class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: DONE for _ in range(self.agent_params['num_critic_updates_per_agent_update']): critic_loss = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) for _ in range(self.agent_params['num_actor_updates_per_agent_update']): actor_loss = self.actor.update(ob_no, ac_na, advantage) loss = OrderedDict() loss['Critic_Loss'] = critic_loss loss['Actor_Loss'] = actor_loss return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: DONE # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) V_s = self.critic.forward_np(ob_no) V_s_prime = self.critic.forward_np(next_ob_no) Q_s_a = re_n + self.gamma*V_s_prime*(1-terminal_n) adv_n = Q_s_a - V_s if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent: def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.num_critic_updates_per_agent_update = agent_params['num_critic_updates_per_agent_update'] self.num_actor_updates_per_agent_update = agent_params['num_actor_updates_per_agent_update'] self.device = agent_params['device'] self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.actor = MLPPolicyAC(self.agent_params['ob_dim'], self.agent_params['ac_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['device'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer(agent_params['replay_size']) def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): ob, next_ob, rew, done = map(lambda x: torch.from_numpy(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n]) value = self.critic.value_func(ob).squeeze() next_value = self.critic.value_func(next_ob).squeeze() * (1 - done) adv_n = rew + (self.gamma * next_value) - value adv_n = adv_n.cpu().detach().numpy() if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): loss = OrderedDict() for critic_update in range(self.num_critic_updates_per_agent_update): loss['Critic_Loss'] = self.critic.update(ob_no, next_ob_no, re_n, terminal_n) adv_n = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # put final critic loss here for actor_update in range(self.num_actor_updates_per_agent_update): loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, adv_n) # put final actor loss here return loss def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent(BaseAgent): def __init__(self, sess, env, agent_params): super(ACAgent, self).__init__() self.env = env self.sess = sess self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.actor = MLPPolicyAC(sess, self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(sess, self.agent_params) self.replay_buffer = ReplayBuffer() def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) vs = self.sess.run(self.critic.critic_prediction, feed_dict = {self.critic.sy_ob_no : ob_no}) vsprime = self.sess.run(self.critic.critic_prediction, feed_dict = {self.critic.sy_ob_no : next_ob_no})*(1-terminal_n) q_val = re_n + self.gamma * vsprime adv_n = q_val - vs if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic # advantage = estimate_advantage(...) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor for x in range(self.agent_params['num_critic_updates_per_agent_update']): closs = self.critic.update(ob_no, next_ob_no, re_n, terminal_n) advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) for x in range(self.agent_params['num_actor_updates_per_agent_update']): aloss = self.actor.update(ob_no, ac_na, advantage) loss = OrderedDict() loss['Critic_Loss'] = closs # put final critic loss here loss['Actor_Loss'] = aloss # put final actor loss here return loss def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent: def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.num_critic_updates_per_agent_update = agent_params['num_critic_updates_per_agent_update'] self.num_actor_updates_per_agent_update = agent_params['num_actor_updates_per_agent_update']3 self.device = agent_params['device'] self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.actor = MLPPolicyAC(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['device'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): ob, next_ob, rew, done = map(lambda x: torch.from_numpy(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n]) # TODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) adv_n = TODO if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic # advantage = estimate_advantage(...) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor TODO loss = OrderedDict() loss['Critic_Loss'] = TODO # put final critic loss here loss['Actor_Loss'] = TODO # put final actor loss here return loss def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class PGAgent(BaseAgent): def __init__(self, env, agent_params): super(PGAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.nn_baseline = self.agent_params['nn_baseline'] self.reward_to_go = self.agent_params['reward_to_go'] # actor/policy self.actor = MLPPolicyPG( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], nn_baseline=self.agent_params['nn_baseline'] ) # replay buffer self.replay_buffer = ReplayBuffer(1000000) def train(self, observations, actions, rewards_list, next_observations, terminals): """ Training a PG agent refers to updating its actor using the given observations/actions and the calculated qvals/advantages that come from the seen rewards. """ # step 1: calculate q values of each (s_t, a_t) point, using rewards (r_0, ..., r_t, ..., r_T) q_values = self.calculate_q_vals(rewards_list) # step 2: calculate advantages that correspond to each (s_t, a_t) point advantages = self.estimate_advantage(observations, q_values) # step 3: use all datapoints (s_t, a_t, q_t, adv_t) to update the PG actor/policy train_log = self.actor.update(observations, actions, advantages, q_values=q_values) return train_log def calculate_q_vals(self, rewards_list): """ Monte Carlo estimation of the Q function. """ # Case 1: trajectory-based PG # Estimate Q^{pi}(s_t, a_t) by the total discounted reward summed over entire trajectory if not self.reward_to_go: # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory # In other words: value of (s_t, a_t) = sum_{t'=0}^T gamma^t' r_{t'} q_values = np.concatenate([self._discounted_return(r) for r in rewards_list]) # Case 2: reward-to-go PG # Estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting from t else: # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory # In other words: value of (s_t, a_t) = sum_{t'=t}^T gamma^(t'-t) * r_{t'} q_values = np.concatenate([self._discounted_cumsum(r) for r in rewards_list]) return q_values def estimate_advantage(self, obs, q_values): """ Computes advantages by (possibly) subtracting a baseline from the estimated Q values """ # Estimate the advantage when nn_baseline is True, # by querying the neural network that you're using to learn the baseline if self.nn_baseline: baselines_unnormalized = self.actor.run_baseline_prediction(obs) ## ensure that the baseline and q_values have the same dimensionality ## to prevent silent broadcasting errors assert baselines_unnormalized.ndim == q_values.ndim ## baseline was trained with standardized q_values, so ensure that the predictions ## have the same mean and standard deviation as the current batch of q_values baselines = baselines_unnormalized * np.std(q_values) + np.mean(q_values) advantages = q_values - baselines # Else, just set the advantage to [Q] else: advantages = q_values.copy() # Normalize the resulting advantages if self.standardize_advantages: advantages = utils.normalize(advantages, np.mean(advantages), np.std(advantages)) return advantages ##################################################### ##################################################### def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size, concat_rew=False) ##################################################### ################## HELPER FUNCTIONS ################# ##################################################### def _discounted_return(self, rewards): """ Helper function Input: list of rewards {r_0, r_1, ..., r_t', ... r_T} from a single rollout of length T Output: list where each index t contains sum_{t'=0}^T gamma^t' r_{t'} """ discounted_return = sum([(self.gamma**t) * r for t, r in enumerate(rewards)]) return [discounted_return] * len(rewards) def _discounted_cumsum(self, rewards): """ Helper function which -takes a list of rewards {r_0, r_1, ..., r_t', ... r_T}, -and returns a list where the entry in each index t is sum_{t'=t}^T gamma^(t'-t) * r_{t'} """ discounted_returns_to_go = [] for t in range(len(rewards)): return_to_go = sum([(self.gamma**tp) * r for tp, r in enumerate(rewards[t:])]) discounted_returns_to_go.append(return_to_go) return discounted_returns_to_go
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic loss = OrderedDict() for _ in range( self.agent_params['num_critic_updates_per_agent_update']): loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) advantages = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor for _ in range( self.agent_params['num_actor_updates_per_agent_update']): loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, advantages) return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) ob_no = ptu.from_numpy(ob_no) next_ob_no = ptu.from_numpy(next_ob_no) re_n = ptu.from_numpy(re_n) terminal_n = ptu.from_numpy(terminal_n).bool() v_s = self.critic(ob_no) v_sp1 = self.critic(next_ob_no) v_sp1[terminal_n] = 0 q_sa = re_n + self.gamma * v_sp1 adv_n = q_sa - v_s assert adv_n.size() == re_n.size() adv_n = adv_n.detach().cpu().numpy() if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): ob_no = ptu.from_numpy(ob_no) next_ob_no = ptu.from_numpy(next_ob_no) terminal_n = ptu.from_numpy(terminal_n) re_n = ptu.from_numpy(re_n) ac_na = ptu.from_numpy(ac_na) loss_critic = 0. for i in range( self.agent_params['num_critic_updates_per_agent_update']): loss_critic += self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) # advantage = estimate_advantage(...) : adv_n = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # a tensor is returned loss_actor = 0. for i in range( self.agent_params['num_actor_updates_per_agent_update']): loss_actor += self.actor.update(ob_no, ac_na, adv_n) loss = OrderedDict() loss['Critic_Loss'] = loss_critic loss[ 'Actor_Loss'] = loss_actor # in TensorBoard, loss_actor actually increases as we actually minimize -loss_actor return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) # V_s_prime = self.critic.critic_network(next_ob_no) # V_s_prime = V_s_prime.squeeze() # mask = (terminal_n == 1.) # V_s_prime= V_s_prime.masked_fill(mask, 0.) # # V_s = self.critic.critic_network(ob_no) # V_s = V_s.squeeze() # # assert V_s_prime.ndim == V_s.ndim # TODO-assert enable this assert in debug # adv_n2 = re_n + self.gamma * V_s_prime - V_s # another way to calculate: V_s_prime = re_n + ( 1 - terminal_n) * self.gamma * self.critic.forward(next_ob_no) adv_n = V_s_prime - self.critic.forward(ob_no) # assert adv_n2 == adv_n if self.standardize_advantages: adv_n = (adv_n - torch.mean(adv_n)) / (torch.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class PGAgent(BaseAgent): def __init__(self, env, agent_params): super(PGAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.nn_baseline = self.agent_params['nn_baseline'] self.reward_to_go = self.agent_params['reward_to_go'] # actor/policy self.actor = MLPPolicyPG( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], nn_baseline=self.agent_params['nn_baseline']) # replay buffer self.replay_buffer = ReplayBuffer(1000000) def train(self, observations, actions, rewards_list, next_observations, terminals): """ Training a PG agent refers to updating its actor using the given observations/actions and the calculated qvals/advantages that come from the seen rewards. """ # step 1: calculate q values of each (s_t, a_t) point, using rewards (r_0, ..., r_t, ..., r_T) q_values = self.calculate_q_vals(rewards_list) # step 2: calculate advantages that correspond to each (s_t, a_t) point advantages = self.estimate_advantage(observations, q_values) # TODO: step 3: use all datapoints (s_t, a_t, q_t, adv_t) to update the PG actor/policy ## HINT: `train_log` should be returned by your actor update method train_log = self.actor.update(observations, actions, advantages, q_values) return train_log def calculate_q_vals(self, rewards_list): """ Monte Carlo estimation of the Q function. """ # Case 1: trajectory-based PG # Estimate Q^{pi}(s_t, a_t) by the total discounted reward summed over entire trajectory if not self.reward_to_go: # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory # In other words: value of (s_t, a_t) = sum_{t'=0}^T gamma^t' r_{t'} q_values = np.concatenate( [self._discounted_return(r) for r in rewards_list]) # Case 2: reward-to-go PG # Estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting from t else: # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory # In other words: value of (s_t, a_t) = sum_{t'=t}^T gamma^(t'-t) * r_{t'} q_values = np.concatenate( [self._discounted_cumsum(r) for r in rewards_list]) return q_values def estimate_advantage(self, obs, q_values): """ Computes advantages by (possibly) subtracting a baseline from the estimated Q values """ # Estimate the advantage when nn_baseline is True, # by querying the neural network that you're using to learn the baseline if self.nn_baseline: baselines_unnormalized = self.actor.run_baseline_prediction(obs) ## ensure that the baseline and q_values have the same dimensionality ## to prevent silent broadcasting errors assert baselines_unnormalized.ndim == q_values.ndim ## baseline was trained with standardized q_values, so ensure that the predictions ## have the same mean and standard deviation as the current batch of q_values baselines = baselines_unnormalized * np.std(q_values) + np.mean( q_values) ## TODO: compute advantage estimates using q_values and baselines advantages = q_values - baselines # Else, just set the advantage to [Q] else: advantages = q_values.copy() # Normalize the resulting advantages if self.standardize_advantages: ## TODO: standardize the advantages to have a mean of zero ## and a standard deviation of one ## HINT: there is a `normalize` function in `infrastructure.utils` advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) return advantages ##################################################### ##################################################### def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size, concat_rew=False) ##################################################### ################## HELPER FUNCTIONS ################# ##################################################### def _discounted_return(self, rewards): """ Helper function Input: list of rewards {r_0, r_1, ..., r_t', ... r_T} from a single rollout of length T Output: list where each index t contains sum_{t'=0}^T gamma^t' r_{t'} """ # TODO: create list_of_discounted_returns # Hint: note that all entries of this output are equivalent # because each sum is from 0 to T (and doesnt involve t) out = sum(self.gamma**t * rew for t, rew in enumerate(rewards)) return [out for _ in range(len(rewards))] def _discounted_cumsum(self, rewards): """ Helper function which -takes a list of rewards {r_0, r_1, ..., r_t', ... r_T}, -and returns a list where the entry in each index t' is sum_{t'=t}^T gamma^(t'-t) * r_{t'} """ # TODO: create `list_of_discounted_returns` # HINT1: note that each entry of the output should now be unique, # because the summation happens over [t, T] instead of [0, T] # HINT2: it is possible to write a vectorized solution, but a solution # using a for loop is also fine ret, q = [], 0 for rew in reversed(rewards): ret.append(q * self.gamma + rew) q = ret[-1] return ret[::-1]
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.gae = self.agent_params['gae'] self.gae_lambda = self.agent_params['gae_lambda'] self.ppo = self.agent_params['ppo'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], self.agent_params['clip_eps'], ) if self.ppo: self.old_actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], self.agent_params['clip_eps'], ) self.old_actor.load_state_dict(self.actor.state_dict()) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic rewards = np.concatenate([r for r in re_n]) if self.gae else re_n assert rewards.shape == terminal_n.shape for i in range( self.agent_params['num_critic_updates_per_agent_update']): loss_critic = self.critic.update(ob_no, ac_na, next_ob_no, rewards, terminal_n) advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) old_log_prob = self.get_old_prob(self.old_actor, ob_no, ac_na) if self.ppo else None # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor for i in range( self.agent_params['num_actor_updates_per_agent_update']): loss_actor = self.actor.update(ob_no, ac_na, advantage, old_log_prob) if self.ppo: self.old_actor.load_state_dict(self.actor.state_dict()) loss = OrderedDict() loss['Critic_Loss'] = loss_critic loss['Actor_Loss'] = loss_actor return loss def get_old_prob(self, old_policy, ob_no, ac_na): observations = ptu.from_numpy(ob_no) actions = ptu.from_numpy(ac_na) log_prob = old_policy.forward(observations).log_prob(actions) return ptu.to_numpy(log_prob) def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) v_s = self.critic.forward_np(ob_no) if not self.gae: v_s_next = self.critic.forward_np(next_ob_no) * (1 - terminal_n) adv_n = re_n + self.gamma * v_s_next - v_s else: index = 0 adv_n = np.zeros(len(ob_no)) for rewards in re_n: gae_deltas = [] for i in range(len(rewards) - 1): delta = rewards[i] + self.gamma * v_s[index + i + 1] - v_s[index + i] gae_deltas.append(delta) i = len(rewards) - 1 gae_deltas.append(rewards[i] - v_s[index + i]) assert len(gae_deltas) == len(rewards) sum_deltas = 0 for t in range(len(gae_deltas) - 1, -1, -1): sum_deltas = gae_deltas[ t] + sum_deltas * self.gamma * self.gae_lambda adv_n[t + index] = sum_deltas index += len(rewards) if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): concat_rew = False if self.gae else True return self.replay_buffer.sample_recent_data(batch_size, concat_rew)
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): loss = OrderedDict() ob_no = ptu.from_numpy(ob_no) ac_na = ptu.from_numpy(ac_na) re_n = ptu.from_numpy(re_n) next_ob_no = ptu.from_numpy(next_ob_no) terminal_n = ptu.from_numpy(terminal_n) for _ in range( self.agent_params['num_critic_updates_per_agent_update']): loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) advantages = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) advantages = ptu.from_numpy(advantages) for _ in range( self.agent_params['num_actor_updates_per_agent_update']): loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, adv_n=advantages) return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): v_s_n = self.critic(ob_no) v_s_prime_n = self.critic(next_ob_no) # setting V(s') to zero if the next state is a terminal state q_n = re_n + self.gamma * v_s_prime_n * (1 - terminal_n) adv_n = q_n - v_s_n assert adv_n.size() == re_n.size() adv_n = adv_n.detach().cpu().numpy() if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO_ Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic # advantage = estimate_advantage(...) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor loss = OrderedDict() Critic_Loss = [] Actor_loss = [] for i in range(agent_params['num_critic_updates_per_agent_update']): Critic_Loss.append( self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n)) advantage = estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) for i in range(agent_params['num_actor_updates_per_agent_update']): Actor_loss.append(self.actor.update(ob_no, ac_na, advantage)) loss['Critic_Loss'] = mean(Critic_Loss) loss['Actor_Loss'] = mean(Actor_loss) return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO_ Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) Vs = self.critic(ob_no) Vs_prime = self.critic(next_ob_no) # ternimal_index = next(i for i, x in enumerate(terminal_n) if x, None) ternimal_index = [i for i, x in enumerate(terminal_n) if x] if len(ternimal_index): Vs_prime[ternimal_index] = 0 Qs = re_n + self.gamma * Vs_prime adv_n = Qs - Vs if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class PGAgent(BaseAgent): def __init__(self, env, agent_params, batch_size=500000, **kwargs): super(PGAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.nn_baseline = self.agent_params['nn_baseline'] self.reward_to_go = self.agent_params['reward_to_go'] # actor/policy if self.agent_params['discrete']: self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) else: self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) self.policy_optimizer = tf.keras.optimizers.Adam( learning_rate=self.agent_params['learning_rate']) # replay buffer self.replay_buffer = ReplayBuffer(2 * batch_size) self.baseline_model = None if self.agent_params['nn_baseline']: self.baseline_model = build_mlp( (self.agent_params['ob_dim'], ), output_size=1, n_layers=self.agent_params['n_layers'], size=self.agent_params['size'], name='baseline_model') self.baseline_loss = tf.keras.losses.MeanSquaredError() self.baseline_optimizer = tf.keras.optimizers.Adam( learning_rate=self.agent_params['learning_rate']) self.baseline_model.compile(optimizer=self.baseline_optimizer, loss=self.baseline_loss) def train(self, obs, acs, rews_list, next_obs, terminals): """ Training a PG agent refers to updating its actor using the given observations/actions and the calculated qvals/advantages that come from the seen rewards. ---------------------------------------------------------------------------------- Recall that the expression for the policy gradient PG is PG = E_{tau} [sum_{t=0}^{T-1} grad log pi(a_t|s_t) * (Q_t - b_t )] where tau=(s_0, a_0, s_1, a_1, s_2, a_2, ...) is a trajectory, Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), b_t is a baseline which may depend on s_t, and (Q_t - b_t ) is the advantage. Thus, the PG update performed by the actor needs (s_t, a_t, q_t, adv_t), and that is exactly what this function provides. ---------------------------------------------------------------------------------- """ # step 1: calculate q values of each (s_t, a_t) point, # using rewards from that full rollout of length T: (r_0, ..., r_t, ..., r_{T-1}) q_values = self.calculate_q_vals(rews_list) # step 2: calculate advantages that correspond to each (s_t, a_t) point advantage_values = self.estimate_advantage(obs, q_values) # step 3: # TODO: pass the calculated values above into the actor/policy's update, # which will perform the actual PG update step # TODO: define the loss that should be optimized when training a policy with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: see define_log_prob (above) # to get log pi(a_t|s_t) # HINT3: look for a placeholder above that will be populated with advantage values # to get [Q_t - b_t] # HINT4: don't forget that we need to MINIMIZE this self.loss # but the equation above is something that should be maximized # define the log probability of seen actions/observations under the current policy with tf.GradientTape() as tape: log_action_probas = self.actor.get_log_prob(obs, acs) advantage_values_no_grad = tf.stop_gradient(advantage_values) loss = -tf.reduce_mean( advantage_values_no_grad * log_action_probas) actor_vars = self.actor.trainable_variables grads = tape.gradient(loss, actor_vars) self.policy_optimizer.apply_gradients(zip(grads, actor_vars)) if self.nn_baseline: targets_n = (q_values - np.mean(q_values)) / (np.std(q_values) + 1e-8) dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(obs, tf.float32), tf.cast(targets_n, tf.float32))) dataset = dataset.batch(batch_size=targets_n.shape[0]).repeat() # 20 baseline gradient updates with the current data batch. self.baseline_model.fit(dataset, epochs=1, steps_per_epoch=20) return loss.numpy().item() def calculate_q_vals(self, rews_list): """ Monte Carlo estimation of the Q function. arguments: rews_list: length: number of sampled rollouts Each element corresponds to a particular rollout, and contains an array of the rewards for every step of that particular rollout returns: q_values: shape: (sum/total number of steps across the rollouts) Each entry corresponds to the estimated q(s_t,a_t) value of the corresponding obs/ac point at time t. """ # Case 1: trajectory-based PG if not self.reward_to_go: # TODO: Estimate the Q value Q^{pi}(s_t, a_t) using rewards from that entire trajectory # HINT1: value of each point (t) = total discounted reward summed over the entire trajectory (from 0 to T-1) # In other words, q(s_t, a_t) = sum_{t'=0}^{T-1} gamma^t' r_{t'} # Hint3: see the helper functions at the bottom of this file q_values = np.concatenate( [self._discounted_return(r) for r in rews_list]) # Case 2: reward-to-go PG else: # TODO: Estimate the Q value Q^{pi}(s_t, a_t) as the reward-to-go # HINT1: value of each point (t) = total discounted reward summed over the remainder of that trajectory # (from t to T-1) # In other words, q(s_t, a_t) = sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'} # Hint3: see the helper functions at the bottom of this file q_values = np.concatenate( [self._discounted_cumsum(r) for r in rews_list]) return q_values.astype(np.float32) def estimate_advantage(self, obs, q_values): """ Computes advantages by (possibly) subtracting a baseline from the estimated Q values """ # TODO: Estimate the advantage when nn_baseline is True # HINT1: pass obs into the neural network that you're using to learn the baseline # extra hint if you're stuck: see your actor's run_baseline_prediction # HINT2: advantage should be [Q-b] if self.nn_baseline: b_n_unnormalized = self.baseline_model(obs) b_n = b_n_unnormalized * np.std(q_values) + np.mean(q_values) adv_n = (q_values - tf.squeeze(b_n)).numpy() # Else, just set the advantage to [Q] else: adv_n = q_values.copy() # Normalize the resulting advantages if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n.astype(np.float32) ##################################################### ##################################################### def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size, concat_rew=False) ##################################################### ################## HELPER FUNCTIONS ################# ##################################################### def _discounted_return(self, rewards): """ Helper function Input: a list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single rollout of length T Output: list where each index t contains sum_{t'=0}^{T-1} gamma^t' r_{t'} note that all entries of this output are equivalent because each index t is a sum from 0 to T-1 (and doesnt involve t) """ q = sum(reward * (self.gamma**t) for t, reward in enumerate(rewards)) return [q for _ in rewards] def _discounted_cumsum(self, rewards): """ Input: a list of length T a list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single rollout of length T Output: a list of length T a list where the entry in each index t is sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'} """ all_discounted_cumsums = rewards.copy() for t in range(len(all_discounted_cumsums) - 1, 0, -1): all_discounted_cumsums[t - 1] += self.gamma * all_discounted_cumsums[t] return all_discounted_cumsums
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super().__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.n_drivers = self.agent_params['n_drivers'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size_ac'], self.agent_params['shared_exp'], self.agent_params['shared_exp_lambda'], self.agent_params['is_city'], self.agent_params['learning_rate'], self.agent_params['n_drivers'] ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic loss = OrderedDict() for i in range(self.agent_params['num_critic_updates_per_agent_update']): if not self.agent_params['shared_exp']: loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) else: action_distributions = self.actor.shared_forward(ptu.from_numpy(ob_no)) loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n, action_distributions) # advantage = estimate_advantage(...) if self.agent_params['shared_exp']: advantage = self.estimate_shared_advantage(ob_no, next_ob_no, re_n, terminal_n) else: advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor for i in range(self.agent_params['num_actor_updates_per_agent_update']): loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, advantage) return loss def estimate_shared_advantage(self, ob_no, next_ob_no, re_n, terminal_n): value_s = self.critic.shared_forward(ptu.from_numpy(ob_no)) value_next_s = self.critic.shared_forward(ptu.from_numpy(next_ob_no)) adv_n = dict() for i in range(self.n_drivers): for k in range(self.n_drivers): adv_n[(i,k)] = re_n[:,k] + self.gamma*ptu.to_numpy(value_next_s[(i,k)]) - ptu.to_numpy(value_s[(i,k)]) if self.standardize_advantages: adv_n[(i,k)] = (adv_n[(i,k)]- np.mean(adv_n[(i,k)]))/(np.std(adv_n[(i,k)])+1e-8) return adv_n def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) value_s = self.critic.forward_np(ob_no) value_next_s = self.critic.forward_np(next_ob_no) value_next_s[terminal_n==1] = 0 adv_n = re_n + self.gamma*value_next_s - value_s if self.standardize_advantages: for i in range(self.n_drivers): adv_n[:,i] = (adv_n[:,i] - np.mean(adv_n[:,i])) / (np.std(adv_n[:,i]) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class PPOAgent(BaseAgent): def __init__(self, env, agent_params): super(PPOAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.use_gae = self.agent_params['use_gae'] self.lam = self.agent_params['gae_lam'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.ppo_epochs = self.agent_params['ppo_epochs'] self.ppo_min_bacth_size = self.agent_params['ppo_min_batch_size'] # actor/policy self.actor = PPOPolicy( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['clip_eps'], self.agent_params['ent_coeff'], self.agent_params['max_grad_norm'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate_policyfn'], ) self.critic = PPOCritic(self.agent_params) # replay buffer self.replay_buffer = ReplayBuffer(1000000) def train(self, ob_no, ac_no, re_n, next_ob_no, terminal_n, logprobs): """ Training a PPO agent refers to updating its actor using the given observations/actions and the calculated qvals/advantages that come from the seen rewards. """ # calculate advantages and target returs for value_function that correspond to each (s_t, a_t) point advantages, targets = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # step 1: use all datapoints (s_t, a_t, q_t, adv_t) to update the PPO actor/policy ## HINT: `train_log` should be returned by your actor update method loss = OrderedDict() #print(self.actor.parameters()) if self.ppo_min_bacth_size: n_batches = math.ceil(terminal_n.shape[0] / self.ppo_min_bacth_size) inds = np.arange(terminal_n.shape[0]) for _ in range(self.ppo_epochs): np.random.shuffle(inds) for i in range(n_batches): rand_indices = inds[slice( i * self.ppo_min_bacth_size, min(inds.shape[0], ((i + 1) * self.ppo_min_bacth_size)))] mb_ob_no = ob_no[rand_indices] mb_ac_no = ac_no[rand_indices] mb_adv = advantages[rand_indices] mb_targets = targets[rand_indices] mb_logprobs = logprobs[rand_indices] loss['critic_loss'] = self.critic.update( mb_ob_no, mb_targets) loss['agent_loss'] = self.actor.update( mb_ob_no, mb_ac_no, mb_adv, mb_logprobs) else: for _ in range(self.ppo_epochs): loss['critic_loss'] = self.critic.update(ob_no, targets) loss['agent_loss'] = self.actor.update(ob_no, ac_no, advantages, logprobs) return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): """ Computes advantages (both gae and standard) from the estimated Q values """ v_t = self.critic.forward_np(ob_no) v_tp1 = self.critic.forward_np(next_ob_no) if self.use_gae: last_gae = 0 gaes = np.zeros(re_n.shape[0]) for i in range(re_n.shape[0] - 1, -1, -1): next_value = v_tp1[i] value = v_t[i] delta = re_n[i] + (self.gamma * next_value * (1 - terminal_n[i])) - value last_gae = delta + self.gamma * self.lam * last_gae * ( 1 - terminal_n[i]) gaes[i] = last_gae valuefn_targets = gaes + v_t advantages = gaes else: q_value = re_n + self.gamma * (v_tp1 * (1 - terminal_n)) valuefn_targets = q_value advantages = q_value - v_t # Normalize the resulting advantages if self.standardize_advantages: advantages = normalize(advantages, np.mean(advantages), np.std(advantages)) return advantages, valuefn_targets ##################################################### ##################################################### def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size) ##################################################### ################## HELPER FUNCTIONS ################# ##################################################### def save(self, path): torch.save( { "actor": self.actor.state_dict(), "critic": self.critic.state_dict(), "actor_optimizer": self.actor.optimizer.state_dict(), "critic_optimizer": self.critic.optimizer.state_dict() }, path)
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic # ob_no = ptu.from_numpy(ob_no) # ac_na = ptu.from_numpy(ac_na).to(torch.long) # next_ob_no = ptu.from_numpy(next_ob_no) # re_n = ptu.from_numpy(re_n) # terminal_n = ptu.from_numpy(terminal_n) for _ in range( self.agent_params['num_critic_updates_per_agent_update']): critic_loss = self.critic.update(ob_no=ob_no, ac_na=ac_na, reward_n=re_n, next_ob_no=next_ob_no, terminal_n=terminal_n) # targets = re_n + self.gamma * self.critic(next_ob_no) * (1-terminal_n) # pred = self.critic(ob_no) # #advantage = re_n + self.gamma * self.critic(next_ob_no) * (1-terminal_n) - self.critic(ob_no) # advantage = targets - pred advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor for _ in range( self.agent_params['num_actor_updates_per_agent_update']): actor_loss = self.actor.update(ob_no, ac_na, adv_n=advantage) loss = OrderedDict() loss['Critic_Loss'] = critic_loss loss['Actor_Loss'] = actor_loss return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) value_targets = re_n + self.gamma * self.critic(next_ob_no) * ( 1. - terminal_n) value_pred = self.critic(ob_no) #advantage = re_n + self.gamma * self.critic(next_ob_no) * (1-terminal_n) - self.critic(ob_no) adv_n = value_targets - value_pred if self.standardize_advantages: adv_n = (adv_n - torch.mean(adv_n)) / (torch.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class TRPOAgent(BaseAgent): def __init__(self, env, agent_params): super(TRPOAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.use_gae = self.agent_params['use_gae'] self.lam = self.agent_params['gae_lam'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] # actor/policy self.actor = TRPOPolicy( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['cg_steps'], self.agent_params['damping'], self.agent_params['max_backtracks'], self.agent_params['max_kl_increment'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) self.critic = TRPOCritic(self.agent_params) # replay buffer self.replay_buffer = ReplayBuffer(1000000) def train(self, ob_no, ac_no, re_n, next_ob_no, terminal_n): """ Training a TRPO agent refers to updating its actor using the given observations/actions and the calculated qvals/advantages that come from the seen rewards. """ # calculate advantages and target returs for value_function that correspond to each (s_t, a_t) point advantages, targets = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) loss = OrderedDict() #print(self.actor.parameters()) loss['critic_loss'] = self.critic.update(ob_no, targets) loss['agent_loss'] = self.actor.update(ob_no, ac_no, advantages) return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): """ Computes advantages (both gae and standard) from the estimated Q values """ v_t = self.critic.forward_np(ob_no) v_tp1 = self.critic.forward_np(next_ob_no) if self.use_gae: last_gae = 0 gaes = np.zeros(re_n.shape[0]) for i in range(re_n.shape[0] - 1, -1, -1): next_value = v_tp1[i] value = v_t[i] delta = re_n[i] + (self.gamma * next_value * (1 - terminal_n[i])) - value last_gae = delta + self.gamma * self.lam * last_gae * ( 1 - terminal_n[i]) gaes[i] = last_gae valuefn_targets = gaes + v_t advantages = gaes else: q_value = re_n + self.gamma * (v_tp1 * (1 - terminal_n)) valuefn_targets = q_value advantages = q_value - v_t # Normalize the resulting advantages if self.standardize_advantages: advantages = normalize(advantages, np.mean(advantages), np.std(advantages)) return advantages, valuefn_targets ##################################################### ##################################################### def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size) ##################################################### ################## HELPER FUNCTIONS ################# ##################################################### def save(self, path): torch.save( { "actor": self.actor.state_dict(), "critic": self.critic.state_dict(), "actor_optimizer": self.actor.optimizer.state_dict(), "critic_optimizer": self.critic.optimizer.state_dict() }, path)
class ACAgent: def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.num_critic_updates_per_agent_update = agent_params[ 'num_critic_updates_per_agent_update'] self.num_actor_updates_per_agent_update = agent_params[ 'num_actor_updates_per_agent_update'] self.device = agent_params['device'] self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['device'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) # introduced in actor-critic to improve advantage function. self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): ob, next_ob, rew, done = map( lambda x: torch.from_numpy(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n]) # DoneTODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) v_s = self.critic.value_func(ob) v_s_prime = self.critic.value_func(next_ob).squeeze() v_s_prime[done >= 1] = 0 estimated_q = rew + self.gamma * v_s_prime adv_n = estimated_q - v_s adv_n = adv_n.cpu().detach().numpy() if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # DoneTODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic # advantage = estimate_advantage(...) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor loss = OrderedDict() for i in range(self.num_critic_updates_per_agent_update): loss['Critic_Loss'] = self.critic.update(ob_no, next_ob_no, re_n, terminal_n) adv = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) for i in range(self.num_actor_updates_per_agent_update): loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, adv) return loss def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class PGAgent(BaseAgent): def __init__(self, env, agent_params): super(PGAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params["gamma"] self.standardize_advantages = self.agent_params["standardize_advantages"] self.nn_baseline = self.agent_params["nn_baseline"] self.reward_to_go = self.agent_params["reward_to_go"] # actor/policy self.actor = MLPPolicyPG( self.agent_params["ac_dim"], self.agent_params["ob_dim"], self.agent_params["n_layers"], self.agent_params["size"], discrete=self.agent_params["discrete"], learning_rate=self.agent_params["learning_rate"], nn_baseline=self.agent_params["nn_baseline"], ) # replay buffer self.replay_buffer = ReplayBuffer(1000000) def train(self, observations, actions, rewards_list, next_observations, terminals): """ Training a PG agent refers to updating its actor using the given observations/actions and the calculated qvals/advantages that come from the seen rewards. """ # step 1: calculate q values of each (s_t, a_t) point, using rewards (r_0, ..., r_t, ..., r_T) q_values = self.calculate_q_vals(rewards_list) # step 2: calculate advantages that correspond to each (s_t, a_t) point advantages = self.estimate_advantage(observations, q_values) # step 3: use all datapoints (s_t, a_t, q_t, adv_t) to update the PG actor/policy train_log = self.actor.update(observations, actions, advantages, q_values) return train_log def calculate_q_vals(self, rewards_list): """ Monte Carlo estimation of the Q function. """ # Case 1: trajectory-based PG # Estimate Q^{pi}(s_t, a_t) by the total discounted reward summed over entire trajectory if not self.reward_to_go: # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory # In other words: value of (s_t, a_t) = sum_{t'=0}^T gamma^t' r_{t'} q_values = np.concatenate( [self._discounted_return(r) for r in rewards_list] ) # Case 2: reward-to-go PG # Estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting from t else: # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory # In other words: value of (s_t, a_t) = sum_{t'=t}^T gamma^(t'-t) * r_{t'} q_values = np.concatenate( [self._discounted_cumsum(r) for r in rewards_list] ) return q_values def estimate_advantage(self, obs, q_values): """ Computes advantages by (possibly) subtracting a baseline from the estimated Q values """ # Estimate the advantage when nn_baseline is True, # by querying the neural network that you're using to learn the baseline if self.nn_baseline: baselines_unnormalized = self.actor.run_baseline_prediction(obs) ## ensure that the baseline and q_values have the same dimensionality ## to prevent silent broadcasting errors assert baselines_unnormalized.ndim == q_values.ndim ## baseline was trained with standardized q_values, so ensure that the predictions ## have the same mean and standard deviation as the current batch of q_values baselines = baselines_unnormalized * np.std(q_values) + np.mean(q_values) ## TODO: compute advantage estimates using q_values and baselines advantages = q_values - baselines # Else, just set the advantage to [Q] else: advantages = q_values.copy() # Normalize the resulting advantages if self.standardize_advantages: ## standardize the advantages to have a mean of zero ## and a standard deviation of one advantages = normalize(advantages) return advantages ##################################################### ##################################################### def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size, concat_rew=False) ##################################################### ################## HELPER FUNCTIONS ################# ##################################################### def _discounted_return(self, rewards): """ Helper function Input: list of rewards {r_0, r_1, ..., r_t', ... r_T} from a single rollout of length T Output: list where each index t contains sum_{t'=0}^T gamma^t' r_{t'} """ T = rewards.shape[0] discount_factors = np.power(self.gamma, np.arange(T)) discounted_rewards = rewards * discount_factors ret = np.sum(discounted_rewards) return np.repeat(ret, T) def _discounted_cumsum(self, rewards): """ Helper function which - takes a list of rewards {r_0, r_1, ..., r_t', ... r_T}, - and returns a list where the entry in each index t' is sum_{t'=t}^T gamma^(t'-t) * r_{t'} """ # HINT1: note that each entry of the output should now be unique, # because the summation happens over [t, T] instead of [0, T] # HINT2: it is possible to write a vectorized solution, but a solution # using a for loop is also fine T = rewards.shape[0] discount_factors = np.power(self.gamma, np.arange(T)) discounted_rewards = rewards * discount_factors # We can write RTG(t) = sum_{t'=t}^T gamma^t' r_{t'} / r^t # Need cumsum from the right, i.e. flip -> cumsum -> flip partial_sums = np.flip(np.cumsum(np.flip(discounted_rewards))) rewards_to_go = partial_sums / discount_factors return rewards_to_go
class PGAgent(BaseAgent): def __init__(self, sess, env, agent_params): super(PGAgent, self).__init__() # init vars self.env = env self.sess = sess self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.nn_baseline = self.agent_params['nn_baseline'] self.reward_to_go = self.agent_params['reward_to_go'] # actor/policy # NOTICE that we are using MLPPolicyPG (hw2), instead of MLPPolicySL (hw1) # which indicates similar network structure (layout/inputs/outputs), # but differences in training procedure # between supervised learning and policy gradients self.actor = MLPPolicyPG( sess, self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], nn_baseline=self.agent_params['nn_baseline']) # replay buffer self.replay_buffer = ReplayBuffer(1000000) def train(self, obs, acs, rews_list, next_obs, terminals): """ Training a PG agent refers to updating its actor using the given observations/actions and the calculated qvals/advantages that come from the seen rewards. ---------------------------------------------------------------------------------- Recall that the expression for the policy gradient PG is PG = E_{tau} [sum_{t=0}^{T-1} grad log pi(a_t|s_t) * (Q_t - b_t )] where tau=(s_0, a_0, s_1, a_1, s_2, a_2, ...) is a trajectory, Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), b_t is a baseline which may depend on s_t, and (Q_t - b_t ) is the advantage. Thus, the PG update performed by the actor needs (s_t, a_t, q_t, adv_t), and that is exactly what this function provides. ---------------------------------------------------------------------------------- """ # step 1: calculate q values of each (s_t, a_t) point, # using rewards from that full rollout of length T: (r_0, ..., r_t, ..., r_{T-1}) q_values = self.calculate_q_vals(rews_list) # step 2: calculate advantages that correspond to each (s_t, a_t) point advantage_values = self.estimate_advantage(obs, q_values) # step 3: # TODO: pass the calculated values above into the actor/policy's update, # which will perform the actual PG update step loss = self.actor.update(obs, acs, qvals=TODO, adv_n=TODO) return loss def calculate_q_vals(self, rews_list): """ Monte Carlo estimation of the Q function. arguments: rews_list: length: number of sampled rollouts Each element corresponds to a particular rollout, and contains an array of the rewards for every step of that particular rollout returns: q_values: shape: (sum/total number of steps across the rollouts) Each entry corresponds to the estimated q(s_t,a_t) value of the corresponding obs/ac point at time t. """ # Case 1: trajectory-based PG if not self.reward_to_go: # TODO: Estimate the Q value Q^{pi}(s_t, a_t) using rewards from that entire trajectory # HINT1: value of each point (t) = total discounted reward summed over the entire trajectory (from 0 to T-1) # In other words, q(s_t, a_t) = sum_{t'=0}^{T-1} gamma^t' r_{t'} # Hint3: see the helper functions at the bottom of this file q_values = np.concatenate([TODO for r in rews_list]) # Case 2: reward-to-go PG else: # TODO: Estimate the Q value Q^{pi}(s_t, a_t) as the reward-to-go # HINT1: value of each point (t) = total discounted reward summed over the remainder of that trajectory (from t to T-1) # In other words, q(s_t, a_t) = sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'} # Hint3: see the helper functions at the bottom of this file q_values = np.concatenate([TODO for r in rews_list]) return q_values def estimate_advantage(self, obs, q_values): """ Computes advantages by (possibly) subtracting a baseline from the estimated Q values """ # TODO: Estimate the advantage when nn_baseline is True # HINT1: pass obs into the neural network that you're using to learn the baseline # extra hint if you're stuck: see your actor's run_baseline_prediction # HINT2: advantage should be [Q-b] if self.nn_baseline: b_n_unnormalized = TODO b_n = b_n_unnormalized * np.std(q_values) + np.mean(q_values) adv_n = TODO # Else, just set the advantage to [Q] else: adv_n = q_values.copy() # Normalize the resulting advantages if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n ##################################################### ##################################################### def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size, concat_rew=False) ##################################################### ################## HELPER FUNCTIONS ################# ##################################################### # TODO: implement this function def _discounted_return(self, rewards): """ Helper function Input: a list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single rollout of length T Output: list where each index t contains sum_{t'=0}^{T-1} gamma^t' r_{t'} note that all entries of this output are equivalent because each index t is a sum from 0 to T-1 (and doesnt involve t) """ # 1) create a list of indices (t'): from 0 to T-1 indices = TODO # 2) create a list where the entry at each index (t') is gamma^(t') discounts = TODO # 3) create a list where the entry at each index (t') is gamma^(t') * r_{t'} discounted_rewards = TODO # 4) calculate a scalar: sum_{t'=0}^{T-1} gamma^(t') * r_{t'} sum_of_discounted_rewards = TODO # 5) create a list of length T-1, where each entry t contains that scalar list_of_discounted_returns = TODO return list_of_discounted_returns def _discounted_cumsum(self, rewards): """ Input: a list of length T a list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single rollout of length T Output: a list of length T a list where the entry in each index t is sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'} """ all_discounted_cumsums = [] # for loop over steps (t) of the given rollout for start_time_index in range(len(rewards)): # 1) create a list of indices (t'): goes from t to T-1 indices = TODO # 2) create a list where the entry at each index (t') is gamma^(t'-t) discounts = TODO # 3) create a list where the entry at each index (t') is gamma^(t'-t) * r_{t'} # Hint: remember that t' goes from t to T-1, so you should use the rewards from those indices as well discounted_rtg = TODO # 4) calculate a scalar: sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'} sum_discounted_rtg = TODO # appending each of these calculated sums into the list to return all_discounted_cumsums.append(sum_discounted_rtg) list_of_discounted_cumsums = np.array(all_discounted_cumsums) return list_of_discounted_cumsums
class ACAgent(BaseAgent): def __init__(self, env, agent_params, **kwargs): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] # actor/policy if self.agent_params['discrete']: self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) else: self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) self.policy_optimizer = tf.keras.optimizers.Adam(learning_rate=self.agent_params['learning_rate']) self.critic = BootstrappedContinuousCritic(self.agent_params) self.critic_loss = tf.keras.losses.MeanSquaredError() self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=self.agent_params['learning_rate']) self.critic.nn_critic.compile(optimizer=self.critic_optimizer, loss=self.critic_loss) self.replay_buffer = ReplayBuffer() def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) current_state_values = self.critic(ob_no) next_state_values = self.gamma * self.critic(next_ob_no) * (1.0 - tf.expand_dims(tf.cast(terminal_n, tf.float32), axis=1)) adv_n = next_state_values + re_n - current_state_values if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic # advantage = estimate_advantage(...) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor for _ in range(self.agent_params['num_critic_updates_per_agent_update']): for _ in range(self.agent_params['num_target_updates']): critic_targets = self.critic.get_training_targets(next_ob_no, re_n, terminal_n, self.gamma) critic_dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(ob_no, tf.float32), tf.cast(critic_targets, tf.float32))) critic_dataset = critic_dataset.batch(batch_size=critic_targets.shape[0]).repeat() self.critic.nn_critic.fit(critic_dataset, epochs=1, steps_per_epoch=self.agent_params['num_grad_steps_per_target_update']) advantage = tf.stop_gradient(self.estimate_advantage(ob_no, next_ob_no, tf.expand_dims(re_n, axis=1), terminal_n)) for _ in range(self.agent_params['num_actor_updates_per_agent_update']): with tf.GradientTape() as tape: log_action_probas = self.actor.get_log_prob(ob_no, ac_na) loss = -tf.reduce_mean(advantage * tf.expand_dims(log_action_probas, axis=1)) actor_vars = self.actor.trainable_variables grads = tape.gradient(loss, actor_vars) self.policy_optimizer.apply_gradients(zip(grads, actor_vars)) loss_dict = OrderedDict() loss_dict['Critic_Loss'] = 0 # put final critic loss here loss_dict['Actor_Loss'] = loss.numpy().item() # put final actor loss here return loss_dict def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)