class PGAgent(BaseAgent): def __init__(self, env, agent_params): super(PGAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.nn_baseline = self.agent_params['nn_baseline'] self.reward_to_go = self.agent_params['reward_to_go'] # actor/policy self.actor = MLPPolicyPG( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], nn_baseline=self.agent_params['nn_baseline']) # replay buffer self.replay_buffer = ReplayBuffer(1000000) def train(self, observations, actions, rewards_list, next_observations, terminals): """ Training a PG agent refers to updating its actor using the given observations/actions and the calculated qvals/advantages that come from the seen rewards. """ # step 1: calculate q values of each (s_t, a_t) point, using rewards (r_0, ..., r_t, ..., r_T) q_values = self.calculate_q_vals(rewards_list) # step 2: calculate advantages that correspond to each (s_t, a_t) point advantages = self.estimate_advantage(observations, q_values) # TODO: step 3: use all datapoints (s_t, a_t, q_t, adv_t) to update the PG actor/policy ## HINT: `train_log` should be returned by your actor update method train_log = self.actor.update(observations, actions, advantages, q_values) return train_log def calculate_q_vals(self, rewards_list): """ Monte Carlo estimation of the Q function. """ # Case 1: trajectory-based PG # Estimate Q^{pi}(s_t, a_t) by the total discounted reward summed over entire trajectory if not self.reward_to_go: # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory # In other words: value of (s_t, a_t) = sum_{t'=0}^T gamma^t' r_{t'} q_values = np.concatenate( [self._discounted_return(r) for r in rewards_list]) # Case 2: reward-to-go PG # Estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting from t else: # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory # In other words: value of (s_t, a_t) = sum_{t'=t}^T gamma^(t'-t) * r_{t'} q_values = np.concatenate( [self._discounted_cumsum(r) for r in rewards_list]) return q_values def estimate_advantage(self, obs, q_values): """ Computes advantages by (possibly) subtracting a baseline from the estimated Q values """ # Estimate the advantage when nn_baseline is True, # by querying the neural network that you're using to learn the baseline if self.nn_baseline: baselines_unnormalized = self.actor.run_baseline_prediction(obs) ## ensure that the baseline and q_values have the same dimensionality ## to prevent silent broadcasting errors assert baselines_unnormalized.ndim == q_values.ndim ## baseline was trained with standardized q_values, so ensure that the predictions ## have the same mean and standard deviation as the current batch of q_values baselines = baselines_unnormalized * np.std(q_values) + np.mean( q_values) ## TODO: compute advantage estimates using q_values and baselines advantages = q_values - baselines # Else, just set the advantage to [Q] else: advantages = q_values.copy() # Normalize the resulting advantages if self.standardize_advantages: ## TODO: standardize the advantages to have a mean of zero ## and a standard deviation of one ## HINT: there is a `normalize` function in `infrastructure.utils` advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) return advantages ##################################################### ##################################################### def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size, concat_rew=False) ##################################################### ################## HELPER FUNCTIONS ################# ##################################################### def _discounted_return(self, rewards): """ Helper function Input: list of rewards {r_0, r_1, ..., r_t', ... r_T} from a single rollout of length T Output: list where each index t contains sum_{t'=0}^T gamma^t' r_{t'} """ # TODO: create list_of_discounted_returns # Hint: note that all entries of this output are equivalent # because each sum is from 0 to T (and doesnt involve t) out = sum(self.gamma**t * rew for t, rew in enumerate(rewards)) return [out for _ in range(len(rewards))] def _discounted_cumsum(self, rewards): """ Helper function which -takes a list of rewards {r_0, r_1, ..., r_t', ... r_T}, -and returns a list where the entry in each index t' is sum_{t'=t}^T gamma^(t'-t) * r_{t'} """ # TODO: create `list_of_discounted_returns` # HINT1: note that each entry of the output should now be unique, # because the summation happens over [t, T] instead of [0, T] # HINT2: it is possible to write a vectorized solution, but a solution # using a for loop is also fine ret, q = [], 0 for rew in reversed(rewards): ret.append(q * self.gamma + rew) q = ret[-1] return ret[::-1]
class PGAgent(BaseAgent): def __init__(self, env, agent_params): super(PGAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.nn_baseline = self.agent_params['nn_baseline'] self.reward_to_go = self.agent_params['reward_to_go'] # actor/policy self.actor = MLPPolicyPG( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], nn_baseline=self.agent_params['nn_baseline'] ) # replay buffer self.replay_buffer = ReplayBuffer(1000000) def train(self, observations, actions, rewards_list, next_observations, terminals): """ Training a PG agent refers to updating its actor using the given observations/actions and the calculated qvals/advantages that come from the seen rewards. """ # step 1: calculate q values of each (s_t, a_t) point, using rewards (r_0, ..., r_t, ..., r_T) q_values = self.calculate_q_vals(rewards_list) # step 2: calculate advantages that correspond to each (s_t, a_t) point advantages = self.estimate_advantage(observations, q_values) # step 3: use all datapoints (s_t, a_t, q_t, adv_t) to update the PG actor/policy train_log = self.actor.update(observations, actions, advantages, q_values=q_values) return train_log def calculate_q_vals(self, rewards_list): """ Monte Carlo estimation of the Q function. """ # Case 1: trajectory-based PG # Estimate Q^{pi}(s_t, a_t) by the total discounted reward summed over entire trajectory if not self.reward_to_go: # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory # In other words: value of (s_t, a_t) = sum_{t'=0}^T gamma^t' r_{t'} q_values = np.concatenate([self._discounted_return(r) for r in rewards_list]) # Case 2: reward-to-go PG # Estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting from t else: # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory # In other words: value of (s_t, a_t) = sum_{t'=t}^T gamma^(t'-t) * r_{t'} q_values = np.concatenate([self._discounted_cumsum(r) for r in rewards_list]) return q_values def estimate_advantage(self, obs, q_values): """ Computes advantages by (possibly) subtracting a baseline from the estimated Q values """ # Estimate the advantage when nn_baseline is True, # by querying the neural network that you're using to learn the baseline if self.nn_baseline: baselines_unnormalized = self.actor.run_baseline_prediction(obs) ## ensure that the baseline and q_values have the same dimensionality ## to prevent silent broadcasting errors assert baselines_unnormalized.ndim == q_values.ndim ## baseline was trained with standardized q_values, so ensure that the predictions ## have the same mean and standard deviation as the current batch of q_values baselines = baselines_unnormalized * np.std(q_values) + np.mean(q_values) advantages = q_values - baselines # Else, just set the advantage to [Q] else: advantages = q_values.copy() # Normalize the resulting advantages if self.standardize_advantages: advantages = utils.normalize(advantages, np.mean(advantages), np.std(advantages)) return advantages ##################################################### ##################################################### def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size, concat_rew=False) ##################################################### ################## HELPER FUNCTIONS ################# ##################################################### def _discounted_return(self, rewards): """ Helper function Input: list of rewards {r_0, r_1, ..., r_t', ... r_T} from a single rollout of length T Output: list where each index t contains sum_{t'=0}^T gamma^t' r_{t'} """ discounted_return = sum([(self.gamma**t) * r for t, r in enumerate(rewards)]) return [discounted_return] * len(rewards) def _discounted_cumsum(self, rewards): """ Helper function which -takes a list of rewards {r_0, r_1, ..., r_t', ... r_T}, -and returns a list where the entry in each index t is sum_{t'=t}^T gamma^(t'-t) * r_{t'} """ discounted_returns_to_go = [] for t in range(len(rewards)): return_to_go = sum([(self.gamma**tp) * r for tp, r in enumerate(rewards[t:])]) discounted_returns_to_go.append(return_to_go) return discounted_returns_to_go
class PGAgent(BaseAgent): def __init__(self, sess, env, agent_params): super(PGAgent, self).__init__() # init vars self.env = env self.sess = sess self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.nn_baseline = self.agent_params['nn_baseline'] self.reward_to_go = self.agent_params['reward_to_go'] # actor/policy # NOTICE that we are using MLPPolicyPG (hw2), instead of MLPPolicySL (hw1) # which indicates similar network structure (layout/inputs/outputs), # but differences in training procedure # between supervised learning and policy gradients self.actor = MLPPolicyPG(sess, self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], nn_baseline=self.agent_params['nn_baseline'] ) # replay buffer self.replay_buffer = ReplayBuffer(1000000) def train(self, obs, acs, rews_list, next_obs, terminals): """ Training a PG agent refers to updating its actor using the given observations/actions and the calculated qvals/advantages that come from the seen rewards. ---------------------------------------------------------------------------------- Recall that the expression for the policy gradient PG is PG = E_{tau} [sum_{t=0}^{T-1} grad log pi(a_t|s_t) * (Q_t - b_t )] where tau=(s_0, a_0, s_1, a_1, s_2, a_2, ...) is a trajectory, Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), b_t is a baseline which may depend on s_t, and (Q_t - b_t ) is the advantage. Thus, the PG update performed by the actor needs (s_t, a_t, q_t, adv_t), and that is exactly what this function provides. ---------------------------------------------------------------------------------- """ # step 1: calculate q values of each (s_t, a_t) point, # using rewards from that full rollout of length T: (r_0, ..., r_t, ..., r_{T-1}) q_values = self.calculate_q_vals(rews_list) # step 2: calculate advantages that correspond to each (s_t, a_t) point advantage_values = self.estimate_advantage(obs, q_values) # step 3: # TODO: pass the calculated values above into the actor/policy's update, # which will perform the actual PG update step loss = self.actor.update(obs, acs, qvals=q_values, adv_n=advantage_values) return loss def calculate_q_vals(self, rews_list): """ Monte Carlo estimation of the Q function. arguments: rews_list: length: number of sampled rollouts Each element corresponds to a particular rollout, and contains an array of the rewards for every step of that particular rollout returns: q_values: shape: (sum/total number of steps across the rollouts) Each entry corresponds to the estimated q(s_t,a_t) value of the corresponding obs/ac point at time t. """ # Case 1: trajectory-based PG if not self.reward_to_go: # TODO: Estimate the Q value Q^{pi}(s_t, a_t) using rewards from that entire trajectory # HINT1: value of each point (t) = total discounted reward summed over the entire trajectory (from 0 to T-1) # In other words, q(s_t, a_t) = sum_{t'=0}^{T-1} gamma^t' r_{t'} # Hint3: see the helper functions at the bottom of this file q_values = np.concatenate([self._discounted_return(r) for r in rews_list]) # Case 2: reward-to-go PG else: # TODO: Estimate the Q value Q^{pi}(s_t, a_t) as the reward-to-go # HINT1: value of each point (t) = total discounted reward summed over the remainder of that trajectory (from t to T-1) # In other words, q(s_t, a_t) = sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'} # Hint3: see the helper functions at the bottom of this file q_values = np.concatenate([self._discounted_cumsum(r) for r in rews_list]) return q_values def estimate_advantage(self, obs, q_values): """ Computes advantages by (possibly) subtracting a baseline from the estimated Q values """ # TODO: Estimate the advantage when nn_baseline is True # HINT1: pass obs into the neural network that you're using to learn the baseline # extra hint if you're stuck: see your actor's run_baseline_prediction # HINT2: advantage should be [Q-b] if self.nn_baseline: b_n_unnormalized = self.actor.run_baseline_prediction(obs) b_n = b_n_unnormalized * np.std(q_values) + np.mean(q_values) adv_n = q_values - b_n # Else, just set the advantage to [Q] else: adv_n = q_values.copy() # Normalize the resulting advantages if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n ##################################################### ##################################################### def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size, concat_rew=False) ##################################################### ################## HELPER FUNCTIONS ################# ##################################################### # TODO: implement this function def _discounted_return(self, rewards): """ Helper function Input: a list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single rollout of length T Output: list where each index t contains sum_{t'=0}^{T-1} gamma^t' r_{t'} note that all entries of this output are equivalent because each index t is a sum from 0 to T-1 (and doesnt involve t) """ # 1) create a list of indices (t'): from 0 to T-1 indices = list(range(len(rewards))) # 2) create a list where the entry at each index (t') is gamma^(t') discounts = np.power(self.gamma, indices) # 3) create a list where the entry at each index (t') is gamma^(t') * r_{t'} discounted_rewards = np.multiply(discounts, rewards) # 4) calculate a scalar: sum_{t'=0}^{T-1} gamma^(t') * r_{t'} sum_of_discounted_rewards = np.sum(discounted_rewards) # 5) create a list of length T-1, where each entry t contains that scalar list_of_discounted_returns = [sum_of_discounted_rewards] * len(rewards) return list_of_discounted_returns def _discounted_cumsum(self, rewards): """ Input: a list of length T a list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single rollout of length T Output: a list of length T a list where the entry in each index t is sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'} """ all_discounted_cumsums = [] # for loop over steps (t) of the given rollout for start_time_index in range(len(rewards)): # 1) create a list of indices (t'): goes from t to T-1 indices = list(range(start_time_index, len(rewards))) # 2) create a list where the entry at each index (t') is gamma^(t'-t) discounts = np.power(self.gamma, np.subtract(indices, start_time_index)) # 3) create a list where the entry at each index (t') is gamma^(t'-t) * r_{t'} # Hint: remember that t' goes from t to T-1, so you should use the rewards from those indices as well discounted_rtg = np.multiply(discounts, rewards[start_time_index:]) # 4) calculate a scalar: sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'} sum_discounted_rtg = np.sum(discounted_rtg) # appending each of these calculated sums into the list to return all_discounted_cumsums.append(sum_discounted_rtg) list_of_discounted_cumsums = np.array(all_discounted_cumsums) return list_of_discounted_cumsums
class PGAgent(BaseAgent): def __init__(self, env, agent_params): super(PGAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params["gamma"] self.standardize_advantages = self.agent_params["standardize_advantages"] self.nn_baseline = self.agent_params["nn_baseline"] self.reward_to_go = self.agent_params["reward_to_go"] # actor/policy self.actor = MLPPolicyPG( self.agent_params["ac_dim"], self.agent_params["ob_dim"], self.agent_params["n_layers"], self.agent_params["size"], discrete=self.agent_params["discrete"], learning_rate=self.agent_params["learning_rate"], nn_baseline=self.agent_params["nn_baseline"], ) # replay buffer self.replay_buffer = ReplayBuffer(1000000) def train(self, observations, actions, rewards_list, next_observations, terminals): """ Training a PG agent refers to updating its actor using the given observations/actions and the calculated qvals/advantages that come from the seen rewards. """ # step 1: calculate q values of each (s_t, a_t) point, using rewards (r_0, ..., r_t, ..., r_T) q_values = self.calculate_q_vals(rewards_list) # step 2: calculate advantages that correspond to each (s_t, a_t) point advantages = self.estimate_advantage(observations, q_values) # step 3: use all datapoints (s_t, a_t, q_t, adv_t) to update the PG actor/policy train_log = self.actor.update(observations, actions, advantages, q_values) return train_log def calculate_q_vals(self, rewards_list): """ Monte Carlo estimation of the Q function. """ # Case 1: trajectory-based PG # Estimate Q^{pi}(s_t, a_t) by the total discounted reward summed over entire trajectory if not self.reward_to_go: # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory # In other words: value of (s_t, a_t) = sum_{t'=0}^T gamma^t' r_{t'} q_values = np.concatenate( [self._discounted_return(r) for r in rewards_list] ) # Case 2: reward-to-go PG # Estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting from t else: # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory # In other words: value of (s_t, a_t) = sum_{t'=t}^T gamma^(t'-t) * r_{t'} q_values = np.concatenate( [self._discounted_cumsum(r) for r in rewards_list] ) return q_values def estimate_advantage(self, obs, q_values): """ Computes advantages by (possibly) subtracting a baseline from the estimated Q values """ # Estimate the advantage when nn_baseline is True, # by querying the neural network that you're using to learn the baseline if self.nn_baseline: baselines_unnormalized = self.actor.run_baseline_prediction(obs) ## ensure that the baseline and q_values have the same dimensionality ## to prevent silent broadcasting errors assert baselines_unnormalized.ndim == q_values.ndim ## baseline was trained with standardized q_values, so ensure that the predictions ## have the same mean and standard deviation as the current batch of q_values baselines = baselines_unnormalized * np.std(q_values) + np.mean(q_values) ## TODO: compute advantage estimates using q_values and baselines advantages = q_values - baselines # Else, just set the advantage to [Q] else: advantages = q_values.copy() # Normalize the resulting advantages if self.standardize_advantages: ## standardize the advantages to have a mean of zero ## and a standard deviation of one advantages = normalize(advantages) return advantages ##################################################### ##################################################### def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size, concat_rew=False) ##################################################### ################## HELPER FUNCTIONS ################# ##################################################### def _discounted_return(self, rewards): """ Helper function Input: list of rewards {r_0, r_1, ..., r_t', ... r_T} from a single rollout of length T Output: list where each index t contains sum_{t'=0}^T gamma^t' r_{t'} """ T = rewards.shape[0] discount_factors = np.power(self.gamma, np.arange(T)) discounted_rewards = rewards * discount_factors ret = np.sum(discounted_rewards) return np.repeat(ret, T) def _discounted_cumsum(self, rewards): """ Helper function which - takes a list of rewards {r_0, r_1, ..., r_t', ... r_T}, - and returns a list where the entry in each index t' is sum_{t'=t}^T gamma^(t'-t) * r_{t'} """ # HINT1: note that each entry of the output should now be unique, # because the summation happens over [t, T] instead of [0, T] # HINT2: it is possible to write a vectorized solution, but a solution # using a for loop is also fine T = rewards.shape[0] discount_factors = np.power(self.gamma, np.arange(T)) discounted_rewards = rewards * discount_factors # We can write RTG(t) = sum_{t'=t}^T gamma^t' r_{t'} / r^t # Need cumsum from the right, i.e. flip -> cumsum -> flip partial_sums = np.flip(np.cumsum(np.flip(discounted_rewards))) rewards_to_go = partial_sums / discount_factors return rewards_to_go