class PGAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(PGAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
        self.nn_baseline = self.agent_params['nn_baseline']
        self.reward_to_go = self.agent_params['reward_to_go']

        # actor/policy
        self.actor = MLPPolicyPG(

        # replay buffer
        self.replay_buffer = ReplayBuffer(1000000)

    def train(self, observations, actions, rewards_list, next_observations,
            Training a PG agent refers to updating its actor using the given observations/actions
            and the calculated qvals/advantages that come from the seen rewards.

        # step 1: calculate q values of each (s_t, a_t) point, using rewards (r_0, ..., r_t, ..., r_T)
        q_values = self.calculate_q_vals(rewards_list)

        # step 2: calculate advantages that correspond to each (s_t, a_t) point
        advantages = self.estimate_advantage(observations, q_values)

        # TODO: step 3: use all datapoints (s_t, a_t, q_t, adv_t) to update the PG actor/policy
        ## HINT: `train_log` should be returned by your actor update method
        train_log = self.actor.update(observations, actions, advantages,

        return train_log

    def calculate_q_vals(self, rewards_list):
            Monte Carlo estimation of the Q function.

        # Case 1: trajectory-based PG
        # Estimate Q^{pi}(s_t, a_t) by the total discounted reward summed over entire trajectory
        if not self.reward_to_go:

            # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory
            # In other words: value of (s_t, a_t) = sum_{t'=0}^T gamma^t' r_{t'}
            q_values = np.concatenate(
                [self._discounted_return(r) for r in rewards_list])

        # Case 2: reward-to-go PG
        # Estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting from t

            # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory
            # In other words: value of (s_t, a_t) = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
            q_values = np.concatenate(
                [self._discounted_cumsum(r) for r in rewards_list])

        return q_values

    def estimate_advantage(self, obs, q_values):
            Computes advantages by (possibly) subtracting a baseline from the estimated Q values

        # Estimate the advantage when nn_baseline is True,
        # by querying the neural network that you're using to learn the baseline
        if self.nn_baseline:
            baselines_unnormalized = self.actor.run_baseline_prediction(obs)
            ## ensure that the baseline and q_values have the same dimensionality
            ## to prevent silent broadcasting errors
            assert baselines_unnormalized.ndim == q_values.ndim
            ## baseline was trained with standardized q_values, so ensure that the predictions
            ## have the same mean and standard deviation as the current batch of q_values
            baselines = baselines_unnormalized * np.std(q_values) + np.mean(
            ## TODO: compute advantage estimates using q_values and baselines
            advantages = q_values - baselines

        # Else, just set the advantage to [Q]
            advantages = q_values.copy()

        # Normalize the resulting advantages
        if self.standardize_advantages:
            ## TODO: standardize the advantages to have a mean of zero
            ## and a standard deviation of one
            ## HINT: there is a `normalize` function in `infrastructure.utils`
            advantages = (advantages - advantages.mean()) / (advantages.std() +

        return advantages


    def add_to_replay_buffer(self, paths):

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size,

    ################## HELPER FUNCTIONS #################

    def _discounted_return(self, rewards):
            Helper function

            Input: list of rewards {r_0, r_1, ..., r_t', ... r_T} from a single rollout of length T

            Output: list where each index t contains sum_{t'=0}^T gamma^t' r_{t'}

        # TODO: create list_of_discounted_returns
        # Hint: note that all entries of this output are equivalent
        # because each sum is from 0 to T (and doesnt involve t)
        out = sum(self.gamma**t * rew for t, rew in enumerate(rewards))
        return [out for _ in range(len(rewards))]

    def _discounted_cumsum(self, rewards):
            Helper function which
            -takes a list of rewards {r_0, r_1, ..., r_t', ... r_T},
            -and returns a list where the entry in each index t' is sum_{t'=t}^T gamma^(t'-t) * r_{t'}

        # TODO: create `list_of_discounted_returns`
        # HINT1: note that each entry of the output should now be unique,
        # because the summation happens over [t, T] instead of [0, T]
        # HINT2: it is possible to write a vectorized solution, but a solution
        # using a for loop is also fine
        ret, q = [], 0
        for rew in reversed(rewards):
            ret.append(q * self.gamma + rew)
            q = ret[-1]
        return ret[::-1]
class PGAgent(BaseAgent):
    def __init__(self, sess, env, agent_params):
        super(PGAgent, self).__init__()

        # init vars
        self.env = env 
        self.sess = sess
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']
        self.nn_baseline = self.agent_params['nn_baseline'] 
        self.reward_to_go = self.agent_params['reward_to_go'] 

        # actor/policy
        # NOTICE that we are using MLPPolicyPG (hw2), instead of MLPPolicySL (hw1)
            # which indicates similar network structure (layout/inputs/outputs), 
            # but differences in training procedure 
            # between supervised learning and policy gradients
        self.actor = MLPPolicyPG(sess, 

        # replay buffer
        self.replay_buffer = ReplayBuffer(1000000)

    def train(self, obs, acs, rews_list, next_obs, terminals):

            Training a PG agent refers to updating its actor using the given observations/actions
            and the calculated qvals/advantages that come from the seen rewards.

            Recall that the expression for the policy gradient PG is
                PG = E_{tau} [sum_{t=0}^{T-1} grad log pi(a_t|s_t) * (Q_t - b_t )]
                tau=(s_0, a_0, s_1, a_1, s_2, a_2, ...) is a trajectory,
                Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
                b_t is a baseline which may depend on s_t,
                and (Q_t - b_t ) is the advantage.

            Thus, the PG update performed by the actor needs (s_t, a_t, q_t, adv_t),
                and that is exactly what this function provides.


        # step 1: calculate q values of each (s_t, a_t) point, 
        # using rewards from that full rollout of length T: (r_0, ..., r_t, ..., r_{T-1})
        q_values = self.calculate_q_vals(rews_list)

        # step 2: calculate advantages that correspond to each (s_t, a_t) point
        advantage_values = self.estimate_advantage(obs, q_values)

        # step 3:
        # TODO: pass the calculated values above into the actor/policy's update, 
        # which will perform the actual PG update step
        loss = self.actor.update(obs, acs, qvals=q_values, adv_n=advantage_values)
        return loss

    def calculate_q_vals(self, rews_list):

            Monte Carlo estimation of the Q function.

                rews_list: length: number of sampled rollouts
                    Each element corresponds to a particular rollout,
                    and contains an array of the rewards for every step of that particular rollout

                q_values: shape: (sum/total number of steps across the rollouts)
                    Each entry corresponds to the estimated q(s_t,a_t) value 
                    of the corresponding obs/ac point at time t.

        # Case 1: trajectory-based PG 
        if not self.reward_to_go:
            # TODO: Estimate the Q value Q^{pi}(s_t, a_t) using rewards from that entire trajectory
            # HINT1: value of each point (t) = total discounted reward summed over the entire trajectory (from 0 to T-1)
                # In other words, q(s_t, a_t) = sum_{t'=0}^{T-1} gamma^t' r_{t'}
            # Hint3: see the helper functions at the bottom of this file
            q_values = np.concatenate([self._discounted_return(r) for r in rews_list])

        # Case 2: reward-to-go PG 

            # TODO: Estimate the Q value Q^{pi}(s_t, a_t) as the reward-to-go
            # HINT1: value of each point (t) = total discounted reward summed over the remainder of that trajectory (from t to T-1)
                # In other words, q(s_t, a_t) = sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'}
            # Hint3: see the helper functions at the bottom of this file
            q_values = np.concatenate([self._discounted_cumsum(r) for r in rews_list])

        return q_values

    def estimate_advantage(self, obs, q_values):

            Computes advantages by (possibly) subtracting a baseline from the estimated Q values

        # TODO: Estimate the advantage when nn_baseline is True
        # HINT1: pass obs into the neural network that you're using to learn the baseline
            # extra hint if you're stuck: see your actor's run_baseline_prediction
        # HINT2: advantage should be [Q-b]
        if self.nn_baseline:
            b_n_unnormalized = self.actor.run_baseline_prediction(obs)
            b_n = b_n_unnormalized * np.std(q_values) + np.mean(q_values)
            adv_n = q_values - b_n

        # Else, just set the advantage to [Q]
            adv_n = q_values.copy()

        # Normalize the resulting advantages
        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)

        return adv_n


    def add_to_replay_buffer(self, paths):

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size, concat_rew=False)

    ################## HELPER FUNCTIONS #################

    # TODO: implement this function
    def _discounted_return(self, rewards):
            Helper function

            Input: a list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single rollout of length T

            Output: list where each index t contains sum_{t'=0}^{T-1} gamma^t' r_{t'}
                note that all entries of this output are equivalent
                because each index t is a sum from 0 to T-1 (and doesnt involve t)

        # 1) create a list of indices (t'): from 0 to T-1
        indices = list(range(len(rewards)))

        # 2) create a list where the entry at each index (t') is gamma^(t')
        discounts = np.power(self.gamma, indices)

        # 3) create a list where the entry at each index (t') is gamma^(t') * r_{t'}
        discounted_rewards = np.multiply(discounts, rewards)

        # 4) calculate a scalar: sum_{t'=0}^{T-1} gamma^(t') * r_{t'}
        sum_of_discounted_rewards = np.sum(discounted_rewards)

        # 5) create a list of length T-1, where each entry t contains that scalar
        list_of_discounted_returns = [sum_of_discounted_rewards] * len(rewards)

        return list_of_discounted_returns

    def _discounted_cumsum(self, rewards):
                a list of length T 
                a list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single rollout of length T
                a list of length T
                a list where the entry in each index t is sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'}

        all_discounted_cumsums = []

        # for loop over steps (t) of the given rollout
        for start_time_index in range(len(rewards)): 

            # 1) create a list of indices (t'): goes from t to T-1
            indices = list(range(start_time_index, len(rewards)))

            # 2) create a list where the entry at each index (t') is gamma^(t'-t)
            discounts = np.power(self.gamma, np.subtract(indices, start_time_index))

            # 3) create a list where the entry at each index (t') is gamma^(t'-t) * r_{t'}
            # Hint: remember that t' goes from t to T-1, so you should use the rewards from those indices as well
            discounted_rtg = np.multiply(discounts, rewards[start_time_index:])

            # 4) calculate a scalar: sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'}
            sum_discounted_rtg = np.sum(discounted_rtg)

            # appending each of these calculated sums into the list to return
        list_of_discounted_cumsums = np.array(all_discounted_cumsums)
        return list_of_discounted_cumsums 
