예제 #1
0
    def __init__(self, filename, **kwargs):
        super().__init__(**kwargs)

        # Open the pickled expert policy.
        with open(filename, 'rb') as f:
            data = pickle.loads(f.read())

        # Define the hidden layer activation functions.
        self.nonlin_type = data['nonlin_type']
        if self.nonlin_type == 'lrelu':
            self.non_lin = nn.LeakyReLU(0.01)
        elif self.nonlin_type == 'tanh':
            self.non_lin = nn.Tanh()
        else:
            raise NotImplementedError()

        # Assert that this loaded policy is a "GaussianPolicy"
        policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]
        assert policy_type == 'GaussianPolicy', (
            'Policy type {} not supported'.format(policy_type)
        )
        self.policy_params = data[policy_type]

        # The loaded policy has policy_params
        # policy_params is a dictionary with these 4 entries.
        assert set(self.policy_params.keys()) == {
            'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'
        }

        # Build the policy. First, observation normalization.
        # Under the loaded policy, the observations are (approx) distributed as
        # N(obsnorm_mean, obsnorm_stdev)
        assert list(self.policy_params['obsnorm'].keys()) == ['Standardizer']
        obsnorm_mean = self.policy_params['obsnorm']['Standardizer']['mean_1_D']
        obsnorm_meansq = self.policy_params['obsnorm']['Standardizer'][
            'meansq_1_D']
        obsnorm_stdev = np.sqrt(
            np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
        print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)

        self.obs_norm_mean = nn.Parameter(ptu.from_numpy(obsnorm_mean))
        self.obs_norm_std = nn.Parameter(ptu.from_numpy(obsnorm_stdev))

        # Reconstruct the hidden layers froam the loaded data.
        self.hidden_layers = nn.ModuleList()

        # The 'hidden' layers must be "FeedforwardNet" type
        # The layers are kept in `layer_params` dictionary, ordered by the keys.
        # They are read out, made into PyTorch layers, then appended, in order.
        assert list(self.policy_params['hidden'].keys()) == ['FeedforwardNet']
        layer_params = self.policy_params['hidden']['FeedforwardNet']
        for layer_name in sorted(layer_params.keys()):
            l = layer_params[layer_name]
            W, b = read_layer(l)
            linear_layer = create_linear_layer(W, b)
            self.hidden_layers.append(linear_layer)

        # Output layer (does not have an activation function).
        W, b = read_layer(self.policy_params['out'])
        self.output_layer = create_linear_layer(W, b)
예제 #2
0
    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        ob_no = ptu.from_numpy(ob_no)
        next_ob_no = ptu.from_numpy(next_ob_no)
        terminal_n = ptu.from_numpy(terminal_n)
        re_n = ptu.from_numpy(re_n)

        ac_na = ptu.from_numpy(ac_na)

        loss_critic = 0.
        for i in range(
                self.agent_params['num_critic_updates_per_agent_update']):
            loss_critic += self.critic.update(ob_no, ac_na, next_ob_no, re_n,
                                              terminal_n)

        # advantage = estimate_advantage(...) :
        adv_n = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                        terminal_n)  # a tensor is returned
        loss_actor = 0.
        for i in range(
                self.agent_params['num_actor_updates_per_agent_update']):
            loss_actor += self.actor.update(ob_no, ac_na, adv_n)

        loss = OrderedDict()
        loss['Critic_Loss'] = loss_critic
        loss[
            'Actor_Loss'] = loss_actor  # in TensorBoard, loss_actor actually increases as we actually minimize -loss_actor

        return loss
예제 #3
0
    def update(self,
               observations,
               actions,
               adv_n=None,
               acs_labels_na=None,
               qvals=None):
        # TODO: update the policy and return the loss
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)

        observations.require_grads = True
        actions.require_grads = True

        nn_acs = self.forward(observations).rsample()

        loss = self.loss(nn_acs, actions)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return {
            # You can add extra logging information here, but keep this line
            'Training Loss': ptu.to_numpy(loss),
        }
예제 #4
0
    def update(self, observations, actions, adv_n=None):
        # TODO: update the policy and return the loss
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        adv_n = ptu.from_numpy(adv_n)

        # TODO: compute the loss that should be optimized when training with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE
            # is the expectation over collected trajectories of:
            # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: you will want to use the `log_prob` method on the distribution returned
            # by the `forward` method
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss

        action_distributions = self.forward(observations)
        log_prob_actions = action_distributions.log_prob(actions)
        if self.discrete:
            assert log_prob_actions.shape == adv_n.shape
        else:
            # Need to sum the log prob over the action dimension.
            assert log_prob_actions.shape[:-1] == adv_n.shape
            log_prob_actions = log_prob_actions.sum(dim=-1)
        losses = -log_prob_actions * adv_n
        loss = losses.sum()

        # TODO: optimize `loss` using `self.optimizer`
        # HINT: remember to `zero_grad` first

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()
예제 #5
0
    def update(self, observations_np, actions_np, advantages_np=None):
        observations = ptu.from_numpy(observations_np)
        actions = ptu.from_numpy(actions_np)
        advantages = ptu.from_numpy(advantages_np)

        # Compute the loss that should be optimized when training with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE
            # is the expectation over collected trajectories of:
            # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: you will want to use the `log_prob` method on the distribution returned
            # by the `forward` method
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss

        actions_distribution = self.forward(observations)
        log_probs: torch.Tensor = actions_distribution.log_prob(actions)
        if not self.discrete:
            log_probs = log_probs.sum(1)
        assert log_probs.size() == advantages.size()
        loss = -(log_probs * advantages).sum()

        # Optimize `loss` using `self.optimizer`
        # HINT: remember to `zero_grad` first
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        train_log = {
            'Training Loss': ptu.to_numpy(loss),
        }

        return loss.item()
    def update(self,
               observations,
               actions,
               adv_n=None,
               acs_labels_na=None,
               qvals=None):
        # TODO: update the policy and return the loss
        if type(observations) == np.ndarray:
            observations = ptu.from_numpy(observations)

        if type(actions) == np.ndarray:
            actions = ptu.from_numpy(actions)

        #print(observations.shape, actions.shape)

        loss = self.loss(self.forward(observations), actions)
        # print(loss)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return {
            # You can add extra logging information here, but keep this line
            'Training Loss': ptu.to_numpy(loss)
        }
예제 #7
0
    def get_prediction(self, obs, acs, data_statistics):
        """
        :param obs: numpy array of observations (s_t)
        :param acs: numpy array of actions (a_t)
        :param data_statistics: A dictionary with the following keys (each with
        a numpy array as the value):
             - 'obs_mean'
             - 'obs_std'
             - 'acs_mean'
             - 'acs_std'
             - 'delta_mean'
             - 'delta_std'
        :return: a numpy array of the predicted next-states (s_t+1)
        """
        self.update_statistics(data_statistics['obs_mean'],
                               data_statistics['obs_std'],
                               data_statistics['acs_mean'],
                               data_statistics['acs_std'],
                               data_statistics['delta_mean'],
                               data_statistics['delta_std'])

        # print ('pred', obs.shape, acs.shape)
        prediction, _ = self.forward(
            ptu.from_numpy(obs) if type(obs) == np.ndarray else obs,
            ptu.from_numpy(acs), self.obs_mean, self.obs_std, self.acs_mean,
            self.acs_std, self.delta_mean, self.delta_std)
        # print (prediction.shape)

        # TODO(Q1) get numpy array of the predicted next-states (s_t+1)
        # Hint: `self(...)` returns a tuple, but you only need to use one of the
        # outputs.
        return ptu.to_numpy(prediction)
예제 #8
0
    def compute_loss(self, observations, gradients, actions):
        # if self.siren:
        if self.supervision_mode in ['gradient', 'gv']:

            def net(x):
                action_distribution, obs = self(x)
                return action_distribution.rsample()

            prediction_gradients = jacobian(net=net,
                                            x=observations,
                                            ac_dim=self.ac_dim)
            if self.supervision_mode == 'gradient':
                loss = self.loss(prediction_gradients,
                                 ptu.from_numpy(gradients))
            else:  # supervision_mode= 'gv', weight gradient loss
                action_value_loss = nn.MSELoss()
                predicted_actions = self(observations)[0].rsample()
                loss = self.gradient_loss_scale * self.loss(
                    prediction_gradients,
                    ptu.from_numpy(gradients)) + action_value_loss(
                        predicted_actions, ptu.from_numpy(actions))
        else:
            assert self.supervision_mode == 'value'
            predicted_actions = self(observations)[0].rsample()
            loss = self.loss(predicted_actions, ptu.from_numpy(actions))
        return loss
예제 #9
0
    def update(self, ob_no, targets):
        """
            Update the parameters of the critic.

            let sum_of_path_lengths be the sum of the lengths of the paths sampled from
                Agent.sample_trajectories
            let num_paths be the number of paths sampled from Agent.sample_trajectories

            arguments:
                ob_no: shape: (sum_of_path_lengths, ob_dim)
                targets: shape: (sum_of_path_lengths,)

            returns:
                training loss
        """

        targets = ptu.from_numpy(targets).detach()

        for _ in range(self.num_target_updates):

            rand_indices = torch.randperm(targets.shape[0])
            v_ts = self(ptu.from_numpy(ob_no))[rand_indices]
            v_targets = targets[rand_indices]

            value_loss = self.loss(v_ts, v_targets)

            for param in self.critic_network.parameters():
                value_loss += param.pow(2).sum() * self.l2_reg

            self.optimizer.zero_grad()
            value_loss.backward()
            self.optimizer.step()

        return value_loss.item()
예제 #10
0
 def get_prediction(self, obs, acs, data_statistics):
     """
     :param obs: numpy array of observations (s_t)
     :param acs: numpy array of actions (a_t)
     :param data_statistics: A dictionary with the following keys (each with
     a numpy array as the value):
          - 'obs_mean'
          - 'obs_std'
          - 'acs_mean'
          - 'acs_std'
          - 'delta_mean'
          - 'delta_std'
     :return: a numpy array of the predicted next-states (s_t+1)
     """
     obs = ptu.from_numpy(obs)
     acs = ptu.from_numpy(acs)
     # obs_mean = ptu.from_numpy(data_statistics['obs_mean'])
     # obs_std = ptu.from_numpy(data_statistics['obs_std'])
     # acs_mean = ptu.from_numpy(data_statistics['acs_mean'])
     # acs_std = ptu.from_numpy(data_statistics['acs_std'])
     # delta_mean = ptu.from_numpy(data_statistics['delta_mean'])
     # delta_std = ptu.from_numpy(data_statistics['delta_std'])
     self.update_statistics(*data_statistics.values())
     prediction = self(
         obs, acs, self.obs_mean, self.obs_std, self.acs_mean, self.acs_std,
         self.delta_mean, self.delta_std
     )[0]  # TODO(Q1) get numpy array of the predicted next-states (s_t+1)
     prediction = ptu.to_numpy(prediction)
     # Hint: `self(...)` returns a tuple, but you only need to use one of the
     # outputs.
     return prediction
예제 #11
0
 def get_prediction(self, obs, acs, data_statistics):
     """
     :param obs: numpy array of observations (s_t)
     :param acs: numpy array of actions (a_t)
     :param data_statistics: A dictionary with the following keys (each with
     a numpy array as the value):
          - 'obs_mean'
          - 'obs_std'
          - 'acs_mean'
          - 'acs_std'
          - 'delta_mean'
          - 'delta_std'
     :return: a numpy array of the predicted next-states (s_t+1)
     """
     # TODO(Q1) get numpy array of the predicted next-states (s_t+1)
     obs = ptu.from_numpy(obs)
     acs = ptu.from_numpy(acs)
     torch_data_statistics = {k: ptu.from_numpy(v) for k, v in data_statistics.items()}
     prediction = self.forward(obs,
                               acs,
                               torch_data_statistics['obs_mean'],
                               torch_data_statistics['obs_std'],
                               torch_data_statistics['acs_mean'],
                               torch_data_statistics['acs_std'],
                               torch_data_statistics['delta_mean'],
                               torch_data_statistics['delta_std'])[0]
     # Hint: `self(...)` returns a tuple, but you only need to use one of the
     # outputs.
     return ptu.to_numpy(prediction)
예제 #12
0
    def get_prediction(self, obs, acs, data_statistics):
        """
        :param obs: numpy array of observations (s_t)
        :param acs: numpy array of actions (a_t)
        :param data_statistics: A dictionary with the following keys (each with
        a numpy array as the value):
             - 'obs_mean'
             - 'obs_std'
             - 'acs_mean'
             - 'acs_std'
             - 'delta_mean'
             - 'delta_std'
        :return: a numpy array of the predicted next-states (s_t+1)
        """

        obs = ptu.from_numpy(obs)
        acs = ptu.from_numpy(acs)
        data_statistics = {key: ptu.from_numpy(value) for key, value in
                           data_statistics.items()}

        # get numpy array of the predicted next-states (s_t+1)
        # Hint: `self(...)` returns a tuple, but you only need to use one of the
        # outputs.
        prediction, _ = self(
            obs,
            acs,
            data_statistics['obs_mean'],
            data_statistics['obs_std'],
            data_statistics['acs_mean'],
            data_statistics['acs_std'],
            data_statistics['delta_mean'],
            data_statistics['delta_std'],
        )
        return prediction.cpu().detach().numpy()
예제 #13
0
    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # TODO Implement the following pseudocode:
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic

        ob_no = ptu.from_numpy(ob_no)
        next_ob_no = ptu.from_numpy(next_ob_no)
        re_n = ptu.from_numpy(re_n)
        ac_na = ptu.from_numpy(ac_na)
        terminal_n = ptu.from_numpy(terminal_n)
        for _ in range(
                self.agent_params['num_critic_updates_per_agent_update']):
            critic_loss = self.critic.update(ob_no, ac_na, next_ob_no, re_n,
                                             terminal_n)
        # advantage = estimate_advantage(...)
        advantage = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                            terminal_n)

        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor
        for _ in range(
                self.agent_params['num_actor_updates_per_agent_update']):
            actor_loss = self.actor.update(ob_no, ac_na, advantage)

        loss = OrderedDict()
        loss['Critic_Loss'] = critic_loss
        loss['Actor_Loss'] = actor_loss

        return loss
예제 #14
0
    def train_agent(self):
        """
        Sample self.params['train_batch_size'] frames from the replay buffer of
        the agent, then train the agent upon that.
        Repeat this for self.params['num_agent_train_steps_per_iter'] steps.

        Returns
            - all_logs: the entire training log from this training.
        """
        print('\nTraining agent using sampled data from replay buffer...')
        all_logs = []
        for train_step in range(self.params['num_agent_train_steps_per_iter']):

            # Sample some data from the replay buffer of the agent.
            # sample size is
            # self.params['train_batch_size']
            ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch \
                = self.agent.sample(self.params['train_batch_size'])

            # Use the sampled data to train an agent

            train_log = self.agent.train(
                ptu.from_numpy(ob_batch),
                ptu.from_numpy(ac_batch),
                ptu.from_numpy(re_batch),
                ptu.from_numpy(next_ob_batch),
                ptu.from_numpy(terminal_batch))
            all_logs.append(train_log) # training log for debugging
        return all_logs
예제 #15
0
    def update(self,
               observations,
               actions,
               adv_n=None,
               acs_labels_na=None,
               qvals=None):
        # TODO: update the policy and return the loss

        #action_predicted = self.get_action(observations)
        #loss = self.loss(action_predicted, actions)

        observations = ptu.from_numpy(observations.astype(np.float32))
        actions = ptu.from_numpy(actions.astype(np.float32))

        pred_action_distribution = self(observations)
        pred_acs = pred_action_distribution.rsample()
        loss = self.loss(pred_acs, actions)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return {
            # You can add extra logging information here, but keep this line
            'Training Loss': ptu.to_numpy(loss),
        }
예제 #16
0
    def update(self, observations, actions, adv_n=None):
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        adv_n = ptu.from_numpy(adv_n)

        if self.discrete:
            action_distribution = self.forward(observations)

        else:
            raise NotImplementedError()
        # Recall that the expression that we want to MAXIMIZE
        # is the expectation over collected trajectories of:
        # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # you will want to use the `log_prob` method on the distribution returned
        # by the `forward` method
        # don't forget that `optimizer.step()` MINIMIZES a loss

        loss = action_distribution.log_prob(actions) * adv_n
        loss = -loss.sum()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()
예제 #17
0
 def get_prediction(self, obs, acs, data_statistics):
     """
 :param obs: numpy array of observations (s_t)
 :param acs: numpy array of actions (a_t)
 :param data_statistics: A dictionary with the following keys (each with
 a numpy array as the value):
      - 'obs_mean'
      - 'obs_std'
      - 'acs_mean'
      - 'acs_std'
      - 'delta_mean'
      - 'delta_std'
 :return: a numpy array of the predicted next-states (s_t+1)
 """
     # TODO(Q1) done get numpy array of the predicted next-states (s_t+1)
     obs = ptu.from_numpy(obs)
     acs = ptu.from_numpy(acs)
     data_statistics = {
         k: ptu.from_numpy(v)
         for k, v in data_statistics.items()
     }
     prediction, delta_pred_normalized = \
       self.forward(obs, acs, data_statistics['obs_mean'], data_statistics['obs_std'],
                    data_statistics['acs_mean'], data_statistics['acs_std'],
                    data_statistics['delta_mean'], data_statistics['delta_std'])
     return ptu.to_numpy(prediction)
예제 #18
0
    def update(self, observations, actions, adv_n=None):

        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(adv_n)

        # TODO: compute the loss that should be optimized when training with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE
            # is the expectation over collected trajectories of:
            # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: you will want to use the `log_prob` method on the distribution returned
            # by the `forward` method
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss

        action_distribution = self(observations)
        log_probability = action_distribution.log_prob(actions)

        m = torch.mul(log_probability, advantages)
        loss = torch.sum(m)
        loss = loss * -1 #because we want to maximize but self.optimizer minimizes

        # TODO: optimize `loss` using `self.optimizer`
        # HINT: remember to `zero_grad` first
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()
예제 #19
0
    def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
        ob_no = ptu.from_numpy(ob_no)
        ac_na = ptu.from_numpy(ac_na).to(torch.long)
        next_ob_no = ptu.from_numpy(next_ob_no)
        reward_n = ptu.from_numpy(reward_n)
        terminal_n = ptu.from_numpy(terminal_n)

        qa_t_values = self.q_net(ob_no)
        q_t_values = torch.gather(qa_t_values, 1,
                                  ac_na.unsqueeze(1)).squeeze(1)
        qa_tp1_values = self.q_net_target(next_ob_no)

        if self.double_q:
            next_actions = self.q_net(next_ob_no).argmax(dim=1)
            q_tp1 = torch.gather(qa_tp1_values, 1,
                                 next_actions.unsqueeze(1)).squeeze(1)
        else:
            q_tp1, _ = qa_tp1_values.max(dim=1)

        target = reward_n + self.gamma * q_tp1 * (1 - terminal_n)
        target = target.detach()
        loss = self.loss(q_t_values, target)

        self.optimizer.zero_grad()
        loss.backward()
        utils.clip_grad_value_(self.q_net.parameters(),
                               self.grad_norm_clipping)
        self.optimizer.step()

        return {'Training Loss': ptu.to_numpy(loss)}
예제 #20
0
    def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
        """
            Update the parameters of the critic.
            let sum_of_path_lengths be the sum of the lengths of the paths sampled from
                Agent.sample_trajectories
            let num_paths be the number of paths sampled from Agent.sample_trajectories
            arguments:
                ob_no: shape: (sum_of_path_lengths, ob_dim)
                next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
                reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing
                    the reward for each timestep
                terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
                    at that timestep of 0 if the episode did not end
            returns:
                nothing
        """
        ob_no = ptu.from_numpy(ob_no)
        ac_na = ptu.from_numpy(ac_na).to(torch.long)
        next_ob_no = ptu.from_numpy(next_ob_no)
        reward_n = ptu.from_numpy(reward_n)
        terminal_n = ptu.from_numpy(terminal_n)

        #print(ob_no)

        qa_t_values = self.q_net(ob_no)
        q_t_values = torch.gather(qa_t_values, 1,
                                  ac_na.unsqueeze(1)).squeeze(1)

        # TODO compute the Q-values from the target network
        qa_tp1_values = self.q_net_target(next_ob_no)

        if self.double_q:
            # You must fill this part for Q2 of the Q-learning portion of the homework.
            # In double Q-learning, the best action is selected using the Q-network that
            # is being updated, but the Q-value for this action is obtained from the
            # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details.
            q_tp1 = torch.gather(qa_tp1_values, 1,
                                 torch.argmax(qa_t_values,
                                              dim=1).unsqueeze(1)).squeeze(1)
        else:
            q_tp1, _ = qa_tp1_values.max(dim=1)

        # TODO compute targets for minimizing Bellman error
        # HINT: as you saw in lecture, this would be:
        #currentReward + self.gamma * qValuesOfNextTimestep * (not terminal)
        target = reward_n + self.gamma * (q_tp1 * (1 - terminal_n))
        target = target.detach()

        assert q_t_values.shape == target.shape
        loss = self.loss(q_t_values, target)

        self.optimizer.zero_grad()
        loss.backward()
        utils.clip_grad_value_(self.q_net.parameters(),
                               self.grad_norm_clipping)
        self.optimizer.step()

        return {
            'Training Loss': ptu.to_numpy(loss),
        }
예제 #21
0
    def update(self, observations, actions, advantages, q_values=None):
        """
           TRPO policy update fucntion

        """
        self.observations = ptu.from_numpy(observations)
        self.actions = ptu.from_numpy(actions)
        self.advantages = ptu.from_numpy(advantages)

        # computes the loss that should be optimized when training with policy gradient

        log_probs = self.logprobs(self.observations, self.actions)

        with torch.no_grad():
            old_log_probs = self.logprobs(self.observations, self.actions)

        loss = self.surrogate_reward(log_probs, old_log_probs)

        # find policy gradient with surrogate objective of TRPO
        grads = torch.autograd.grad(loss, self.policy_parameters())
        policy_grad = torch.cat([grad.view(-1) for grad in grads]).detach()
        step_dir = self.conjugate_gradient(-policy_grad)

        max_step = torch.sqrt(
            2 * self.max_kl /
            torch.dot(step_dir, self.fisher_vector_product(step_dir)))
        full_step = max_step * step_dir
        expected_improve = torch.dot(-policy_grad, full_step)

        prev_params = ptu.flatten_params(self.policy_parameters()).clone()
        success, new_params = self.line_search(old_log_probs, prev_params,
                                               full_step, expected_improve)
        ptu.assign_params_to(self.policy_parameters(), new_params)

        return loss.item()
예제 #22
0
    def update(self, observations, actions, advantages=None):
        # update the policy and return the loss
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(advantages)

        # compute the loss that should be optimized when training with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE
        # is the expectation over collected trajectories of:
        # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: you will want to use the `log_prob` method on the distribution returned
        # by the `forward` method
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss

        action_dists = self(observations)
        log_probs = action_dists.log_prob(actions)
        loss = -torch.sum(log_probs * advantages)

        # optimize `loss` using `self.optimizer`
        # HINT: remember to `zero_grad` first
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        assert not self.nn_baseline

        return loss.item()
예제 #23
0
    def td_error(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
        # calculate temporal difference 
        ob_no = ptu.from_numpy(ob_no)
        ac_na = ptu.from_numpy(ac_na).to(torch.long)
        next_ob_no = ptu.from_numpy(next_ob_no)
        reward_n = ptu.from_numpy(reward_n)
        terminal_n = ptu.from_numpy(terminal_n)

        qa_t_values = self.q_net(ob_no)
        q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1)
        
        # TODO compute the Q-values from the target network 
        qa_tp1_values = self.q_net_target(next_ob_no)

        if self.double_q:
            # You must fill this part for Q2 of the Q-learning portion of the homework.
            # In double Q-learning, the best action is selected using the Q-network that
            # is being updated, but the Q-value for this action is obtained from the
            # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details.
            _, selected_action = self.q_net(next_ob_no).max(1)
            selected_action = selected_action.unsqueeze(1)
            q_tp1 = qa_tp1_values.gather(1, selected_action).squeeze()
        else:
            q_tp1, _ = qa_tp1_values.max(dim=1)

        # TODO compute targets for minimizing Bellman error
        # HINT: as you saw in lecture, this would be:
            #currentReward + self.gamma * qValuesOfNextTimestep * (not terminal)
        target = reward_n + self.gamma * q_tp1 * (1 - terminal_n)
        target = target.detach()

        assert q_t_values.shape == target.shape
        difference = q_t_values - target
        return ptu.to_numpy(difference)
예제 #24
0
    def update(self,
               observations,
               actions,
               adv_n=None,
               acs_labels_na=None,
               qvals=None):
        # TODO: update the policy and return the loss

        #zeroing out the gradients to prevent gradient accumulation before calling loss.backward()
        self.optimizer.zero_grad()

        #forward-propagation of NN i.e., observation -> policy takes action
        pred_actions = self.forward(ptu.from_numpy(observations))

        #Q: why not use self._get_action(observations)?
        #Q: How to understand loss.forward?
        loss = self.loss.forward(pred_actions,
                                 ptu.from_numpy(actions))  #loss(y_hat,y)

        loss.backward()  #back-propagation:compute gradients
        self.optimizer.step()  #updates parameters

        return {
            # You can add extra logging information here, but keep this line
            'Training Loss': ptu.to_numpy(loss),
        }
예제 #25
0
    def update(self, observations, actions, advantages, q_values=None):
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(advantages)

        # TODO: compute the loss that should be optimized when training with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE
        # is the expectation over collected trajectories of:
        # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: you will want to use the `log_prob` method on the distribution returned
        # by the `forward` method
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss

        N = observations.shape[0]
        self.optimizer.zero_grad()
        loss = 0

        for i in range(1, N):
            log_prob = self(observations[i]).log_prob(
                actions[i]) if self.discrete or self.ac_dim == 1 else self(
                    observations[i]).log_prob(actions[i]).sum()
            adv = advantages[i]
            loss += adv * log_prob

        loss = -loss / N

        # TODO: optimize `loss` using `self.optimizer`
        # HINT: remember to `zero_grad` first
        loss.backward()
        self.optimizer.step()

        if self.nn_baseline:
            ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one
            ## HINT: there is a `normalize` function in `infrastructure.utils`
            targets = normalize(q_values, q_values.mean(), q_values.std())
            targets = ptu.from_numpy(targets)

            ## TODO: use the `forward` method of `self.baseline` to get baseline predictions
            baseline_predictions = self.baseline(observations)

            ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape
            ## [ N ] versus shape [ N x 1 ]
            ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1
            baseline_predictions.squeeze_()
            assert baseline_predictions.shape == targets.shape

            # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`)
            # HINT: use `F.mse_loss`
            baseline_loss = F.mse_loss(baseline_predictions, targets)

            # TODO: optimize `baseline_loss` using `self.baseline_optimizer`
            # HINT: remember to `zero_grad` first
            self.baseline_optimizer.zero_grad()
            baseline_loss.backward()
            self.baseline_optimizer.step()

        train_log = {
            'Training Loss': ptu.to_numpy(loss),
        }
        return train_log
예제 #26
0
    def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
        """
            Update the parameters of the critic.

            let sum_of_path_lengths be the sum of the lengths of the paths sampled from
                Agent.sample_trajectories
            let num_paths be the number of paths sampled from Agent.sample_trajectories

            arguments:
                ob_no: shape: (sum_of_path_lengths, ob_dim)
                next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
                reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing
                    the reward for each timestep
                terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
                    at that timestep of 0 if the episode did not end

            returns:
                training loss
        """
        # TODO: Implement the pseudocode below: do the following (
        # self.num_grad_steps_per_target_update * self.num_target_updates)
        # times:
        # every self.num_grad_steps_per_target_update steps (which includes the
        # first step), recompute the target values by
        #     a) calculating V(s') by querying the critic with next_ob_no
        #     b) and computing the target values as r(s, a) + gamma * V(s')
        # every time, update this critic using the observations and targets
        #

        # HINT: don't forget to use terminal_n to cut off the V(s') (ie set it
        #       to 0) when a terminal state is reached

        for k in range(self.num_grad_steps_per_target_update *
                       self.num_target_updates):
            # Calculating vs'

            self.optimizer.zero_grad()
            if k % self.num_grad_steps_per_target_update == 0:
                vs_prime = self.forward(ptu.from_numpy(next_ob_no))
                vs_prime = ptu.to_numpy(vs_prime)
                #                 print('reward', type(reward_n))
                #                 print('vs_prime', type(vs_prime))
                #                 print('term', type(terminal_n))
                target_values = reward_n + self.gamma * vs_prime * (1 -
                                                                    terminal_n)
                target_values = ptu.from_numpy(target_values)
                target_values.detach()  # im not sure this is right.

            preds = self.forward(ptu.from_numpy(ob_no))
            loss = self.loss(target_values, preds)

            loss.backward()

            self.optimizer.step()

        # HINT: make sure to squeeze the output of the critic_network to ensure
        #       that its dimensions match the reward

        return loss.item()
    def update(self, observations, actions, advantages, q_values=None):
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(advantages)

        # TODO: compute the loss that should be optimized when training with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE
        # is the expectation over collected trajectories of:
        # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: you will want to use the `log_prob` method on the distribution returned
        # by the `forward` method
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss

        action_dist = self.forward(observations)
        log_pi = action_dist.log_prob(actions)
        print(observations.shape)
        loss = -torch.sum(log_pi * advantages)

        # TODO: optimize `loss` using `self.optimizer`
        # HINT: remember to `zero_grad` first
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.nn_baseline:
            ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one
            ## HINT: there is a `normalize` function in `infrastructure.utils`
            if q_values is None:
                targets = utils.normalize(advantages, np.mean(q_values),
                                          np.std(q_values))
            else:
                targets = utils.normalize(q_values, np.mean(q_values),
                                          np.std(q_values))
            targets = ptu.from_numpy(targets)

            ## TODO: use the `forward` method of `self.baseline` to get baseline predictions
            baseline_predictions = self.baseline.forward(observations).squeeze(
                1)

            ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape
            ## [ N ] versus shape [ N x 1 ]
            ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1
            assert baseline_predictions.shape == targets.shape, f"shapes do not match, pred_shape: " \
                                                                f" {baseline_predictions.shape} \t target shape {targets.shape}"

            # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`)
            # HINT: use `F.mse_loss`
            baseline_loss = F.mse_loss(baseline_predictions, targets)

            # TODO: optimize `baseline_loss` using `self.baseline_optimizer`
            # HINT: remember to `zero_grad` first
            self.baseline_optimizer.zero_grad()
            baseline_loss.backward()
            self.baseline_optimizer.step()

        train_log = {
            'Training Loss': ptu.to_numpy(loss),
        }
        return train_log
예제 #28
0
 def get_action(self, obs: np.ndarray) -> np.ndarray:
     # get this from Piazza
     if len(obs.shape) > 1:
         observation = ptu.from_numpy(obs)
     else:
         observation = ptu.from_numpy(obs[None])
     #  return the action that the policy prescribes
     return ptu.to_numpy(self(observation).sample())
예제 #29
0
    def __init__(self,
                 ac_dim,
                 ob_dim,
                 n_layers,
                 size,
                 discrete=False,
                 learning_rate=1e-4,
                 training=True,
                 nn_baseline=False,
                 **kwargs
                 ):
        super().__init__(**kwargs)

        # init vars
        self.ac_dim = ac_dim
        self.ob_dim = ob_dim
        self.n_layers = n_layers
        self.discrete = discrete
        self.size = size
        self.learning_rate = learning_rate
        self.training = training
        self.nn_baseline = nn_baseline

        if self.discrete:
            self.logits_na = ptu.build_mlp(
                input_size=self.ob_dim,
                output_size=self.ac_dim,
                n_layers=self.n_layers,
                size=self.size,
            )
            self.logits_na.to(ptu.device)
            self.mean_net = None
            self.logstd = None
            self.optimizer = optim.Adam(
                self.logits_na.parameters(),
                self.learning_rate
            )
        else:
            self.logits_na = None
            self.mean_net = ptu.build_mlp(
                input_size=self.ob_dim,
                output_size=self.ac_dim,
                n_layers=self.n_layers,
                size=self.size,
            )
            self.mean_net.to(ptu.device)
            # TODO: shouldn't logstd also be a NN?
            self.logstd = nn.Parameter(torch.zeros(
                self.ac_dim, dtype=torch.float32, device=ptu.device
            ))
            self.optimizer = optim.Adam(
                itertools.chain([self.logstd], self.mean_net.parameters()),
                self.learning_rate
            )
            self.normal_dist = distributions.Normal(
                ptu.from_numpy(0.0),
                ptu.from_numpy(1.0)
            )
예제 #30
0
 def update(self, observations, actions, adv_n=None):
     # TODO: update the policy and return the loss
     dist = self(ptu.from_numpy(observations))
     logp = dist.log_prob(ptu.from_numpy(actions))
     loss = -(logp * ptu.from_numpy(adv_n)).sum()
     self.optimizer.zero_grad()
     loss.backward()
     self.optimizer.step()
     return loss.item()