Exemplo n.º 1
0
    def update(self,
               observations,
               actions,
               adv_n=None,
               acs_labels_na=None,
               qvals=None):
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        """
        TODO: compute the behavior cloning loss by maximizing the log likelihood
        expert actions under the policy.
        Hint: look at the documentation for torch.distributions 
        """
        m = self.forward(observations)
        loss = -torch.mean(m.log_prob(actions))
        """
        END CODE
        """

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return {
            # You can add extra logging information here, but keep this line
            'Training Loss': ptu.to_numpy(loss),
        }
 def get_action(self, obs):
     if len(obs.shape) > 1:
         observation = obs
     else:
         observation = obs[None, :]
     observation = ptu.from_numpy(observation.astype(np.float32))
     action = self(observation)
     return ptu.to_numpy(action)
Exemplo n.º 3
0
    def update(self,
               observations,
               acs_na,
               adv_n=None,
               acs_labels_na=None,
               qvals=None):
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(acs_na)
        adv_n = ptu.from_numpy(adv_n)
        """
        TODO: compute the policy gradient given already compute advantages adv_n
        """
        m = self.forward(observations)
        loss = -torch.mean(adv_n * m.log_prob(actions))
        """
        END CODE
        """
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.nn_baseline:
            # first standardize qvals to make this easier to train
            targets_n = (qvals - np.mean(qvals)) / (np.std(qvals) + 1e-8)
            targets_n = ptu.from_numpy(targets_n)
            """
            TODO: update the baseline value function by regressing to the values
            Hint: see self.baseline_loss for the appropriate loss
            """
            f = self.baseline.forward(observations)
            baseline_loss = self.baseline_loss(targets_n, f.T[0])
            """
            END CODE
            """
            self.baseline_optimizer.zero_grad()
            baseline_loss.backward()
            self.baseline_optimizer.step()
        else:
            baseline_loss = None
        return {
            'Training Loss': ptu.to_numpy(loss),
            'Baseline Loss':
            ptu.to_numpy(baseline_loss) if baseline_loss else 0,
        }
Exemplo n.º 4
0
    def get_action(self, obs: np.ndarray) -> np.ndarray:
        if len(obs.shape) > 1:
            observation = obs
        else:
            observation = obs[None]

        observation = ptu.from_numpy(observation)
        # action = self(observation)
        action_distribution = self(observation)
        action = action_distribution.sample()  # don't bother with rsample
        return ptu.to_numpy(action)
Exemplo n.º 5
0
 def run_baseline_prediction(self, observations):
     """
         Helper function that converts `observations` to a tensor,
         calls the forward method of the baseline MLP,
         and returns a np array
         Input: `observations`: np.ndarray of size [N, 1]
         Output: np.ndarray of size [N]
     """
     observations = ptu.from_numpy(observations)
     pred = self.baseline(observations)
     return ptu.to_numpy(pred.squeeze())
    def update(self, obs, acts, next_obs, rewards, terminals, actor):
        """
            Update the parameters of the critic.

            arguments:
                obs: shape: (batch_size, ob_dim)
                acts: shape: (batch_size, acdim)
                next_obs: shape: (batch_size, ob_dim). The observation after taking one step forward
                reward_n: length: batch_size. Each element in reward_n is a scalar containing
                    the reward for each timestep
                terminal_n: length: batch_size. Each element in terminal_n is either 1 if the episode ended
                    at that timestep of 0 if the episode did not end

            returns:
                training loss
        """
        obs = ptu.from_numpy(obs)
        acts = ptu.from_numpy(acts)
        next_obs = ptu.from_numpy(next_obs)
        rewards = ptu.from_numpy(rewards)
        terminals = ptu.from_numpy(terminals)

        q_pred = self.critic_network(torch.cat((obs, acts), dim=-1)).squeeze(1)
        target_value = self.compute_target_value(next_obs, rewards, terminals,
                                                 actor)
        loss = self.loss(q_pred, target_value.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target Q function with exponential moving average
        self.update_target_network_ema()

        return {
            'Critic Training Loss': ptu.to_numpy(loss),
            'Critic Mean': ptu.to_numpy(q_pred.mean()),
        }
Exemplo n.º 7
0
 def update(self, observations, critic):
     observations = ptu.from_numpy(observations)
     """
     TODO: implement policy loss with learned critic. Update the policy 
     to maximize the expected Q value of actions, using a single sample
     from the policy for each state.
     Hint: assuming we are in continous action spaces and using Gaussian 
     distributions, look at the rsample function to differentiate through 
     samples from the action distribution.
     """
     loss = None
     """
     END CODE
     """
     self.optimizer.zero_grad()
     loss.backward()
     self.optimizer.step()
     return {
         'Actor Training Loss': ptu.to_numpy(loss),
     }
 def forward_np(self, obs, acts):
     obs = ptu.from_numpy(obs)
     acts = ptu.from_numpy(acts)
     predictions = self(obs, acts)
     return ptu.to_numpy(predictions)
Exemplo n.º 9
0
    def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
        """
            Update the parameters of the critic.
            let sum_of_path_lengths be the sum of the lengths of the paths sampled from
                Agent.sample_trajectories
            let num_paths be the number of paths sampled from Agent.sample_trajectories
            arguments:
                ob_no: shape: (sum_of_path_lengths, ob_dim)
                next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
                reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing
                    the reward for each timestep
                terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
                    at that timestep of 0 if the episode did not end
            returns:
                nothing
        """
        ob_no = ptu.from_numpy(ob_no)
        ac_na = ptu.from_numpy(ac_na).to(torch.long)
        next_ob_no = ptu.from_numpy(next_ob_no)
        reward_n = ptu.from_numpy(reward_n)
        terminal_n = ptu.from_numpy(terminal_n)

        qa_t_values = self.q_net(ob_no)
        q_t_values = torch.gather(qa_t_values, 1,
                                  ac_na.unsqueeze(1)).squeeze(1)

        if self.double_q:
            """
            TODO: In double Q-learning, the best action is selected using the 
            current Q-network, but the Q-value for this action 
            is obtained from the target Q-network. See page 5 of 
            https://arxiv.org/pdf/1509.06461.pdf for more details.
            """
            # your code here
            q_next = self.q_net(next_ob_no)
            ac_ind = torch.max(q_next, dim=-1).indices
            next_values = self.q_net_target(next_ob_no)
            next_values *= torch.stack(self.ac_dim * [1 - terminal_n]).T
            maxed_values = torch.Tensor(
                [next_values[i][ac_ind[i]] for i in range(len(terminal_n))])
            """
            END CODE
            """
        else:
            """
            TODO: compute the value of of the next state
            """

            # your code here
            next_values = self.q_net_target(next_ob_no)
            next_values *= torch.stack(self.ac_dim * [1 - terminal_n]).T
            maxed_values = torch.max(next_values, dim=-1).values
            """
            END CODE
            """
        """
        TODO: Compute the target values, remember to make sure no gradients
        are passed through the target values.
        Hint: Use torch.no_grad or .detach() to ensure no gradients are passed.
        """
        target = (self.gamma * maxed_values + reward_n).detach()
        """
        END CODE
        """

        assert q_t_values.shape == target.shape
        loss = self.loss(q_t_values, target)

        # Updates Q function to minimize bellman error
        # Includes gradient clipping for stability
        self.optimizer.zero_grad()
        loss.backward()
        utils.clip_grad_value_(self.q_net.parameters(),
                               self.grad_norm_clipping)
        self.optimizer.step()

        return {
            'Training Loss': ptu.to_numpy(loss),
        }
Exemplo n.º 10
0
 def qa_values(self, obs):
     obs = ptu.from_numpy(obs)
     qa_values = self.q_net(obs)
     return ptu.to_numpy(qa_values)