def update(self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) """ TODO: compute the behavior cloning loss by maximizing the log likelihood expert actions under the policy. Hint: look at the documentation for torch.distributions """ m = self.forward(observations) loss = -torch.mean(m.log_prob(actions)) """ END CODE """ self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { # You can add extra logging information here, but keep this line 'Training Loss': ptu.to_numpy(loss), }
def get_action(self, obs): if len(obs.shape) > 1: observation = obs else: observation = obs[None, :] observation = ptu.from_numpy(observation.astype(np.float32)) action = self(observation) return ptu.to_numpy(action)
def update(self, observations, acs_na, adv_n=None, acs_labels_na=None, qvals=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(acs_na) adv_n = ptu.from_numpy(adv_n) """ TODO: compute the policy gradient given already compute advantages adv_n """ m = self.forward(observations) loss = -torch.mean(adv_n * m.log_prob(actions)) """ END CODE """ self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.nn_baseline: # first standardize qvals to make this easier to train targets_n = (qvals - np.mean(qvals)) / (np.std(qvals) + 1e-8) targets_n = ptu.from_numpy(targets_n) """ TODO: update the baseline value function by regressing to the values Hint: see self.baseline_loss for the appropriate loss """ f = self.baseline.forward(observations) baseline_loss = self.baseline_loss(targets_n, f.T[0]) """ END CODE """ self.baseline_optimizer.zero_grad() baseline_loss.backward() self.baseline_optimizer.step() else: baseline_loss = None return { 'Training Loss': ptu.to_numpy(loss), 'Baseline Loss': ptu.to_numpy(baseline_loss) if baseline_loss else 0, }
def get_action(self, obs: np.ndarray) -> np.ndarray: if len(obs.shape) > 1: observation = obs else: observation = obs[None] observation = ptu.from_numpy(observation) # action = self(observation) action_distribution = self(observation) action = action_distribution.sample() # don't bother with rsample return ptu.to_numpy(action)
def run_baseline_prediction(self, observations): """ Helper function that converts `observations` to a tensor, calls the forward method of the baseline MLP, and returns a np array Input: `observations`: np.ndarray of size [N, 1] Output: np.ndarray of size [N] """ observations = ptu.from_numpy(observations) pred = self.baseline(observations) return ptu.to_numpy(pred.squeeze())
def update(self, obs, acts, next_obs, rewards, terminals, actor): """ Update the parameters of the critic. arguments: obs: shape: (batch_size, ob_dim) acts: shape: (batch_size, acdim) next_obs: shape: (batch_size, ob_dim). The observation after taking one step forward reward_n: length: batch_size. Each element in reward_n is a scalar containing the reward for each timestep terminal_n: length: batch_size. Each element in terminal_n is either 1 if the episode ended at that timestep of 0 if the episode did not end returns: training loss """ obs = ptu.from_numpy(obs) acts = ptu.from_numpy(acts) next_obs = ptu.from_numpy(next_obs) rewards = ptu.from_numpy(rewards) terminals = ptu.from_numpy(terminals) q_pred = self.critic_network(torch.cat((obs, acts), dim=-1)).squeeze(1) target_value = self.compute_target_value(next_obs, rewards, terminals, actor) loss = self.loss(q_pred, target_value.detach()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target Q function with exponential moving average self.update_target_network_ema() return { 'Critic Training Loss': ptu.to_numpy(loss), 'Critic Mean': ptu.to_numpy(q_pred.mean()), }
def update(self, observations, critic): observations = ptu.from_numpy(observations) """ TODO: implement policy loss with learned critic. Update the policy to maximize the expected Q value of actions, using a single sample from the policy for each state. Hint: assuming we are in continous action spaces and using Gaussian distributions, look at the rsample function to differentiate through samples from the action distribution. """ loss = None """ END CODE """ self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { 'Actor Training Loss': ptu.to_numpy(loss), }
def forward_np(self, obs, acts): obs = ptu.from_numpy(obs) acts = ptu.from_numpy(acts) predictions = self(obs, acts) return ptu.to_numpy(predictions)
def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): """ Update the parameters of the critic. let sum_of_path_lengths be the sum of the lengths of the paths sampled from Agent.sample_trajectories let num_paths be the number of paths sampled from Agent.sample_trajectories arguments: ob_no: shape: (sum_of_path_lengths, ob_dim) next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing the reward for each timestep terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended at that timestep of 0 if the episode did not end returns: nothing """ ob_no = ptu.from_numpy(ob_no) ac_na = ptu.from_numpy(ac_na).to(torch.long) next_ob_no = ptu.from_numpy(next_ob_no) reward_n = ptu.from_numpy(reward_n) terminal_n = ptu.from_numpy(terminal_n) qa_t_values = self.q_net(ob_no) q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1) if self.double_q: """ TODO: In double Q-learning, the best action is selected using the current Q-network, but the Q-value for this action is obtained from the target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details. """ # your code here q_next = self.q_net(next_ob_no) ac_ind = torch.max(q_next, dim=-1).indices next_values = self.q_net_target(next_ob_no) next_values *= torch.stack(self.ac_dim * [1 - terminal_n]).T maxed_values = torch.Tensor( [next_values[i][ac_ind[i]] for i in range(len(terminal_n))]) """ END CODE """ else: """ TODO: compute the value of of the next state """ # your code here next_values = self.q_net_target(next_ob_no) next_values *= torch.stack(self.ac_dim * [1 - terminal_n]).T maxed_values = torch.max(next_values, dim=-1).values """ END CODE """ """ TODO: Compute the target values, remember to make sure no gradients are passed through the target values. Hint: Use torch.no_grad or .detach() to ensure no gradients are passed. """ target = (self.gamma * maxed_values + reward_n).detach() """ END CODE """ assert q_t_values.shape == target.shape loss = self.loss(q_t_values, target) # Updates Q function to minimize bellman error # Includes gradient clipping for stability self.optimizer.zero_grad() loss.backward() utils.clip_grad_value_(self.q_net.parameters(), self.grad_norm_clipping) self.optimizer.step() return { 'Training Loss': ptu.to_numpy(loss), }
def qa_values(self, obs): obs = ptu.from_numpy(obs) qa_values = self.q_net(obs) return ptu.to_numpy(qa_values)