def __init__(self, filename, **kwargs): super().__init__(**kwargs) # Open the pickled expert policy. with open(filename, 'rb') as f: data = pickle.loads(f.read()) # Define the hidden layer activation functions. self.nonlin_type = data['nonlin_type'] if self.nonlin_type == 'lrelu': self.non_lin = nn.LeakyReLU(0.01) elif self.nonlin_type == 'tanh': self.non_lin = nn.Tanh() else: raise NotImplementedError() # Assert that this loaded policy is a "GaussianPolicy" policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] assert policy_type == 'GaussianPolicy', ( 'Policy type {} not supported'.format(policy_type) ) self.policy_params = data[policy_type] # The loaded policy has policy_params # policy_params is a dictionary with these 4 entries. assert set(self.policy_params.keys()) == { 'logstdevs_1_Da', 'hidden', 'obsnorm', 'out' } # Build the policy. First, observation normalization. # Under the loaded policy, the observations are (approx) distributed as # N(obsnorm_mean, obsnorm_stdev) assert list(self.policy_params['obsnorm'].keys()) == ['Standardizer'] obsnorm_mean = self.policy_params['obsnorm']['Standardizer']['mean_1_D'] obsnorm_meansq = self.policy_params['obsnorm']['Standardizer'][ 'meansq_1_D'] obsnorm_stdev = np.sqrt( np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) print('obs', obsnorm_mean.shape, obsnorm_stdev.shape) self.obs_norm_mean = nn.Parameter(ptu.from_numpy(obsnorm_mean)) self.obs_norm_std = nn.Parameter(ptu.from_numpy(obsnorm_stdev)) # Reconstruct the hidden layers froam the loaded data. self.hidden_layers = nn.ModuleList() # The 'hidden' layers must be "FeedforwardNet" type # The layers are kept in `layer_params` dictionary, ordered by the keys. # They are read out, made into PyTorch layers, then appended, in order. assert list(self.policy_params['hidden'].keys()) == ['FeedforwardNet'] layer_params = self.policy_params['hidden']['FeedforwardNet'] for layer_name in sorted(layer_params.keys()): l = layer_params[layer_name] W, b = read_layer(l) linear_layer = create_linear_layer(W, b) self.hidden_layers.append(linear_layer) # Output layer (does not have an activation function). W, b = read_layer(self.policy_params['out']) self.output_layer = create_linear_layer(W, b)
def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): ob_no = ptu.from_numpy(ob_no) next_ob_no = ptu.from_numpy(next_ob_no) terminal_n = ptu.from_numpy(terminal_n) re_n = ptu.from_numpy(re_n) ac_na = ptu.from_numpy(ac_na) loss_critic = 0. for i in range( self.agent_params['num_critic_updates_per_agent_update']): loss_critic += self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) # advantage = estimate_advantage(...) : adv_n = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # a tensor is returned loss_actor = 0. for i in range( self.agent_params['num_actor_updates_per_agent_update']): loss_actor += self.actor.update(ob_no, ac_na, adv_n) loss = OrderedDict() loss['Critic_Loss'] = loss_critic loss[ 'Actor_Loss'] = loss_actor # in TensorBoard, loss_actor actually increases as we actually minimize -loss_actor return loss
def update(self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None): # TODO: update the policy and return the loss observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) observations.require_grads = True actions.require_grads = True nn_acs = self.forward(observations).rsample() loss = self.loss(nn_acs, actions) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { # You can add extra logging information here, but keep this line 'Training Loss': ptu.to_numpy(loss), }
def update(self, observations, actions, adv_n=None): # TODO: update the policy and return the loss observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) adv_n = ptu.from_numpy(adv_n) # TODO: compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss action_distributions = self.forward(observations) log_prob_actions = action_distributions.log_prob(actions) if self.discrete: assert log_prob_actions.shape == adv_n.shape else: # Need to sum the log prob over the action dimension. assert log_prob_actions.shape[:-1] == adv_n.shape log_prob_actions = log_prob_actions.sum(dim=-1) losses = -log_prob_actions * adv_n loss = losses.sum() # TODO: optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item()
def update(self, observations_np, actions_np, advantages_np=None): observations = ptu.from_numpy(observations_np) actions = ptu.from_numpy(actions_np) advantages = ptu.from_numpy(advantages_np) # Compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss actions_distribution = self.forward(observations) log_probs: torch.Tensor = actions_distribution.log_prob(actions) if not self.discrete: log_probs = log_probs.sum(1) assert log_probs.size() == advantages.size() loss = -(log_probs * advantages).sum() # Optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first self.optimizer.zero_grad() loss.backward() self.optimizer.step() train_log = { 'Training Loss': ptu.to_numpy(loss), } return loss.item()
def update(self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None): # TODO: update the policy and return the loss if type(observations) == np.ndarray: observations = ptu.from_numpy(observations) if type(actions) == np.ndarray: actions = ptu.from_numpy(actions) #print(observations.shape, actions.shape) loss = self.loss(self.forward(observations), actions) # print(loss) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { # You can add extra logging information here, but keep this line 'Training Loss': ptu.to_numpy(loss) }
def get_prediction(self, obs, acs, data_statistics): """ :param obs: numpy array of observations (s_t) :param acs: numpy array of actions (a_t) :param data_statistics: A dictionary with the following keys (each with a numpy array as the value): - 'obs_mean' - 'obs_std' - 'acs_mean' - 'acs_std' - 'delta_mean' - 'delta_std' :return: a numpy array of the predicted next-states (s_t+1) """ self.update_statistics(data_statistics['obs_mean'], data_statistics['obs_std'], data_statistics['acs_mean'], data_statistics['acs_std'], data_statistics['delta_mean'], data_statistics['delta_std']) # print ('pred', obs.shape, acs.shape) prediction, _ = self.forward( ptu.from_numpy(obs) if type(obs) == np.ndarray else obs, ptu.from_numpy(acs), self.obs_mean, self.obs_std, self.acs_mean, self.acs_std, self.delta_mean, self.delta_std) # print (prediction.shape) # TODO(Q1) get numpy array of the predicted next-states (s_t+1) # Hint: `self(...)` returns a tuple, but you only need to use one of the # outputs. return ptu.to_numpy(prediction)
def compute_loss(self, observations, gradients, actions): # if self.siren: if self.supervision_mode in ['gradient', 'gv']: def net(x): action_distribution, obs = self(x) return action_distribution.rsample() prediction_gradients = jacobian(net=net, x=observations, ac_dim=self.ac_dim) if self.supervision_mode == 'gradient': loss = self.loss(prediction_gradients, ptu.from_numpy(gradients)) else: # supervision_mode= 'gv', weight gradient loss action_value_loss = nn.MSELoss() predicted_actions = self(observations)[0].rsample() loss = self.gradient_loss_scale * self.loss( prediction_gradients, ptu.from_numpy(gradients)) + action_value_loss( predicted_actions, ptu.from_numpy(actions)) else: assert self.supervision_mode == 'value' predicted_actions = self(observations)[0].rsample() loss = self.loss(predicted_actions, ptu.from_numpy(actions)) return loss
def update(self, ob_no, targets): """ Update the parameters of the critic. let sum_of_path_lengths be the sum of the lengths of the paths sampled from Agent.sample_trajectories let num_paths be the number of paths sampled from Agent.sample_trajectories arguments: ob_no: shape: (sum_of_path_lengths, ob_dim) targets: shape: (sum_of_path_lengths,) returns: training loss """ targets = ptu.from_numpy(targets).detach() for _ in range(self.num_target_updates): rand_indices = torch.randperm(targets.shape[0]) v_ts = self(ptu.from_numpy(ob_no))[rand_indices] v_targets = targets[rand_indices] value_loss = self.loss(v_ts, v_targets) for param in self.critic_network.parameters(): value_loss += param.pow(2).sum() * self.l2_reg self.optimizer.zero_grad() value_loss.backward() self.optimizer.step() return value_loss.item()
def get_prediction(self, obs, acs, data_statistics): """ :param obs: numpy array of observations (s_t) :param acs: numpy array of actions (a_t) :param data_statistics: A dictionary with the following keys (each with a numpy array as the value): - 'obs_mean' - 'obs_std' - 'acs_mean' - 'acs_std' - 'delta_mean' - 'delta_std' :return: a numpy array of the predicted next-states (s_t+1) """ obs = ptu.from_numpy(obs) acs = ptu.from_numpy(acs) # obs_mean = ptu.from_numpy(data_statistics['obs_mean']) # obs_std = ptu.from_numpy(data_statistics['obs_std']) # acs_mean = ptu.from_numpy(data_statistics['acs_mean']) # acs_std = ptu.from_numpy(data_statistics['acs_std']) # delta_mean = ptu.from_numpy(data_statistics['delta_mean']) # delta_std = ptu.from_numpy(data_statistics['delta_std']) self.update_statistics(*data_statistics.values()) prediction = self( obs, acs, self.obs_mean, self.obs_std, self.acs_mean, self.acs_std, self.delta_mean, self.delta_std )[0] # TODO(Q1) get numpy array of the predicted next-states (s_t+1) prediction = ptu.to_numpy(prediction) # Hint: `self(...)` returns a tuple, but you only need to use one of the # outputs. return prediction
def get_prediction(self, obs, acs, data_statistics): """ :param obs: numpy array of observations (s_t) :param acs: numpy array of actions (a_t) :param data_statistics: A dictionary with the following keys (each with a numpy array as the value): - 'obs_mean' - 'obs_std' - 'acs_mean' - 'acs_std' - 'delta_mean' - 'delta_std' :return: a numpy array of the predicted next-states (s_t+1) """ # TODO(Q1) get numpy array of the predicted next-states (s_t+1) obs = ptu.from_numpy(obs) acs = ptu.from_numpy(acs) torch_data_statistics = {k: ptu.from_numpy(v) for k, v in data_statistics.items()} prediction = self.forward(obs, acs, torch_data_statistics['obs_mean'], torch_data_statistics['obs_std'], torch_data_statistics['acs_mean'], torch_data_statistics['acs_std'], torch_data_statistics['delta_mean'], torch_data_statistics['delta_std'])[0] # Hint: `self(...)` returns a tuple, but you only need to use one of the # outputs. return ptu.to_numpy(prediction)
def get_prediction(self, obs, acs, data_statistics): """ :param obs: numpy array of observations (s_t) :param acs: numpy array of actions (a_t) :param data_statistics: A dictionary with the following keys (each with a numpy array as the value): - 'obs_mean' - 'obs_std' - 'acs_mean' - 'acs_std' - 'delta_mean' - 'delta_std' :return: a numpy array of the predicted next-states (s_t+1) """ obs = ptu.from_numpy(obs) acs = ptu.from_numpy(acs) data_statistics = {key: ptu.from_numpy(value) for key, value in data_statistics.items()} # get numpy array of the predicted next-states (s_t+1) # Hint: `self(...)` returns a tuple, but you only need to use one of the # outputs. prediction, _ = self( obs, acs, data_statistics['obs_mean'], data_statistics['obs_std'], data_statistics['acs_mean'], data_statistics['acs_std'], data_statistics['delta_mean'], data_statistics['delta_std'], ) return prediction.cpu().detach().numpy()
def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic ob_no = ptu.from_numpy(ob_no) next_ob_no = ptu.from_numpy(next_ob_no) re_n = ptu.from_numpy(re_n) ac_na = ptu.from_numpy(ac_na) terminal_n = ptu.from_numpy(terminal_n) for _ in range( self.agent_params['num_critic_updates_per_agent_update']): critic_loss = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) # advantage = estimate_advantage(...) advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor for _ in range( self.agent_params['num_actor_updates_per_agent_update']): actor_loss = self.actor.update(ob_no, ac_na, advantage) loss = OrderedDict() loss['Critic_Loss'] = critic_loss loss['Actor_Loss'] = actor_loss return loss
def train_agent(self): """ Sample self.params['train_batch_size'] frames from the replay buffer of the agent, then train the agent upon that. Repeat this for self.params['num_agent_train_steps_per_iter'] steps. Returns - all_logs: the entire training log from this training. """ print('\nTraining agent using sampled data from replay buffer...') all_logs = [] for train_step in range(self.params['num_agent_train_steps_per_iter']): # Sample some data from the replay buffer of the agent. # sample size is # self.params['train_batch_size'] ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch \ = self.agent.sample(self.params['train_batch_size']) # Use the sampled data to train an agent train_log = self.agent.train( ptu.from_numpy(ob_batch), ptu.from_numpy(ac_batch), ptu.from_numpy(re_batch), ptu.from_numpy(next_ob_batch), ptu.from_numpy(terminal_batch)) all_logs.append(train_log) # training log for debugging return all_logs
def update(self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None): # TODO: update the policy and return the loss #action_predicted = self.get_action(observations) #loss = self.loss(action_predicted, actions) observations = ptu.from_numpy(observations.astype(np.float32)) actions = ptu.from_numpy(actions.astype(np.float32)) pred_action_distribution = self(observations) pred_acs = pred_action_distribution.rsample() loss = self.loss(pred_acs, actions) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { # You can add extra logging information here, but keep this line 'Training Loss': ptu.to_numpy(loss), }
def update(self, observations, actions, adv_n=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) adv_n = ptu.from_numpy(adv_n) if self.discrete: action_distribution = self.forward(observations) else: raise NotImplementedError() # Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # you will want to use the `log_prob` method on the distribution returned # by the `forward` method # don't forget that `optimizer.step()` MINIMIZES a loss loss = action_distribution.log_prob(actions) * adv_n loss = -loss.sum() self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item()
def get_prediction(self, obs, acs, data_statistics): """ :param obs: numpy array of observations (s_t) :param acs: numpy array of actions (a_t) :param data_statistics: A dictionary with the following keys (each with a numpy array as the value): - 'obs_mean' - 'obs_std' - 'acs_mean' - 'acs_std' - 'delta_mean' - 'delta_std' :return: a numpy array of the predicted next-states (s_t+1) """ # TODO(Q1) done get numpy array of the predicted next-states (s_t+1) obs = ptu.from_numpy(obs) acs = ptu.from_numpy(acs) data_statistics = { k: ptu.from_numpy(v) for k, v in data_statistics.items() } prediction, delta_pred_normalized = \ self.forward(obs, acs, data_statistics['obs_mean'], data_statistics['obs_std'], data_statistics['acs_mean'], data_statistics['acs_std'], data_statistics['delta_mean'], data_statistics['delta_std']) return ptu.to_numpy(prediction)
def update(self, observations, actions, adv_n=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(adv_n) # TODO: compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss action_distribution = self(observations) log_probability = action_distribution.log_prob(actions) m = torch.mul(log_probability, advantages) loss = torch.sum(m) loss = loss * -1 #because we want to maximize but self.optimizer minimizes # TODO: optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item()
def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): ob_no = ptu.from_numpy(ob_no) ac_na = ptu.from_numpy(ac_na).to(torch.long) next_ob_no = ptu.from_numpy(next_ob_no) reward_n = ptu.from_numpy(reward_n) terminal_n = ptu.from_numpy(terminal_n) qa_t_values = self.q_net(ob_no) q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1) qa_tp1_values = self.q_net_target(next_ob_no) if self.double_q: next_actions = self.q_net(next_ob_no).argmax(dim=1) q_tp1 = torch.gather(qa_tp1_values, 1, next_actions.unsqueeze(1)).squeeze(1) else: q_tp1, _ = qa_tp1_values.max(dim=1) target = reward_n + self.gamma * q_tp1 * (1 - terminal_n) target = target.detach() loss = self.loss(q_t_values, target) self.optimizer.zero_grad() loss.backward() utils.clip_grad_value_(self.q_net.parameters(), self.grad_norm_clipping) self.optimizer.step() return {'Training Loss': ptu.to_numpy(loss)}
def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): """ Update the parameters of the critic. let sum_of_path_lengths be the sum of the lengths of the paths sampled from Agent.sample_trajectories let num_paths be the number of paths sampled from Agent.sample_trajectories arguments: ob_no: shape: (sum_of_path_lengths, ob_dim) next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing the reward for each timestep terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended at that timestep of 0 if the episode did not end returns: nothing """ ob_no = ptu.from_numpy(ob_no) ac_na = ptu.from_numpy(ac_na).to(torch.long) next_ob_no = ptu.from_numpy(next_ob_no) reward_n = ptu.from_numpy(reward_n) terminal_n = ptu.from_numpy(terminal_n) #print(ob_no) qa_t_values = self.q_net(ob_no) q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1) # TODO compute the Q-values from the target network qa_tp1_values = self.q_net_target(next_ob_no) if self.double_q: # You must fill this part for Q2 of the Q-learning portion of the homework. # In double Q-learning, the best action is selected using the Q-network that # is being updated, but the Q-value for this action is obtained from the # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details. q_tp1 = torch.gather(qa_tp1_values, 1, torch.argmax(qa_t_values, dim=1).unsqueeze(1)).squeeze(1) else: q_tp1, _ = qa_tp1_values.max(dim=1) # TODO compute targets for minimizing Bellman error # HINT: as you saw in lecture, this would be: #currentReward + self.gamma * qValuesOfNextTimestep * (not terminal) target = reward_n + self.gamma * (q_tp1 * (1 - terminal_n)) target = target.detach() assert q_t_values.shape == target.shape loss = self.loss(q_t_values, target) self.optimizer.zero_grad() loss.backward() utils.clip_grad_value_(self.q_net.parameters(), self.grad_norm_clipping) self.optimizer.step() return { 'Training Loss': ptu.to_numpy(loss), }
def update(self, observations, actions, advantages, q_values=None): """ TRPO policy update fucntion """ self.observations = ptu.from_numpy(observations) self.actions = ptu.from_numpy(actions) self.advantages = ptu.from_numpy(advantages) # computes the loss that should be optimized when training with policy gradient log_probs = self.logprobs(self.observations, self.actions) with torch.no_grad(): old_log_probs = self.logprobs(self.observations, self.actions) loss = self.surrogate_reward(log_probs, old_log_probs) # find policy gradient with surrogate objective of TRPO grads = torch.autograd.grad(loss, self.policy_parameters()) policy_grad = torch.cat([grad.view(-1) for grad in grads]).detach() step_dir = self.conjugate_gradient(-policy_grad) max_step = torch.sqrt( 2 * self.max_kl / torch.dot(step_dir, self.fisher_vector_product(step_dir))) full_step = max_step * step_dir expected_improve = torch.dot(-policy_grad, full_step) prev_params = ptu.flatten_params(self.policy_parameters()).clone() success, new_params = self.line_search(old_log_probs, prev_params, full_step, expected_improve) ptu.assign_params_to(self.policy_parameters(), new_params) return loss.item()
def update(self, observations, actions, advantages=None): # update the policy and return the loss observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(advantages) # compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss action_dists = self(observations) log_probs = action_dists.log_prob(actions) loss = -torch.sum(log_probs * advantages) # optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first self.optimizer.zero_grad() loss.backward() self.optimizer.step() assert not self.nn_baseline return loss.item()
def td_error(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): # calculate temporal difference ob_no = ptu.from_numpy(ob_no) ac_na = ptu.from_numpy(ac_na).to(torch.long) next_ob_no = ptu.from_numpy(next_ob_no) reward_n = ptu.from_numpy(reward_n) terminal_n = ptu.from_numpy(terminal_n) qa_t_values = self.q_net(ob_no) q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1) # TODO compute the Q-values from the target network qa_tp1_values = self.q_net_target(next_ob_no) if self.double_q: # You must fill this part for Q2 of the Q-learning portion of the homework. # In double Q-learning, the best action is selected using the Q-network that # is being updated, but the Q-value for this action is obtained from the # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details. _, selected_action = self.q_net(next_ob_no).max(1) selected_action = selected_action.unsqueeze(1) q_tp1 = qa_tp1_values.gather(1, selected_action).squeeze() else: q_tp1, _ = qa_tp1_values.max(dim=1) # TODO compute targets for minimizing Bellman error # HINT: as you saw in lecture, this would be: #currentReward + self.gamma * qValuesOfNextTimestep * (not terminal) target = reward_n + self.gamma * q_tp1 * (1 - terminal_n) target = target.detach() assert q_t_values.shape == target.shape difference = q_t_values - target return ptu.to_numpy(difference)
def update(self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None): # TODO: update the policy and return the loss #zeroing out the gradients to prevent gradient accumulation before calling loss.backward() self.optimizer.zero_grad() #forward-propagation of NN i.e., observation -> policy takes action pred_actions = self.forward(ptu.from_numpy(observations)) #Q: why not use self._get_action(observations)? #Q: How to understand loss.forward? loss = self.loss.forward(pred_actions, ptu.from_numpy(actions)) #loss(y_hat,y) loss.backward() #back-propagation:compute gradients self.optimizer.step() #updates parameters return { # You can add extra logging information here, but keep this line 'Training Loss': ptu.to_numpy(loss), }
def update(self, observations, actions, advantages, q_values=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(advantages) # TODO: compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss N = observations.shape[0] self.optimizer.zero_grad() loss = 0 for i in range(1, N): log_prob = self(observations[i]).log_prob( actions[i]) if self.discrete or self.ac_dim == 1 else self( observations[i]).log_prob(actions[i]).sum() adv = advantages[i] loss += adv * log_prob loss = -loss / N # TODO: optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first loss.backward() self.optimizer.step() if self.nn_baseline: ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one ## HINT: there is a `normalize` function in `infrastructure.utils` targets = normalize(q_values, q_values.mean(), q_values.std()) targets = ptu.from_numpy(targets) ## TODO: use the `forward` method of `self.baseline` to get baseline predictions baseline_predictions = self.baseline(observations) ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape ## [ N ] versus shape [ N x 1 ] ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1 baseline_predictions.squeeze_() assert baseline_predictions.shape == targets.shape # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`) # HINT: use `F.mse_loss` baseline_loss = F.mse_loss(baseline_predictions, targets) # TODO: optimize `baseline_loss` using `self.baseline_optimizer` # HINT: remember to `zero_grad` first self.baseline_optimizer.zero_grad() baseline_loss.backward() self.baseline_optimizer.step() train_log = { 'Training Loss': ptu.to_numpy(loss), } return train_log
def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): """ Update the parameters of the critic. let sum_of_path_lengths be the sum of the lengths of the paths sampled from Agent.sample_trajectories let num_paths be the number of paths sampled from Agent.sample_trajectories arguments: ob_no: shape: (sum_of_path_lengths, ob_dim) next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing the reward for each timestep terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended at that timestep of 0 if the episode did not end returns: training loss """ # TODO: Implement the pseudocode below: do the following ( # self.num_grad_steps_per_target_update * self.num_target_updates) # times: # every self.num_grad_steps_per_target_update steps (which includes the # first step), recompute the target values by # a) calculating V(s') by querying the critic with next_ob_no # b) and computing the target values as r(s, a) + gamma * V(s') # every time, update this critic using the observations and targets # # HINT: don't forget to use terminal_n to cut off the V(s') (ie set it # to 0) when a terminal state is reached for k in range(self.num_grad_steps_per_target_update * self.num_target_updates): # Calculating vs' self.optimizer.zero_grad() if k % self.num_grad_steps_per_target_update == 0: vs_prime = self.forward(ptu.from_numpy(next_ob_no)) vs_prime = ptu.to_numpy(vs_prime) # print('reward', type(reward_n)) # print('vs_prime', type(vs_prime)) # print('term', type(terminal_n)) target_values = reward_n + self.gamma * vs_prime * (1 - terminal_n) target_values = ptu.from_numpy(target_values) target_values.detach() # im not sure this is right. preds = self.forward(ptu.from_numpy(ob_no)) loss = self.loss(target_values, preds) loss.backward() self.optimizer.step() # HINT: make sure to squeeze the output of the critic_network to ensure # that its dimensions match the reward return loss.item()
def update(self, observations, actions, advantages, q_values=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(advantages) # TODO: compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss action_dist = self.forward(observations) log_pi = action_dist.log_prob(actions) print(observations.shape) loss = -torch.sum(log_pi * advantages) # TODO: optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.nn_baseline: ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one ## HINT: there is a `normalize` function in `infrastructure.utils` if q_values is None: targets = utils.normalize(advantages, np.mean(q_values), np.std(q_values)) else: targets = utils.normalize(q_values, np.mean(q_values), np.std(q_values)) targets = ptu.from_numpy(targets) ## TODO: use the `forward` method of `self.baseline` to get baseline predictions baseline_predictions = self.baseline.forward(observations).squeeze( 1) ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape ## [ N ] versus shape [ N x 1 ] ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1 assert baseline_predictions.shape == targets.shape, f"shapes do not match, pred_shape: " \ f" {baseline_predictions.shape} \t target shape {targets.shape}" # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`) # HINT: use `F.mse_loss` baseline_loss = F.mse_loss(baseline_predictions, targets) # TODO: optimize `baseline_loss` using `self.baseline_optimizer` # HINT: remember to `zero_grad` first self.baseline_optimizer.zero_grad() baseline_loss.backward() self.baseline_optimizer.step() train_log = { 'Training Loss': ptu.to_numpy(loss), } return train_log
def get_action(self, obs: np.ndarray) -> np.ndarray: # get this from Piazza if len(obs.shape) > 1: observation = ptu.from_numpy(obs) else: observation = ptu.from_numpy(obs[None]) # return the action that the policy prescribes return ptu.to_numpy(self(observation).sample())
def __init__(self, ac_dim, ob_dim, n_layers, size, discrete=False, learning_rate=1e-4, training=True, nn_baseline=False, **kwargs ): super().__init__(**kwargs) # init vars self.ac_dim = ac_dim self.ob_dim = ob_dim self.n_layers = n_layers self.discrete = discrete self.size = size self.learning_rate = learning_rate self.training = training self.nn_baseline = nn_baseline if self.discrete: self.logits_na = ptu.build_mlp( input_size=self.ob_dim, output_size=self.ac_dim, n_layers=self.n_layers, size=self.size, ) self.logits_na.to(ptu.device) self.mean_net = None self.logstd = None self.optimizer = optim.Adam( self.logits_na.parameters(), self.learning_rate ) else: self.logits_na = None self.mean_net = ptu.build_mlp( input_size=self.ob_dim, output_size=self.ac_dim, n_layers=self.n_layers, size=self.size, ) self.mean_net.to(ptu.device) # TODO: shouldn't logstd also be a NN? self.logstd = nn.Parameter(torch.zeros( self.ac_dim, dtype=torch.float32, device=ptu.device )) self.optimizer = optim.Adam( itertools.chain([self.logstd], self.mean_net.parameters()), self.learning_rate ) self.normal_dist = distributions.Normal( ptu.from_numpy(0.0), ptu.from_numpy(1.0) )
def update(self, observations, actions, adv_n=None): # TODO: update the policy and return the loss dist = self(ptu.from_numpy(observations)) logp = dist.log_prob(ptu.from_numpy(actions)) loss = -(logp * ptu.from_numpy(adv_n)).sum() self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item()