def update(self, ob_no, ac_na, reward_n, next_ob_no, terminal_n): # everything else should be numpy arrays up til this point ob_no = np.array(ob_no) ac_na = ptu.from_numpy(ac_na).to(torch.long) next_ob_no = np.array(next_ob_no) reward_n = ptu.from_numpy(reward_n) terminal_n = ptu.from_numpy(terminal_n) ac_na = ac_na.to(self.device) q = torch.gather(self.q_net(ob_no), 1, ac_na.unsqueeze(1)).squeeze() #print('q', q.shape) ac_qmax = torch.argmax(self.q_net(next_ob_no), dim=1).unsqueeze(1) # next_ob_no = next_ob_no.to(self.device) q_target = self.q_net_target(next_ob_no) q_target_plug_in = q_target.gather(1, ac_qmax).squeeze() terminal_n = terminal_n.to(self.device) reward_n = reward_n.to(self.device) target = reward_n + q_target_plug_in * ( torch.logical_not(terminal_n)).detach() loss = self.loss(q, target) self.optimizer.zero_grad() loss.backward() utils.clip_grad_value_(self.q_decoder.parameters(), self.grad_norm_clipping) self.optimizer.step()
def updateActors(self, trajectories): self.decoder.optimizer.zero_grad() self.encoder.optimizer.zero_grad() loss = torch.zeros(1) for t in trajectories: obs = t.observations decoder_input, decoder_hidden, encoder_padded = ptu.from_numpy( obs[0]).long()[:, :, 0], ptu.from_numpy( obs[1]), ptu.from_numpy(obs[2]).squeeze() acs = ptu.from_numpy(t.actions) # actions could be squeezed, need to reshape to 1*N so log prob calculates respectively instead of a matrix for categorical distirbution # N*1 for normal batch acs = torch.reshape(acs, (1, -1)) action_distribution, _, _, _ = self.decoder( decoder_input, decoder_hidden, encoder_padded) neg_log_prob = -1 * action_distribution.log_prob(acs) neg_log_prob = torch.squeeze(neg_log_prob) causality_cumsum = np.flip(np.cumsum(np.flip(t.rewards))).copy() traj_reward = ptu.from_numpy(causality_cumsum) # causality trick loss += torch.dot(neg_log_prob, traj_reward) loss.backward() self.decoder.optimizer.step() self.encoder.optimizer.step()
def q_net(self, ob): encoder_padded = ptu.from_numpy(np.array(ob[:,1].tolist()).astype(np.float32))[:,0,:,:] decoder_hidden = ptu.from_numpy(np.array(ob[:,2].tolist()).astype(np.float32))[:,0,:,:] decoder_input = ptu.from_numpy(np.array(ob[:,3].tolist()).astype(np.float32)).long() decoder_input = decoder_input.to(self.device) decoder_hidden = decoder_hidden.to(self.device) encoder_padded = encoder_padded.to(self.device) output, _, _ = self.q_decoder(decoder_input, decoder_hidden, encoder_padded) return output.squeeze()
def get_action_distribution(self, ob): ob = np.array(ob, dtype=object).reshape(-1,5) encoder_padded = ptu.from_numpy(np.array(ob[:,1].tolist()).astype(np.float32))[:,0,:,:] decoder_hidden = ptu.from_numpy(np.array(ob[:,2].tolist()).astype(np.float32))[:,0,:,:] decoder_input = ptu.from_numpy(np.array(ob[:,3].tolist()).astype(np.float32)).long() encoder_padded = encoder_padded.to(device) decoder_hidden = decoder_hidden.to(device) decoder_input = decoder_input.to(device) output, _, _ = self.action_decoder(decoder_input, decoder_hidden, encoder_padded) prob = F.softmax(output[:,0,:], dim=1) action_distribution = torch.distributions.Categorical(probs=prob) return action_distribution, prob
def get_baseline(self, ob): ob = np.array(ob, dtype=object).reshape(-1,5) decoder_hidden = ptu.from_numpy(np.array(ob[:,2].tolist()).astype(np.float32))[:,0,:,:] decoder_hidden = decoder_hidden.to(device) value = self.baseline_decoder(decoder_hidden).squeeze() return value
def update(self, observations, actions, advantages, q_values=None): observation = np.array(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(advantages) action_distribution, probs = self.get_action_distribution(observations) negative_loglikelihood_predicted = -action_distribution.log_prob(actions) advantages = torch.squeeze(advantages) loss = torch.dot(negative_loglikelihood_predicted.squeeze(), advantages) self.action_decoder.optimizer.zero_grad() loss.backward() self.action_decoder.optimizer.step() targets = ptu.normalize(q_values, np.mean(q_values), np.std(q_values)) targets = ptu.from_numpy(targets) baseline_predictions = self.get_baseline(observations) baseline_loss = self.baseline_loss(baseline_predictions, targets) self.baseline_optimizer.zero_grad() baseline_loss.backward() self.baseline_optimizer.step()