def update_target(self): if 'hard' in self.target_update_mode: util.hard_update(self.target_actor, self.actor) util.hard_update(self.target_critic, self.critic) # pop-art self.target_y_mean = self.y_mean self.target_y_square_mean = self.y_square_mean else: util.soft_update(self.target_actor, self.actor, self.tau) util.soft_update(self.target_critic, self.critic, self.tau) # no sure how to update pop-art w.r.t. soft update self.target_y_mean = self.target_y_mean * ( 1.0 - self.tau) + self.y_mean * self.tau self.target_y_square_mean = self.target_y_square_mean * ( 1.0 - self.tau) + self.y_square_mean * self.tau
def optimize_model(): ''' the actual reinforcement learning stuff, adapted from: https://discuss.pytorch.org/t/correct-way-to-do-backpropagation-through-time/11701/2 https://github.com/fshamshirdar/pytorch-rdpg/blob/master/rdpg.py https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html https://github.com/seungeunrho/minimalRL/blob/master/ddpg.py modified to support models with multiple inputs (really, it's specific to this problem with an image input and vector input). ''' if len(memory) < nb_warmup_steps and len(memory) != memory_size: return transitions = memory.sample(batch_size) batch = Transition(*zip(*transitions)) # setting up tensors state_batch = torch.from_numpy(np.concatenate(batch.state)).to(device) action_batch = torch.from_numpy(np.stack(batch.action)).to(device) reward_batch = torch.stack(batch.reward).to(device).double() non_final_next_state_batch = torch.from_numpy( np.concatenate(batch.next_state)).to(device) with torch.no_grad(): # no grad because these are target networks target_actions = mu_target(non_final_next_state_batch) next_state_values = Q_target(non_final_next_state_batch, target_actions) # Compute the expected Q values expected_state_action_values = (next_state_values * gamma) + reward_batch # critic update Q_optim.zero_grad() state_action_values = Q(state_batch, action_batch) Q_loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.detach()) writer.add_scalar('Q_loss', Q_loss.item(), global_step=steps_done) Q_loss.backward() Q_optim.step() del Q_loss # actor update mu_optim.zero_grad() # mu_loss and state_action_values should be nearly identical since we # are using the same Q function for both, and actions were selected # with the same policy. for RSVG specifically, they will be different # because actions are randomly rather than deterministically sampled mu_loss = -Q(state_batch, mu(state_batch)).mean() writer.add_scalar('mu_loss', mu_loss.item(), global_step=steps_done) mu_loss.backward() mu_optim.step() del mu_loss # for m, (name, param) in enumerate(mu.named_parameters()): # if m == 0: # print('name: ', name) # param_scale = np.linalg.norm(param.data.cpu().view(-1)) # update = param.grad.data * lr_mu # update_scale = np.linalg.norm(update.cpu().view(-1)) # print('param_scale: ', param_scale) # print('update_scale: ', update_scale) # print('ratio: ', update_scale / param_scale) soft_update(mu, mu_target, target_update) soft_update(Q, Q_target, target_update)
def optimize_model(): ''' the actual reinforcement learning stuff, adapted from: https://discuss.pytorch.org/t/correct-way-to-do-backpropagation-through-time/11701/2 https://github.com/fshamshirdar/pytorch-rdpg/blob/master/rdpg.py https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html https://github.com/seungeunrho/minimalRL/blob/master/ddpg.py ''' if len(memory) < nb_warmup_steps: return transitions = memory.sample(batch_size) batch = Transition(*zip(*transitions)) # setting up tensors state_batch = torch.from_numpy(np.concatenate(batch.state)).to(device) action_batch = torch.from_numpy(np.concatenate(batch.action)).unsqueeze(1).double().to(device) reward_batch = torch.from_numpy(np.concatenate(batch.reward)).unsqueeze(1).double().to(device) next_state_batch = torch.from_numpy(np.concatenate(batch.next_state)).to(device) # print('state_batch: ', state_batch.shape) # print('action_batch: ', action_batch.shape) # print('reward_batch: ', reward_batch.shape) # print('next_state_batch: ', next_state_batch.shape) # print('state_batch: ', state_batch) # print('action_batch: ', action_batch) # print('reward_batch: ', reward_batch) # print('next_state_batch: ', next_state_batch) # print('diff: ', next_state_batch - state_batch) if use_double_dqn: next_state_actions = policy_net(next_state_batch).max(1)[1].unsqueeze(1) next_state_values = target_net(next_state_batch).gather(1, next_state_actions).detach() else: next_state_values = target_net(next_state_batch).max(1)[0].unsqueeze(1).detach() # print('next_state_values: ', next_state_values.shape) # Compute the expected Q values expected_state_action_values = (next_state_values * gamma) + reward_batch # print('expected_state_action_values: ', expected_state_action_values.shape) state_action_values = policy_net(state_batch).gather(1, action_batch.long()) # print('state_action_values: ', state_action_values.shape) loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.detach()) optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) # is gradient clamping applicable to this problem? optimizer.step() writer.add_scalar('loss', loss.item(), global_step=steps_done) writer.add_scalar('avg_q', state_action_values.mean().item(), global_step=steps_done) del loss soft_update(policy_net, target_net, target_update)
def optimize_model(): ''' the actual reinforcement learning stuff, adapted from: https://discuss.pytorch.org/t/correct-way-to-do-backpropagation-through-time/11701/2 https://github.com/fshamshirdar/pytorch-rdpg/blob/master/rdpg.py https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html https://github.com/seungeunrho/minimalRL/blob/master/ddpg.py ''' if len(memory) < nb_warmup_steps and len(memory) != memory_size: return transitions = memory.sample(batch_size) batch = Transition(*zip(*transitions)) # setting up tensors state_batch = torch.from_numpy(np.concatenate(batch.state)).to(device) action_batch = torch.from_numpy(np.stack(batch.action)).to(device) reward_batch = torch.from_numpy(np.stack(batch.reward)).to(device).double() non_final_next_state_batch = torch.from_numpy(np.concatenate(batch.next_state)).to(device) # print('shapes!') # print('state_batch: ', state_batch.shape) # print('action_batch: ', action_batch.shape) # print('reward_batch: ', reward_batch.shape) # print('non_final_next_state_batch: ', non_final_next_state_batch.shape) with torch.no_grad(): # no grad because these are target networks target_actions = mu_target(non_final_next_state_batch) next_state_values = Q_target(non_final_next_state_batch, target_actions) # Compute the expected Q values expected_state_action_values = (next_state_values * gamma) + reward_batch # critic update Q_optim.zero_grad() state_action_values = Q(state_batch, action_batch) Q_loss = F.smooth_l1_loss(state_action_values1, expected_state_action_values.detach()) writer.add_scalar('Q_loss', Q_loss.item(), global_step=steps_done) Q_loss.backward() Q_optim.step() del Q_loss # actor update mu_optim.zero_grad() # mu_loss and state_action_values should be nearly identical since we # are using the same Q function for both, and actions were selected # with the same policy. mu_loss = -Q(state_batch, mu(state_batch)).mean() writer.add_scalar('mu_loss', mu_loss.item(), global_step=steps_done) mu_loss.backward() mu_optim.step() del mu_loss soft_update(mu, mu_target) soft_update(Q, Q_target)