Пример #1
0
 def update_target(self):
     if 'hard' in self.target_update_mode:
         util.hard_update(self.target_actor, self.actor)
         util.hard_update(self.target_critic, self.critic)
         # pop-art
         self.target_y_mean = self.y_mean
         self.target_y_square_mean = self.y_square_mean
     else:
         util.soft_update(self.target_actor, self.actor, self.tau)
         util.soft_update(self.target_critic, self.critic, self.tau)
         # no sure how to update pop-art w.r.t. soft update
         self.target_y_mean = self.target_y_mean * (
             1.0 - self.tau) + self.y_mean * self.tau
         self.target_y_square_mean = self.target_y_square_mean * (
             1.0 - self.tau) + self.y_square_mean * self.tau
Пример #2
0
def optimize_model():
    '''
    the actual reinforcement learning stuff, adapted from:
        https://discuss.pytorch.org/t/correct-way-to-do-backpropagation-through-time/11701/2
        https://github.com/fshamshirdar/pytorch-rdpg/blob/master/rdpg.py
        https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
        https://github.com/seungeunrho/minimalRL/blob/master/ddpg.py

    modified to support models with multiple inputs (really, it's specific to this
    problem with an image input and vector input).
    '''
    if len(memory) < nb_warmup_steps and len(memory) != memory_size:
        return

    transitions = memory.sample(batch_size)

    batch = Transition(*zip(*transitions))

    # setting up tensors
    state_batch = torch.from_numpy(np.concatenate(batch.state)).to(device)

    action_batch = torch.from_numpy(np.stack(batch.action)).to(device)
    reward_batch = torch.stack(batch.reward).to(device).double()

    non_final_next_state_batch = torch.from_numpy(
        np.concatenate(batch.next_state)).to(device)

    with torch.no_grad():  # no grad because these are target networks
        target_actions = mu_target(non_final_next_state_batch)
        next_state_values = Q_target(non_final_next_state_batch,
                                     target_actions)

    # Compute the expected Q values
    expected_state_action_values = (next_state_values * gamma) + reward_batch

    # critic update
    Q_optim.zero_grad()

    state_action_values = Q(state_batch, action_batch)

    Q_loss = F.smooth_l1_loss(state_action_values,
                              expected_state_action_values.detach())
    writer.add_scalar('Q_loss', Q_loss.item(), global_step=steps_done)

    Q_loss.backward()
    Q_optim.step()

    del Q_loss

    # actor update
    mu_optim.zero_grad()

    # mu_loss and state_action_values should be nearly identical since we
    # are using the same Q function for both, and actions were selected
    # with the same policy. for RSVG specifically, they will be different
    # because actions are randomly rather than deterministically sampled
    mu_loss = -Q(state_batch, mu(state_batch)).mean()
    writer.add_scalar('mu_loss', mu_loss.item(), global_step=steps_done)

    mu_loss.backward()
    mu_optim.step()

    del mu_loss

    # for m, (name, param) in enumerate(mu.named_parameters()):
    #     if m == 0:
    #         print('name: ', name)
    #         param_scale = np.linalg.norm(param.data.cpu().view(-1))
    #         update = param.grad.data * lr_mu
    #         update_scale = np.linalg.norm(update.cpu().view(-1))
    #         print('param_scale: ', param_scale)
    #         print('update_scale: ', update_scale)
    #         print('ratio: ', update_scale / param_scale)

    soft_update(mu, mu_target, target_update)
    soft_update(Q, Q_target, target_update)
Пример #3
0
def optimize_model():
    '''
    the actual reinforcement learning stuff, adapted from:
        https://discuss.pytorch.org/t/correct-way-to-do-backpropagation-through-time/11701/2
        https://github.com/fshamshirdar/pytorch-rdpg/blob/master/rdpg.py
        https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
        https://github.com/seungeunrho/minimalRL/blob/master/ddpg.py
    '''
    if len(memory) < nb_warmup_steps:
        return

    transitions = memory.sample(batch_size)
    batch = Transition(*zip(*transitions))


    # setting up tensors            
    state_batch = torch.from_numpy(np.concatenate(batch.state)).to(device)

    action_batch = torch.from_numpy(np.concatenate(batch.action)).unsqueeze(1).double().to(device)
    reward_batch = torch.from_numpy(np.concatenate(batch.reward)).unsqueeze(1).double().to(device)

    next_state_batch = torch.from_numpy(np.concatenate(batch.next_state)).to(device)

    # print('state_batch: ', state_batch.shape)
    # print('action_batch: ', action_batch.shape)
    # print('reward_batch: ', reward_batch.shape)
    # print('next_state_batch: ', next_state_batch.shape)

    # print('state_batch: ', state_batch)
    # print('action_batch: ', action_batch)
    # print('reward_batch: ', reward_batch)
    # print('next_state_batch: ', next_state_batch)
    # print('diff: ', next_state_batch - state_batch)

    if use_double_dqn:
        next_state_actions = policy_net(next_state_batch).max(1)[1].unsqueeze(1)
        next_state_values = target_net(next_state_batch).gather(1, next_state_actions).detach()
    else:
        next_state_values = target_net(next_state_batch).max(1)[0].unsqueeze(1).detach()

    # print('next_state_values: ', next_state_values.shape)
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * gamma) + reward_batch
    # print('expected_state_action_values: ', expected_state_action_values.shape)

    state_action_values = policy_net(state_batch).gather(1, action_batch.long())
    # print('state_action_values: ', state_action_values.shape)

    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.detach())

    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1) # is gradient clamping applicable to this problem?
    optimizer.step()

    writer.add_scalar('loss', loss.item(), global_step=steps_done)
    writer.add_scalar('avg_q', state_action_values.mean().item(), global_step=steps_done)

    del loss

    soft_update(policy_net, target_net, target_update)
Пример #4
0
def optimize_model():
    '''
    the actual reinforcement learning stuff, adapted from:
        https://discuss.pytorch.org/t/correct-way-to-do-backpropagation-through-time/11701/2
        https://github.com/fshamshirdar/pytorch-rdpg/blob/master/rdpg.py
        https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
        https://github.com/seungeunrho/minimalRL/blob/master/ddpg.py
    '''
    if len(memory) < nb_warmup_steps and len(memory) != memory_size:
        return

    transitions = memory.sample(batch_size)

    batch = Transition(*zip(*transitions))

    # setting up tensors            
    state_batch = torch.from_numpy(np.concatenate(batch.state)).to(device)

    action_batch = torch.from_numpy(np.stack(batch.action)).to(device)
    reward_batch = torch.from_numpy(np.stack(batch.reward)).to(device).double()
   
    non_final_next_state_batch = torch.from_numpy(np.concatenate(batch.next_state)).to(device)

    # print('shapes!')
    # print('state_batch: ', state_batch.shape)
    # print('action_batch: ', action_batch.shape)
    # print('reward_batch: ', reward_batch.shape)
    # print('non_final_next_state_batch: ', non_final_next_state_batch.shape)

    with torch.no_grad(): # no grad because these are target networks
        target_actions = mu_target(non_final_next_state_batch)
        next_state_values = Q_target(non_final_next_state_batch, target_actions)

    # Compute the expected Q values
    expected_state_action_values = (next_state_values * gamma) + reward_batch

    # critic update
    Q_optim.zero_grad()

    state_action_values = Q(state_batch, action_batch)

    Q_loss = F.smooth_l1_loss(state_action_values1, expected_state_action_values.detach())
    writer.add_scalar('Q_loss', Q_loss.item(), global_step=steps_done)

    Q_loss.backward()
    Q_optim.step()

    del Q_loss

    # actor update
    mu_optim.zero_grad()

    # mu_loss and state_action_values should be nearly identical since we
    # are using the same Q function for both, and actions were selected
    # with the same policy.
    mu_loss = -Q(state_batch, mu(state_batch)).mean()
    writer.add_scalar('mu_loss', mu_loss.item(), global_step=steps_done)

    mu_loss.backward()
    mu_optim.step()

    del mu_loss

    soft_update(mu, mu_target)
    soft_update(Q, Q_target)