Пример #1
0
class A2C():
    def __init__(self, episode_buffer, replay_buffer, action_space=3):
        self.lr = PARAM.LEARNING_RATE
        self.episode_buffer = episode_buffer
        self.replay_buffer = replay_buffer
        self.N = PARAM.N
        self.gamma = PARAM.gamma
        self.seq_len = PARAM.A2C_SEQUENCE_LENGTH
        self.aux_batch_size = PARAM.AUX_TASK_BATCH_SIZE
        self.vfr_weight = PARAM.VFR_LOSS_WEIGHT
        self.rp_weight = PARAM.RP_LOSS_WEIGHT
        self.pc_weight = PARAM.PC_LOSS_WEIGHT

        # A2C network
        self.A = AuxNetwork(state_size=PARAM.STATE_SIZE,
                            action_space=action_space,
                            seq_len=self.seq_len)

        # GPU availability
        self.gpu = torch.cuda.is_available()
        if self.gpu:
            print("Using GPU")
            self.A = self.A.cuda()
        else:
            print("Using CPU")

        # Loss Function and Optimizer
        self.optimizer = optim.Adam(self.A.parameters(),
                                    lr=self.lr,
                                    weight_decay=1e-6)
        self.vfr_criterion = nn.MSELoss()  # Value Function Replay loss
        self.rp_criterion = nn.CrossEntropyLoss()  # Reward Prediction loss
        self.pc_criterion = nn.MSELoss()  # Value Function Replay loss

    def reduce_learning_rate(self):
        for pgroups in self.optimizer.param_groups:
            pgroups['lr'] = pgroups['lr'] / 10.0

    def train(self, episode_len):
        self.optimizer.zero_grad()
        loss = self.compute_A2C_loss(episode_len)
        loss += self.vfr_weight * self.compute_vfr_loss()
        if self.replay_buffer.any_reward_instances():
            loss += self.rp_weight * self.compute_rp_loss()
        loss += self.pc_weight * self.compute_pc_loss()
        loss.backward()
        torch.nn.utils.clip_grad_value_(self.A.parameters(),
                                        PARAM.GRAD_CLIP_VAL)
        self.optimizer.step()

        if math.isnan(loss.item()):
            print('Loss Eploded!')

    def compute_A2C_loss(self, episode_len):
        T = episode_len
        n = self.N
        for t in range(T - 1, -1, -1):
            val = self.episode_buffer[t][-1]
            if t + n >= T:
                Vend = 0
            else:
                Vend = self.episode_buffer[t + n][-1]
            sum_ = 0.0
            for k in range(n):
                if t + k < T:
                    tk_reward = self.episode_buffer[t + k][2]
                    sum_ += tk_reward * (self.gamma**k)
            rew = Vend * (self.gamma**n) + float(sum_)
            if t == T - 1:
                ploss = (rew - val) * torch.log(
                    self.episode_buffer[t][4][self.episode_buffer[t][1]])
                vloss = (rew - val)**2
            else:
                ploss += (rew - val) * torch.log(
                    self.episode_buffer[t][4][self.episode_buffer[t][1]])
                vloss += (rew - val)**2

        ploss = -1.0 * ploss / float(T)
        vloss = vloss / float(T)

        return ploss + vloss

    def compute_vfr_loss(self):
        """ Computes Value Function Replay Loss. """
        idxs = self.replay_buffer.sample_idxs(self.aux_batch_size)
        vision, scent, state, reward = self.get_io_from_replay_buffer(
            idxs, batch_size=self.aux_batch_size, seq_len=self.seq_len)
        val, _ = self.A.forward(vision, scent, state)

        return self.vfr_criterion(val.view(-1, 1), reward)

    def compute_rp_loss(self):
        """ Computes Reward Prediction Loss. """
        vision, ground_truth = self.get_io_from_skewed_replay_buffer(
            batch_size=self.aux_batch_size, seq_len=3)
        pred = self.A.predict_rewards(vision)

        return self.rp_criterion(pred, ground_truth)

    def compute_pc_loss(self):
        """ Computes Pixel Control Loss. """
        idxs = self.replay_buffer.sample_idxs(self.aux_batch_size)
        vision, aux_rew, actions = self.get_pc_io_from_replay_buffer(
            idxs, batch_size=self.aux_batch_size, seq_len=1)
        pred = self.A.pixel_control(vision)
        for i in range(20):
            if i == 0:
                pc_loss = self.pc_criterion(aux_rew[i], pred[i, actions[i]])
            else:
                pc_loss += self.pc_criterion(aux_rew[i], pred[i, actions[i]])

        return pc_loss

    def get_output(self, index, batch_size=1, seq_len=1, no_grad=False):
        ''' Returns output from the A network. '''
        vision, scent, state = self.get_input_tensor(index, batch_size,
                                                     seq_len)
        if no_grad:
            with torch.no_grad():
                val, softmax = self.A.forward(vision, scent, state)
        else:
            val, softmax = self.A.forward(vision, scent, state)

        action = np.random.choice(np.arange(3),
                                  1,
                                  p=np.squeeze(
                                      softmax.clone().cpu().detach().numpy()))
        return val, softmax.view(3), action

    def get_input_tensor(self, idxs, batch_size=1, seq_len=1):
        ''' Returns an input tensor from the observation. '''
        vision = np.zeros((batch_size, seq_len, 3, 11, 11))
        scent = np.zeros((batch_size, seq_len, 3))
        state = np.zeros((batch_size, seq_len, 4))

        for k, idx in enumerate(idxs):
            for j in range(seq_len):
                if idx - j < 0:
                    continue
                obs, action, rew, _, _, tong_count, _ = self.episode_buffer[idx
                                                                            -
                                                                            j]
                vision[k, j] = np.moveaxis(obs['vision'], -1, 0)
                scent[k, j] = obs['scent']
                state[k, j] = np.array(
                    [action, rew, int(obs['moved']), tong_count])

        vision, scent, state = torch.from_numpy(vision).float(
        ), torch.from_numpy(scent).float(), torch.from_numpy(state).float()
        if self.gpu:
            vision, scent, state = vision.cuda(), scent.cuda(), state.cuda()

        return vision, scent, state

    def get_io_from_replay_buffer(self, idxs, batch_size=1, seq_len=1):
        ''' Returns an input tensor from the observation. '''
        vision = np.zeros((batch_size, seq_len, 3, 11, 11))
        scent = np.zeros((batch_size, seq_len, 3))
        state = np.zeros((batch_size, seq_len, 4))
        reward = np.zeros((batch_size, 1))

        for k, idx in enumerate(idxs):
            for j in range(seq_len):
                obs, action, rew, _, _, tong_count = self.replay_buffer.get_single_sample(
                    idx - j)
                vision[k, j] = np.moveaxis(obs['vision'], -1, 0)
                scent[k, j] = obs['scent']
                state[k, j] = np.array(
                    [action, rew, int(obs['moved']), tong_count])
                if j == 0:
                    reward[k] = rew

        vision, scent, state, reward = torch.from_numpy(
            vision).float(), torch.from_numpy(scent).float(), torch.from_numpy(
                state).float(), torch.from_numpy(reward).float()
        if self.gpu:
            vision, scent, state, reward = vision.cuda(), scent.cuda(
            ), state.cuda(), reward.cuda()

        return vision, scent, state, reward

    def get_io_from_skewed_replay_buffer(self, batch_size=1, seq_len=1):
        ''' Returns an input tensor from the observation. '''
        vision, reward_class = self.replay_buffer.skewed_samples(
            batch_size, seq_len)
        vision, reward_class = torch.from_numpy(
            vision).float(), torch.from_numpy(reward_class).long()
        if self.gpu:
            vision, reward_class = vision.cuda(), reward_class.cuda()

        return vision, reward_class

    def get_pc_io_from_replay_buffer(self, idxs, batch_size=1, seq_len=1):
        ''' Returns an input tensor from the observation. '''
        vision = np.zeros((batch_size, seq_len, 3, 11, 11))
        aux_rew = np.zeros((batch_size, 11, 11))
        actions = [[]] * batch_size

        for k, idx in enumerate(idxs):
            for j in range(seq_len):
                obs, action, _, next_obs, _, _ = self.replay_buffer.get_single_sample(
                    idx - j)
                vision[k, j] = np.moveaxis(obs['vision'], -1, 0)
                if j == 0:
                    if next_obs['moved']:
                        aux_rew[k] = np.mean(np.abs(obs['vision'] -
                                                    next_obs['vision']),
                                             axis=2)
                    actions[k] = action

        vision, aux_rew = torch.from_numpy(vision).float(), torch.from_numpy(
            aux_rew).float()
        if self.gpu:
            vision, aux_rew = vision.cuda(), aux_rew.cuda()

        return vision, aux_rew, actions

    def set_train(self):
        self.A.train()

    def set_eval(self):
        self.A.eval()

    def save_model_weights(self, suffix, path='./'):
        # Helper function to save your model / weights.
        state = {
            'epoch': suffix,
            'state_dict': self.A.state_dict(),
            'optmizer': self.optimizer.state_dict(),
        }
        torch.save(state, path + str(suffix) + '.dat')

    def load_model(self, model_file):
        # Helper function to load an existing model.
        state = torch.load(model_file)
        self.A.load_state_dict(state['state_dict'])
        self.optimizer.load_state_dict(state['optimizer'])
Пример #2
0
class PPO():
  def __init__(self, episode_buffer, replay_buffer, action_space=3):
    self.lr = PARAM.LEARNING_RATE
    self.episode_buffer = episode_buffer
    self.replay_buffer = replay_buffer
    self.N = PARAM.N
    self.gamma = PARAM.gamma
    self.seq_len = PARAM.A2C_SEQUENCE_LENGTH
    self.aux_batch_size = PARAM.AUX_TASK_BATCH_SIZE
    self.vfr_weight = PARAM.VFR_LOSS_WEIGHT
    self.rp_weight = PARAM.RP_LOSS_WEIGHT
    self.pc_weight = PARAM.PC_LOSS_WEIGHT

    self.ppo_epochs = 10 #PARAM.PPO_EPOCHS
    self.num_mini_batch = 12 #PARAM.PPO_NUM_MINI_BATCH
    self.clip_param = 0.2

    #self.max_grad_norm = PARAM.MAX_GRAD_NORM
    #self.use_clipped_value_loss = PARAM.USE_CLIPPED_VALUE_LOSS

    # A2C network
    self.A = AuxNetwork(state_size=PARAM.STATE_SIZE, action_space=action_space, seq_len=self.seq_len)

    # GPU availability
    self.gpu = torch.cuda.is_available()
    if self.gpu:
      print("Using GPU")
      self.A = self.A.cuda()
    else:
      print("Using CPU")

    # Loss Function and Optimizer
    self.optimizer = optim.Adam(self.A.parameters(), lr=self.lr, weight_decay=1e-6)
    self.vfr_criterion = nn.MSELoss()           # Value Function Replay loss
    self.rp_criterion = nn.CrossEntropyLoss()   # Reward Prediction loss
    self.pc_criterion = nn.MSELoss()            # Value Function Replay loss


  def reduce_learning_rate(self):
    for pgroups in self.optimizer.param_groups:
      pgroups['lr'] = pgroups['lr']/10.0

  def train(self, episode_len):
    T = episode_len
    n = self.N
    advantages = []
    rewards = []
    for t in range(T-1, -1, -1):
      val = self.episode_buffer[t][-1]
      if t + n >= T:
        Vend = 0
      else:
        Vend = self.episode_buffer[t+n][-1]
      sum_ = 0.0
 
      for k in range(n):
        if t + k < T:
          tk_reward = self.episode_buffer[t+k][2]
          sum_ += tk_reward * (self.gamma**k)
      rew = Vend*(self.gamma**n) + float(sum_)
      rewards.append(rew)
 
      if t == T-1:
         advantages.append(rew-val)
      else:
         advantages.append(rew-val)

    advantages = list(reversed(advantages))
    advantages = torch.tensor(advantages)
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

    rewards = list(reversed(rewards))
    rewards = torch.tensor(rewards)

    self.ppo_epochs = 10 #PARAM.PPO_EPOCHS
    #self.clip_param = PARAM.PPO_CLIP_PARAM
    self.num_mini_batch = 1 #PARAM.PPO_NUM_MINI_BATCH
    self.seq_len = 4

    for e in range(self.ppo_epochs):
        random_indicies = np.random.randint(T, size=self.num_mini_batch)
        new_values = []
        new_softmax = []
        for index in random_indicies:
            val, softmax, action = self.get_output([index], self.num_mini_batch, self.seq_len)
            new_values.append(val)
            new_softmax.append(softmax)
 
        for k, index in enumerate(random_indicies):
            action_log_probs = torch.log(new_softmax[k])
            old_action_log_probs = torch.log(self.episode_buffer[index][4])

            advantage_target = advantages[index]

            ratio = torch.exp(action_log_probs - old_action_log_probs)
            surr1 = ratio * advantage_target 
            surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
                                       1.0 + self.clip_param) * advantage_target
            action_loss = -torch.min(surr1, surr2).mean()

            value_loss = 0.8 * F.mse_loss(rewards[index], new_values[k])

            self.optimizer.zero_grad()
            
            loss = action_loss + value_loss
            loss += self.vfr_weight * self.compute_vfr_loss()
            if self.replay_buffer.any_reward_instances():
                loss += self.rp_weight * self.compute_rp_loss()
            loss += self.pc_weight * self.compute_pc_loss()
            
            loss.backward(retain_graph=True)
            
            torch.nn.utils.clip_grad_value_(self.A.parameters(), PARAM.GRAD_CLIP_VAL)
            self.optimizer.step()


  def compute_vfr_loss(self):
    """ Computes Value Function Replay Loss. """
    idxs = self.replay_buffer.sample_idxs(self.aux_batch_size)
    vision, scent, state, reward = self.get_io_from_replay_buffer(idxs, batch_size=self.aux_batch_size, seq_len=self.seq_len)
    val, _ = self.A.forward(vision, scent, state)

    return self.vfr_criterion(val.view(-1, 1), reward)

  def compute_rp_loss(self):
    """ Computes Reward Prediction Loss. """
    vision, ground_truth = self.get_io_from_skewed_replay_buffer(batch_size=self.aux_batch_size, seq_len=3)
    pred = self.A.predict_rewards(vision)

    return self.rp_criterion(pred, ground_truth)

  def compute_pc_loss(self):
    """ Computes Pixel Control Loss. """
    idxs = self.replay_buffer.sample_idxs(self.aux_batch_size)
    vision, aux_rew, actions = self.get_pc_io_from_replay_buffer(idxs, batch_size=self.aux_batch_size, seq_len=1)
    pred = self.A.pixel_control(vision)
    for i in range(20):
      if i == 0:
        pc_loss = self.pc_criterion(aux_rew[i], pred[i, actions[i]])
      else:
        pc_loss += self.pc_criterion(aux_rew[i], pred[i, actions[i]])

    return pc_loss

  def get_output(self, index, batch_size=1, seq_len=1, no_grad=False):
    ''' Returns output from the A network. '''
    vision, scent, state = self.get_input_tensor(index, batch_size, seq_len)
    if no_grad:
      with torch.no_grad():
        val, softmax = self.A.forward(vision, scent, state)
    else:
      val, softmax = self.A.forward(vision, scent, state)

    action = np.random.choice(np.arange(3), 1, p=np.squeeze(softmax.clone().cpu().detach().numpy()))
    return val, softmax.view(3), action

  def get_input_tensor(self, idxs, batch_size=1, seq_len=1):
    ''' Returns an input tensor from the observation. '''
    vision = np.zeros((batch_size, seq_len, 3, 11, 11))
    scent = np.zeros((batch_size, seq_len, 3))
    state = np.zeros((batch_size, seq_len, 4))

    for k, idx in enumerate(idxs):
      for j in range(seq_len):
        if idx - j < 0:
          continue
        obs, action, rew, _, _, tong_count, _ = self.episode_buffer[idx-j]
        vision[k, j] = np.moveaxis(obs['vision'], -1, 0)
        scent[k, j] = obs['scent']
        state[k, j] = np.array([action, rew, int(obs['moved']), tong_count])

    vision, scent, state = torch.from_numpy(vision).float(), torch.from_numpy(scent).float(), torch.from_numpy(state).float()
    if self.gpu:
      vision, scent, state = vision.cuda(), scent.cuda(), state.cuda()

    return vision, scent, state

  def get_io_from_replay_buffer(self, idxs, batch_size=1, seq_len=1):
    ''' Returns an input tensor from the observation. '''
    vision = np.zeros((batch_size, seq_len, 3, 11, 11))
    scent = np.zeros((batch_size, seq_len, 3))
    state = np.zeros((batch_size, seq_len, 4))
    reward = np.zeros((batch_size, 1))

    for k, idx in enumerate(idxs):
      for j in range(seq_len):
        obs, action, rew, _, _, tong_count = self.replay_buffer.get_single_sample(idx-j)
        vision[k, j] = np.moveaxis(obs['vision'], -1, 0)
        scent[k, j] = obs['scent']
        state[k, j] = np.array([action, rew, int(obs['moved']), tong_count])
        if j == 0:
          reward[k] = rew

    vision, scent, state, reward = torch.from_numpy(vision).float(), torch.from_numpy(scent).float(), torch.from_numpy(state).float(), torch.from_numpy(reward).float()
    if self.gpu:
      vision, scent, state, reward = vision.cuda(), scent.cuda(), state.cuda(), reward.cuda()

    return vision, scent, state, reward

  def get_io_from_skewed_replay_buffer(self, batch_size=1, seq_len=1):
    ''' Returns an input tensor from the observation. '''
    vision, reward_class = self.replay_buffer.skewed_samples(batch_size, seq_len)
    vision, reward_class = torch.from_numpy(vision).float(), torch.from_numpy(reward_class).long()
    if self.gpu:
      vision, reward_class = vision.cuda(), reward_class.cuda()

    return vision, reward_class

  def get_pc_io_from_replay_buffer(self, idxs, batch_size=1, seq_len=1):
    ''' Returns an input tensor from the observation. '''
    vision = np.zeros((batch_size, seq_len, 3, 11, 11))
    aux_rew = np.zeros((batch_size, 11, 11))
    actions = [[]]*batch_size

    for k, idx in enumerate(idxs):
      for j in range(seq_len):
        obs, action, _, next_obs, _, _ = self.replay_buffer.get_single_sample(idx-j)
        vision[k, j] = np.moveaxis(obs['vision'], -1, 0)
        if j == 0:
          if next_obs['moved']:
            aux_rew[k] = np.mean(np.abs(obs['vision'] - next_obs['vision']), axis=2)
          actions[k] = action

    vision, aux_rew = torch.from_numpy(vision).float(), torch.from_numpy(aux_rew).float()
    if self.gpu:
      vision, aux_rew = vision.cuda(), aux_rew.cuda()

    return vision, aux_rew, actions

  def set_train(self):
    self.A.train()

  def set_eval(self):
    self.A.eval()

  def save_model_weights(self, suffix, path='./'):
    # Helper function to save your model / weights.
    state = {
              'epoch': suffix,
              'state_dict': self.A.state_dict(),
              'optmizer': self.optimizer.state_dict(),
            }
    torch.save(state, path + str(suffix) + '.dat')

  def load_model(self, model_file):
    # Helper function to load an existing model.
    state = torch.load(model_file)
    self.A.load_state_dict(state['state_dict'])
    self.optimizer.load_state_dict(state['optimizer'])