Exemplo n.º 1
0
  def __init__(self, cfg):
    # making session
    self.cfg = cfg
    self.step = 0
    self.epsilon = 1.0
    self._use_labeler_as_reward = cfg.use_labeler_as_reward
    self._use_oracle_instruction = cfg.use_oracle_instruction
    self._use_synonym_for_rollout = cfg.use_synonym_for_rollout

    # Vocab loading
    vocab_path = get_vocab_path(cfg)
    self.vocab_list = wv.load_vocab_list(vocab_path)
    self.vocab_list = ['eos', 'sos', 'nothing'] + self.vocab_list[1:]
    v2i, i2v = wv.create_look_up_table(self.vocab_list)
    self.encode_fn = wv.encode_text_with_lookup_table(
        v2i, max_sequence_length=self.cfg.max_sequence_length)
    self.decode_fn = wv.decode_with_lookup_table(i2v)

    labeler_config = get_labeler_config(cfg, self.vocab_list)

    if self._use_labeler_as_reward or not self._use_oracle_instruction:
      self.labeler = Labeler(labeler_config=labeler_config)
      self.labeler.set_captioning_model(
          labeler_config, labeler_config['captioning_weight_path'])
      self.labeler.set_answering_model(labeler_config,
                                       labeler_config['answering_weight_path'])
Exemplo n.º 2
0
class FnApproxHIR:
  """Learner that executes Hindsight Instruction Relabeling.

  Attributes:
    cfg: configuration of this learner
    step: current training step
    epsilon: value of the epsilon for sampling random action
    vocab_list: vocabulary list used for the instruction labeler
    encode_fn: function that encodes a instruction
    decode_fn: function that converts encoded instruction back to text
    labeler: object that generates labels for transitions
  """

  def __init__(self, cfg):
    # making session
    self.cfg = cfg
    self.step = 0
    self.epsilon = 1.0
    self._use_labeler_as_reward = cfg.use_labeler_as_reward
    self._use_oracle_instruction = cfg.use_oracle_instruction
    self._use_synonym_for_rollout = cfg.use_synonym_for_rollout

    # Vocab loading
    vocab_path = get_vocab_path(cfg)
    self.vocab_list = wv.load_vocab_list(vocab_path)
    self.vocab_list = ['eos', 'sos', 'nothing'] + self.vocab_list[1:]
    v2i, i2v = wv.create_look_up_table(self.vocab_list)
    self.encode_fn = wv.encode_text_with_lookup_table(
        v2i, max_sequence_length=self.cfg.max_sequence_length)
    self.decode_fn = wv.decode_with_lookup_table(i2v)

    labeler_config = get_labeler_config(cfg, self.vocab_list)

    if self._use_labeler_as_reward or not self._use_oracle_instruction:
      self.labeler = Labeler(labeler_config=labeler_config)
      self.labeler.set_captioning_model(
          labeler_config, labeler_config['captioning_weight_path'])
      self.labeler.set_answering_model(labeler_config,
                                       labeler_config['answering_weight_path'])

  def learn(self, env, agent, replay_buffer):
    """Run learning for 1 cycle with consists of num_episode of episodes.

    Args:
      env: the RL environment
      agent: the RL agent
      replay_buffer: the experience replay buffer

    Returns:
      statistics of the training episode
    """
    average_per_ep_reward = []
    average_per_ep_achieved_n = []
    average_per_ep_relabel_n = []
    average_batch_loss = []

    curr_step = agent.get_global_step()
    self.update_epsilon(curr_step)
    tic = time.time()
    for _ in range(self.cfg.num_episode):
      curr_step = agent.increase_global_step()

      sample_new_scene = random.uniform(0, 1) < self.cfg.sample_new_scene_prob
      s = env.reset(sample_new_scene)
      episode_experience = []
      episode_reward = 0
      episode_achieved_n = 0
      episode_relabel_n = 0

      # rollout
      g_text, p = env.sample_goal()
      if env.all_goals_satisfied:
        s = env.reset(True)
        g_text, p = env.sample_goal()
      g = self.encode_fn(g_text)
      g = np.squeeze(pad_to_max_length([g], self.cfg.max_sequence_length)[0])
      _ = agent.step(s, g, env, 0.0)  # taking a step to create weights

      for t in range(self.cfg.max_episode_length):
        a = agent.step(s, g, env, self.epsilon)
        s_tp1, r, _, _ = env.step(
            a,
            record_achieved_goal=self._use_oracle_instruction,
            goal=p,
            atomic_goal=self.cfg.record_atomic_instruction)
        if self._use_labeler_as_reward:
          labeler_answer = self.labeler.verify_instruction(
              env.convert_order_invariant_to_direct(s_tp1), g)
          r = float(labeler_answer > 0.5)
        if self._use_oracle_instruction:
          ag = env.get_achieved_goals()
        else:
          ag = [None]
        episode_experience.append((s, a, r, s_tp1, g, ag))
        episode_reward += r
        s = s_tp1
        if r > env.shape_val:
          episode_achieved_n += 1
          g_text, p = env.sample_goal()
          if env.all_goals_satisfied:
            break
          g = self.encode_fn(g_text)
          g = np.squeeze(
              pad_to_max_length([g], self.cfg.max_sequence_length)[0])

      average_per_ep_reward.append(episode_reward)
      average_per_ep_achieved_n.append(episode_achieved_n)

      # processing trajectory
      episode_length = len(episode_experience)

      if not self._use_oracle_instruction:  # generate instructions from traj
        transition_pair = []
        if self.cfg.obs_type == 'order_invariant':
          for t in episode_experience:
            transition_pair.append([
                env.convert_order_invariant_to_direct(t[0]),
                env.convert_order_invariant_to_direct(t[3])
            ])
          transition_pair = np.stack(transition_pair)
        else:
          for t in episode_experience:
            transition_pair.append([t[0], t[3]])

        all_achieved_goals = self.labeler.label_trajectory(
            transition_pair, null_token=2)
        for i in range(len(episode_experience)):
          s, a, r, s_tp1, g, ag = episode_experience[i]
          step_i_text = []
          for inst in all_achieved_goals[i]:
            decoded_inst = self.decode_fn(inst)
            step_i_text.append(decoded_inst)
          episode_experience[i] = [s, a, r, s_tp1, g, step_i_text]

      non_null_future_idx = [[] for _ in range(episode_length)]
      for t in range(episode_length):
        _, _, _, _, _, ag = episode_experience[t]
        if ag:
          for u in range(t):
            non_null_future_idx[u].append(t)

      for t in range(episode_length):
        s, a, r, s_tp1, g, ag = episode_experience[t]
        episode_relabel_n += float(len(ag) > 0)
        g_text = self.decode_fn(g)
        if self.cfg.paraphrase:
          g_text = paraphrase_sentence(
              g_text, delete_color=self.cfg.diverse_scene_content)
        g = self.encode_fn(g_text)
        replay_buffer.add((s, a, r, s_tp1, g))
        if self.cfg.relabeling:
          self.hir_relabel(non_null_future_idx, episode_experience, t,
                           replay_buffer, env)

      average_per_ep_relabel_n.append(episode_relabel_n / float(episode_length))

      # training
      if not self.is_warming_up(curr_step):
        batch_loss = 0
        for _ in range(self.cfg.optimization_steps):
          experience = replay_buffer.sample(self.cfg.batchsize)
          s, a, r, s_tp1, g = [
              np.squeeze(elem, axis=1) for elem in np.split(experience, 5, 1)
          ]
          s = np.stack(s)
          s_tp1 = np.stack(s_tp1)
          g = np.array(list(g))
          if self.cfg.instruction_repr == 'language':
            g = np.array(pad_to_max_length(g, self.cfg.max_sequence_length))
          batch = {
              'obs': np.asarray(s),
              'action': np.asarray(a),
              'reward': np.asarray(r),
              'obs_next': np.asarray(s_tp1),
              'g': np.asarray(g)
          }
          loss_dict = agent.train(batch)
          batch_loss += loss_dict['loss']
        average_batch_loss.append(batch_loss / self.cfg.optimization_steps)

    time_per_episode = (time.time() - tic) / self.cfg.num_episode

    # Update the target network
    agent.update_target_network()

    ################## Debug ##################
    sample = replay_buffer.sample(min(10000, len(replay_buffer.buffer)))
    _, _, sample_r, _, _ = [
        np.squeeze(elem, axis=1) for elem in np.split(sample, 5, 1)
    ]
    print('n one:', np.sum(np.float32(sample_r == 1.0)), 'n zero',
          np.sum(np.float32(sample_r == 0.0)), 'n buff',
          len(replay_buffer.buffer))
    ################## Debug ##################
    stats = {
        'loss': np.mean(average_batch_loss) if average_batch_loss else 0,
        'reward': np.mean(average_per_ep_reward),
        'achieved_goal': np.mean(average_per_ep_achieved_n),
        'average_relabel_goal': np.mean(average_per_ep_relabel_n),
        'epsilon': self.epsilon,
        'global_step': curr_step,
        'time_per_episode': time_per_episode,
        'replay_buffer_reward_avg': np.mean(sample_r),
        'replay_buffer_reward_var': np.var(sample_r)
    }
    return stats

  def hir_relabel(self, non_null_future_idx, episode_experience, current_t,
                  replay_buffer, env):
    """Relabeling trajectories.

    Args:
      non_null_future_idx: list of time step where something happens
      episode_experience: the RL environment
      current_t: time time step at which the experience is relabeled
      replay_buffer:  the experience replay buffer
      env: the RL environment

    Returns:
      the reset state of the environment
    """
    ep_len = len(episode_experience)
    s, a, _, s_tp1, _, ag = episode_experience[current_t]
    if ag:
      # TODO(ydjiang): k_immediate logic needs improvement
      for _ in range(self.cfg.k_immediate):
        ag_text_single = random.choice(ag)
        g_type = instruction_type(ag_text_single)
        if self.cfg.paraphrase and g_type != 'unary':
          ag_text_single = paraphrase_sentence(
              ag_text_single, delete_color=self.cfg.diverse_scene_content)
        replay_buffer.add(
            (s, a, env.reward_scale, s_tp1, self.encode_fn(ag_text_single)))
        if g_type == 'unary' and self.cfg.negate_unary:
          negative_ag = negate_unary_sentence(ag_text_single)
          if negative_ag:
            replay_buffer.add((s, a, 0.0, s_tp1, self.encode_fn(negative_ag)))
    # TODO(ydjiang): repeat logit needs improvement
    goal_count, repeat = 0, 0
    while goal_count < self.cfg.future_k and repeat < (ep_len - current_t) * 4:
      repeat += 1
      future = np.random.randint(current_t, ep_len)
      _, _, _, _, _, ag_future = episode_experience[future]
      if not ag_future:
        continue
      random.shuffle(ag_future)
      for single_g in ag_future:
        if instruction_type(single_g) != 'unary':
          discount = self.cfg.discount**(future - current_t)
          if self.cfg.paraphrase:
            single_g = paraphrase_sentence(
                single_g, delete_color=self.cfg.diverse_scene_content)
          replay_buffer.add((s, a, discount * env.reward_scale, s_tp1,
                             self.encode_fn(single_g)))
          goal_count += 1
          break

  def update_epsilon(self, step):
    new_epsilon = self.cfg.epsilon_decay**(step // self.cfg.num_episode)
    self.epsilon = max(new_epsilon, self.cfg.min_epsilon)

  def is_warming_up(self, step):
    return step <= self.cfg.collect_cycle * self.cfg.num_episode

  def rollout(self,
              env,
              agent,
              directory,
              record_video=False,
              timeout=8,
              num_episode=10,
              record_trajectory=False):
    """Rollout and save.

    Args:
      env: the RL environment
      agent: the RL agent
      directory: directory where the output of the rollout is saved
      record_video: record the video
      timeout: timeout step if the agent is stuck
      num_episode: number of rollout episode
      record_trajectory: record the ground truth trajectory

    Returns:
      percentage of success during this rollout
    """
    print('\n#######################################')
    print('Rolling out...')
    print('#######################################')

    # randomly change subset of embedding
    if self._use_synonym_for_rollout and self.cfg.embedding_type == 'random':
      original_embedding = agent.randomize_partial_word_embedding(10)

    all_frames = []
    ep_observation, ep_action, ep_agn = [], [], []
    black_frame = pad_image(env.render(mode='rgb_array')) * 0.0
    goal_sampled = 0
    timeout_count, success = 0, 0
    for ep in range(num_episode):
      s = env.reset(self.cfg.diverse_scene_content)
      all_frames += [black_frame] * 10
      g_text, p = env.sample_goal()
      if env.all_goals_satisfied:
        s = env.reset(True)
        g, p = env.sample_goal()
      goal_sampled += 1
      g = self.encode_fn(g_text)
      g = np.squeeze(pad_to_max_length([g], self.cfg.max_sequence_length)[0])
      if self._use_synonym_for_rollout and self.cfg.embedding_type != 'random':
        # use unseen lexicons for test
        g = paraphrase_sentence(
            self.decode_fn(g), synonym_tables=_SYNONYM_TABLES)
      current_goal_repetition = 0
      for t in range(self.cfg.max_episode_length):
        prob = self.epsilon if record_trajectory else 0.0
        action = agent.step(s, g, env, explore_prob=prob)
        s_tp1, r, _, _ = env.step(
            action,
            record_achieved_goal=False,
            goal=p,
            atomic_goal=self.cfg.record_atomic_instruction)
        s = s_tp1
        all_frames.append(
            add_text(pad_image(env.render(mode='rgb_array')), g_text))
        current_goal_repetition += 1

        if record_trajectory:
          ep_observation.append(env.get_direct_obs().tolist())
          ep_action.append(action)

        sample_new_goal = False
        if r > env.shape_val:
          img = pad_image(env.render(mode='rgb_array'))
          for _ in range(5):
            all_frames.append(add_text(img, g_text, color='green'))
          success += 1
          sample_new_goal = True

        if current_goal_repetition >= timeout:
          all_frames.append(
              add_text(pad_image(env.render(mode='rgb_array')), 'time out :('))
          timeout_count += 1
          sample_new_goal = True

        if sample_new_goal:
          g, p = env.sample_goal()
          if env.all_goals_satisfied:
            break
          g_text = g
          g = self.encode_fn(g_text)
          g = np.squeeze(
              pad_to_max_length([g], self.cfg.max_sequence_length)[0])
          if self._use_synonym_for_rollout and self.cfg.embedding_type != 'random':
            g = paraphrase_sentence(
                self.decode_fn(g), synonym_tables=_SYNONYM_TABLES)
          current_goal_repetition = 0
          goal_sampled += 1

    # restore the original embedding
    if self._use_synonym_for_rollout and self.cfg.embedding_type == 'random':
      agent.set_embedding(original_embedding)

    print('Rollout finished')
    print('{} instrutctions tried given'.format(goal_sampled))
    print('{} instructions timed out'.format(timeout_count))
    print('{} success rate\n'.format(1 - float(timeout_count) / goal_sampled))
    if record_video:
      save_video(np.uint8(all_frames), directory, fps=5)
      print('Video saved...')
    if record_trajectory:
      print('Recording trajectory...')
      datum = {
          'obs': ep_observation,
          'action': ep_action,
          'achieved goal': ep_agn,
      }
      save_json(datum, directory[:-4] + '_trajectory.json')
    return 1 - float(timeout_count) / goal_sampled