def max_ent_relabel(self, experience, agent, env):
        """Perform maximum entropy relabeling.

    Args:
      experience: experience to be re-labeled
      agent: the RL agent
      env: the RL environment

    Returns:
      relabeled experience
    """
        relabel_proportion = self.cfg.relabel_proportion
        exp_o = experience[int(len(experience) * relabel_proportion):]
        exp_n = experience[:int(len(experience) * relabel_proportion)]
        s_o, a_o, r_o, s_tp1_o, g_o, ag_o = [
            np.squeeze(elem, axis=1) for elem in np.split(exp_o, 6, 1)
        ]
        s_n, a_n, r_n, s_tp1_n, g_n, ag_n = [
            np.squeeze(elem, axis=1) for elem in np.split(exp_n, 6, 1)
        ]
        chosen_q_idx = np.random.choice(np.arange(len(env.all_questions)),
                                        self.cfg.irl_sample_goal_n)
        g_candidate = np.array(env.all_questions)[chosen_q_idx]
        g_candidate = [q for q, p in g_candidate]
        if self.cfg.instruction_repr == 'language':
            g_o = np.array(pad_to_max_length(g_o,
                                             self.cfg.max_sequence_length))
            if self.cfg.paraphrase:
                for i, g_text in enumerate(g_candidate):
                    g_candidate[i] = paraphrase_sentence(
                        g_text, delete_color=self.cfg.diverse_scene_content)
            g_candidate = [self.encode_fn(g) for g in g_candidate]
            g_candidate = np.array(
                pad_to_max_length(g_candidate, self.cfg.max_sequence_length))
        soft_q = agent.compute_q_over_all_g(s_n, a_n, g_candidate)
        normalized_soft_q = tf.nn.softmax(soft_q, axis=-1).numpy()
        chosen_g = []
        for sq in normalized_soft_q:
            chosen_g.append(
                np.random.choice(np.arange(sq.shape[0]), 1, p=sq)[0])
        g_n = g_candidate[chosen_g]
        s = np.concatenate([np.stack(s_o), np.stack(s_n)], axis=0)
        a = np.concatenate([a_o, a_n], axis=0)
        r = np.concatenate([r_o, r_n], axis=0)
        s_tp1 = np.concatenate([np.stack(s_tp1_o), np.stack(s_tp1_n)], axis=0)
        g = np.concatenate([g_o, g_n])
        if self.cfg.instruction_repr == 'language':
            g = np.array(pad_to_max_length(g, self.cfg.max_sequence_length))
        return s, a, r, s_tp1, g
Пример #2
0
def main(_):
    tf.enable_v2_behavior()
    ##############################################################################
    ######################### Data loading and processing ########################
    ##############################################################################
    print('Loading data')
    # with gfile.GFile(transition_path, 'r') as f:
    #   transitions = np.load(f)

    with gfile.GFile(transition_state_path, 'r') as f:
        states = np.load(f)
    states = np.float32(states)

    with gfile.GFile(transition_label_path, 'r') as f:
        captions = pickle.load(f)

    with gfile.GFile(answer_path, 'r') as f:
        answers = pickle.load(f)

    with gfile.GFile(vocab_path, 'r') as f:
        vocab_list = f.readlines()

    vocab_list = [w[:-1].decode('utf-8') for w in vocab_list]
    vocab_list = ['eos', 'sos', 'nothing'] + vocab_list
    vocab_list[-1] = 'to'

    v2i, i2v = wv.create_look_up_table(vocab_list)
    encode_fn = wv.encode_text_with_lookup_table(v2i)
    decode_fn = wv.decode_with_lookup_table(i2v)

    caption_decoding_map = {v: k for k, v in captions[0].items()}
    decompressed_captions = []
    for caption in captions[1:]:
        new_caption = []
        for c in caption:
            new_caption.append(caption_decoding_map[c])
        decompressed_captions.append(new_caption)
    captions = decompressed_captions

    encoded_captions = []
    new_answers = []
    for i, all_cp in enumerate(captions):
        for cp in all_cp:
            encoded_captions.append(np.array(encode_fn(cp)))
        for a in answers[i]:
            new_answers.append(float(a))
    all_caption_n = len(encoded_captions)
    encoded_captions = np.array(encoded_captions)
    encoded_captions = pad_to_max_length(encoded_captions)
    answers = np.float32(new_answers)

    obs_idx, caption_idx = [], []
    curr_caption_idx = 0
    for i, _ in enumerate(states):
        for cp in captions[i]:
            obs_idx.append(i)
            caption_idx.append(curr_caption_idx)
            curr_caption_idx += 1
    assert curr_caption_idx == all_caption_n

    obs_idx = np.array(obs_idx)
    caption_idx = np.array(caption_idx)
    all_idx = np.arange(len(caption_idx))
    train_idx = all_idx[:int(len(all_idx) * 0.7)]
    test_idx = all_idx[int(len(all_idx) * 0.7):]
    print('Number of training examples: {}'.format(len(train_idx)))
    print('Number of test examples: {}\n'.format(len(test_idx)))

    ##############################################################################
    ############################# Training Setup #################################
    ##############################################################################
    embedding_dim = 32
    units = 64
    vocab_size = len(vocab_list)
    batch_size = 128
    max_sequence_length = 21

    encoder_config = {'name': 'state', 'embedding_dim': 64}
    decoder_config = {
        'name': 'state',
        'word_embedding_dim': 64,
        'hidden_units': 512,
        'vocab_size': len(vocab_list),
    }

    encoder = get_answering_encoder(encoder_config)
    decoder = get_answering_decoder(decoder_config)
    projection_layer = tf.keras.layers.Dense(1,
                                             activation='sigmoid',
                                             name='answering_projection')

    optimizer = tf.keras.optimizers.Adam(1e-4)
    bce = tf.keras.losses.BinaryCrossentropy()

    @tf.function
    def compute_loss(obs, instruction, target):
        instruction = tf.expand_dims(instruction, axis=-1)
        hidden = decoder.reset_state(batch_size=target.shape[0])
        features = encoder(obs)
        for i in tf.range(max_sequence_length):
            _, hidden, _ = decoder(instruction[:, i], features, hidden)
        projection = tf.squeeze(projection_layer(hidden), axis=1)
        loss = bce(target, projection)
        return loss, projection

    @tf.function
    def train_step(obs, instruction, target):
        with tf.GradientTape() as tape:
            loss, _ = compute_loss(obs, instruction, target)
        trainable_variables = encoder.trainable_variables + decoder.trainable_variables + projection_layer.trainable_variables
        gradients = tape.gradient(loss, trainable_variables)
        optimizer.apply_gradients(zip(gradients, trainable_variables))
        return loss

    ##############################################################################
    ############################# Training Loop ##################################
    ##############################################################################
    print('Start training...\n')
    start_epoch = 0
    if FLAGS.save_dir:
        checkpoint_path = FLAGS.save_dir
        ckpt = tf.train.Checkpoint(encoder=encoder,
                                   decoder=decoder,
                                   projection_layer=projection_layer,
                                   optimizer=optimizer)
        ckpt_manager = tf.train.CheckpointManager(ckpt,
                                                  checkpoint_path,
                                                  max_to_keep=5)
        if ckpt_manager.latest_checkpoint:
            start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])

    epochs = 400
    step_per_epoch = int(all_caption_n / batch_size)

    previous_best, previous_best_accuracy = 100., 0.0

    for epoch in range(start_epoch, epochs):
        start = time.time()
        total_loss = 0
        for batch in range(step_per_epoch):
            batch_idx = np.random.choice(train_idx, size=batch_size)
            input_tensor = tf.convert_to_tensor(states[obs_idx[batch_idx], :])
            instruction = tf.convert_to_tensor(
                encoded_captions[caption_idx[batch_idx]])
            target = tf.convert_to_tensor(answers[caption_idx[batch_idx]])
            batch_loss = train_step(input_tensor, instruction, target)
            total_loss += batch_loss

            if batch % 1000 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(
                    epoch, batch, batch_loss.numpy()))

        if epoch % 5 == 0 and FLAGS.save_dir:
            test_total_loss = 0
            accuracy = 0
            for batch in range(10):
                batch_idx = np.arange(batch_size) + batch * batch_size
                idx = test_idx[batch_idx]
                input_tensor = tf.convert_to_tensor(states[obs_idx[idx], :])
                instruction = tf.convert_to_tensor(
                    encoded_captions[caption_idx[idx]])
                target = tf.convert_to_tensor(answers[caption_idx[idx]])
                t_loss, prediction = compute_loss(input_tensor, instruction,
                                                  target)
                test_total_loss += t_loss
                accuracy += np.mean(
                    np.float32(np.float32(prediction > 0.5) == target))
            test_total_loss /= 10.
            accuracy /= 10.
            if accuracy > previous_best_accuracy:
                previous_best_accuracy, previous_best = accuracy, test_total_loss
                ckpt_manager.save(checkpoint_number=epoch)

        print('\nEpoch {} | Loss {:.6f} | Val loss {:.6f} | Accuracy {:.3f}'.
              format(epoch + 1, total_loss / step_per_epoch, previous_best,
                     previous_best_accuracy))
        print('Time taken for 1 epoch {:.6f} sec\n'.format(time.time() -
                                                           start))

        if epoch % 10 == 0:
            test_total_loss = 0
            accuracy = 0
            for batch in range(len(test_idx) // batch_size):
                batch_idx = np.arange(batch_size) + batch * batch_size
                idx = test_idx[batch_idx]
                input_tensor = tf.convert_to_tensor(states[obs_idx[idx], :])
                instruction = tf.convert_to_tensor(
                    encoded_captions[caption_idx[idx]])
                target = tf.convert_to_tensor(answers[caption_idx[idx]])
                t_loss, prediction = compute_loss(input_tensor, instruction,
                                                  target)
                test_total_loss += t_loss
                accuracy += np.mean(
                    np.float32(np.float32(prediction > 0.5) == target))
            test_total_loss /= (len(test_idx) // batch_size)
            accuracy /= (len(test_idx) // batch_size)
            if accuracy > previous_best_accuracy and FLAGS.save_dir:
                previous_best_accuracy, previous_best = accuracy, test_total_loss
                ckpt_manager.save(checkpoint_number=epoch)
            print('\n====================================================')
            print('Test Loss {:.6f} | Test Accuracy {:.3f}'.format(
                test_total_loss, accuracy))
            print('====================================================\n')
Пример #3
0
  def learn(self, env, agent, replay_buffer, **kwargs):
    """Run learning for 1 cycle with consists of num_episode of episodes.

    Args:
      env: the RL environment
      agent: the RL agent
      replay_buffer: the experience replay buffer
      **kwargs: other potential arguments

    Returns:
      statistics of the training episode
    """
    average_per_ep_reward = []
    average_per_ep_achieved_n = []
    average_per_ep_relabel_n = []
    average_batch_loss = []
    curiosity_loss = 0

    curr_step = agent.get_global_step()
    self.update_epsilon(curr_step)
    tic = time.time()
    time_rolling_out, time_training = 0.0, 0.0
    for _ in range(self.cfg.num_episode):
      curr_step = agent.increase_global_step()

      sample_new_scene = random.uniform(0, 1) < self.cfg.sample_new_scene_prob
      s = self.reset(env, agent, sample_new_scene)
      episode_experience = []
      episode_reward = 0
      episode_achieved_n = 0
      episode_relabel_n = 0

      # rollout
      rollout_tic = time.time()
      g_text, p = env.sample_goal()
      if env.all_goals_satisfied:
        s = self.reset(env, agent, True)
        g_text, p = env.sample_goal()
      g = np.squeeze(self.encode_fn(g_text))

      for t in range(self.cfg.max_episode_length):
        a = agent.step(s, g, env, self.epsilon)
        s_tp1, r, _, _ = env.step(
            a,
            record_achieved_goal=True,
            goal=p,
            atomic_goal=self.cfg.record_atomic_instruction)
        ag = env.get_achieved_goals()
        ag_text = env.get_achieved_goal_programs()
        ag_total = ag  # TODO(ydjiang): more can be stored in ag
        episode_experience.append((s, a, r, s_tp1, g, ag_total))
        episode_reward += r
        s = s_tp1
        if r > env.shape_val:
          episode_achieved_n += 1
          g_text, p = env.sample_goal()
          if env.all_goals_satisfied:
            break
          g = np.squeeze(self.encode_fn(g_text))
      time_rolling_out += time.time() - rollout_tic

      average_per_ep_reward.append(episode_reward)
      average_per_ep_achieved_n.append(episode_achieved_n)

      # processing trajectory
      train_tic = time.time()
      episode_length = len(episode_experience)
      for t in range(episode_length):
        s, a, r, s_tp1, g, ag = episode_experience[t]
        episode_relabel_n += float(len(ag) > 0)
        g_text = self.decode_fn(g)
        if self.cfg.paraphrase:
          g_text = paraphrase_sentence(
              g_text, delete_color=self.cfg.diverse_scene_content)
        g = self.encode_fn(g_text)
        replay_buffer.add((s, a, r, s_tp1, g))
        if self.cfg.relabeling:
          self.hir_relabel(episode_experience, t, replay_buffer, env)

      average_per_ep_relabel_n.append(episode_relabel_n / float(episode_length))

      # training
      if not self.is_warming_up(curr_step):
        batch_loss = 0
        for _ in range(self.cfg.optimization_steps):
          experience = replay_buffer.sample(self.cfg.batchsize)
          s, a, r, s_tp1, g = [
              np.squeeze(elem, axis=1) for elem in np.split(experience, 5, 1)
          ]
          s = np.stack(s)
          s_tp1 = np.stack(s_tp1)
          g = np.array(list(g))
          if self.cfg.instruction_repr == 'language':
            g = np.array(pad_to_max_length(g, self.cfg.max_sequence_length))
          batch = {
              'obs': np.asarray(s),
              'action': np.asarray(a),
              'reward': np.asarray(r),
              'obs_next': np.asarray(s_tp1),
              'g': np.asarray(g)
          }
          loss_dict = agent.train(batch)
          batch_loss += loss_dict['loss']
          if 'prediction_loss' in loss_dict:
            curiosity_loss += loss_dict['prediction_loss']
        average_batch_loss.append(batch_loss / self.cfg.optimization_steps)
      time_training += time.time()-train_tic

    time_per_episode = (time.time() - tic) / self.cfg.num_episode
    time_training_per_episode = time_training / self.cfg.num_episode
    time_rolling_out_per_episode = time_rolling_out / self.cfg.num_episode

    # Update the target network
    agent.update_target_network()
    ################## Debug ##################
    sample = replay_buffer.sample(min(10000, len(replay_buffer.buffer)))
    _, _, sample_r, _, _ = [
        np.squeeze(elem, axis=1) for elem in np.split(sample, 5, 1)
    ]
    print('n one:', np.sum(np.float32(sample_r == 1.0)), 'n zero',
          np.sum(np.float32(sample_r == 0.0)), 'n buff',
          len(replay_buffer.buffer))
    ################## Debug ##################
    stats = {
        'loss': np.mean(average_batch_loss) if average_batch_loss else 0,
        'reward': np.mean(average_per_ep_reward),
        'achieved_goal': np.mean(average_per_ep_achieved_n),
        'average_relabel_goal': np.mean(average_per_ep_relabel_n),
        'epsilon': self.epsilon,
        'global_step': curr_step,
        'time_per_episode': time_per_episode,
        'time_training_per_episode': time_training_per_episode,
        'time_rolling_out_per_episode': time_rolling_out_per_episode,
        'replay_buffer_reward_avg': np.mean(sample_r),
        'replay_buffer_reward_var': np.var(sample_r)
    }
    return stats
Пример #4
0
def main(_):
    tf.enable_v2_behavior()
    ##############################################################################
    ######################### Data loading and processing ########################
    ##############################################################################
    print('Loading data')

    with gfile.GFile(transition_path, 'r') as f:
        transitions = np.load(f)
    if np.max(transitions) > 1.0:
        transitions = transitions / 255.0
    with gfile.GFile(synthetic_transition_path, 'r') as f:
        synthetic_transitions = np.load(f)
    if np.max(synthetic_transitions) > 1.0:
        synthetic_transitions = synthetic_transitions / 255.0

    with gfile.GFile(transition_label_path, 'r') as f:
        captions = pickle.load(f)
    with gfile.GFile(synthetic_transition_label_path, 'r') as f:
        synthetic_captions = pickle.load(f)

    with gfile.GFile(vocab_path, 'r') as f:
        vocab_list = f.readlines()

    vocab_list = [w[:-1].decode('utf-8') for w in vocab_list]
    vocab_list = ['eos', 'sos'] + vocab_list

    v2i, i2v = wv.create_look_up_table(vocab_list)
    encode_fn = wv.encode_text_with_lookup_table(v2i)
    decode_fn = wv.decode_with_lookup_table(i2v)

    encoded_captions = []
    for all_cp in captions:
        for cp in all_cp:
            cp = 'sos ' + cp + ' eos'
            encoded_captions.append(np.array(encode_fn(cp)))

    synthetic_encoded_captions = []
    for all_cp in synthetic_captions:
        for cp in all_cp:
            cp = 'sos ' + cp + ' eos'
            synthetic_encoded_captions.append(np.array(encode_fn(cp)))

    all_caption_n = len(encoded_captions)
    all_synthetic_caption_n = len(synthetic_encoded_captions)

    encoded_captions = np.array(encoded_captions)
    encoded_captions = pad_to_max_length(encoded_captions, max_l=15)

    synthetic_encoded_captions = np.array(synthetic_encoded_captions)
    synthetic_encoded_captions = pad_to_max_length(synthetic_encoded_captions,
                                                   max_l=15)

    obs_idx, caption_idx, negative_caption_idx = [], [], []
    curr_caption_idx = 0
    for i, _ in enumerate(transitions):
        for cp in captions[i]:
            obs_idx.append(i)
            if 'nothing' not in cp:
                caption_idx.append(curr_caption_idx)
            else:
                negative_caption_idx.append(curr_caption_idx)
            curr_caption_idx += 1
    assert curr_caption_idx == all_caption_n

    synthetic_obs_idx, synthetic_caption_idx = [], []
    synthetic_negative_caption_idx = []
    curr_caption_idx = 0
    for i, _ in enumerate(synthetic_transitions):
        for cp in synthetic_captions[i]:
            synthetic_obs_idx.append(i)
            if 'nothing' not in cp:
                synthetic_caption_idx.append(curr_caption_idx)
            else:
                synthetic_negative_caption_idx.append(curr_caption_idx)
            curr_caption_idx += 1
    assert curr_caption_idx == all_synthetic_caption_n

    obs_idx = np.array(obs_idx)
    caption_idx = np.array(caption_idx)
    negative_caption_idx = np.array(negative_caption_idx)
    all_idx = np.arange(len(caption_idx))
    train_idx = all_idx[:int(len(all_idx) * 0.8)]
    test_idx = all_idx[int(len(all_idx) * 0.8):]
    print('Number of training examples: {}'.format(len(train_idx)))
    print('Number of test examples: {}\n'.format(len(test_idx)))

    synthetic_obs_idx = np.array(synthetic_obs_idx)
    synthetic_caption_idx = np.array(synthetic_caption_idx)
    synthetic_negative_caption_idx = np.array(synthetic_negative_caption_idx)
    synthetic_all_idx = np.arange(len(synthetic_caption_idx))
    synthetic_train_idx = synthetic_all_idx[:int(len(synthetic_all_idx) * 0.8)]
    synthetic_test_idx = synthetic_all_idx[int(len(synthetic_all_idx) * 0.8):]
    print('Number of synthetic training examples: {}'.format(
        len(synthetic_train_idx)))
    print('Number of synthetic test examples: {}\n'.format(
        len(synthetic_test_idx)))

    def sample_batch(data_type, batch_size, mode='train'):
        is_synthetic = data_type == 'synthetic'
        transitions_s = synthetic_transitions if is_synthetic else transitions
        encoded_captions_s = synthetic_encoded_captions if is_synthetic else encoded_captions
        obs_idx_s = synthetic_obs_idx if is_synthetic else obs_idx
        caption_idx_s = synthetic_caption_idx if is_synthetic else caption_idx
        all_idx_s = synthetic_all_idx if is_synthetic else all_idx
        train_idx_s = synthetic_train_idx if is_synthetic else train_idx
        test_idx_s = synthetic_test_idx if is_synthetic else test_idx
        if mode == 'train':
            batch_idx_s = np.random.choice(train_idx_s, size=batch_size)
        else:
            batch_idx_s = np.random.choice(test_idx_s, size=batch_size)
        input_tensor = tf.convert_to_tensor(
            np.concatenate([
                transitions_s[obs_idx_s[batch_idx_s], 1, :],
                transitions_s[obs_idx_s[batch_idx_s], 1, :]
            ]))
        positive_idx = caption_idx_s[batch_idx_s]
        negative_idx = caption_idx_s[np.random.choice(train_idx_s,
                                                      size=batch_size)]
        caption_tensor = tf.convert_to_tensor(
            np.concatenate([
                encoded_captions_s[positive_idx],
                encoded_captions_s[negative_idx]
            ],
                           axis=0))
        target_tensor = tf.convert_to_tensor(
            np.float32(
                np.concatenate([np.ones(batch_size),
                                np.zeros(batch_size)],
                               axis=0)))
        return input_tensor, caption_tensor, target_tensor

    ##############################################################################
    ############################# Training Setup #################################
    ##############################################################################
    embedding_dim = 32
    units = 64
    vocab_size = len(vocab_list)
    batch_size = 64
    max_sequence_length = 15

    encoder_config = {'name': 'image', 'embedding_dim': 64}
    decoder_config = {
        'name': 'attention',
        'word_embedding_dim': 64,
        'hidden_units': 256,
        'vocab_size': len(vocab_list),
    }

    encoder = get_answering_encoder(encoder_config)
    decoder = get_answering_decoder(decoder_config)
    projection_layer = tf.keras.layers.Dense(1,
                                             activation='sigmoid',
                                             name='answering_projection')

    optimizer = tf.keras.optimizers.Adam(1e-4)
    bce = tf.keras.losses.BinaryCrossentropy()

    @tf.function
    def compute_loss(obs, instruction, target, training):
        print('Build compute loss...')
        instruction = tf.expand_dims(instruction, axis=-1)
        hidden = decoder.reset_state(batch_size=target.shape[0])
        features = encoder(obs, training=training)
        for i in tf.range(max_sequence_length):
            _, hidden, _ = decoder(instruction[:, i],
                                   features,
                                   hidden,
                                   training=training)
        projection = tf.squeeze(projection_layer(hidden), axis=1)
        loss = bce(target, projection)
        return loss, projection

    @tf.function
    def train_step(obs, instruction, target):
        print('Build train step...')
        with tf.GradientTape() as tape:
            loss, _ = compute_loss(obs, instruction, target, True)
        trainable_variables = encoder.trainable_variables + decoder.trainable_variables + projection_layer.trainable_variables
        print('num trainable: ', len(trainable_variables))
        gradients = tape.gradient(loss, trainable_variables)
        optimizer.apply_gradients(zip(gradients, trainable_variables))
        return loss

    ##############################################################################
    ############################# Training Loop ##################################
    ##############################################################################
    print('Start training...\n')
    start_epoch = 0
    if FLAGS.save_dir:
        checkpoint_path = FLAGS.save_dir
        ckpt = tf.train.Checkpoint(encoder=encoder,
                                   decoder=decoder,
                                   projection_layer=projection_layer,
                                   optimizer=optimizer)
        ckpt_manager = tf.train.CheckpointManager(ckpt,
                                                  checkpoint_path,
                                                  max_to_keep=5)
        if ckpt_manager.latest_checkpoint:
            start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])

    epochs = 400
    step_per_epoch = int(all_caption_n / batch_size)

    previous_best, previous_best_accuracy = 100., 0.0
    # input_tensor, instruction, target = sample_batch('synthetic', batch_size,
    #                                                  'train')
    for epoch in range(start_epoch, epochs):
        start = time.time()
        total_loss = 0
        for batch in range(step_per_epoch):
            input_tensor, instruction, target = sample_batch(
                'synthetic', batch_size, 'train')
            batch_loss = train_step(input_tensor, instruction, target)
            total_loss += batch_loss
            # print(batch, batch_loss)
            # print(instruction[0])
            # print(encode_fn('nothing'))
            # print('====================================')

            if batch % 1000 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(
                    epoch, batch, batch_loss.numpy()))

        if epoch % 5 == 0 and FLAGS.save_dir:
            test_total_loss = 0
            accuracy = 0
            for batch in range(10):
                input_tensor, instruction, target = sample_batch(
                    'synthetic', batch_size, 'test')
                t_loss, prediction = compute_loss(input_tensor, instruction,
                                                  target, False)
                test_total_loss += t_loss
                accuracy += np.mean(
                    np.float32(np.float32(prediction > 0.5) == target))
            test_total_loss /= 10.
            accuracy /= 10.
            if accuracy > previous_best_accuracy:
                previous_best_accuracy, previous_best = accuracy, test_total_loss
                ckpt_manager.save(checkpoint_number=epoch)

        print('\nEpoch {} | Loss {:.6f} | Val loss {:.6f} | Accuracy {:.3f}'.
              format(epoch + 1, total_loss / step_per_epoch, previous_best,
                     previous_best_accuracy))
        print('Time taken for 1 epoch {:.6f} sec\n'.format(time.time() -
                                                           start))

        if epoch % 10 == 0:
            test_total_loss = 0
            accuracy = 0
            for batch in range(len(test_idx) // batch_size):
                input_tensor, instruction, target = sample_batch(
                    'synthetic', batch_size, 'test')
                t_loss, prediction = compute_loss(input_tensor,
                                                  instruction,
                                                  target,
                                                  training=False)
                test_total_loss += t_loss
                accuracy += np.mean(
                    np.float32(np.float32(prediction > 0.5) == target))
            test_total_loss /= (len(test_idx) // batch_size)
            accuracy /= (len(test_idx) // batch_size)
            if accuracy > previous_best_accuracy and FLAGS.save_dir:
                previous_best_accuracy, previous_best = accuracy, test_total_loss
                ckpt_manager.save(checkpoint_number=epoch)
            print('\n====================================================')
            print('Test Loss {:.6f} | Test Accuracy {:.3f}'.format(
                test_total_loss, accuracy))
            print('====================================================\n')
Пример #5
0
def main(_):
    tf.enable_v2_behavior()
    ##############################################################################
    ######################### Data loading and processing ########################
    ##############################################################################
    print('Loading data')

    with gfile.GFile(_TRANSITION_PATH, 'r') as f:
        transitions = np.load(f)
    if np.max(transitions) > 1.0:
        transitions = transitions / 255.0
    with gfile.GFile(_SYNTHETIC_TRANSITION_PATH, 'r') as f:
        synthetic_tran_sitions = np.load(f)
    if np.max(synthetic_transitions) > 1.0:
        synthetic_transitions = synthetic_transitions / 255.0

    with gfile.GFile(transition_label_path, 'r') as f:
        captions = pickle.load(f)
    with gfile.GFile(_SYNTHETIC_TRANSITION_LABEL_PATH, 'r') as f:
        synthetic_captions = pickle.load(f)

    with gfile.GFile(vocab_path, 'r') as f:
        vocab_list = f.readlines()

    vocab_list = [w[:-1].decode('utf-8') for w in vocab_list]
    vocab_list = ['eos', 'sos'] + vocab_list

    v2i, i2v = wv.create_look_up_table(vocab_list)
    encode_fn = wv.encode_text_with_lookup_table(v2i)
    decode_fn = wv.decode_with_lookup_table(i2v)

    encoded_captions = []
    for all_cp in captions:
        for cp in all_cp:
            cp = 'sos ' + cp + ' eos'
            encoded_captions.append(np.array(encode_fn(cp)))

    synthetic_encoded_captions = []
    for all_cp in synthetic_captions:
        for cp in all_cp:
            cp = 'sos ' + cp + ' eos'
            synthetic_encoded_captions.append(np.array(encode_fn(cp)))

    all_caption_n = len(encoded_captions)
    all_synthetic_caption_n = len(synthetic_encoded_captions)

    encoded_captions = np.array(encoded_captions)
    encoded_captions = pad_to_max_length(encoded_captions, max_l=15)

    synthetic_encoded_captions = np.array(synthetic_encoded_captions)
    synthetic_encoded_captions = pad_to_max_length(synthetic_encoded_captions,
                                                   max_l=15)

    obs_idx, caption_idx = [], []
    curr_caption_idx = 0
    for i, _ in enumerate(transitions):
        for cp in captions[i]:
            obs_idx.append(i)
            caption_idx.append(curr_caption_idx)
            curr_caption_idx += 1
    assert curr_caption_idx == all_caption_n

    synthetic_obs_idx, synthetic_caption_idx = [], []
    curr_caption_idx = 0
    for i, _ in enumerate(synthetic_transitions):
        for cp in synthetic_captions[i]:
            synthetic_obs_idx.append(i)
            synthetic_caption_idx.append(curr_caption_idx)
            curr_caption_idx += 1
    assert curr_caption_idx == all_synthetic_caption_n

    obs_idx = np.array(obs_idx)
    caption_idx = np.array(caption_idx)
    all_idx = np.arange(len(caption_idx))
    train_idx = all_idx[:int(len(all_idx) * 0.8)]
    test_idx = all_idx[int(len(all_idx) * 0.8):]
    print('Number of training examples: {}'.format(len(train_idx)))
    print('Number of test examples: {}\n'.format(len(test_idx)))

    synthetic_obs_idx = np.array(synthetic_obs_idx)
    synthetic_caption_idx = np.array(synthetic_caption_idx)
    synthetic_all_idx = np.arange(len(synthetic_caption_idx))
    synthetic_train_idx = synthetic_all_idx[:int(len(synthetic_all_idx) * 0.8)]
    synthetic_test_idx = synthetic_all_idx[int(len(synthetic_all_idx) * 0.8):]
    print('Number of synthetic training examples: {}'.format(
        len(synthetic_train_idx)))
    print('Number of synthetic test examples: {}\n'.format(
        len(synthetic_test_idx)))

    ##############################################################################
    ############################# Training Setup #################################
    ##############################################################################
    embedding_dim = 32
    units = 64
    vocab_size = len(vocab_list)
    batch_size = 64
    max_sequence_length = 15

    encoder_config = {'name': 'image', 'embedding_dim': 32}
    decoder_config = {
        'name': 'attention',
        'word_embedding_dim': 64,
        'hidden_units': 256,
        'vocab_size': len(vocab_list),
    }

    encoder = get_captioning_encoder(encoder_config)
    decoder = get_captioning_decoder(decoder_config)

    optimizer = tf.keras.optimizers.Adam()
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')

    def loss_function(real, pred, sos_symbol=1):
        mask = tf.math.logical_not(tf.math.equal(real, sos_symbol))
        loss_ = loss_object(real, pred)
        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask
        return tf.reduce_mean(loss_)

    @tf.function
    def train_step(input_tensor, target):
        """Traing on a batch of data."""
        loss = 0
        # initializing the hidden state for each batch
        # because the captions are not related from image to image
        hidden = decoder.reset_state(batch_size=target.shape[0])

        dec_input = tf.expand_dims([1] * target.shape[0], 1)

        with tf.GradientTape() as tape:
            features = encoder(input_tensor, training=True)
            for i in range(1, target.shape[1]):
                # passing the features through the decoder
                predictions, hidden, _ = decoder(dec_input,
                                                 features,
                                                 hidden,
                                                 training=True)
                loss += loss_function(target[:, i], predictions)
                # using teacher forcing
                dec_input = tf.expand_dims(target[:, i], 1)

        total_loss = (loss / int(target.shape[1]))
        trainable_variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss, trainable_variables)
        optimizer.apply_gradients(zip(gradients, trainable_variables))

        return loss, total_loss

    @tf.function
    def evaluate_batch(input_tensor, target):
        """Evaluate loss on a batch of data."""
        loss = 0
        # initializing the hidden state for each batch
        # because the captions are not related from image to image
        hidden = decoder.reset_state(batch_size=target.shape[0])
        dec_input = tf.expand_dims([1] * target.shape[0], 1)
        features = encoder(input_tensor, training=False)

        for i in range(1, target.shape[1]):
            # passing the features through the decoder
            predictions, hidden, _ = decoder(dec_input,
                                             features,
                                             hidden,
                                             training=False)
            loss += loss_function(target[:, i], predictions)
            # using teacher forcing
            dec_input = tf.expand_dims(target[:, i], 1)
        total_loss = (loss / int(target.shape[1]))
        return total_loss

    ##############################################################################
    ############################# Training Loop ##################################
    ##############################################################################
    print('Start training...\n')
    start_epoch = 0
    if FLAGS.save_dir:
        checkpoint_path = FLAGS.save_dir
        ckpt = tf.train.Checkpoint(encoder=encoder,
                                   decoder=decoder,
                                   optimizer=optimizer)
        ckpt_manager = tf.train.CheckpointManager(ckpt,
                                                  checkpoint_path,
                                                  max_to_keep=5)
        if ckpt_manager.latest_checkpoint:
            start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])

    epochs = 400
    step_per_epoch = int(len(captions) / batch_size) * 10

    previous_best = 100.

    mixing_ratio = 0.4
    syn_bs = int(batch_size * 2 * mixing_ratio)
    true_bs = int(batch_size * 2 * (1 - mixing_ratio))

    for epoch in range(start_epoch, epochs):
        start = time.time()
        total_loss = 0

        for batch in range(step_per_epoch):
            batch_idx = np.random.choice(train_idx, size=true_bs)
            synthetic_batch_idx = np.random.choice(synthetic_train_idx,
                                                   size=syn_bs)
            input_tensor = transitions[obs_idx[batch_idx], :]
            synthetic_input_tensor = synthetic_transitions[
                synthetic_obs_idx[synthetic_batch_idx], :]
            input_tensor = np.concatenate(
                [input_tensor, synthetic_input_tensor], axis=0)
            input_tensor = encoder.preprocess(input_tensor)
            target = encoded_captions[caption_idx[batch_idx]]
            sythetic_target = synthetic_encoded_captions[
                synthetic_caption_idx[synthetic_batch_idx]]
            target = np.concatenate([target, sythetic_target], axis=0)
            batch_loss, t_loss = train_step(input_tensor, target)
            total_loss += t_loss

            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(
                    epoch + 1, batch,
                    batch_loss.numpy() / int(target.shape[1])))

        if epoch % 5 == 0 and FLAGS.save_dir:
            test_total_loss = 0
            for batch in range(3):
                batch_idx = np.clip(
                    np.arange(true_bs) + batch * true_bs, 0, 196)
                idx = test_idx[batch_idx]
                input_tensor = transitions[obs_idx[idx], :]
                target = encoded_captions[caption_idx[idx]]
                t_loss = evaluate_batch(input_tensor, target)
                test_total_loss += t_loss
                batch_idx = np.arange(syn_bs) + batch * syn_bs
                idx = synthetic_test_idx[batch_idx]
                input_tensor = synthetic_transitions[synthetic_obs_idx[idx], :]
                target = synthetic_encoded_captions[synthetic_caption_idx[idx]]
                t_loss = evaluate_batch(input_tensor, target)
                test_total_loss += t_loss
            test_total_loss /= 6.
            if test_total_loss < previous_best:
                previous_best = test_total_loss
                ckpt_manager.save(checkpoint_number=epoch)

        print('Epoch {} | Loss {:.6f} | Val loss {:.6f}'.format(
            epoch + 1, total_loss / step_per_epoch, previous_best))
        print('Time taken for 1 epoch {:.6f} sec\n'.format(time.time() -
                                                           start))

        if epoch % 20 == 0:
            total_loss = 0
            for batch in range(len(test_idx) // batch_size):
                batch_idx = np.arange(batch_size) + batch * batch_size
                idx = test_idx[batch_idx]
                input_tensor = transitions[obs_idx[idx], :]
                target = encoded_captions[caption_idx[idx]]
                # input_tensor = input_tensor[:, 0] - input_tensor[:, 1]
                t_loss = evaluate_batch(input_tensor, target)
                total_loss += t_loss

            print('====================================================')
            print('Test Loss {:.6f}'.format(total_loss /
                                            (len(test_idx) // batch_size)))
            print('====================================================\n')
Пример #6
0
  def rollout(self,
              env,
              agent,
              directory,
              record_video=False,
              timeout=8,
              num_episode=10,
              record_trajectory=False):
    """Rollout and save.

    Args:
      env: the RL environment
      agent: the RL agent
      directory: directory where the output of the rollout is saved
      record_video: record the video
      timeout: timeout step if the agent is stuck
      num_episode: number of rollout episode
      record_trajectory: record the ground truth trajectory

    Returns:
      percentage of success during this rollout
    """
    print('\n#######################################')
    print('Rolling out...')
    print('#######################################')

    # randomly change subset of embedding
    if self._use_synonym_for_rollout and self.cfg.embedding_type == 'random':
      original_embedding = agent.randomize_partial_word_embedding(10)

    all_frames = []
    ep_observation, ep_action, ep_agn = [], [], []
    black_frame = pad_image(env.render(mode='rgb_array')) * 0.0
    goal_sampled = 0
    timeout_count, success = 0, 0
    for ep in range(num_episode):
      s = env.reset(self.cfg.diverse_scene_content)
      all_frames += [black_frame] * 10
      g_text, p = env.sample_goal()
      if env.all_goals_satisfied:
        s = env.reset(True)
        g, p = env.sample_goal()
      goal_sampled += 1
      g = self.encode_fn(g_text)
      g = np.squeeze(pad_to_max_length([g], self.cfg.max_sequence_length)[0])
      if self._use_synonym_for_rollout and self.cfg.embedding_type != 'random':
        # use unseen lexicons for test
        g = paraphrase_sentence(
            self.decode_fn(g), synonym_tables=_SYNONYM_TABLES)
      current_goal_repetition = 0
      for t in range(self.cfg.max_episode_length):
        prob = self.epsilon if record_trajectory else 0.0
        action = agent.step(s, g, env, explore_prob=prob)
        s_tp1, r, _, _ = env.step(
            action,
            record_achieved_goal=False,
            goal=p,
            atomic_goal=self.cfg.record_atomic_instruction)
        s = s_tp1
        all_frames.append(
            add_text(pad_image(env.render(mode='rgb_array')), g_text))
        current_goal_repetition += 1

        if record_trajectory:
          ep_observation.append(env.get_direct_obs().tolist())
          ep_action.append(action)

        sample_new_goal = False
        if r > env.shape_val:
          img = pad_image(env.render(mode='rgb_array'))
          for _ in range(5):
            all_frames.append(add_text(img, g_text, color='green'))
          success += 1
          sample_new_goal = True

        if current_goal_repetition >= timeout:
          all_frames.append(
              add_text(pad_image(env.render(mode='rgb_array')), 'time out :('))
          timeout_count += 1
          sample_new_goal = True

        if sample_new_goal:
          g, p = env.sample_goal()
          if env.all_goals_satisfied:
            break
          g_text = g
          g = self.encode_fn(g_text)
          g = np.squeeze(
              pad_to_max_length([g], self.cfg.max_sequence_length)[0])
          if self._use_synonym_for_rollout and self.cfg.embedding_type != 'random':
            g = paraphrase_sentence(
                self.decode_fn(g), synonym_tables=_SYNONYM_TABLES)
          current_goal_repetition = 0
          goal_sampled += 1

    # restore the original embedding
    if self._use_synonym_for_rollout and self.cfg.embedding_type == 'random':
      agent.set_embedding(original_embedding)

    print('Rollout finished')
    print('{} instrutctions tried given'.format(goal_sampled))
    print('{} instructions timed out'.format(timeout_count))
    print('{} success rate\n'.format(1 - float(timeout_count) / goal_sampled))
    if record_video:
      save_video(np.uint8(all_frames), directory, fps=5)
      print('Video saved...')
    if record_trajectory:
      print('Recording trajectory...')
      datum = {
          'obs': ep_observation,
          'action': ep_action,
          'achieved goal': ep_agn,
      }
      save_json(datum, directory[:-4] + '_trajectory.json')
    return 1 - float(timeout_count) / goal_sampled
Пример #7
0
  def learn(self, env, agent, replay_buffer):
    """Run learning for 1 cycle with consists of num_episode of episodes.

    Args:
      env: the RL environment
      agent: the RL agent
      replay_buffer: the experience replay buffer

    Returns:
      statistics of the training episode
    """
    average_per_ep_reward = []
    average_per_ep_achieved_n = []
    average_per_ep_relabel_n = []
    average_batch_loss = []

    curr_step = agent.get_global_step()
    self.update_epsilon(curr_step)
    tic = time.time()
    for _ in range(self.cfg.num_episode):
      curr_step = agent.increase_global_step()

      sample_new_scene = random.uniform(0, 1) < self.cfg.sample_new_scene_prob
      s = env.reset(sample_new_scene)
      episode_experience = []
      episode_reward = 0
      episode_achieved_n = 0
      episode_relabel_n = 0

      # rollout
      g_text, p = env.sample_goal()
      if env.all_goals_satisfied:
        s = env.reset(True)
        g_text, p = env.sample_goal()
      g = self.encode_fn(g_text)
      g = np.squeeze(pad_to_max_length([g], self.cfg.max_sequence_length)[0])
      _ = agent.step(s, g, env, 0.0)  # taking a step to create weights

      for t in range(self.cfg.max_episode_length):
        a = agent.step(s, g, env, self.epsilon)
        s_tp1, r, _, _ = env.step(
            a,
            record_achieved_goal=self._use_oracle_instruction,
            goal=p,
            atomic_goal=self.cfg.record_atomic_instruction)
        if self._use_labeler_as_reward:
          labeler_answer = self.labeler.verify_instruction(
              env.convert_order_invariant_to_direct(s_tp1), g)
          r = float(labeler_answer > 0.5)
        if self._use_oracle_instruction:
          ag = env.get_achieved_goals()
        else:
          ag = [None]
        episode_experience.append((s, a, r, s_tp1, g, ag))
        episode_reward += r
        s = s_tp1
        if r > env.shape_val:
          episode_achieved_n += 1
          g_text, p = env.sample_goal()
          if env.all_goals_satisfied:
            break
          g = self.encode_fn(g_text)
          g = np.squeeze(
              pad_to_max_length([g], self.cfg.max_sequence_length)[0])

      average_per_ep_reward.append(episode_reward)
      average_per_ep_achieved_n.append(episode_achieved_n)

      # processing trajectory
      episode_length = len(episode_experience)

      if not self._use_oracle_instruction:  # generate instructions from traj
        transition_pair = []
        if self.cfg.obs_type == 'order_invariant':
          for t in episode_experience:
            transition_pair.append([
                env.convert_order_invariant_to_direct(t[0]),
                env.convert_order_invariant_to_direct(t[3])
            ])
          transition_pair = np.stack(transition_pair)
        else:
          for t in episode_experience:
            transition_pair.append([t[0], t[3]])

        all_achieved_goals = self.labeler.label_trajectory(
            transition_pair, null_token=2)
        for i in range(len(episode_experience)):
          s, a, r, s_tp1, g, ag = episode_experience[i]
          step_i_text = []
          for inst in all_achieved_goals[i]:
            decoded_inst = self.decode_fn(inst)
            step_i_text.append(decoded_inst)
          episode_experience[i] = [s, a, r, s_tp1, g, step_i_text]

      non_null_future_idx = [[] for _ in range(episode_length)]
      for t in range(episode_length):
        _, _, _, _, _, ag = episode_experience[t]
        if ag:
          for u in range(t):
            non_null_future_idx[u].append(t)

      for t in range(episode_length):
        s, a, r, s_tp1, g, ag = episode_experience[t]
        episode_relabel_n += float(len(ag) > 0)
        g_text = self.decode_fn(g)
        if self.cfg.paraphrase:
          g_text = paraphrase_sentence(
              g_text, delete_color=self.cfg.diverse_scene_content)
        g = self.encode_fn(g_text)
        replay_buffer.add((s, a, r, s_tp1, g))
        if self.cfg.relabeling:
          self.hir_relabel(non_null_future_idx, episode_experience, t,
                           replay_buffer, env)

      average_per_ep_relabel_n.append(episode_relabel_n / float(episode_length))

      # training
      if not self.is_warming_up(curr_step):
        batch_loss = 0
        for _ in range(self.cfg.optimization_steps):
          experience = replay_buffer.sample(self.cfg.batchsize)
          s, a, r, s_tp1, g = [
              np.squeeze(elem, axis=1) for elem in np.split(experience, 5, 1)
          ]
          s = np.stack(s)
          s_tp1 = np.stack(s_tp1)
          g = np.array(list(g))
          if self.cfg.instruction_repr == 'language':
            g = np.array(pad_to_max_length(g, self.cfg.max_sequence_length))
          batch = {
              'obs': np.asarray(s),
              'action': np.asarray(a),
              'reward': np.asarray(r),
              'obs_next': np.asarray(s_tp1),
              'g': np.asarray(g)
          }
          loss_dict = agent.train(batch)
          batch_loss += loss_dict['loss']
        average_batch_loss.append(batch_loss / self.cfg.optimization_steps)

    time_per_episode = (time.time() - tic) / self.cfg.num_episode

    # Update the target network
    agent.update_target_network()

    ################## Debug ##################
    sample = replay_buffer.sample(min(10000, len(replay_buffer.buffer)))
    _, _, sample_r, _, _ = [
        np.squeeze(elem, axis=1) for elem in np.split(sample, 5, 1)
    ]
    print('n one:', np.sum(np.float32(sample_r == 1.0)), 'n zero',
          np.sum(np.float32(sample_r == 0.0)), 'n buff',
          len(replay_buffer.buffer))
    ################## Debug ##################
    stats = {
        'loss': np.mean(average_batch_loss) if average_batch_loss else 0,
        'reward': np.mean(average_per_ep_reward),
        'achieved_goal': np.mean(average_per_ep_achieved_n),
        'average_relabel_goal': np.mean(average_per_ep_relabel_n),
        'epsilon': self.epsilon,
        'global_step': curr_step,
        'time_per_episode': time_per_episode,
        'replay_buffer_reward_avg': np.mean(sample_r),
        'replay_buffer_reward_var': np.var(sample_r)
    }
    return stats