Exemplo n.º 1
0
def evaluate(
    loop_hparams, planner_hparams, policy_dir, model_dir, eval_metrics_dir,
    agent_type, eval_mode, eval_with_learner, log_every_steps, debug_video_path,
    num_debug_videos=1, random_starts_step_limit=None,
    report_fn=None, report_metric=None
):
  """Evaluate."""
  if eval_with_learner:
    assert agent_type == "policy"

  if report_fn:
    assert report_metric is not None

  eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir)
  video_writers = ()
  kwargs = {}
  if eval_mode in ["agent_real", "agent_simulated"]:
    if not eval_with_learner:
      if debug_video_path:
        tf.gfile.MakeDirs(debug_video_path)
        video_writers = [
            common_video.WholeVideoWriter(  # pylint: disable=g-complex-comprehension
                fps=10,
                output_path=os.path.join(debug_video_path, "{}.avi".format(i)),
                file_format="avi",
            )
            for i in range(num_debug_videos)
        ]
      kwargs["eval_fn"] = make_eval_fn_with_agent(
          agent_type, eval_mode, planner_hparams, model_dir,
          log_every_steps=log_every_steps,
          video_writers=video_writers,
          random_starts_step_limit=random_starts_step_limit
      )
    eval_metrics = rl_utils.evaluate_all_configs(
        loop_hparams, policy_dir, **kwargs
    )
  else:
    eval_metrics = evaluate_world_model(
        agent_type, loop_hparams, planner_hparams, model_dir, policy_dir,
        random_starts_step_limit, debug_video_path, log_every_steps
    )
  rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, 0)

  for video_writer in video_writers:
    video_writer.finish_to_disk()

  # Report metrics
  if report_fn:
    if report_metric == "mean_reward":
      metric_name = rl_utils.get_metric_name(
          sampling_temp=loop_hparams.eval_sampling_temps[0],
          max_num_noops=loop_hparams.eval_max_num_noops,
          clipped=False
      )
      report_fn(eval_metrics[metric_name], 0)
    else:
      report_fn(eval_metrics[report_metric], 0)
  return eval_metrics
Exemplo n.º 2
0
def debug_video_writer_factory(output_dir):
    """Creates a VideoWriter for debug videos."""
    if FLAGS.disable_ffmpeg:
        return common_video.IndividualFrameWriter(output_dir)
    else:
        output_path = os.path.join(output_dir, "video.avi")
        return common_video.WholeVideoWriter(fps=10,
                                             output_path=output_path,
                                             file_format="avi")
Exemplo n.º 3
0
 def _video_dump_frame(self, obs, rews):
   if self._video_writer is None:
     self._video_counter += 1
     self._video_writer = common_video.WholeVideoWriter(
         fps=10,
         output_path=os.path.join(self._video_dir,
                                  "{}.avi".format(self._video_counter)),
         file_format="avi")
   img = PIL_Image().new("RGB", (obs.shape[-2], 11),)
   draw = PIL_ImageDraw().Draw(img)
   draw.text((0, 0), "r:{:3}".format(int(rews[0])), fill=(255, 0, 0))
   self._video_writer.write(np.concatenate([np.asarray(img), obs[0]], axis=0))
Exemplo n.º 4
0
def evaluate(loop_hparams,
             planner_hparams,
             policy_dir,
             model_dir,
             eval_metrics_dir,
             agent_type,
             eval_with_learner,
             log_every_steps,
             debug_video_path,
             report_fn=None,
             report_metric=None):
    """Evaluate."""
    if eval_with_learner:
        assert agent_type == "policy"

    if report_fn:
        assert report_metric is not None

    eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir)
    kwargs = {}
    if not eval_with_learner:
        if debug_video_path:
            video_writer = common_video.WholeVideoWriter(
                fps=10, output_path=debug_video_path, file_format="avi")
        else:
            video_writer = None
        kwargs["eval_fn"] = make_eval_fn_with_agent(
            agent_type,
            planner_hparams,
            model_dir,
            log_every_steps=log_every_steps,
            video_writer=video_writer)
    eval_metrics = rl_utils.evaluate_all_configs(loop_hparams, policy_dir,
                                                 **kwargs)
    rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, 0)

    if video_writer is not None:
        video_writer.finish_to_disk()

    # Report metrics
    if report_fn:
        if report_metric == "mean_reward":
            metric_name = rl_utils.get_metric_name(
                sampling_temp=loop_hparams.eval_sampling_temps[0],
                max_num_noops=loop_hparams.eval_max_num_noops,
                clipped=False)
            report_fn(eval_metrics[metric_name], 0)
        else:
            report_fn(eval_metrics[report_metric], 0)
    return eval_metrics
def evaluate_world_model(real_env, hparams, world_model_dir, debug_video_path):
    """Evaluate the world model (reward accuracy)."""
    environment_spec = make_simulated_env_spec(real_env, hparams)
    environment_spec.wrappers = []

    num_input_frames = environment_spec.video_num_input_frames
    rollout_subsequences = []

    def initial_frame_chooser(batch_size):
        assert batch_size == len(rollout_subsequences)
        return np.stack([[
            frame.observation.decode()
            for frame in subsequence[:num_input_frames]
        ] for subsequence in rollout_subsequences])

    environment_spec.add_hparam("initial_frame_chooser", initial_frame_chooser)

    sim_env = SimulatedBatchGymEnv(environment_spec,
                                   hparams.wm_eval_batch_size, world_model_dir)
    subsequence_length = int(
        max(hparams.wm_eval_rollout_ratios) * hparams.ppo_epoch_length)
    rollouts = real_env.current_epoch_rollouts(
        split=tf.contrib.learn.ModeKeys.EVAL,
        minimal_rollout_frames=(subsequence_length + num_input_frames))

    video_writer = common_video.WholeVideoWriter(fps=10,
                                                 output_path=debug_video_path,
                                                 file_format="avi")

    reward_accuracies_by_length = {
        int(ratio * hparams.ppo_epoch_length): []
        for ratio in hparams.wm_eval_rollout_ratios
    }
    for _ in range(hparams.wm_eval_epochs_num):
        rollout_subsequences[:] = random_rollout_subsequences(
            rollouts, hparams.wm_eval_batch_size,
            subsequence_length + num_input_frames)

        eval_subsequences = [
            subsequence[(num_input_frames - 1):]
            for subsequence in rollout_subsequences
        ]

        # Check that the initial observation is the same in the real and simulated
        # rollout.
        sim_init_obs = sim_env.reset()

        def decode_real_obs(index):
            return np.stack([
                subsequence[index].observation.decode()
                for subsequence in eval_subsequences  # pylint: disable=cell-var-from-loop
            ])

        real_init_obs = decode_real_obs(0)
        assert np.all(sim_init_obs == real_init_obs)

        debug_frame_batches = []

        def append_debug_frame_batch(sim_obs, real_obs):
            errs = np.maximum(
                np.abs(sim_obs.astype(np.int) - real_obs, dtype=np.int) - 10,
                0).astype(np.uint8)
            debug_frame_batches.append(  # pylint: disable=cell-var-from-loop
                np.concatenate([sim_obs, real_obs, errs], axis=2))

        append_debug_frame_batch(sim_init_obs, real_init_obs)

        (sim_cum_rewards,
         real_cum_rewards) = (np.zeros(hparams.wm_eval_batch_size)
                              for _ in range(2))
        for i in range(subsequence_length):
            actions = [
                subsequence[i].action for subsequence in eval_subsequences
            ]
            (sim_obs, sim_rewards, _) = sim_env.step(actions)
            sim_cum_rewards += sim_rewards

            real_cum_rewards += [
                subsequence[i + 1].reward for subsequence in eval_subsequences
            ]
            for (length, reward_accuracies
                 ) in six.iteritems(reward_accuracies_by_length):
                if i + 1 == length:
                    reward_accuracies.append(
                        np.sum(sim_cum_rewards == real_cum_rewards) /
                        len(real_cum_rewards))

            real_obs = decode_real_obs(i + 1)
            append_debug_frame_batch(sim_obs, real_obs)

        for debug_frames in np.stack(debug_frame_batches, axis=1):
            for debug_frame in debug_frames:
                video_writer.write(debug_frame)

    video_writer.finish_to_disk()

    return {
        "reward_accuracy/at_{}".format(length): np.mean(reward_accuracies)
        for (length,
             reward_accuracies) in six.iteritems(reward_accuracies_by_length)
    }
Exemplo n.º 6
0
def evaluate_world_model(real_env, hparams, world_model_dir, debug_video_path):
  """Evaluate the world model (reward accuracy)."""
  frame_stack_size = hparams.frame_stack_size
  rollout_subsequences = []
  def initial_frame_chooser(batch_size):
    assert batch_size == len(rollout_subsequences)
    return np.stack([
        [frame.observation.decode() for frame in subsequence[:frame_stack_size]]
        for subsequence in rollout_subsequences
    ])

  env_fn = make_simulated_env_fn_from_hparams(
      real_env, hparams, batch_size=hparams.wm_eval_batch_size,
      initial_frame_chooser=initial_frame_chooser, model_dir=world_model_dir
  )
  sim_env = env_fn(in_graph=False)
  subsequence_length = int(
      max(hparams.wm_eval_rollout_ratios) * hparams.simulated_rollout_length
  )
  rollouts = real_env.current_epoch_rollouts(
      split=tf.estimator.ModeKeys.EVAL,
      minimal_rollout_frames=(subsequence_length + frame_stack_size)
  )

  video_writer = common_video.WholeVideoWriter(
      fps=10, output_path=debug_video_path, file_format="avi"
  )

  reward_accuracies_by_length = {
      int(ratio * hparams.simulated_rollout_length): []
      for ratio in hparams.wm_eval_rollout_ratios
  }
  for _ in range(hparams.wm_eval_num_batches):
    rollout_subsequences[:] = random_rollout_subsequences(
        rollouts, hparams.wm_eval_batch_size,
        subsequence_length + frame_stack_size
    )

    eval_subsequences = [
        subsequence[(frame_stack_size - 1):]
        for subsequence in rollout_subsequences
    ]

    # Check that the initial observation is the same in the real and simulated
    # rollout.
    sim_init_obs = sim_env.reset()
    def decode_real_obs(index):
      return np.stack([
          subsequence[index].observation.decode()
          for subsequence in eval_subsequences  # pylint: disable=cell-var-from-loop
      ])
    real_init_obs = decode_real_obs(0)
    assert np.all(sim_init_obs == real_init_obs)

    debug_frame_batches = []
    def append_debug_frame_batch(sim_obs, real_obs, sim_cum_rews,
                                 real_cum_rews, sim_rews, real_rews):
      """Add a debug frame."""
      rews = [[sim_cum_rews, sim_rews], [real_cum_rews, real_rews]]
      headers = []
      for j in range(len(sim_obs)):
        local_nps = []
        for i in range(2):
          img = PIL_Image().new("RGB", (sim_obs.shape[-2], 11),)
          draw = PIL_ImageDraw().Draw(img)
          draw.text((0, 0), "c:{:3}, r:{:3}".format(int(rews[i][0][j]),
                                                    int(rews[i][1][j])),
                    fill=(255, 0, 0))
          local_nps.append(np.asarray(img))
        local_nps.append(np.zeros_like(local_nps[0]))
        headers.append(np.concatenate(local_nps, axis=1))
      errs = absolute_hinge_difference(sim_obs, real_obs)
      headers = np.stack(headers)
      debug_frame_batches.append(  # pylint: disable=cell-var-from-loop
          np.concatenate([headers,
                          np.concatenate([sim_obs, real_obs, errs], axis=2)],
                         axis=1)
      )
    append_debug_frame_batch(sim_init_obs, real_init_obs,
                             np.zeros(hparams.wm_eval_batch_size),
                             np.zeros(hparams.wm_eval_batch_size),
                             np.zeros(hparams.wm_eval_batch_size),
                             np.zeros(hparams.wm_eval_batch_size))

    (sim_cum_rewards, real_cum_rewards) = (
        np.zeros(hparams.wm_eval_batch_size) for _ in range(2)
    )
    for i in range(subsequence_length):
      actions = [subsequence[i].action for subsequence in eval_subsequences]
      (sim_obs, sim_rewards, _) = sim_env.step(actions)
      sim_cum_rewards += sim_rewards

      real_rewards = np.array([
          subsequence[i + 1].reward for subsequence in eval_subsequences
      ])
      real_cum_rewards += real_rewards
      for (length, reward_accuracies) in six.iteritems(
          reward_accuracies_by_length
      ):
        if i + 1 == length:
          reward_accuracies.append(
              np.sum(sim_cum_rewards == real_cum_rewards) /
              len(real_cum_rewards)
          )

      real_obs = decode_real_obs(i + 1)
      append_debug_frame_batch(sim_obs, real_obs, sim_cum_rewards,
                               real_cum_rewards, sim_rewards, real_rewards)

    for debug_frames in np.stack(debug_frame_batches, axis=1):
      for debug_frame in debug_frames:
        video_writer.write(debug_frame)

  video_writer.finish_to_disk()

  return {
      "reward_accuracy/at_{}".format(length): np.mean(reward_accuracies)
      for (length, reward_accuracies) in six.iteritems(
          reward_accuracies_by_length
      )
  }
Exemplo n.º 7
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    trainer_lib.set_random_seed(FLAGS.random_seed)
    usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)

    # Create hparams
    hparams = create_hparams()
    hparams.force_full_predict = True
    hparams.scheduled_sampling_k = -1

    # Params
    num_agents = 1  # TODO(mbz): fix the code for more agents
    num_steps = FLAGS.num_steps
    num_actions = hparams.problem.num_actions
    frame_shape = hparams.problem.frame_shape
    resized_frame = hparams.preprocess_resize_frames is not None
    if resized_frame:
        frame_shape = hparams.preprocess_resize_frames
        frame_shape += [hparams.problem.num_channels]

    dataset = registry.problem(FLAGS.problem).dataset(
        tf.estimator.ModeKeys.TRAIN, shuffle_files=True, hparams=hparams)

    dataset = dataset.apply(
        tf.contrib.data.batch_and_drop_remainder(num_agents))
    data = dataset.make_one_shot_iterator().get_next()
    # Setup input placeholders
    input_size = [num_agents, hparams.video_num_input_frames]
    placeholders = {
        "inputs": tf.placeholder(tf.float32, input_size + frame_shape),
        "input_action": tf.placeholder(tf.int64, input_size + [1]),
        "input_reward": tf.placeholder(tf.int64, input_size + [1]),
    }
    # Creat model
    model_cls = registry.model(FLAGS.model)
    model = model_cls(hparams, tf.estimator.ModeKeys.PREDICT)
    prediction_ops = model.infer(placeholders)

    states_q = Queue(maxsize=hparams.video_num_input_frames)
    actions_q = Queue(maxsize=hparams.video_num_input_frames)
    rewards_q = Queue(maxsize=hparams.video_num_input_frames)
    all_qs = (states_q, actions_q, rewards_q)

    writer = common_video.WholeVideoWriter(fps=10,
                                           output_path=FLAGS.output_gif)

    saver = tf.train.Saver()
    with tf.train.SingularMonitoredSession() as sess:
        # Load latest checkpoint
        ckpt = tf.train.get_checkpoint_state(
            FLAGS.output_dir).model_checkpoint_path
        saver.restore(sess.raw_session(), ckpt)

        # get init frames from the dataset
        data_np = sess.run(data)

        frames = np.split(data_np["inputs"], hparams.video_num_input_frames, 1)
        for frame in frames:
            frame = np.squeeze(frame, 1)
            states_q.put(frame)
            writer.write(frame[0].astype(np.uint8))

        actions = np.split(data_np["input_action"],
                           hparams.video_num_input_frames, 1)
        for action in actions:
            actions_q.put(np.squeeze(action, 1))

        rewards = np.split(data_np["input_reward"],
                           hparams.video_num_input_frames, 1)
        for reward in rewards:
            rewards_q.put(np.squeeze(reward, 1))

        for step in range(num_steps):
            print(">>>>>>> ", step)

            random_actions = np.random.randint(num_actions - 1)
            random_actions = np.expand_dims(random_actions, 0)
            random_actions = np.tile(random_actions, (num_agents, 1))

            # Shape inputs and targets
            inputs, input_action, input_reward = (np.stack(list(q.queue),
                                                           axis=1)
                                                  for q in all_qs)

            # Predict next frames
            feed = {
                placeholders["inputs"]: inputs,
                placeholders["input_action"]: input_action,
                placeholders["input_reward"]: input_reward,
            }
            predictions = sess.run(prediction_ops, feed_dict=feed)

            predicted_states = predictions["targets"][:, 0]
            predicted_reward = predictions["target_reward"][:, 0]

            # Update queues
            new_data = (predicted_states, random_actions, predicted_reward)
            for q, d in zip(all_qs, new_data):
                q.get()
                q.put(d.copy())

            writer.write(np.round(predicted_states[0]).astype(np.uint8))

        video = writer.finish()
        writer.save_to_disk(video)
Exemplo n.º 8
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    trainer_lib.set_random_seed(FLAGS.random_seed)
    usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)

    # Create hparams
    hparams = trainer_lib.create_hparams(FLAGS.hparams_set,
                                         FLAGS.hparams,
                                         data_dir=os.path.expanduser(
                                             FLAGS.data_dir),
                                         problem_name=FLAGS.problem)
    hparams.force_full_predict = True
    hparams.scheduled_sampling_k = -1

    # Params
    num_agents = 1  # TODO(mbz): fix the code for more agents
    num_steps = FLAGS.num_steps
    if hasattr(hparams.problem, "num_actions"):
        num_actions = hparams.problem.num_actions
    else:
        num_actions = None
    frame_shape = hparams.problem.frame_shape
    resized_frame = hparams.preprocess_resize_frames is not None
    if resized_frame:
        frame_shape = hparams.preprocess_resize_frames
        frame_shape += [hparams.problem.num_channels]

    dataset = registry.problem(FLAGS.problem).dataset(
        tf.estimator.ModeKeys.TRAIN,
        shuffle_files=True,
        data_dir=os.path.expanduser(FLAGS.data_dir),
        hparams=hparams)

    dataset = dataset.batch(num_agents, drop_remainder=True)
    data = dataset.make_one_shot_iterator().get_next()
    # Setup input placeholders
    input_size = [num_agents, hparams.video_num_input_frames]
    if num_actions is None:
        placeholders = {
            "inputs": tf.placeholder(tf.float32, input_size + frame_shape)
        }
    else:
        placeholders = {
            "inputs": tf.placeholder(tf.float32, input_size + frame_shape),
            "input_action": tf.placeholder(tf.int64, input_size + [1]),
            "input_reward": tf.placeholder(tf.int64, input_size + [1]),
            "reset_internal_states": tf.placeholder(tf.float32, []),
        }
    # Create model.
    model_cls = registry.model(FLAGS.model)
    model = model_cls(hparams, tf.estimator.ModeKeys.PREDICT)
    prediction_ops = model.infer(placeholders)

    states_q = Queue(maxsize=hparams.video_num_input_frames)
    actions_q = Queue(maxsize=hparams.video_num_input_frames)
    rewards_q = Queue(maxsize=hparams.video_num_input_frames)
    if num_actions is not None:
        all_qs = [states_q, actions_q, rewards_q]
    else:
        all_qs = [states_q]

    writer = common_video.WholeVideoWriter(fps=FLAGS.fps,
                                           output_path=FLAGS.output_gif)

    saver = tf.train.Saver(tf.trainable_variables())
    with tf.train.SingularMonitoredSession() as sess:
        # Load latest checkpoint
        ckpt = tf.train.get_checkpoint_state(
            FLAGS.output_dir).model_checkpoint_path
        saver.restore(sess.raw_session(), ckpt)

        # get init frames from the dataset
        data_np = sess.run(data)

        frames = np.split(data_np["inputs"], hparams.video_num_input_frames, 1)
        for frame in frames:
            frame = np.squeeze(frame, 1)
            states_q.put(frame)
            writer.write(frame[0].astype(np.uint8))

        if num_actions is not None:
            actions = np.split(data_np["input_action"],
                               hparams.video_num_input_frames, 1)
            for action in actions:
                actions_q.put(np.squeeze(action, 1))

            rewards = np.split(data_np["input_reward"],
                               hparams.video_num_input_frames, 1)
            for reward in rewards:
                rewards_q.put(np.squeeze(reward, 1))

        for step in range(num_steps):
            print(">>>>>>> ", step)

            if num_actions is not None:
                random_actions = np.random.randint(num_actions - 1)
                random_actions = np.expand_dims(random_actions, 0)
                random_actions = np.tile(random_actions, (num_agents, 1))

                # Shape inputs and targets
                inputs, input_action, input_reward = (np.stack(list(q.queue),
                                                               axis=1)
                                                      for q in all_qs)
            else:
                assert len(all_qs) == 1
                q = all_qs[0]
                elems = list(q.queue)
                # Need to adjust shapes sometimes.
                for i, e in enumerate(elems):
                    if len(e.shape) < 4:
                        elems[i] = np.expand_dims(e, axis=0)
                inputs = np.stack(elems, axis=1)

            # Predict next frames
            if num_actions is None:
                feed = {placeholders["inputs"]: inputs}
            else:
                feed = {
                    placeholders["inputs"]: inputs,
                    placeholders["input_action"]: input_action,
                    placeholders["input_reward"]: input_reward,
                    placeholders["reset_internal_states"]: float(step == 0),
                }
            predictions = sess.run(prediction_ops, feed_dict=feed)

            if num_actions is None:
                predicted_states = predictions[:, 0]
            else:
                predicted_states = predictions["targets"][:, 0]
                predicted_reward = predictions["target_reward"][:, 0]

            # Update queues
            if num_actions is None:
                new_data = (predicted_states)
            else:
                new_data = (predicted_states, random_actions, predicted_reward)
            for q, d in zip(all_qs, new_data):
                q.get()
                q.put(d.copy())

            writer.write(np.round(predicted_states[0]).astype(np.uint8))

        writer.finish_to_disk()