def train(hparams, output_dir, report_fn=None):
    """Train."""
    hparams = initialize_env_specs(hparams)
    learner = rl_utils.LEARNERS[hparams.base_algo](hparams.frame_stack_size,
                                                   FLAGS.output_dir,
                                                   output_dir)
    policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
    rl_utils.update_hparams_from_hparams(policy_hparams, hparams,
                                         hparams.base_algo + "_")
    total_steps = policy_hparams.epochs_num
    eval_every_epochs = policy_hparams.eval_every_epochs
    if eval_every_epochs == 0:
        eval_every_epochs = total_steps
    policy_hparams.eval_every_epochs = 0

    steps = list(range(eval_every_epochs, total_steps + 1, eval_every_epochs))
    if not steps or steps[-1] < eval_every_epochs:
        steps.append(eval_every_epochs)
    metric_name = rl_utils.get_metric_name(
        sampling_temp=hparams.eval_sampling_temps[0],
        max_num_noops=hparams.eval_max_num_noops,
        clipped=False)
    for step in steps:
        policy_hparams.epochs_num = step
        learner.train(hparams.env_fn,
                      policy_hparams,
                      simulated=False,
                      save_continuously=True,
                      epoch=0)
        eval_metrics = rl_utils.evaluate_all_configs(hparams, output_dir)
        tf.logging.info("Agent eval metrics:\n{}".format(
            pprint.pformat(eval_metrics)))
        if report_fn:
            report_fn(eval_metrics[metric_name], step)
Exemplo n.º 2
0
def evaluate(
    loop_hparams, planner_hparams, policy_dir, model_dir, eval_metrics_dir,
    agent_type, eval_with_learner, log_every_steps, report_fn=None,
    report_metric=None
):
  """Evaluate."""
  if eval_with_learner:
    assert agent_type == "policy"

  if report_fn:
    assert report_metric is not None

  eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir)
  kwargs = {}
  if not eval_with_learner:
    kwargs["eval_fn"] = make_eval_fn_with_agent(
        agent_type, planner_hparams, model_dir, log_every_steps=log_every_steps
    )
  eval_metrics = rl_utils.evaluate_all_configs(
      loop_hparams, policy_dir, **kwargs
  )
  rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, 0)

  # Report metrics
  if report_fn:
    if report_metric == "mean_reward":
      metric_name = rl_utils.get_metric_name(
          sampling_temp=loop_hparams.eval_sampling_temps[0],
          max_num_noops=loop_hparams.eval_max_num_noops,
          clipped=False
      )
      report_fn(eval_metrics[metric_name], 0)
    else:
      report_fn(eval_metrics[report_metric], 0)
  return eval_metrics
Exemplo n.º 3
0
def evaluate(
    loop_hparams, planner_hparams, policy_dir, model_dir, eval_metrics_dir,
    agent_type, eval_mode, eval_with_learner, log_every_steps, debug_video_path,
    num_debug_videos=1, random_starts_step_limit=None,
    report_fn=None, report_metric=None
):
  """Evaluate."""
  if eval_with_learner:
    assert agent_type == "policy"

  if report_fn:
    assert report_metric is not None

  eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir)
  video_writers = ()
  kwargs = {}
  if eval_mode in ["agent_real", "agent_simulated"]:
    if not eval_with_learner:
      if debug_video_path:
        tf.gfile.MakeDirs(debug_video_path)
        video_writers = [
            common_video.WholeVideoWriter(  # pylint: disable=g-complex-comprehension
                fps=10,
                output_path=os.path.join(debug_video_path, "{}.avi".format(i)),
                file_format="avi",
            )
            for i in range(num_debug_videos)
        ]
      kwargs["eval_fn"] = make_eval_fn_with_agent(
          agent_type, eval_mode, planner_hparams, model_dir,
          log_every_steps=log_every_steps,
          video_writers=video_writers,
          random_starts_step_limit=random_starts_step_limit
      )
    eval_metrics = rl_utils.evaluate_all_configs(
        loop_hparams, policy_dir, **kwargs
    )
  else:
    eval_metrics = evaluate_world_model(
        agent_type, loop_hparams, planner_hparams, model_dir, policy_dir,
        random_starts_step_limit, debug_video_path, log_every_steps
    )
  rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, 0)

  for video_writer in video_writers:
    video_writer.finish_to_disk()

  # Report metrics
  if report_fn:
    if report_metric == "mean_reward":
      metric_name = rl_utils.get_metric_name(
          sampling_temp=loop_hparams.eval_sampling_temps[0],
          max_num_noops=loop_hparams.eval_max_num_noops,
          clipped=False
      )
      report_fn(eval_metrics[metric_name], 0)
    else:
      report_fn(eval_metrics[report_metric], 0)
  return eval_metrics
Exemplo n.º 4
0
def evaluate(loop_hparams,
             planner_hparams,
             policy_dir,
             model_dir,
             eval_metrics_dir,
             agent_type,
             eval_with_learner,
             log_every_steps,
             debug_video_path,
             report_fn=None,
             report_metric=None):
    """Evaluate."""
    if eval_with_learner:
        assert agent_type == "policy"

    if report_fn:
        assert report_metric is not None

    eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir)
    kwargs = {}
    if not eval_with_learner:
        if debug_video_path:
            video_writer = common_video.WholeVideoWriter(
                fps=10, output_path=debug_video_path, file_format="avi")
        else:
            video_writer = None
        kwargs["eval_fn"] = make_eval_fn_with_agent(
            agent_type,
            planner_hparams,
            model_dir,
            log_every_steps=log_every_steps,
            video_writer=video_writer)
    eval_metrics = rl_utils.evaluate_all_configs(loop_hparams, policy_dir,
                                                 **kwargs)
    rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, 0)

    if video_writer is not None:
        video_writer.finish_to_disk()

    # Report metrics
    if report_fn:
        if report_metric == "mean_reward":
            metric_name = rl_utils.get_metric_name(
                sampling_temp=loop_hparams.eval_sampling_temps[0],
                max_num_noops=loop_hparams.eval_max_num_noops,
                clipped=False)
            report_fn(eval_metrics[metric_name], 0)
        else:
            report_fn(eval_metrics[report_metric], 0)
    return eval_metrics
Exemplo n.º 5
0
def train(hparams, output_dir, env_problem_name, report_fn=None):
    """Train."""
    env_fn = initialize_env_specs(hparams, env_problem_name)

    tf.logging.vlog(1, "HParams in trainer_model_free.train : %s",
                    misc_utils.pprint_hparams(hparams))

    tf.logging.vlog(1, "Using hparams.base_algo: %s", hparams.base_algo)
    learner = rl_utils.LEARNERS[hparams.base_algo](hparams.frame_stack_size,
                                                   output_dir,
                                                   output_dir,
                                                   total_num_epochs=1)

    policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params)

    rl_utils.update_hparams_from_hparams(policy_hparams, hparams,
                                         hparams.base_algo + "_")

    tf.logging.vlog(1, "Policy HParams : %s",
                    misc_utils.pprint_hparams(policy_hparams))

    total_steps = policy_hparams.epochs_num
    tf.logging.vlog(2, "total_steps: %d", total_steps)

    eval_every_epochs = policy_hparams.eval_every_epochs
    tf.logging.vlog(2, "eval_every_epochs: %d", eval_every_epochs)

    if eval_every_epochs == 0:
        eval_every_epochs = total_steps
    policy_hparams.eval_every_epochs = 0

    metric_name = rl_utils.get_metric_name(
        sampling_temp=hparams.eval_sampling_temps[0],
        max_num_noops=hparams.eval_max_num_noops,
        clipped=False)

    tf.logging.vlog(1, "metric_name: %s", metric_name)

    eval_metrics_dir = os.path.join(output_dir, "eval_metrics")
    eval_metrics_dir = os.path.expanduser(eval_metrics_dir)
    tf.gfile.MakeDirs(eval_metrics_dir)
    eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir)

    def evaluate_on_new_model(model_dir_path):
        global step
        eval_metrics = rl_utils.evaluate_all_configs(hparams, model_dir_path)
        tf.logging.info("Agent eval metrics:\n{}".format(
            pprint.pformat(eval_metrics)))
        rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, step)
        if report_fn:
            report_fn(eval_metrics[metric_name], step)
        step += 1

    policy_hparams.epochs_num = total_steps
    policy_hparams.save_models_every_epochs = eval_every_epochs
    learner.train(env_fn,
                  policy_hparams,
                  simulated=False,
                  save_continuously=True,
                  epoch=0,
                  model_save_fn=evaluate_on_new_model)
Exemplo n.º 6
0
def training_loop(hparams, output_dir, report_fn=None, report_metric=None):
  """Run the main training loop."""
  if report_fn:
    assert report_metric is not None

  # Directories
  subdirectories = [
      "data", "tmp", "world_model", ("world_model", "debug_videos"),
      "policy", "eval_metrics"
  ]
  directories = setup_directories(output_dir, subdirectories)

  epoch = -1
  data_dir = directories["data"]
  env = rl_utils.setup_env(
      hparams, batch_size=hparams.real_batch_size,
      max_num_noops=hparams.max_num_noops,
      rl_env_max_episode_steps=hparams.rl_env_max_episode_steps
  )
  env.start_new_epoch(epoch, data_dir)

  if hparams.wm_policy_param_sharing:
    policy_model_dir = directories["world_model"]
  else:
    policy_model_dir = directories["policy"]
  learner = rl_utils.LEARNERS[hparams.base_algo](
      hparams.frame_stack_size, policy_model_dir,
      policy_model_dir, hparams.epochs
  )

  # Timing log function
  log_relative_time = make_relative_timing_fn()

  # Per-epoch state
  epoch_metrics = []
  metrics = {}

  # Collect data from the real environment.
  tf.logging.info("Initial training of the policy in real environment.")
  train_agent_real_env(env, learner, hparams, epoch)
  metrics["mean_reward/train/clipped"] = rl_utils.compute_mean_reward(
      env.current_epoch_rollouts(), clipped=True
  )
  tf.logging.info("Mean training reward (initial): {}".format(
      metrics["mean_reward/train/clipped"]
  ))
  env.generate_data(data_dir)

  eval_metrics_writer = tf.summary.FileWriter(
      directories["eval_metrics"]
  )

  world_model_steps_num = 0

  for epoch in range(hparams.epochs):
    log = make_log_fn(epoch, log_relative_time)

    # Train world model
    log("Training world model")
    world_model_steps_num = train_world_model(
        env, data_dir, directories["world_model"], hparams,
        world_model_steps_num, epoch
    )

    # Train agent
    log("Training policy in simulated environment.")
    train_agent(env, learner, directories["world_model"], hparams, epoch)

    env.start_new_epoch(epoch, data_dir)

    # Train agent on real env (short)
    log("Training policy in real environment.")
    train_agent_real_env(env, learner, hparams, epoch)

    if hparams.stop_loop_early:
      return 0.0

    env.generate_data(data_dir)

    metrics = load_metrics(directories["eval_metrics"], epoch)
    if metrics:
      # Skip eval if metrics have already been written for this epoch. Otherwise
      # we'd overwrite them with wrong data.
      log("Metrics found for this epoch, skipping evaluation.")
    else:
      metrics["mean_reward/train/clipped"] = rl_utils.compute_mean_reward(
          env.current_epoch_rollouts(), clipped=True
      )
      log("Mean training reward: {}".format(
          metrics["mean_reward/train/clipped"]
      ))

      eval_metrics = rl_utils.evaluate_all_configs(hparams, policy_model_dir)
      log("Agent eval metrics:\n{}".format(pprint.pformat(eval_metrics)))
      metrics.update(eval_metrics)

      if hparams.eval_world_model:
        debug_video_path = os.path.join(
            directories["world_model", "debug_videos"],
            "{}.avi".format(env.current_epoch)
        )
        wm_metrics = evaluate_world_model(
            env, hparams, directories["world_model"], debug_video_path
        )
        log("World model eval metrics:\n{}".format(pprint.pformat(wm_metrics)))
        metrics.update(wm_metrics)

      rl_utils.summarize_metrics(eval_metrics_writer, metrics, epoch)

      # Report metrics
      if report_fn:
        if report_metric == "mean_reward":
          metric_name = rl_utils.get_metric_name(
              sampling_temp=hparams.eval_sampling_temps[0],
              max_num_noops=hparams.eval_max_num_noops,
              clipped=False
          )
          report_fn(eval_metrics[metric_name], epoch)
        else:
          report_fn(eval_metrics[report_metric], epoch)

    epoch_metrics.append(metrics)

  # Return the evaluation metrics from the final epoch
  return epoch_metrics[-1]
def train(hparams, output_dir, env_problem_name, report_fn=None):
  """Train."""
  env_fn = initialize_env_specs(hparams, env_problem_name)

  tf.logging.vlog(1, "HParams in trainer_model_free.train : %s",
                  misc_utils.pprint_hparams(hparams))
  tf.logging.vlog(1, "Using hparams.base_algo: %s", hparams.base_algo)
  learner = rl_utils.LEARNERS[hparams.base_algo](
      hparams.frame_stack_size, output_dir, output_dir, total_num_epochs=1,
      distributional_size=hparams.get("distributional_size", 1),
      distributional_subscale=hparams.get("distributional_subscale", 0.04),
      distributional_threshold=hparams.get("distributional_threshold", 0.0),
  )

  policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
  rl_utils.update_hparams_from_hparams(
      policy_hparams, hparams, hparams.base_algo + "_"
  )

  tf.logging.vlog(1, "Policy HParams : %s",
                  misc_utils.pprint_hparams(policy_hparams))

  # TODO(konradczechowski): remove base_algo dependance, when evaluation method
  # will be decided
  if hparams.base_algo == "ppo":
    total_steps = policy_hparams.epochs_num
    tf.logging.vlog(2, "total_steps: %d", total_steps)

    eval_every_epochs = policy_hparams.eval_every_epochs
    tf.logging.vlog(2, "eval_every_epochs: %d", eval_every_epochs)

    if eval_every_epochs == 0:
      eval_every_epochs = total_steps
    policy_hparams.eval_every_epochs = 0

    metric_name = rl_utils.get_metric_name(
        sampling_temp=hparams.eval_sampling_temps[0],
        max_num_noops=hparams.eval_max_num_noops,
        clipped=False
    )

    tf.logging.vlog(1, "metric_name: %s", metric_name)

    eval_metrics_dir = os.path.join(output_dir, "eval_metrics")
    eval_metrics_dir = os.path.expanduser(eval_metrics_dir)
    tf.gfile.MakeDirs(eval_metrics_dir)
    eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir)

    def evaluate_on_new_model(model_dir_path):
      global step
      eval_metrics = rl_utils.evaluate_all_configs(hparams, model_dir_path)
      tf.logging.info(
          "Agent eval metrics:\n{}".format(pprint.pformat(eval_metrics)))
      rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, step)
      if report_fn:
        report_fn(eval_metrics[metric_name], step)
      step += 1

    policy_hparams.epochs_num = total_steps
    policy_hparams.save_models_every_epochs = eval_every_epochs
  else:
    def evaluate_on_new_model(model_dir_path):
      del model_dir_path
      raise NotImplementedError(
          "This function is currently implemented only for ppo")

  learner.train(env_fn,
                policy_hparams,
                simulated=False,
                save_continuously=True,
                epoch=0,
                model_save_fn=evaluate_on_new_model)
def train(hparams, output_dir, report_fn=None):
    """Train."""
    hparams = initialize_env_specs(hparams)

    tf.logging.vlog(1, "HParams in trainer_model_free.train : %s",
                    misc_utils.pprint_hparams(hparams))

    tf.logging.vlog(1, "Using hparams.base_algo: %s", hparams.base_algo)
    learner = rl_utils.LEARNERS[hparams.base_algo](hparams.frame_stack_size,
                                                   output_dir,
                                                   output_dir,
                                                   total_num_epochs=1)

    policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params)

    rl_utils.update_hparams_from_hparams(policy_hparams, hparams,
                                         hparams.base_algo + "_")

    tf.logging.vlog(1, "Policy HParams : %s",
                    misc_utils.pprint_hparams(policy_hparams))

    total_steps = policy_hparams.epochs_num
    tf.logging.vlog(2, "total_steps: %d", total_steps)

    eval_every_epochs = policy_hparams.eval_every_epochs
    tf.logging.vlog(2, "eval_every_epochs: %d", eval_every_epochs)

    if eval_every_epochs == 0:
        eval_every_epochs = total_steps
    policy_hparams.eval_every_epochs = 0

    steps = list(range(eval_every_epochs, total_steps + 1, eval_every_epochs))
    if not steps or steps[-1] < eval_every_epochs:
        steps.append(eval_every_epochs)

    tf.logging.vlog(1, "steps: [%s]", ",".join([str(s) for s in steps]))

    metric_name = rl_utils.get_metric_name(
        sampling_temp=hparams.eval_sampling_temps[0],
        max_num_noops=hparams.eval_max_num_noops,
        clipped=False)

    tf.logging.vlog(1, "metric_name: %s", metric_name)

    for i, step in enumerate(steps):
        tf.logging.info("Starting training iteration [%d] for [%d] steps.", i,
                        step)

        policy_hparams.epochs_num = step
        learner.train(hparams.env_fn,
                      policy_hparams,
                      simulated=False,
                      save_continuously=True,
                      epoch=0)

        tf.logging.info("Ended training iteration [%d] for [%d] steps.", i,
                        step)

        eval_metrics = rl_utils.evaluate_all_configs(hparams, output_dir)

        tf.logging.info("Agent eval metrics:\n{}".format(
            pprint.pformat(eval_metrics)))

        if report_fn:
            report_fn(eval_metrics[metric_name], step)