def train(hparams, output_dir, report_fn=None):
    """Train."""
    hparams = initialize_env_specs(hparams)
    learner = rl_utils.LEARNERS[hparams.base_algo](hparams.frame_stack_size,
                                                   FLAGS.output_dir,
                                                   output_dir)
    policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
    rl_utils.update_hparams_from_hparams(policy_hparams, hparams,
                                         hparams.base_algo + "_")
    total_steps = policy_hparams.epochs_num
    eval_every_epochs = policy_hparams.eval_every_epochs
    if eval_every_epochs == 0:
        eval_every_epochs = total_steps
    policy_hparams.eval_every_epochs = 0

    steps = list(range(eval_every_epochs, total_steps + 1, eval_every_epochs))
    if not steps or steps[-1] < eval_every_epochs:
        steps.append(eval_every_epochs)
    metric_name = rl_utils.get_metric_name(
        sampling_temp=hparams.eval_sampling_temps[0],
        max_num_noops=hparams.eval_max_num_noops,
        clipped=False)
    for step in steps:
        policy_hparams.epochs_num = step
        learner.train(hparams.env_fn,
                      policy_hparams,
                      simulated=False,
                      save_continuously=True,
                      epoch=0)
        eval_metrics = rl_utils.evaluate_all_configs(hparams, output_dir)
        tf.logging.info("Agent eval metrics:\n{}".format(
            pprint.pformat(eval_metrics)))
        if report_fn:
            report_fn(eval_metrics[metric_name], step)
示例#2
0
def train_agent(real_env, learner, world_model_dir, hparams, epoch):
  """Train the PPO agent in the simulated environment."""
  initial_frame_chooser = rl_utils.make_initial_frame_chooser(
      real_env, hparams.frame_stack_size, hparams.simulation_random_starts,
      hparams.simulation_flip_first_random_for_beginning
  )
  env_fn = make_simulated_env_fn_from_hparams(
      real_env, hparams, batch_size=hparams.simulated_batch_size,
      initial_frame_chooser=initial_frame_chooser, model_dir=world_model_dir,
      sim_video_dir=os.path.join(
          learner.agent_model_dir, "sim_videos_{}".format(epoch)
      )
  )
  base_algo_str = hparams.base_algo
  train_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
  if hparams.wm_policy_param_sharing:
    train_hparams.optimizer_zero_grads = True

  rl_utils.update_hparams_from_hparams(
      train_hparams, hparams, base_algo_str + "_"
  )

  final_epoch = hparams.epochs - 1
  is_special_epoch = (epoch + 3) == final_epoch or (epoch + 7) == final_epoch
  is_final_epoch = epoch == final_epoch
  env_step_multiplier = 3 if is_final_epoch else 2 if is_special_epoch else 1
  learner.train(
      env_fn, train_hparams, simulated=True, save_continuously=True,
      epoch=epoch, env_step_multiplier=env_step_multiplier
  )
示例#3
0
def train_agent(real_env, learner, world_model_dir, hparams, epoch):
    """Train the PPO agent in the simulated environment."""
    frame_stack_size = hparams.frame_stack_size
    initial_frame_rollouts = real_env.current_epoch_rollouts(
        split=tf.contrib.learn.ModeKeys.TRAIN,
        minimal_rollout_frames=frame_stack_size,
    )

    # TODO(koz4k): Move this to a different module.
    def initial_frame_chooser(batch_size):
        """Frame chooser."""

        deterministic_initial_frames =\
            initial_frame_rollouts[0][:frame_stack_size]
        if not hparams.simulation_random_starts:
            # Deterministic starts: repeat first frames from the first rollout.
            initial_frames = [deterministic_initial_frames] * batch_size
        else:
            # Random starts: choose random initial frames from random rollouts.
            initial_frames = random_rollout_subsequences(
                initial_frame_rollouts, batch_size, frame_stack_size)
            if hparams.simulation_flip_first_random_for_beginning:
                # Flip first entry in the batch for deterministic initial frames.
                initial_frames[0] = deterministic_initial_frames

        return np.stack(
            [[frame.observation.decode() for frame in initial_frame_stack]
             for initial_frame_stack in initial_frames])

    env_fn = make_simulated_env_fn(
        real_env, hparams, hparams.simulated_batch_size, initial_frame_chooser,
        world_model_dir,
        os.path.join(learner.agent_model_dir, "sim_videos_{}".format(epoch)))
    base_algo_str = hparams.base_algo
    train_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
    if hparams.wm_policy_param_sharing:
        train_hparams.optimizer_zero_grads = True

    rl_utils.update_hparams_from_hparams(train_hparams, hparams,
                                         base_algo_str + "_")

    final_epoch = hparams.epochs - 1
    is_special_epoch = (epoch + 3) == final_epoch or (epoch + 7) == final_epoch
    is_final_epoch = epoch == final_epoch
    env_step_multiplier = 3 if is_final_epoch else 2 if is_special_epoch else 1
    learner.train(env_fn,
                  train_hparams,
                  simulated=True,
                  save_continuously=True,
                  epoch=epoch,
                  env_step_multiplier=env_step_multiplier)
示例#4
0
def train_agent_real_env(env, learner, hparams, epoch):
    """Train the PPO agent in the real environment."""
    base_algo_str = hparams.base_algo

    train_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
    rl_utils.update_hparams_from_hparams(train_hparams, hparams,
                                         "real_" + base_algo_str + "_")

    env_fn = rl.make_real_env_fn(env)
    num_env_steps = real_env_step_increment(hparams)
    learner.train(env_fn,
                  train_hparams,
                  simulated=False,
                  save_continuously=False,
                  epoch=epoch,
                  num_env_steps=num_env_steps)
    # Save unfinished rollouts to history.
    env.reset()
示例#5
0
def train_agent_real_env(env, learner, hparams, epoch):
    """Train the PPO agent in the real environment."""
    base_algo_str = hparams.base_algo

    train_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
    rl_utils.update_hparams_from_hparams(train_hparams, hparams,
                                         "real_" + base_algo_str + "_")
    if hparams.wm_policy_param_sharing:
        train_hparams.optimizer_zero_grads = True

    env_fn = rl.make_real_env_fn(env)
    num_env_steps = real_env_step_increment(hparams)
    learner.train(
        env_fn,
        train_hparams,
        simulated=False,
        save_continuously=False,
        epoch=epoch,
        sampling_temp=hparams.real_sampling_temp,
        num_env_steps=num_env_steps,
    )
    # Save unfinished rollouts to history.
    env.reset()
示例#6
0
def train(hparams, output_dir, env_problem_name, report_fn=None):
    """Train."""
    env_fn = initialize_env_specs(hparams, env_problem_name)

    tf.logging.vlog(1, "HParams in trainer_model_free.train : %s",
                    misc_utils.pprint_hparams(hparams))

    tf.logging.vlog(1, "Using hparams.base_algo: %s", hparams.base_algo)
    learner = rl_utils.LEARNERS[hparams.base_algo](hparams.frame_stack_size,
                                                   output_dir,
                                                   output_dir,
                                                   total_num_epochs=1)

    policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params)

    rl_utils.update_hparams_from_hparams(policy_hparams, hparams,
                                         hparams.base_algo + "_")

    tf.logging.vlog(1, "Policy HParams : %s",
                    misc_utils.pprint_hparams(policy_hparams))

    total_steps = policy_hparams.epochs_num
    tf.logging.vlog(2, "total_steps: %d", total_steps)

    eval_every_epochs = policy_hparams.eval_every_epochs
    tf.logging.vlog(2, "eval_every_epochs: %d", eval_every_epochs)

    if eval_every_epochs == 0:
        eval_every_epochs = total_steps
    policy_hparams.eval_every_epochs = 0

    metric_name = rl_utils.get_metric_name(
        sampling_temp=hparams.eval_sampling_temps[0],
        max_num_noops=hparams.eval_max_num_noops,
        clipped=False)

    tf.logging.vlog(1, "metric_name: %s", metric_name)

    eval_metrics_dir = os.path.join(output_dir, "eval_metrics")
    eval_metrics_dir = os.path.expanduser(eval_metrics_dir)
    tf.gfile.MakeDirs(eval_metrics_dir)
    eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir)

    def evaluate_on_new_model(model_dir_path):
        global step
        eval_metrics = rl_utils.evaluate_all_configs(hparams, model_dir_path)
        tf.logging.info("Agent eval metrics:\n{}".format(
            pprint.pformat(eval_metrics)))
        rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, step)
        if report_fn:
            report_fn(eval_metrics[metric_name], step)
        step += 1

    policy_hparams.epochs_num = total_steps
    policy_hparams.save_models_every_epochs = eval_every_epochs
    learner.train(env_fn,
                  policy_hparams,
                  simulated=False,
                  save_continuously=True,
                  epoch=0,
                  model_save_fn=evaluate_on_new_model)
def train(hparams, output_dir, env_problem_name, report_fn=None):
  """Train."""
  env_fn = initialize_env_specs(hparams, env_problem_name)

  tf.logging.vlog(1, "HParams in trainer_model_free.train : %s",
                  misc_utils.pprint_hparams(hparams))
  tf.logging.vlog(1, "Using hparams.base_algo: %s", hparams.base_algo)
  learner = rl_utils.LEARNERS[hparams.base_algo](
      hparams.frame_stack_size, output_dir, output_dir, total_num_epochs=1,
      distributional_size=hparams.get("distributional_size", 1),
      distributional_subscale=hparams.get("distributional_subscale", 0.04),
      distributional_threshold=hparams.get("distributional_threshold", 0.0),
  )

  policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
  rl_utils.update_hparams_from_hparams(
      policy_hparams, hparams, hparams.base_algo + "_"
  )

  tf.logging.vlog(1, "Policy HParams : %s",
                  misc_utils.pprint_hparams(policy_hparams))

  # TODO(konradczechowski): remove base_algo dependance, when evaluation method
  # will be decided
  if hparams.base_algo == "ppo":
    total_steps = policy_hparams.epochs_num
    tf.logging.vlog(2, "total_steps: %d", total_steps)

    eval_every_epochs = policy_hparams.eval_every_epochs
    tf.logging.vlog(2, "eval_every_epochs: %d", eval_every_epochs)

    if eval_every_epochs == 0:
      eval_every_epochs = total_steps
    policy_hparams.eval_every_epochs = 0

    metric_name = rl_utils.get_metric_name(
        sampling_temp=hparams.eval_sampling_temps[0],
        max_num_noops=hparams.eval_max_num_noops,
        clipped=False
    )

    tf.logging.vlog(1, "metric_name: %s", metric_name)

    eval_metrics_dir = os.path.join(output_dir, "eval_metrics")
    eval_metrics_dir = os.path.expanduser(eval_metrics_dir)
    tf.gfile.MakeDirs(eval_metrics_dir)
    eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir)

    def evaluate_on_new_model(model_dir_path):
      global step
      eval_metrics = rl_utils.evaluate_all_configs(hparams, model_dir_path)
      tf.logging.info(
          "Agent eval metrics:\n{}".format(pprint.pformat(eval_metrics)))
      rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, step)
      if report_fn:
        report_fn(eval_metrics[metric_name], step)
      step += 1

    policy_hparams.epochs_num = total_steps
    policy_hparams.save_models_every_epochs = eval_every_epochs
  else:
    def evaluate_on_new_model(model_dir_path):
      del model_dir_path
      raise NotImplementedError(
          "This function is currently implemented only for ppo")

  learner.train(env_fn,
                policy_hparams,
                simulated=False,
                save_continuously=True,
                epoch=0,
                model_save_fn=evaluate_on_new_model)
def train(hparams, output_dir, report_fn=None):
    """Train."""
    hparams = initialize_env_specs(hparams)

    tf.logging.vlog(1, "HParams in trainer_model_free.train : %s",
                    misc_utils.pprint_hparams(hparams))

    tf.logging.vlog(1, "Using hparams.base_algo: %s", hparams.base_algo)
    learner = rl_utils.LEARNERS[hparams.base_algo](hparams.frame_stack_size,
                                                   output_dir,
                                                   output_dir,
                                                   total_num_epochs=1)

    policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params)

    rl_utils.update_hparams_from_hparams(policy_hparams, hparams,
                                         hparams.base_algo + "_")

    tf.logging.vlog(1, "Policy HParams : %s",
                    misc_utils.pprint_hparams(policy_hparams))

    total_steps = policy_hparams.epochs_num
    tf.logging.vlog(2, "total_steps: %d", total_steps)

    eval_every_epochs = policy_hparams.eval_every_epochs
    tf.logging.vlog(2, "eval_every_epochs: %d", eval_every_epochs)

    if eval_every_epochs == 0:
        eval_every_epochs = total_steps
    policy_hparams.eval_every_epochs = 0

    steps = list(range(eval_every_epochs, total_steps + 1, eval_every_epochs))
    if not steps or steps[-1] < eval_every_epochs:
        steps.append(eval_every_epochs)

    tf.logging.vlog(1, "steps: [%s]", ",".join([str(s) for s in steps]))

    metric_name = rl_utils.get_metric_name(
        sampling_temp=hparams.eval_sampling_temps[0],
        max_num_noops=hparams.eval_max_num_noops,
        clipped=False)

    tf.logging.vlog(1, "metric_name: %s", metric_name)

    for i, step in enumerate(steps):
        tf.logging.info("Starting training iteration [%d] for [%d] steps.", i,
                        step)

        policy_hparams.epochs_num = step
        learner.train(hparams.env_fn,
                      policy_hparams,
                      simulated=False,
                      save_continuously=True,
                      epoch=0)

        tf.logging.info("Ended training iteration [%d] for [%d] steps.", i,
                        step)

        eval_metrics = rl_utils.evaluate_all_configs(hparams, output_dir)

        tf.logging.info("Agent eval metrics:\n{}".format(
            pprint.pformat(eval_metrics)))

        if report_fn:
            report_fn(eval_metrics[metric_name], step)