示例#1
0
def train_agent_real_env(
    env, agent_model_dir, event_dir, epoch_data_dir,
    hparams, epoch=0, is_final_epoch=False):
  """Train the PPO agent in the real environment."""
  del epoch_data_dir
  ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params)
  ppo_params_names = ["epochs_num", "epoch_length",
                      "learning_rate", "num_agents", "eval_every_epochs",
                      "optimization_epochs", "effective_num_agents"]

  # This should be overridden.
  ppo_hparams.add_hparam("effective_num_agents", None)
  for param_name in ppo_params_names:
    ppo_param_name = "real_ppo_"+ param_name
    if ppo_param_name in hparams:
      ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name))

  ppo_hparams.epochs_num = _ppo_training_epochs(hparams, epoch,
                                                is_final_epoch, True)
  # We do not save model, as that resets frames that we need at restarts.
  # But we need to save at the last step, so we set it very high.
  ppo_hparams.save_models_every_epochs = 1000000

  environment_spec = rl.standard_atari_env_spec(
      batch_env=env, include_clipping=False
  )

  ppo_hparams.add_hparam("environment_spec", environment_spec)

  rl_trainer_lib.train(ppo_hparams, event_dir + "real", agent_model_dir,
                       name_scope="ppo_real%d" % (epoch + 1))

  # Save unfinished rollouts to history.
  env.reset()
 def test_train_pong(self):
     hparams = tf.contrib.training.HParams(
         epochs_num=4,
         eval_every_epochs=2,
         num_agents=10,
         optimization_epochs=3,
         epoch_length=30,
         entropy_loss_coef=0.003,
         learning_rate=8e-05,
         optimizer="Adam",
         policy_network=rl_models.feed_forward_cnn_small_categorical_fun,
         gae_lambda=0.985,
         num_eval_agents=1,
         max_gradients_norm=0.5,
         gae_gamma=0.985,
         optimization_batch_size=4,
         clipping_coef=0.2,
         value_loss_coef=1,
         save_models_every_epochs=False)
     hparams.add_hparam(
         "environment_spec",
         gym_problems.standard_atari_env_spec("PongNoFrameskip-v4"))
     hparams.add_hparam(
         "environment_eval_spec",
         gym_problems.standard_atari_env_eval_spec("PongNoFrameskip-v4"))
     rl_trainer_lib.train(hparams)
  def test_no_crash_pendulum(self):
    hparams = trainer_lib.create_hparams(
        "ppo_continuous_action_base",
        TrainTest.test_config)

    hparams.add_hparam("environment_spec", simple_gym_spec("Pendulum-v0"))
    rl_trainer_lib.train(hparams)
示例#4
0
    def train(self, env_fn, hparams, target_num_epochs, simulated, epoch):
        hparams.set_hparam("epochs_num", target_num_epochs)

        if simulated:
            simulated_str = "sim"
            hparams.save_models_every_epochs = 10
        else:
            # TODO(konradczechowski): refactor ppo
            assert hparams.num_agents == 1
            # We do not save model, as that resets frames that we need at restarts.
            # But we need to save at the last step, so we set it very high.
            hparams.save_models_every_epochs = 1000000
            simulated_str = "real"

        # TODO(konradczechowski) refactor ppo, pass these as arguments
        # (not inside hparams). Do the same in evaluate()
        hparams.add_hparam("force_beginning_resets", simulated)
        hparams.add_hparam("env_fn", env_fn)
        hparams.add_hparam("frame_stack_size", self.frame_stack_size)
        name_scope = "ppo_{}{}".format(simulated_str, epoch + 1)

        rl_trainer_lib.train(hparams,
                             self.event_dir + simulated_str,
                             self.agent_model_dir,
                             name_scope=name_scope)
    def test_no_crash_pendulum(self):
        hparams = trainer_lib.create_hparams("ppo_continuous_action_base",
                                             TrainTest.test_config)

        hparams.add_hparam("environment_spec",
                           rl_models.simple_gym_spec("Pendulum-v0"))
        rl_trainer_lib.train(hparams)
示例#6
0
    def test_no_crash_cartpole(self):
        hparams = trainer_lib.create_hparams("ppo_discrete_action_base",
                                             TrainTest.test_config)

        hparams.add_hparam("environment_spec",
                           standard_atari_env_spec("CartPole-v0"))
        rl_trainer_lib.train(hparams)
    def test_no_crash_cartpole(self):
        hparams = trainer_lib.create_hparams("ppo_discrete_action_base",
                                             TrainTest.test_config)

        hparams.add_hparam("environment_spec",
                           rl_models.simple_gym_spec("CartPole-v0"))
        rl_trainer_lib.train(hparams)
def train_agent(real_env, agent_model_dir, event_dir, world_model_dir, data_dir,
                hparams, completed_ppo_epochs_num, epoch=0,
                is_final_epoch=False):
  """Train the PPO agent in the simulated environment."""
  del data_dir
  ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params)
  ppo_params_names = ["epochs_num", "epoch_length",
                      "learning_rate", "num_agents",
                      "optimization_epochs", "eval_every_epochs"]

  for param_name in ppo_params_names:
    ppo_param_name = "ppo_" + param_name
    if ppo_param_name in hparams:
      ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name))

  completed_ppo_epochs_num += sim_ppo_epoch_increment(hparams, is_final_epoch)
  ppo_hparams.epochs_num = completed_ppo_epochs_num

  ppo_hparams.save_models_every_epochs = 10
  ppo_hparams.world_model_dir = world_model_dir

  environment_spec = make_simulated_env_spec(real_env, hparams)

  num_input_frames = environment_spec.video_num_input_frames
  initial_frame_rollouts = real_env.current_epoch_rollouts(
      split=tf.contrib.learn.ModeKeys.TRAIN,
      minimal_rollout_frames=num_input_frames,
  )
  # TODO(koz4k): Move this to a different module.
  def initial_frame_chooser(batch_size):
    """Frame chooser."""

    deterministic_initial_frames =\
        initial_frame_rollouts[0][:num_input_frames]
    if not hparams.simulation_random_starts:
      # Deterministic starts: repeat first frames from the first rollout.
      initial_frames = [deterministic_initial_frames] * batch_size
    else:
      # Random starts: choose random initial frames from random rollouts.
      initial_frames = random_rollout_subsequences(
          initial_frame_rollouts, batch_size, num_input_frames
      )
      if hparams.simulation_flip_first_random_for_beginning:
        # Flip first entry in the batch for deterministic initial frames.
        initial_frames[0] = deterministic_initial_frames

    return np.stack([
        [frame.observation.decode() for frame in initial_frame_stack]
        for initial_frame_stack in initial_frames
    ])

  environment_spec.add_hparam("initial_frame_chooser", initial_frame_chooser)

  ppo_hparams.add_hparam("environment_spec", environment_spec)

  rl_trainer_lib.train(ppo_hparams, event_dir + "sim", agent_model_dir,
                       name_scope="ppo_sim%d" % (epoch + 1))

  return completed_ppo_epochs_num
  def test_no_crash_cartpole(self):
    hparams = trainer_lib.create_hparams(
        "ppo_discrete_action_base",
        TrainTest.test_config)

    hparams.add_hparam("environment_spec",
                       standard_atari_env_spec("CartPole-v0"))
    rl_trainer_lib.train(hparams)
def train(hparams, output_dir):
  prefix = output_dir
  #remove trash
  # prefix = "~/trash/loop_{}".format(random.randint(10000, 99999))
  data_dir = os.path.expanduser(prefix + "/data")
  tmp_dir = os.path.expanduser(prefix + "/tmp")
  output_dir = os.path.expanduser(prefix + "/output")
  tf.gfile.MakeDirs(data_dir)
  tf.gfile.MakeDirs(tmp_dir)
  tf.gfile.MakeDirs(output_dir)
  last_model = ""
  start_time = time.time()
  line = ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>    "
  for iloop in range(hparams.epochs):
      time_delta = time.time() - start_time
      print(line+"Step {}.1. - generate data from policy. "
            "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
      FLAGS.problems = "gym_discrete_problem"
      FLAGS.agent_policy_path = last_model
      gym_problem = problems.problem(FLAGS.problems)
      gym_problem.num_steps = hparams.true_env_generator_num_steps
      iter_data_dir = os.path.join(data_dir, str(iloop))
      tf.gfile.MakeDirs(iter_data_dir)
      gym_problem.generate_data(iter_data_dir, tmp_dir)

      time_delta = time.time() - start_time
      print(line+"Step {}.2. - generate env model. "
            "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
      # 2. generate env model
      FLAGS.data_dir = iter_data_dir
      FLAGS.output_dir = output_dir
      FLAGS.model = hparams.generative_model
      FLAGS.hparams_set = hparams.generative_model_params
      FLAGS.train_steps = hparams.model_train_steps
      FLAGS.eval_steps = 1
      t2t_trainer.main([])

      time_delta = time.time() - start_time
      print(line+"Step {}.3. - evalue env model. "
            "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
      gym_simulated_problem = problems.problem("gym_simulated_discrete_problem")
      gym_simulated_problem.num_steps = hparams.simulated_env_generator_num_steps
      gym_simulated_problem.generate_data(iter_data_dir, tmp_dir)

      # time_delta = time.time() - start_time
      print(line+"Step {}.4. - train PPO in model env."
            " Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
      ppo_epochs_num=hparams.ppo_epochs_num
      ppo_hparams = trainer_lib.create_hparams("atari_base", "epochs_num={},simulated_environment=True,eval_every_epochs=0,save_models_every_epochs={}".format(ppo_epochs_num+1, ppo_epochs_num),
                                           data_dir=output_dir)
      ppo_hparams.epoch_length = hparams.ppo_epoch_length
      ppo_dir = tempfile.mkdtemp(dir=data_dir, prefix="ppo_")
      in_graph_wrappers = [(TimeLimitWrapper, {"timelimit": 150}),
                           (PongT2TGeneratorHackWrapper, {"add_value": -2})] + gym_problem.in_graph_wrappers
      ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers)
      rl_trainer_lib.train(ppo_hparams, "PongNoFrameskip-v4", ppo_dir)

      last_model = ppo_dir + "/model{}.ckpt".format(ppo_epochs_num)
示例#11
0
def train_agent(problem_name,
                agent_model_dir,
                event_dir,
                world_model_dir,
                epoch_data_dir,
                hparams,
                autoencoder_path=None,
                epoch=0):
    """Train the PPO agent in the simulated environment."""
    gym_problem = registry.problem(problem_name)
    ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params)
    ppo_epochs_num = hparams.ppo_epochs_num
    ppo_hparams.epochs_num = ppo_epochs_num
    ppo_hparams.eval_every_epochs = 50
    ppo_hparams.save_models_every_epochs = ppo_epochs_num
    ppo_hparams.epoch_length = hparams.ppo_epoch_length
    ppo_hparams.num_agents = hparams.ppo_num_agents
    ppo_hparams.world_model_dir = world_model_dir
    if hparams.ppo_learning_rate:
        ppo_hparams.learning_rate = hparams.ppo_learning_rate

    # Adding model hparams for model specific adjustments
    model_hparams = trainer_lib.create_hparams(hparams.generative_model_params)
    ppo_hparams.add_hparam("model_hparams", model_hparams)

    environment_spec = copy.copy(gym_problem.environment_spec)
    environment_spec.simulated_env = True
    environment_spec.add_hparam("simulation_random_starts",
                                hparams.simulation_random_starts)
    environment_spec.add_hparam("intrinsic_reward_scale",
                                hparams.intrinsic_reward_scale)
    environment_spec.add_hparam("initial_frames_problem", gym_problem)

    # 4x for the StackAndSkipWrapper minus one to always finish for reporting.
    ppo_time_limit = ppo_hparams.epoch_length - 1
    ppo_time_limit *= model_hparams.video_num_input_frames

    wrappers = environment_spec.wrappers + \
               [[TimeLimitWrapper, {"timelimit": ppo_time_limit}]]
    environment_spec.wrappers = wrappers

    ppo_hparams.add_hparam("environment_spec", environment_spec)

    with temporary_flags({
            "problem": problem_name,
            "model": hparams.generative_model,
            "hparams_set": hparams.generative_model_params,
            "output_dir": world_model_dir,
            "data_dir": epoch_data_dir,
            "autoencoder_path": autoencoder_path,
    }):
        rl_trainer_lib.train(ppo_hparams,
                             event_dir,
                             agent_model_dir,
                             epoch=epoch)
def train_agent(environment_spec,
                agent_model_dir,
                event_dir,
                world_model_dir,
                epoch_data_dir,
                hparams,
                epoch=0,
                is_final_epoch=False):
    """Train the PPO agent in the simulated environment."""
    ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params)
    ppo_params_names = [
        "epochs_num", "epoch_length", "learning_rate", "num_agents",
        "optimization_epochs", "eval_every_epochs"
    ]

    for param_name in ppo_params_names:
        ppo_param_name = "ppo_" + param_name
        if ppo_param_name in hparams:
            ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name))

    ppo_hparams.epochs_num = _ppo_training_epochs(hparams, epoch,
                                                  is_final_epoch, False)
    ppo_hparams.save_models_every_epochs = 10
    ppo_hparams.world_model_dir = world_model_dir
    ppo_hparams.add_hparam("force_beginning_resets", True)

    # Adding model hparams for model specific adjustments
    model_hparams = trainer_lib.create_hparams(hparams.generative_model_params)
    ppo_hparams.add_hparam("model_hparams", model_hparams)

    environment_spec = copy.copy(environment_spec)
    environment_spec_param_names = [
        "simulation_random_starts",
        "simulation_flip_first_random_for_beginning", "intrinsic_reward_scale"
    ]
    for param_name in environment_spec_param_names:
        environment_spec.set_hparam(param_name, hparams.get(param_name))
    ppo_hparams.add_hparam("environment_spec", environment_spec)

    ppo_hparams.add_hparam(
        "initial_frame_chooser",
        InitialFrameChooser(environment_spec, mode=tf.estimator.ModeKeys.EVAL))

    # TODO(koz4k): Pass by arguments.
    with temporary_flags({
            "problem": environment_spec.initial_frames_problem,
            "model": hparams.generative_model,
            "hparams_set": hparams.generative_model_params,
            "output_dir": world_model_dir,
            "data_dir": epoch_data_dir,
    }):
        rl_trainer_lib.train(ppo_hparams,
                             event_dir + "sim",
                             agent_model_dir,
                             name_scope="ppo_sim%d" % (epoch + 1))
示例#13
0
def train_agent_real_env(problem_name,
                         agent_model_dir,
                         event_dir,
                         world_model_dir,
                         epoch_data_dir,
                         hparams,
                         epoch=0,
                         is_final_epoch=False):
    """Train the PPO agent in the real environment."""
    global dumper_path, ppo_data_dumper_counter

    gym_problem = registry.problem(problem_name)
    ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params)
    ppo_params_names = [
        "epochs_num", "epoch_length", "learning_rate", "num_agents",
        "eval_every_epochs", "optimization_epochs", "effective_num_agents"
    ]

    # This should be overridden.
    ppo_hparams.add_hparam("effective_num_agents", None)
    for param_name in ppo_params_names:
        ppo_param_name = "real_ppo_" + param_name
        if ppo_param_name in hparams:
            ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name))

    ppo_hparams.epochs_num = _ppo_training_epochs(hparams, epoch,
                                                  is_final_epoch, True)
    # We do not save model, as that resets frames that we need at restarts.
    # But we need to save at the last step, so we set it very high.
    ppo_hparams.save_models_every_epochs = 1000000

    environment_spec = copy.copy(gym_problem.environment_spec)

    if hparams.gather_ppo_real_env_data:
        # TODO(piotrmilos):This should be refactored
        assert hparams.real_ppo_num_agents == 1, (
            "It is required to use collect with pyfunc_wrapper")

        ppo_data_dumper_counter = 0
        dumper_path = os.path.join(epoch_data_dir, "dumper")
        tf.gfile.MakeDirs(dumper_path)
        dumper_spec = [PyFuncWrapper, {"process_fun": ppo_data_dumper}]
        environment_spec.wrappers.insert(2, dumper_spec)

    ppo_hparams.add_hparam("environment_spec", environment_spec)

    with temporary_flags({
            "problem": problem_name,
            "output_dir": world_model_dir,
            "data_dir": epoch_data_dir,
    }):
        rl_trainer_lib.train(ppo_hparams,
                             event_dir + "real",
                             agent_model_dir,
                             name_scope="ppo_real%d" % (epoch + 1))
示例#14
0
def train_agent(problem_name,
                agent_model_dir,
                event_dir,
                world_model_dir,
                epoch_data_dir,
                hparams,
                epoch=0,
                is_final_epoch=False):
    """Train the PPO agent in the simulated environment."""
    gym_problem = registry.problem(problem_name)
    ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params)
    ppo_params_names = [
        "epochs_num", "epoch_length", "learning_rate", "num_agents",
        "optimization_epochs"
    ]

    for param_name in ppo_params_names:
        ppo_param_name = "ppo_" + param_name
        if ppo_param_name in hparams:
            ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name))

    ppo_epochs_num = hparams.ppo_epochs_num
    if is_final_epoch:
        ppo_epochs_num *= 2
        ppo_hparams.epoch_length *= 2
    ppo_hparams.save_models_every_epochs = ppo_epochs_num
    ppo_hparams.world_model_dir = world_model_dir
    ppo_hparams.add_hparam("force_beginning_resets", True)

    # Adding model hparams for model specific adjustments
    model_hparams = trainer_lib.create_hparams(hparams.generative_model_params)
    ppo_hparams.add_hparam("model_hparams", model_hparams)

    environment_spec = copy.copy(gym_problem.environment_spec)
    environment_spec.simulation_random_starts = hparams.simulation_random_starts
    do_flip = hparams.simulation_flip_first_random_for_beginning
    environment_spec.simulation_flip_first_random_for_beginning = do_flip
    environment_spec.intrinsic_reward_scale = hparams.intrinsic_reward_scale

    ppo_hparams.add_hparam("environment_spec", environment_spec)

    with temporary_flags({
            "problem": problem_name,
            "model": hparams.generative_model,
            "hparams_set": hparams.generative_model_params,
            "output_dir": world_model_dir,
            "data_dir": epoch_data_dir,
    }):
        rl_trainer_lib.train(ppo_hparams,
                             event_dir,
                             agent_model_dir,
                             epoch=epoch,
                             name_scope="ppo_sim")
示例#15
0
def train_agent(problem_name,
                agent_model_dir,
                event_dir,
                world_model_dir,
                epoch_data_dir,
                hparams,
                autoencoder_path=None,
                epoch=0):
    """Train the PPO agent in the simulated environment."""
    gym_problem = registry.problem(problem_name)
    ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params)
    ppo_epochs_num = hparams.ppo_epochs_num
    ppo_hparams.epochs_num = ppo_epochs_num
    ppo_hparams.simulated_environment = True
    ppo_hparams.simulation_random_starts = hparams.simulation_random_starts
    ppo_hparams.intrinsic_reward_scale = hparams.intrinsic_reward_scale
    ppo_hparams.eval_every_epochs = 50
    ppo_hparams.save_models_every_epochs = ppo_epochs_num
    ppo_hparams.epoch_length = hparams.ppo_epoch_length
    ppo_hparams.num_agents = hparams.ppo_num_agents
    ppo_hparams.problem = gym_problem
    ppo_hparams.world_model_dir = world_model_dir
    if hparams.ppo_learning_rate:
        ppo_hparams.learning_rate = hparams.ppo_learning_rate
    # 4x for the StackAndSkipWrapper minus one to always finish for reporting.
    ppo_time_limit = (ppo_hparams.epoch_length - 1) * 4

    in_graph_wrappers = [(TimeLimitWrapper, {
        "timelimit": ppo_time_limit
    }), (StackAndSkipWrapper, {
        "skip": 4
    })]
    in_graph_wrappers += gym_problem.in_graph_wrappers
    ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers)

    with temporary_flags({
            "problem": problem_name,
            "model": hparams.generative_model,
            "hparams_set": hparams.generative_model_params,
            "output_dir": world_model_dir,
            "data_dir": epoch_data_dir,
            "autoencoder_path": autoencoder_path,
    }):
        rl_trainer_lib.train(ppo_hparams,
                             gym_problem.env_name,
                             event_dir,
                             agent_model_dir,
                             epoch=epoch)
示例#16
0
def train_agent(problem_name,
                agent_model_dir,
                event_dir,
                world_model_dir,
                epoch_data_dir,
                hparams,
                autoencoder_path=None,
                epoch=0):
    """Train the PPO agent in the simulated environment."""
    gym_problem = registry.problem(problem_name)
    ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params)
    ppo_epochs_num = hparams.ppo_epochs_num
    ppo_hparams.epochs_num = ppo_epochs_num
    ppo_hparams.eval_every_epochs = 50
    ppo_hparams.save_models_every_epochs = ppo_epochs_num
    ppo_hparams.epoch_length = hparams.ppo_epoch_length
    ppo_hparams.num_agents = hparams.ppo_num_agents
    ppo_hparams.world_model_dir = world_model_dir
    ppo_hparams.add_hparam("force_beginning_resets", True)
    if hparams.ppo_learning_rate:
        ppo_hparams.learning_rate = hparams.ppo_learning_rate

    # Adding model hparams for model specific adjustments
    model_hparams = trainer_lib.create_hparams(hparams.generative_model_params)
    ppo_hparams.add_hparam("model_hparams", model_hparams)

    environment_spec = copy.copy(gym_problem.environment_spec)
    environment_spec.simulation_random_starts = hparams.simulation_random_starts
    environment_spec.intrinsic_reward_scale = hparams.intrinsic_reward_scale

    ppo_hparams.add_hparam("environment_spec", environment_spec)

    with temporary_flags({
            "problem": problem_name,
            "model": hparams.generative_model,
            "hparams_set": hparams.generative_model_params,
            "output_dir": world_model_dir,
            "data_dir": epoch_data_dir,
            "autoencoder_path": autoencoder_path,
    }):
        rl_trainer_lib.train(ppo_hparams,
                             event_dir,
                             agent_model_dir,
                             epoch=epoch)
示例#17
0
def train_agent_real_env(problem_name,
                         agent_model_dir,
                         event_dir,
                         world_model_dir,
                         epoch_data_dir,
                         hparams,
                         epoch=0,
                         is_final_epoch=False):
    """Train the PPO agent in the real environment."""
    del epoch, is_final_epoch
    gym_problem = registry.problem(problem_name)
    ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params)
    ppo_params_names = [
        "epochs_num", "epoch_length", "learning_rate", "num_agents",
        "optimization_epochs"
    ]

    for param_name in ppo_params_names:
        ppo_param_name = "real_ppo_" + param_name
        if ppo_param_name in hparams:
            ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name))

    ppo_epochs_num = hparams.real_ppo_epochs_num
    if ppo_epochs_num == 0:
        return

    ppo_hparams.save_models_every_epochs = ppo_epochs_num

    environment_spec = copy.copy(gym_problem.environment_spec)

    ppo_hparams.add_hparam("environment_spec", environment_spec)

    with temporary_flags({
            "problem": problem_name,
            "output_dir": world_model_dir,
            "data_dir": epoch_data_dir,
    }):
        # epoch = 10**20 is a hackish way to avoid skiping training
        rl_trainer_lib.train(ppo_hparams,
                             event_dir,
                             agent_model_dir,
                             epoch=10**20,
                             name_scope="ppo_real")
def train_agent_real_env(env,
                         agent_model_dir,
                         event_dir,
                         data_dir,
                         hparams,
                         completed_ppo_epochs_num,
                         epoch=0,
                         is_final_epoch=False):
    """Train the PPO agent in the real environment."""
    del is_final_epoch, data_dir
    ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params)
    ppo_params_names = [
        "epochs_num", "epoch_length", "learning_rate", "num_agents",
        "eval_every_epochs", "optimization_epochs", "effective_num_agents"
    ]

    # This should be overridden.
    ppo_hparams.add_hparam("effective_num_agents", None)
    for param_name in ppo_params_names:
        ppo_param_name = "real_ppo_" + param_name
        if ppo_param_name in hparams:
            ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name))

    completed_ppo_epochs_num += real_ppo_epoch_increment(hparams)
    ppo_hparams.epochs_num = completed_ppo_epochs_num
    # We do not save model, as that resets frames that we need at restarts.
    # But we need to save at the last step, so we set it very high.
    ppo_hparams.save_models_every_epochs = 1000000

    ppo_hparams.add_hparam("env_fn", rl.make_real_env_fn(env))
    ppo_hparams.add_hparam("force_beginning_resets", False)
    ppo_hparams.add_hparam("frame_stack_size", hparams.frame_stack_size)

    rl_trainer_lib.train(ppo_hparams,
                         event_dir + "real",
                         agent_model_dir,
                         name_scope="ppo_real%d" % (epoch + 1))

    # Save unfinished rollouts to history.
    env.reset()

    return completed_ppo_epochs_num
示例#19
0
def train_agent(problem_name, agent_model_dir,
                event_dir, world_model_dir, epoch_data_dir, hparams,
                autoencoder_path=None, epoch=0):
  """Train the PPO agent in the simulated environment."""
  gym_problem = registry.problem(problem_name)
  ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params)
  ppo_params_names = ["epochs_num", "epoch_length",
                      "learning_rate", "num_agents",
                      "optimization_epochs"]

  for param_name in ppo_params_names:
    ppo_param_name = "ppo_"+ param_name
    if ppo_param_name in hparams:
      ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name))

  ppo_epochs_num = hparams.ppo_epochs_num
  ppo_hparams.save_models_every_epochs = ppo_epochs_num
  ppo_hparams.world_model_dir = world_model_dir
  ppo_hparams.add_hparam("force_beginning_resets", True)

  # Adding model hparams for model specific adjustments
  model_hparams = trainer_lib.create_hparams(hparams.generative_model_params)
  ppo_hparams.add_hparam("model_hparams", model_hparams)

  environment_spec = copy.copy(gym_problem.environment_spec)
  environment_spec.simulation_random_starts = hparams.simulation_random_starts
  environment_spec.intrinsic_reward_scale = hparams.intrinsic_reward_scale

  ppo_hparams.add_hparam("environment_spec", environment_spec)

  with temporary_flags({
      "problem": problem_name,
      "model": hparams.generative_model,
      "hparams_set": hparams.generative_model_params,
      "output_dir": world_model_dir,
      "data_dir": epoch_data_dir,
      "autoencoder_path": autoencoder_path,
  }):
    rl_trainer_lib.train(ppo_hparams, event_dir, agent_model_dir, epoch=epoch)
 def test_no_crash_pendulum(self):
     hparams = trainer_lib.create_hparams(
         "continuous_action_base", "epochs_num=11,video_during_eval=False")
     rl_trainer_lib.train(hparams, "Pendulum-v0")
 def test_no_crash_cartpole(self):
   hparams = trainer_lib.create_hparams(
       "discrete_action_base", "epochs_num=11,video_during_eval=False")
   rl_trainer_lib.train(hparams, "CartPole-v0")
 def test_no_crash_pendulum(self):
   hparams = trainer_lib.create_hparams(
       "continuous_action_base", "epochs_num=11,video_during_eval=False")
   rl_trainer_lib.train(hparams, "Pendulum-v0")
示例#23
0
def main(_):
    hparams = trainer_lib.create_hparams(FLAGS.hparams_set, FLAGS.hparams)
    rl_trainer_lib.train(hparams, FLAGS.output_dir, FLAGS.output_dir)
def train(hparams, output_dir, report_fn=None):
  hparams = initialize_env_specs(hparams)
  rl_trainer_lib.train(hparams, output_dir, output_dir, report_fn=report_fn)
示例#25
0
def train_agent(real_env,
                agent_model_dir,
                event_dir,
                world_model_dir,
                data_dir,
                hparams,
                ppo_epochs_num,
                epoch=0,
                is_final_epoch=False):
    """Train the PPO agent in the simulated environment."""
    del data_dir
    ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params)
    ppo_params_names = [
        "epochs_num", "epoch_length", "learning_rate", "num_agents",
        "optimization_epochs", "eval_every_epochs"
    ]

    for param_name in ppo_params_names:
        ppo_param_name = "ppo_" + param_name
        if ppo_param_name in hparams:
            ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name))

    ppo_epochs_num += sim_ppo_epoch_increment(hparams, is_final_epoch)
    ppo_hparams.epochs_num = ppo_epochs_num

    ppo_hparams.save_models_every_epochs = 10
    ppo_hparams.world_model_dir = world_model_dir

    environment_spec_params = {
        param_name: hparams.get(param_name)
        for param_name in [
            "intrinsic_reward_scale", "simulation_random_starts",
            "simulation_flip_first_random_for_beginning"
        ]
    }
    environment_spec_params.update({
        "model_name":
        hparams.generative_model,
        "model_hparams":
        trainer_lib.create_hparams(hparams.generative_model_params),
        # Hardcoded for now. TODO(koz4k): Make it a hparam.
        "video_num_input_frames":
        4,
        "video_num_target_frames":
        1
    })
    environment_spec = rl.standard_atari_env_simulated_spec(
        real_env, **environment_spec_params)

    with tf.Session() as sess:
        encoded_png_p = tf.placeholder(tf.string)
        decoded_png_t = tf.image.decode_png(encoded_png_p)

        def decode_png(encoded_png):
            return sess.run(decoded_png_t,
                            feed_dict={encoded_png_p: encoded_png})

        num_input_frames = environment_spec.video_num_input_frames
        initial_frame_rollouts = real_env.current_epoch_rollouts(
            split=tf.contrib.learn.ModeKeys.TRAIN,
            minimal_rollout_frames=num_input_frames,
        )

        # TODO(koz4k): Move this to a different module.
        def initial_frame_chooser(batch_size):
            """Frame chooser."""

            deterministic_initial_frames =\
                initial_frame_rollouts[0][:num_input_frames]
            if not environment_spec.simulation_random_starts:
                # Deterministic starts: repeat first frames from the first rollout.
                initial_frames = [deterministic_initial_frames] * batch_size
            else:
                # Random starts: choose random initial frames from random rollouts.
                # TODO(koz4k): Weigh rollouts by their lengths so sampling is uniform
                # over frames and not rollouts.
                def choose_initial_frames():
                    try:
                        rollout = random.choice(initial_frame_rollouts)
                        from_index = random.randrange(
                            len(rollout) - num_input_frames + 1)
                        return rollout[from_index:(from_index +
                                                   num_input_frames)]
                    except ValueError:
                        # Rollout too short; repeat.
                        return choose_initial_frames()

                initial_frames = [
                    choose_initial_frames() for _ in range(batch_size)
                ]
                if environment_spec.simulation_flip_first_random_for_beginning:
                    # Flip first entry in the batch for deterministic initial frames.
                    initial_frames[0] = deterministic_initial_frames

            return np.stack([[
                decode_png(frame.observation) for frame in initial_frame_stack
            ] for initial_frame_stack in initial_frames])

        environment_spec.add_hparam("initial_frame_chooser",
                                    initial_frame_chooser)

        ppo_hparams.add_hparam("environment_spec", environment_spec)

        rl_trainer_lib.train(ppo_hparams,
                             event_dir + "sim",
                             agent_model_dir,
                             name_scope="ppo_sim%d" % (epoch + 1))

    return ppo_epochs_num
 def test_no_crash_pendulum(self):
     hparams = trainer_lib.create_hparams("pendulum_base", "epochs_num=10")
     rl_trainer_lib.train(hparams, "Pendulum-v0")
 def test_no_crash_cartpole(self):
     hparams = trainer_lib.create_hparams("cartpole_base", "epochs_num=10")
     rl_trainer_lib.train(hparams, "CartPole-v0")
def train(hparams, output_dir):
    """Training function."""
    prefix = output_dir
    data_dir = os.path.expanduser(prefix + "/data")
    tmp_dir = os.path.expanduser(prefix + "/tmp")
    output_dir = os.path.expanduser(prefix + "/output")
    tf.gfile.MakeDirs(data_dir)
    tf.gfile.MakeDirs(tmp_dir)
    tf.gfile.MakeDirs(output_dir)
    last_model = ""
    start_time = time.time()
    line = ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>    "
    for iloop in range(hparams.epochs):
        time_delta = time.time() - start_time
        print(line + "Step {}.1. - generate data from policy. "
              "Time: {}".format(iloop,
                                str(datetime.timedelta(seconds=time_delta))))
        FLAGS.problem = "gym_discrete_problem_with_agent_on_%s" % hparams.game
        FLAGS.agent_policy_path = last_model
        gym_problem = registry.problem(FLAGS.problem)
        gym_problem.settable_num_steps = hparams.true_env_generator_num_steps
        iter_data_dir = os.path.join(data_dir, str(iloop))
        tf.gfile.MakeDirs(iter_data_dir)
        gym_problem.generate_data(iter_data_dir, tmp_dir)

        time_delta = time.time() - start_time
        print(line + "Step {}.2. - generate env model. "
              "Time: {}".format(iloop,
                                str(datetime.timedelta(seconds=time_delta))))
        # 2. generate env model
        FLAGS.data_dir = iter_data_dir
        FLAGS.output_dir = output_dir
        FLAGS.model = hparams.generative_model
        FLAGS.hparams_set = hparams.generative_model_params
        FLAGS.train_steps = hparams.model_train_steps * (iloop + 2)
        FLAGS.eval_steps = 10
        t2t_trainer.main([])

        # Dump frames from env model.
        time_delta = time.time() - start_time
        print(line + "Step {}.3. - evaluate env model. "
              "Time: {}".format(iloop,
                                str(datetime.timedelta(seconds=time_delta))))
        gym_simulated_problem = registry.problem(
            "gym_simulated_discrete_problem_with_agent_on_%s" % hparams.game)
        sim_steps = hparams.simulated_env_generator_num_steps
        gym_simulated_problem.settable_num_steps = sim_steps
        gym_simulated_problem.generate_data(iter_data_dir, tmp_dir)

        # PPO.
        time_delta = time.time() - start_time
        print(line + "Step {}.4. - train PPO in model env."
              " Time: {}".format(iloop,
                                 str(datetime.timedelta(seconds=time_delta))))
        ppo_epochs_num = hparams.ppo_epochs_num
        ppo_hparams = trainer_lib.create_hparams(
            "atari_base",
            "epochs_num={},simulated_environment=True,eval_every_epochs=0,"
            "save_models_every_epochs={}".format(ppo_epochs_num + 1,
                                                 ppo_epochs_num),
            data_dir=output_dir)
        ppo_hparams.epoch_length = hparams.ppo_epoch_length
        ppo_dir = tempfile.mkdtemp(dir=data_dir, prefix="ppo_")
        in_graph_wrappers = [(TimeLimitWrapper, {
            "timelimit": hparams.ppo_time_limit
        }), (MaxAndSkipWrapper, {
            "skip": 4
        })]
        in_graph_wrappers += gym_problem.in_graph_wrappers
        ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers)
        ppo_hparams.num_agents = hparams.ppo_num_agents
        rl_trainer_lib.train(ppo_hparams, gym_simulated_problem.env_name,
                             ppo_dir)

        last_model = ppo_dir + "/model{}.ckpt".format(ppo_epochs_num)
 def test_train_pong(self):
     hparams = registry.hparams("pong_model_free")
     hparams.epochs_num = 2
     hparams.num_agents = 2
     hparams.epoch_length = 3
     rl_trainer_lib.train(hparams)
示例#30
0
 def test_no_crash_cartpole(self):
   hparams = trainer_lib.create_hparams(
       "discrete_action_base",
       TrainTest.test_config)
   rl_trainer_lib.train(hparams, "CartPole-v0")
示例#31
0
 def test_no_crash_pendulum(self):
   hparams = trainer_lib.create_hparams(
       "continuous_action_base",
       TrainTest.test_config)
   rl_trainer_lib.train(hparams, "Pendulum-v0")
示例#32
0
def train(hparams, output_dir):
  """Training function."""
  prefix = output_dir
  data_dir = os.path.expanduser(prefix + "/data")
  tmp_dir = os.path.expanduser(prefix + "/tmp")
  output_dir = os.path.expanduser(prefix + "/output")
  autoencoder_dir = os.path.expanduser(prefix + "/autoencoder")
  tf.gfile.MakeDirs(data_dir)
  tf.gfile.MakeDirs(tmp_dir)
  tf.gfile.MakeDirs(output_dir)
  tf.gfile.MakeDirs(autoencoder_dir)
  last_model = ""
  start_time = time.time()
  line = ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>    "
  epoch_metrics = []
  iter_data_dirs = []
  ae_data_dirs = []
  orig_autoencoder_path = FLAGS.autoencoder_path
  for iloop in range(hparams.epochs):
    # Train autoencoder if needed.
    if (hparams.autoencoder_train_steps > 0 and iloop == 0 and
        not orig_autoencoder_path):
      time_delta = time.time() - start_time
      tf.logging.info("%s Step AE - train autoencoder. Time: %s",
                      line, str(datetime.timedelta(seconds=time_delta)))
      with tf.Graph().as_default():
        # Generate data.
        FLAGS.autoencoder_path = ""
        FLAGS.problem = "gym_discrete_problem_with_agent_on_%s" % hparams.game
        FLAGS.agent_policy_path = ""
        gym_problem = registry.problem(FLAGS.problem)
        gym_problem.settable_num_steps = hparams.true_env_generator_num_steps
        ae_data_dir = os.path.join(data_dir, "ae%d" % iloop)
        ae_data_dirs.append(ae_data_dir)
        tf.gfile.MakeDirs(ae_data_dir)
        gym_problem.generate_data(ae_data_dir, tmp_dir)
        if ae_data_dirs[:-1]:
          combine_world_model_train_data(gym_problem,
                                         ae_data_dir,
                                         ae_data_dirs[:-1])
        # Train AE.
        FLAGS.data_dir = ae_data_dir
        FLAGS.output_dir = autoencoder_dir
        # TODO(lukaszkaiser): make non-hardcoded here and in gym_problems.py.
        FLAGS.model = "autoencoder_ordered_discrete"
        FLAGS.hparams_set = "autoencoder_discrete_pong"
        FLAGS.train_steps = hparams.autoencoder_train_steps * (iloop + 2)
        FLAGS.eval_steps = 100
        t2t_trainer.main([])
        FLAGS.autoencoder_path = autoencoder_dir

    # Generate random frames.
    if iloop == 0:
      time_delta = time.time() - start_time
      tf.logging.info("%s Step %d.0 - generate random data. Time: %s",
                      line, iloop, str(datetime.timedelta(seconds=time_delta)))
      FLAGS.problem = "gym_discrete_problem_with_agent_on_%s" % hparams.game
      FLAGS.agent_policy_path = ""
      gym_problem = registry.problem(FLAGS.problem)
      gym_problem.settable_num_steps = hparams.true_env_generator_num_steps
      iter_data_dir = os.path.join(data_dir, "0random")
      iter_data_dirs.append(iter_data_dir)
      tf.gfile.MakeDirs(iter_data_dir)
      gym_problem.generate_data(iter_data_dir, tmp_dir)
      mean_reward = gym_problem.sum_of_rewards / max(1.0, gym_problem.dones)
      tf.logging.info("%s Step 0.0 random reward: %.4f" % (line, mean_reward))

    time_delta = time.time() - start_time
    tf.logging.info("%s Step %d.1 - generate env model. Time: %s",
                    line, iloop, str(datetime.timedelta(seconds=time_delta)))

    # Train env model
    FLAGS.data_dir = iter_data_dir
    FLAGS.output_dir = output_dir
    FLAGS.model = hparams.generative_model
    FLAGS.hparams_set = hparams.generative_model_params
    FLAGS.train_steps = hparams.model_train_steps * (iloop + 2)
    FLAGS.eval_steps = 100
    t2t_trainer.main([])

    # Evaluate and dump frames from env model
    time_delta = time.time() - start_time
    tf.logging.info("%s Step %d.1a - evaluate env model. Time: %s",
                    line, iloop, str(datetime.timedelta(seconds=time_delta)))
    gym_simulated_problem = registry.problem(
        "gym_simulated_discrete_problem_with_agent_on_%s" % hparams.game)
    sim_steps = hparams.simulated_env_generator_num_steps
    gym_simulated_problem.settable_num_steps = sim_steps
    gym_simulated_problem.real_env_problem = gym_problem
    gym_simulated_problem.simulation_random_starts = False
    gym_simulated_problem.intrinsic_reward_scale = 0.
    gym_simulated_problem.generate_data(iter_data_dir, tmp_dir)
    model_reward_accuracy = 0.0
    if gym_simulated_problem.dones != 0:
      n = float(gym_simulated_problem.dones)
      model_reward_accuracy = (
          gym_simulated_problem.successful_episode_reward_predictions / n)
    tf.logging.info("%s Step %d.1a env model reward accuracy: %.4f" % (
        line, iloop, model_reward_accuracy))

    # Train PPO agent
    time_delta = time.time() - start_time
    tf.logging.info("%s Step %d.2 - train PPO in model env. Time: %s",
                    line, iloop, str(datetime.timedelta(seconds=time_delta)))

    # Setup PPO hparams
    ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params,
                                             data_dir=output_dir)
    ppo_epochs_num = hparams.ppo_epochs_num
    ppo_hparams.epochs_num = ppo_epochs_num
    ppo_hparams.simulated_environment = True
    ppo_hparams.simulation_random_starts = hparams.simulation_random_starts
    ppo_hparams.intrinsic_reward_scale = hparams.intrinsic_reward_scale
    ppo_hparams.eval_every_epochs = 0
    ppo_hparams.save_models_every_epochs = ppo_epochs_num
    ppo_hparams.epoch_length = hparams.ppo_epoch_length
    ppo_hparams.num_agents = hparams.ppo_num_agents
    ppo_hparams.problem = gym_problem

    in_graph_wrappers = [
        (TimeLimitWrapper, {"timelimit": hparams.ppo_time_limit}),
        (MaxAndSkipWrapper, {"skip": 4})]
    in_graph_wrappers += gym_problem.in_graph_wrappers
    ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers)

    ppo_dir = generator_utils.make_tmp_dir(dir=data_dir, prefix="ppo_")
    rl_trainer_lib.train(ppo_hparams, gym_simulated_problem.env_name, ppo_dir)
    last_model = ppo_dir

    # Evaluate agent.
    time_delta = time.time() - start_time
    tf.logging.info("%s Step %d.3 - evaluate agent. Time: %s",
                    line, iloop, str(datetime.timedelta(seconds=time_delta)))
    FLAGS.problem = "gym_discrete_problem_with_agent_on_%s" % hparams.game
    FLAGS.agent_policy_path = last_model
    eval_gym_problem = registry.problem(FLAGS.problem)
    eval_gym_problem.settable_num_steps = hparams.true_env_generator_num_steps
    eval_gym_problem.eval_runs = 5
    eval_data_dir = os.path.join(data_dir, str(iloop)+"eval")
    iter_data_dirs.append(eval_data_dir)
    tf.gfile.MakeDirs(eval_data_dir)
    eval_gym_problem.generate_data(eval_data_dir, tmp_dir)

    # Generate environment frames.
    time_delta = time.time() - start_time
    tf.logging.info("%s Step %d.4 - generate environment data. Time: %s",
                    line, iloop, str(datetime.timedelta(seconds=time_delta)))
    gym_problem = registry.problem(FLAGS.problem)
    gym_problem.settable_num_steps = hparams.true_env_generator_num_steps
    iter_data_dir = os.path.join(data_dir, str(iloop))
    iter_data_dirs.append(iter_data_dir)
    tf.gfile.MakeDirs(iter_data_dir)
    gym_problem.generate_data(iter_data_dir, tmp_dir)
    combine_world_model_train_data(gym_problem,
                                   iter_data_dir,
                                   iter_data_dirs[:-1])

    mean_reward = 0.0
    if eval_gym_problem.dones != 0:
      mean_reward = eval_gym_problem.sum_of_rewards / float(eval_gym_problem.dones)
    tf.logging.info("%s Step %d mean reward: %.4f" % (line, iloop, mean_reward))

    # Report metrics.
    eval_metrics = {"model_reward_accuracy": model_reward_accuracy,
                    "mean_reward": mean_reward}
    epoch_metrics.append(eval_metrics)

  # Report the evaluation metrics from the final epoch
  return epoch_metrics[-1]
示例#33
0
def main(_):
  hparams = trainer_lib.create_hparams(FLAGS.hparams_set, FLAGS.hparams)
  rl_trainer_lib.train(hparams, FLAGS.problem, FLAGS.output_dir)
示例#34
0
def main(_):
    rl_trainer_lib.train(rl_trainer_lib.example_params())
示例#35
0
def train(hparams, output_dir):
  """Training function."""
  prefix = output_dir
  data_dir = os.path.expanduser(prefix + "/data")
  tmp_dir = os.path.expanduser(prefix + "/tmp")
  output_dir = os.path.expanduser(prefix + "/output")
  tf.gfile.MakeDirs(data_dir)
  tf.gfile.MakeDirs(tmp_dir)
  tf.gfile.MakeDirs(output_dir)
  last_model = ""
  start_time = time.time()
  line = ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>    "
  epoch_metrics = []
  for iloop in range(hparams.epochs):
    # Generate random frames.
    if iloop == 0:
      time_delta = time.time() - start_time
      tf.logging.info("%s Step %d.0 - generate random data. Time: %s",
                      line, iloop, str(datetime.timedelta(seconds=time_delta)))
      FLAGS.problem = "gym_discrete_problem_with_agent_on_%s" % hparams.game
      FLAGS.agent_policy_path = ""
      gym_problem = registry.problem(FLAGS.problem)
      gym_problem.settable_num_steps = hparams.true_env_generator_num_steps
      iter_data_dir = os.path.join(data_dir, "0random")
      tf.gfile.MakeDirs(iter_data_dir)
      gym_problem.generate_data(iter_data_dir, tmp_dir)
      mean_reward = gym_problem.sum_of_rewards / max(1.0, gym_problem.dones)
      tf.logging.info("%s Step 0.0 random reward: %.4f" % (line, mean_reward))

    time_delta = time.time() - start_time
    tf.logging.info("%s Step %d.1 - generate env model. Time: %s",
                    line, iloop, str(datetime.timedelta(seconds=time_delta)))

    # Train env model
    FLAGS.data_dir = iter_data_dir
    FLAGS.output_dir = output_dir
    FLAGS.model = hparams.generative_model
    FLAGS.hparams_set = hparams.generative_model_params
    FLAGS.train_steps = hparams.model_train_steps * (iloop + 2)
    FLAGS.eval_steps = 10
    t2t_trainer.main([])

    # Evaluate and dump frames from env model
    time_delta = time.time() - start_time
    tf.logging.info("%s Step %d.1a - evaluate env model. Time: %s",
                    line, iloop, str(datetime.timedelta(seconds=time_delta)))
    gym_simulated_problem = registry.problem(
        "gym_simulated_discrete_problem_with_agent_on_%s" % hparams.game)
    sim_steps = hparams.simulated_env_generator_num_steps
    gym_simulated_problem.settable_num_steps = sim_steps
    gym_simulated_problem.real_env_problem = gym_problem
    gym_simulated_problem.generate_data(iter_data_dir, tmp_dir)
    model_reward_accuracy = 0.0
    if gym_simulated_problem.dones != 0:
      n = float(gym_simulated_problem.dones)
      model_reward_accuracy = (
          gym_simulated_problem.successful_episode_reward_predictions / n)

    # Train PPO agent
    time_delta = time.time() - start_time
    tf.logging.info("%s Step %d.2 - train PPO in model env. Time: %s",
                    line, iloop, str(datetime.timedelta(seconds=time_delta)))

    # Setup PPO hparams
    ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params,
                                             data_dir=output_dir)
    ppo_epochs_num = hparams.ppo_epochs_num
    ppo_hparams.epochs_num = ppo_epochs_num
    ppo_hparams.simulated_environment = True
    ppo_hparams.eval_every_epochs = 0
    ppo_hparams.save_models_every_epochs = ppo_epochs_num
    ppo_hparams.epoch_length = hparams.ppo_epoch_length
    ppo_hparams.num_agents = hparams.ppo_num_agents
    ppo_hparams.problem = gym_problem

    in_graph_wrappers = [
        (TimeLimitWrapper, {"timelimit": hparams.ppo_time_limit}),
        (MaxAndSkipWrapper, {"skip": 4})]
    in_graph_wrappers += gym_problem.in_graph_wrappers
    ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers)

    ppo_dir = generator_utils.make_tmp_dir(dir=data_dir, prefix="ppo_")
    rl_trainer_lib.train(ppo_hparams, gym_simulated_problem.env_name, ppo_dir)
    last_model = ppo_dir

    # Generate environment frames.
    time_delta = time.time() - start_time
    tf.logging.info("%s Step %d.3 - generate environment data. Time: %s",
                    line, iloop, str(datetime.timedelta(seconds=time_delta)))
    FLAGS.problem = "gym_discrete_problem_with_agent_on_%s" % hparams.game
    FLAGS.agent_policy_path = last_model
    gym_problem = registry.problem(FLAGS.problem)
    gym_problem.settable_num_steps = hparams.true_env_generator_num_steps
    iter_data_dir = os.path.join(data_dir, str(iloop))
    tf.gfile.MakeDirs(iter_data_dir)
    gym_problem.generate_data(iter_data_dir, tmp_dir)
    mean_reward = 0.0
    if gym_problem.dones != 0:
      mean_reward = gym_problem.sum_of_rewards / float(gym_problem.dones)
    tf.logging.info("%s Step %d mean reward: %.4f" % (line, iloop, mean_reward))

    # Report metrics.
    eval_metrics = {"model_reward_accuracy": model_reward_accuracy,
                    "mean_reward": mean_reward}
    epoch_metrics.append(eval_metrics)

  # Report the evaluation metrics from the final epoch
  return epoch_metrics[-1]
 def test_no_crash_cartpole(self):
     hparams = trainer_lib.create_hparams(
         "discrete_action_base", "epochs_num=11,video_during_eval=False")
     rl_trainer_lib.train(hparams, "CartPole-v0")