def train_agent_real_env( env, agent_model_dir, event_dir, epoch_data_dir, hparams, epoch=0, is_final_epoch=False): """Train the PPO agent in the real environment.""" del epoch_data_dir ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params) ppo_params_names = ["epochs_num", "epoch_length", "learning_rate", "num_agents", "eval_every_epochs", "optimization_epochs", "effective_num_agents"] # This should be overridden. ppo_hparams.add_hparam("effective_num_agents", None) for param_name in ppo_params_names: ppo_param_name = "real_ppo_"+ param_name if ppo_param_name in hparams: ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name)) ppo_hparams.epochs_num = _ppo_training_epochs(hparams, epoch, is_final_epoch, True) # We do not save model, as that resets frames that we need at restarts. # But we need to save at the last step, so we set it very high. ppo_hparams.save_models_every_epochs = 1000000 environment_spec = rl.standard_atari_env_spec( batch_env=env, include_clipping=False ) ppo_hparams.add_hparam("environment_spec", environment_spec) rl_trainer_lib.train(ppo_hparams, event_dir + "real", agent_model_dir, name_scope="ppo_real%d" % (epoch + 1)) # Save unfinished rollouts to history. env.reset()
def test_train_pong(self): hparams = tf.contrib.training.HParams( epochs_num=4, eval_every_epochs=2, num_agents=10, optimization_epochs=3, epoch_length=30, entropy_loss_coef=0.003, learning_rate=8e-05, optimizer="Adam", policy_network=rl_models.feed_forward_cnn_small_categorical_fun, gae_lambda=0.985, num_eval_agents=1, max_gradients_norm=0.5, gae_gamma=0.985, optimization_batch_size=4, clipping_coef=0.2, value_loss_coef=1, save_models_every_epochs=False) hparams.add_hparam( "environment_spec", gym_problems.standard_atari_env_spec("PongNoFrameskip-v4")) hparams.add_hparam( "environment_eval_spec", gym_problems.standard_atari_env_eval_spec("PongNoFrameskip-v4")) rl_trainer_lib.train(hparams)
def test_no_crash_pendulum(self): hparams = trainer_lib.create_hparams( "ppo_continuous_action_base", TrainTest.test_config) hparams.add_hparam("environment_spec", simple_gym_spec("Pendulum-v0")) rl_trainer_lib.train(hparams)
def train(self, env_fn, hparams, target_num_epochs, simulated, epoch): hparams.set_hparam("epochs_num", target_num_epochs) if simulated: simulated_str = "sim" hparams.save_models_every_epochs = 10 else: # TODO(konradczechowski): refactor ppo assert hparams.num_agents == 1 # We do not save model, as that resets frames that we need at restarts. # But we need to save at the last step, so we set it very high. hparams.save_models_every_epochs = 1000000 simulated_str = "real" # TODO(konradczechowski) refactor ppo, pass these as arguments # (not inside hparams). Do the same in evaluate() hparams.add_hparam("force_beginning_resets", simulated) hparams.add_hparam("env_fn", env_fn) hparams.add_hparam("frame_stack_size", self.frame_stack_size) name_scope = "ppo_{}{}".format(simulated_str, epoch + 1) rl_trainer_lib.train(hparams, self.event_dir + simulated_str, self.agent_model_dir, name_scope=name_scope)
def test_no_crash_pendulum(self): hparams = trainer_lib.create_hparams("ppo_continuous_action_base", TrainTest.test_config) hparams.add_hparam("environment_spec", rl_models.simple_gym_spec("Pendulum-v0")) rl_trainer_lib.train(hparams)
def test_no_crash_cartpole(self): hparams = trainer_lib.create_hparams("ppo_discrete_action_base", TrainTest.test_config) hparams.add_hparam("environment_spec", standard_atari_env_spec("CartPole-v0")) rl_trainer_lib.train(hparams)
def test_no_crash_cartpole(self): hparams = trainer_lib.create_hparams("ppo_discrete_action_base", TrainTest.test_config) hparams.add_hparam("environment_spec", rl_models.simple_gym_spec("CartPole-v0")) rl_trainer_lib.train(hparams)
def train_agent(real_env, agent_model_dir, event_dir, world_model_dir, data_dir, hparams, completed_ppo_epochs_num, epoch=0, is_final_epoch=False): """Train the PPO agent in the simulated environment.""" del data_dir ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params) ppo_params_names = ["epochs_num", "epoch_length", "learning_rate", "num_agents", "optimization_epochs", "eval_every_epochs"] for param_name in ppo_params_names: ppo_param_name = "ppo_" + param_name if ppo_param_name in hparams: ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name)) completed_ppo_epochs_num += sim_ppo_epoch_increment(hparams, is_final_epoch) ppo_hparams.epochs_num = completed_ppo_epochs_num ppo_hparams.save_models_every_epochs = 10 ppo_hparams.world_model_dir = world_model_dir environment_spec = make_simulated_env_spec(real_env, hparams) num_input_frames = environment_spec.video_num_input_frames initial_frame_rollouts = real_env.current_epoch_rollouts( split=tf.contrib.learn.ModeKeys.TRAIN, minimal_rollout_frames=num_input_frames, ) # TODO(koz4k): Move this to a different module. def initial_frame_chooser(batch_size): """Frame chooser.""" deterministic_initial_frames =\ initial_frame_rollouts[0][:num_input_frames] if not hparams.simulation_random_starts: # Deterministic starts: repeat first frames from the first rollout. initial_frames = [deterministic_initial_frames] * batch_size else: # Random starts: choose random initial frames from random rollouts. initial_frames = random_rollout_subsequences( initial_frame_rollouts, batch_size, num_input_frames ) if hparams.simulation_flip_first_random_for_beginning: # Flip first entry in the batch for deterministic initial frames. initial_frames[0] = deterministic_initial_frames return np.stack([ [frame.observation.decode() for frame in initial_frame_stack] for initial_frame_stack in initial_frames ]) environment_spec.add_hparam("initial_frame_chooser", initial_frame_chooser) ppo_hparams.add_hparam("environment_spec", environment_spec) rl_trainer_lib.train(ppo_hparams, event_dir + "sim", agent_model_dir, name_scope="ppo_sim%d" % (epoch + 1)) return completed_ppo_epochs_num
def test_no_crash_cartpole(self): hparams = trainer_lib.create_hparams( "ppo_discrete_action_base", TrainTest.test_config) hparams.add_hparam("environment_spec", standard_atari_env_spec("CartPole-v0")) rl_trainer_lib.train(hparams)
def train(hparams, output_dir): prefix = output_dir #remove trash # prefix = "~/trash/loop_{}".format(random.randint(10000, 99999)) data_dir = os.path.expanduser(prefix + "/data") tmp_dir = os.path.expanduser(prefix + "/tmp") output_dir = os.path.expanduser(prefix + "/output") tf.gfile.MakeDirs(data_dir) tf.gfile.MakeDirs(tmp_dir) tf.gfile.MakeDirs(output_dir) last_model = "" start_time = time.time() line = ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> " for iloop in range(hparams.epochs): time_delta = time.time() - start_time print(line+"Step {}.1. - generate data from policy. " "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta)))) FLAGS.problems = "gym_discrete_problem" FLAGS.agent_policy_path = last_model gym_problem = problems.problem(FLAGS.problems) gym_problem.num_steps = hparams.true_env_generator_num_steps iter_data_dir = os.path.join(data_dir, str(iloop)) tf.gfile.MakeDirs(iter_data_dir) gym_problem.generate_data(iter_data_dir, tmp_dir) time_delta = time.time() - start_time print(line+"Step {}.2. - generate env model. " "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta)))) # 2. generate env model FLAGS.data_dir = iter_data_dir FLAGS.output_dir = output_dir FLAGS.model = hparams.generative_model FLAGS.hparams_set = hparams.generative_model_params FLAGS.train_steps = hparams.model_train_steps FLAGS.eval_steps = 1 t2t_trainer.main([]) time_delta = time.time() - start_time print(line+"Step {}.3. - evalue env model. " "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta)))) gym_simulated_problem = problems.problem("gym_simulated_discrete_problem") gym_simulated_problem.num_steps = hparams.simulated_env_generator_num_steps gym_simulated_problem.generate_data(iter_data_dir, tmp_dir) # time_delta = time.time() - start_time print(line+"Step {}.4. - train PPO in model env." " Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta)))) ppo_epochs_num=hparams.ppo_epochs_num ppo_hparams = trainer_lib.create_hparams("atari_base", "epochs_num={},simulated_environment=True,eval_every_epochs=0,save_models_every_epochs={}".format(ppo_epochs_num+1, ppo_epochs_num), data_dir=output_dir) ppo_hparams.epoch_length = hparams.ppo_epoch_length ppo_dir = tempfile.mkdtemp(dir=data_dir, prefix="ppo_") in_graph_wrappers = [(TimeLimitWrapper, {"timelimit": 150}), (PongT2TGeneratorHackWrapper, {"add_value": -2})] + gym_problem.in_graph_wrappers ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers) rl_trainer_lib.train(ppo_hparams, "PongNoFrameskip-v4", ppo_dir) last_model = ppo_dir + "/model{}.ckpt".format(ppo_epochs_num)
def train_agent(problem_name, agent_model_dir, event_dir, world_model_dir, epoch_data_dir, hparams, autoencoder_path=None, epoch=0): """Train the PPO agent in the simulated environment.""" gym_problem = registry.problem(problem_name) ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params) ppo_epochs_num = hparams.ppo_epochs_num ppo_hparams.epochs_num = ppo_epochs_num ppo_hparams.eval_every_epochs = 50 ppo_hparams.save_models_every_epochs = ppo_epochs_num ppo_hparams.epoch_length = hparams.ppo_epoch_length ppo_hparams.num_agents = hparams.ppo_num_agents ppo_hparams.world_model_dir = world_model_dir if hparams.ppo_learning_rate: ppo_hparams.learning_rate = hparams.ppo_learning_rate # Adding model hparams for model specific adjustments model_hparams = trainer_lib.create_hparams(hparams.generative_model_params) ppo_hparams.add_hparam("model_hparams", model_hparams) environment_spec = copy.copy(gym_problem.environment_spec) environment_spec.simulated_env = True environment_spec.add_hparam("simulation_random_starts", hparams.simulation_random_starts) environment_spec.add_hparam("intrinsic_reward_scale", hparams.intrinsic_reward_scale) environment_spec.add_hparam("initial_frames_problem", gym_problem) # 4x for the StackAndSkipWrapper minus one to always finish for reporting. ppo_time_limit = ppo_hparams.epoch_length - 1 ppo_time_limit *= model_hparams.video_num_input_frames wrappers = environment_spec.wrappers + \ [[TimeLimitWrapper, {"timelimit": ppo_time_limit}]] environment_spec.wrappers = wrappers ppo_hparams.add_hparam("environment_spec", environment_spec) with temporary_flags({ "problem": problem_name, "model": hparams.generative_model, "hparams_set": hparams.generative_model_params, "output_dir": world_model_dir, "data_dir": epoch_data_dir, "autoencoder_path": autoencoder_path, }): rl_trainer_lib.train(ppo_hparams, event_dir, agent_model_dir, epoch=epoch)
def train_agent(environment_spec, agent_model_dir, event_dir, world_model_dir, epoch_data_dir, hparams, epoch=0, is_final_epoch=False): """Train the PPO agent in the simulated environment.""" ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params) ppo_params_names = [ "epochs_num", "epoch_length", "learning_rate", "num_agents", "optimization_epochs", "eval_every_epochs" ] for param_name in ppo_params_names: ppo_param_name = "ppo_" + param_name if ppo_param_name in hparams: ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name)) ppo_hparams.epochs_num = _ppo_training_epochs(hparams, epoch, is_final_epoch, False) ppo_hparams.save_models_every_epochs = 10 ppo_hparams.world_model_dir = world_model_dir ppo_hparams.add_hparam("force_beginning_resets", True) # Adding model hparams for model specific adjustments model_hparams = trainer_lib.create_hparams(hparams.generative_model_params) ppo_hparams.add_hparam("model_hparams", model_hparams) environment_spec = copy.copy(environment_spec) environment_spec_param_names = [ "simulation_random_starts", "simulation_flip_first_random_for_beginning", "intrinsic_reward_scale" ] for param_name in environment_spec_param_names: environment_spec.set_hparam(param_name, hparams.get(param_name)) ppo_hparams.add_hparam("environment_spec", environment_spec) ppo_hparams.add_hparam( "initial_frame_chooser", InitialFrameChooser(environment_spec, mode=tf.estimator.ModeKeys.EVAL)) # TODO(koz4k): Pass by arguments. with temporary_flags({ "problem": environment_spec.initial_frames_problem, "model": hparams.generative_model, "hparams_set": hparams.generative_model_params, "output_dir": world_model_dir, "data_dir": epoch_data_dir, }): rl_trainer_lib.train(ppo_hparams, event_dir + "sim", agent_model_dir, name_scope="ppo_sim%d" % (epoch + 1))
def train_agent_real_env(problem_name, agent_model_dir, event_dir, world_model_dir, epoch_data_dir, hparams, epoch=0, is_final_epoch=False): """Train the PPO agent in the real environment.""" global dumper_path, ppo_data_dumper_counter gym_problem = registry.problem(problem_name) ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params) ppo_params_names = [ "epochs_num", "epoch_length", "learning_rate", "num_agents", "eval_every_epochs", "optimization_epochs", "effective_num_agents" ] # This should be overridden. ppo_hparams.add_hparam("effective_num_agents", None) for param_name in ppo_params_names: ppo_param_name = "real_ppo_" + param_name if ppo_param_name in hparams: ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name)) ppo_hparams.epochs_num = _ppo_training_epochs(hparams, epoch, is_final_epoch, True) # We do not save model, as that resets frames that we need at restarts. # But we need to save at the last step, so we set it very high. ppo_hparams.save_models_every_epochs = 1000000 environment_spec = copy.copy(gym_problem.environment_spec) if hparams.gather_ppo_real_env_data: # TODO(piotrmilos):This should be refactored assert hparams.real_ppo_num_agents == 1, ( "It is required to use collect with pyfunc_wrapper") ppo_data_dumper_counter = 0 dumper_path = os.path.join(epoch_data_dir, "dumper") tf.gfile.MakeDirs(dumper_path) dumper_spec = [PyFuncWrapper, {"process_fun": ppo_data_dumper}] environment_spec.wrappers.insert(2, dumper_spec) ppo_hparams.add_hparam("environment_spec", environment_spec) with temporary_flags({ "problem": problem_name, "output_dir": world_model_dir, "data_dir": epoch_data_dir, }): rl_trainer_lib.train(ppo_hparams, event_dir + "real", agent_model_dir, name_scope="ppo_real%d" % (epoch + 1))
def train_agent(problem_name, agent_model_dir, event_dir, world_model_dir, epoch_data_dir, hparams, epoch=0, is_final_epoch=False): """Train the PPO agent in the simulated environment.""" gym_problem = registry.problem(problem_name) ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params) ppo_params_names = [ "epochs_num", "epoch_length", "learning_rate", "num_agents", "optimization_epochs" ] for param_name in ppo_params_names: ppo_param_name = "ppo_" + param_name if ppo_param_name in hparams: ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name)) ppo_epochs_num = hparams.ppo_epochs_num if is_final_epoch: ppo_epochs_num *= 2 ppo_hparams.epoch_length *= 2 ppo_hparams.save_models_every_epochs = ppo_epochs_num ppo_hparams.world_model_dir = world_model_dir ppo_hparams.add_hparam("force_beginning_resets", True) # Adding model hparams for model specific adjustments model_hparams = trainer_lib.create_hparams(hparams.generative_model_params) ppo_hparams.add_hparam("model_hparams", model_hparams) environment_spec = copy.copy(gym_problem.environment_spec) environment_spec.simulation_random_starts = hparams.simulation_random_starts do_flip = hparams.simulation_flip_first_random_for_beginning environment_spec.simulation_flip_first_random_for_beginning = do_flip environment_spec.intrinsic_reward_scale = hparams.intrinsic_reward_scale ppo_hparams.add_hparam("environment_spec", environment_spec) with temporary_flags({ "problem": problem_name, "model": hparams.generative_model, "hparams_set": hparams.generative_model_params, "output_dir": world_model_dir, "data_dir": epoch_data_dir, }): rl_trainer_lib.train(ppo_hparams, event_dir, agent_model_dir, epoch=epoch, name_scope="ppo_sim")
def train_agent(problem_name, agent_model_dir, event_dir, world_model_dir, epoch_data_dir, hparams, autoencoder_path=None, epoch=0): """Train the PPO agent in the simulated environment.""" gym_problem = registry.problem(problem_name) ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params) ppo_epochs_num = hparams.ppo_epochs_num ppo_hparams.epochs_num = ppo_epochs_num ppo_hparams.simulated_environment = True ppo_hparams.simulation_random_starts = hparams.simulation_random_starts ppo_hparams.intrinsic_reward_scale = hparams.intrinsic_reward_scale ppo_hparams.eval_every_epochs = 50 ppo_hparams.save_models_every_epochs = ppo_epochs_num ppo_hparams.epoch_length = hparams.ppo_epoch_length ppo_hparams.num_agents = hparams.ppo_num_agents ppo_hparams.problem = gym_problem ppo_hparams.world_model_dir = world_model_dir if hparams.ppo_learning_rate: ppo_hparams.learning_rate = hparams.ppo_learning_rate # 4x for the StackAndSkipWrapper minus one to always finish for reporting. ppo_time_limit = (ppo_hparams.epoch_length - 1) * 4 in_graph_wrappers = [(TimeLimitWrapper, { "timelimit": ppo_time_limit }), (StackAndSkipWrapper, { "skip": 4 })] in_graph_wrappers += gym_problem.in_graph_wrappers ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers) with temporary_flags({ "problem": problem_name, "model": hparams.generative_model, "hparams_set": hparams.generative_model_params, "output_dir": world_model_dir, "data_dir": epoch_data_dir, "autoencoder_path": autoencoder_path, }): rl_trainer_lib.train(ppo_hparams, gym_problem.env_name, event_dir, agent_model_dir, epoch=epoch)
def train_agent(problem_name, agent_model_dir, event_dir, world_model_dir, epoch_data_dir, hparams, autoencoder_path=None, epoch=0): """Train the PPO agent in the simulated environment.""" gym_problem = registry.problem(problem_name) ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params) ppo_epochs_num = hparams.ppo_epochs_num ppo_hparams.epochs_num = ppo_epochs_num ppo_hparams.eval_every_epochs = 50 ppo_hparams.save_models_every_epochs = ppo_epochs_num ppo_hparams.epoch_length = hparams.ppo_epoch_length ppo_hparams.num_agents = hparams.ppo_num_agents ppo_hparams.world_model_dir = world_model_dir ppo_hparams.add_hparam("force_beginning_resets", True) if hparams.ppo_learning_rate: ppo_hparams.learning_rate = hparams.ppo_learning_rate # Adding model hparams for model specific adjustments model_hparams = trainer_lib.create_hparams(hparams.generative_model_params) ppo_hparams.add_hparam("model_hparams", model_hparams) environment_spec = copy.copy(gym_problem.environment_spec) environment_spec.simulation_random_starts = hparams.simulation_random_starts environment_spec.intrinsic_reward_scale = hparams.intrinsic_reward_scale ppo_hparams.add_hparam("environment_spec", environment_spec) with temporary_flags({ "problem": problem_name, "model": hparams.generative_model, "hparams_set": hparams.generative_model_params, "output_dir": world_model_dir, "data_dir": epoch_data_dir, "autoencoder_path": autoencoder_path, }): rl_trainer_lib.train(ppo_hparams, event_dir, agent_model_dir, epoch=epoch)
def train_agent_real_env(problem_name, agent_model_dir, event_dir, world_model_dir, epoch_data_dir, hparams, epoch=0, is_final_epoch=False): """Train the PPO agent in the real environment.""" del epoch, is_final_epoch gym_problem = registry.problem(problem_name) ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params) ppo_params_names = [ "epochs_num", "epoch_length", "learning_rate", "num_agents", "optimization_epochs" ] for param_name in ppo_params_names: ppo_param_name = "real_ppo_" + param_name if ppo_param_name in hparams: ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name)) ppo_epochs_num = hparams.real_ppo_epochs_num if ppo_epochs_num == 0: return ppo_hparams.save_models_every_epochs = ppo_epochs_num environment_spec = copy.copy(gym_problem.environment_spec) ppo_hparams.add_hparam("environment_spec", environment_spec) with temporary_flags({ "problem": problem_name, "output_dir": world_model_dir, "data_dir": epoch_data_dir, }): # epoch = 10**20 is a hackish way to avoid skiping training rl_trainer_lib.train(ppo_hparams, event_dir, agent_model_dir, epoch=10**20, name_scope="ppo_real")
def train_agent_real_env(env, agent_model_dir, event_dir, data_dir, hparams, completed_ppo_epochs_num, epoch=0, is_final_epoch=False): """Train the PPO agent in the real environment.""" del is_final_epoch, data_dir ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params) ppo_params_names = [ "epochs_num", "epoch_length", "learning_rate", "num_agents", "eval_every_epochs", "optimization_epochs", "effective_num_agents" ] # This should be overridden. ppo_hparams.add_hparam("effective_num_agents", None) for param_name in ppo_params_names: ppo_param_name = "real_ppo_" + param_name if ppo_param_name in hparams: ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name)) completed_ppo_epochs_num += real_ppo_epoch_increment(hparams) ppo_hparams.epochs_num = completed_ppo_epochs_num # We do not save model, as that resets frames that we need at restarts. # But we need to save at the last step, so we set it very high. ppo_hparams.save_models_every_epochs = 1000000 ppo_hparams.add_hparam("env_fn", rl.make_real_env_fn(env)) ppo_hparams.add_hparam("force_beginning_resets", False) ppo_hparams.add_hparam("frame_stack_size", hparams.frame_stack_size) rl_trainer_lib.train(ppo_hparams, event_dir + "real", agent_model_dir, name_scope="ppo_real%d" % (epoch + 1)) # Save unfinished rollouts to history. env.reset() return completed_ppo_epochs_num
def train_agent(problem_name, agent_model_dir, event_dir, world_model_dir, epoch_data_dir, hparams, autoencoder_path=None, epoch=0): """Train the PPO agent in the simulated environment.""" gym_problem = registry.problem(problem_name) ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params) ppo_params_names = ["epochs_num", "epoch_length", "learning_rate", "num_agents", "optimization_epochs"] for param_name in ppo_params_names: ppo_param_name = "ppo_"+ param_name if ppo_param_name in hparams: ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name)) ppo_epochs_num = hparams.ppo_epochs_num ppo_hparams.save_models_every_epochs = ppo_epochs_num ppo_hparams.world_model_dir = world_model_dir ppo_hparams.add_hparam("force_beginning_resets", True) # Adding model hparams for model specific adjustments model_hparams = trainer_lib.create_hparams(hparams.generative_model_params) ppo_hparams.add_hparam("model_hparams", model_hparams) environment_spec = copy.copy(gym_problem.environment_spec) environment_spec.simulation_random_starts = hparams.simulation_random_starts environment_spec.intrinsic_reward_scale = hparams.intrinsic_reward_scale ppo_hparams.add_hparam("environment_spec", environment_spec) with temporary_flags({ "problem": problem_name, "model": hparams.generative_model, "hparams_set": hparams.generative_model_params, "output_dir": world_model_dir, "data_dir": epoch_data_dir, "autoencoder_path": autoencoder_path, }): rl_trainer_lib.train(ppo_hparams, event_dir, agent_model_dir, epoch=epoch)
def test_no_crash_pendulum(self): hparams = trainer_lib.create_hparams( "continuous_action_base", "epochs_num=11,video_during_eval=False") rl_trainer_lib.train(hparams, "Pendulum-v0")
def test_no_crash_cartpole(self): hparams = trainer_lib.create_hparams( "discrete_action_base", "epochs_num=11,video_during_eval=False") rl_trainer_lib.train(hparams, "CartPole-v0")
def test_no_crash_pendulum(self): hparams = trainer_lib.create_hparams( "continuous_action_base", "epochs_num=11,video_during_eval=False") rl_trainer_lib.train(hparams, "Pendulum-v0")
def main(_): hparams = trainer_lib.create_hparams(FLAGS.hparams_set, FLAGS.hparams) rl_trainer_lib.train(hparams, FLAGS.output_dir, FLAGS.output_dir)
def train(hparams, output_dir, report_fn=None): hparams = initialize_env_specs(hparams) rl_trainer_lib.train(hparams, output_dir, output_dir, report_fn=report_fn)
def train_agent(real_env, agent_model_dir, event_dir, world_model_dir, data_dir, hparams, ppo_epochs_num, epoch=0, is_final_epoch=False): """Train the PPO agent in the simulated environment.""" del data_dir ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params) ppo_params_names = [ "epochs_num", "epoch_length", "learning_rate", "num_agents", "optimization_epochs", "eval_every_epochs" ] for param_name in ppo_params_names: ppo_param_name = "ppo_" + param_name if ppo_param_name in hparams: ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name)) ppo_epochs_num += sim_ppo_epoch_increment(hparams, is_final_epoch) ppo_hparams.epochs_num = ppo_epochs_num ppo_hparams.save_models_every_epochs = 10 ppo_hparams.world_model_dir = world_model_dir environment_spec_params = { param_name: hparams.get(param_name) for param_name in [ "intrinsic_reward_scale", "simulation_random_starts", "simulation_flip_first_random_for_beginning" ] } environment_spec_params.update({ "model_name": hparams.generative_model, "model_hparams": trainer_lib.create_hparams(hparams.generative_model_params), # Hardcoded for now. TODO(koz4k): Make it a hparam. "video_num_input_frames": 4, "video_num_target_frames": 1 }) environment_spec = rl.standard_atari_env_simulated_spec( real_env, **environment_spec_params) with tf.Session() as sess: encoded_png_p = tf.placeholder(tf.string) decoded_png_t = tf.image.decode_png(encoded_png_p) def decode_png(encoded_png): return sess.run(decoded_png_t, feed_dict={encoded_png_p: encoded_png}) num_input_frames = environment_spec.video_num_input_frames initial_frame_rollouts = real_env.current_epoch_rollouts( split=tf.contrib.learn.ModeKeys.TRAIN, minimal_rollout_frames=num_input_frames, ) # TODO(koz4k): Move this to a different module. def initial_frame_chooser(batch_size): """Frame chooser.""" deterministic_initial_frames =\ initial_frame_rollouts[0][:num_input_frames] if not environment_spec.simulation_random_starts: # Deterministic starts: repeat first frames from the first rollout. initial_frames = [deterministic_initial_frames] * batch_size else: # Random starts: choose random initial frames from random rollouts. # TODO(koz4k): Weigh rollouts by their lengths so sampling is uniform # over frames and not rollouts. def choose_initial_frames(): try: rollout = random.choice(initial_frame_rollouts) from_index = random.randrange( len(rollout) - num_input_frames + 1) return rollout[from_index:(from_index + num_input_frames)] except ValueError: # Rollout too short; repeat. return choose_initial_frames() initial_frames = [ choose_initial_frames() for _ in range(batch_size) ] if environment_spec.simulation_flip_first_random_for_beginning: # Flip first entry in the batch for deterministic initial frames. initial_frames[0] = deterministic_initial_frames return np.stack([[ decode_png(frame.observation) for frame in initial_frame_stack ] for initial_frame_stack in initial_frames]) environment_spec.add_hparam("initial_frame_chooser", initial_frame_chooser) ppo_hparams.add_hparam("environment_spec", environment_spec) rl_trainer_lib.train(ppo_hparams, event_dir + "sim", agent_model_dir, name_scope="ppo_sim%d" % (epoch + 1)) return ppo_epochs_num
def test_no_crash_pendulum(self): hparams = trainer_lib.create_hparams("pendulum_base", "epochs_num=10") rl_trainer_lib.train(hparams, "Pendulum-v0")
def test_no_crash_cartpole(self): hparams = trainer_lib.create_hparams("cartpole_base", "epochs_num=10") rl_trainer_lib.train(hparams, "CartPole-v0")
def train(hparams, output_dir): """Training function.""" prefix = output_dir data_dir = os.path.expanduser(prefix + "/data") tmp_dir = os.path.expanduser(prefix + "/tmp") output_dir = os.path.expanduser(prefix + "/output") tf.gfile.MakeDirs(data_dir) tf.gfile.MakeDirs(tmp_dir) tf.gfile.MakeDirs(output_dir) last_model = "" start_time = time.time() line = ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> " for iloop in range(hparams.epochs): time_delta = time.time() - start_time print(line + "Step {}.1. - generate data from policy. " "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta)))) FLAGS.problem = "gym_discrete_problem_with_agent_on_%s" % hparams.game FLAGS.agent_policy_path = last_model gym_problem = registry.problem(FLAGS.problem) gym_problem.settable_num_steps = hparams.true_env_generator_num_steps iter_data_dir = os.path.join(data_dir, str(iloop)) tf.gfile.MakeDirs(iter_data_dir) gym_problem.generate_data(iter_data_dir, tmp_dir) time_delta = time.time() - start_time print(line + "Step {}.2. - generate env model. " "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta)))) # 2. generate env model FLAGS.data_dir = iter_data_dir FLAGS.output_dir = output_dir FLAGS.model = hparams.generative_model FLAGS.hparams_set = hparams.generative_model_params FLAGS.train_steps = hparams.model_train_steps * (iloop + 2) FLAGS.eval_steps = 10 t2t_trainer.main([]) # Dump frames from env model. time_delta = time.time() - start_time print(line + "Step {}.3. - evaluate env model. " "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta)))) gym_simulated_problem = registry.problem( "gym_simulated_discrete_problem_with_agent_on_%s" % hparams.game) sim_steps = hparams.simulated_env_generator_num_steps gym_simulated_problem.settable_num_steps = sim_steps gym_simulated_problem.generate_data(iter_data_dir, tmp_dir) # PPO. time_delta = time.time() - start_time print(line + "Step {}.4. - train PPO in model env." " Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta)))) ppo_epochs_num = hparams.ppo_epochs_num ppo_hparams = trainer_lib.create_hparams( "atari_base", "epochs_num={},simulated_environment=True,eval_every_epochs=0," "save_models_every_epochs={}".format(ppo_epochs_num + 1, ppo_epochs_num), data_dir=output_dir) ppo_hparams.epoch_length = hparams.ppo_epoch_length ppo_dir = tempfile.mkdtemp(dir=data_dir, prefix="ppo_") in_graph_wrappers = [(TimeLimitWrapper, { "timelimit": hparams.ppo_time_limit }), (MaxAndSkipWrapper, { "skip": 4 })] in_graph_wrappers += gym_problem.in_graph_wrappers ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers) ppo_hparams.num_agents = hparams.ppo_num_agents rl_trainer_lib.train(ppo_hparams, gym_simulated_problem.env_name, ppo_dir) last_model = ppo_dir + "/model{}.ckpt".format(ppo_epochs_num)
def test_train_pong(self): hparams = registry.hparams("pong_model_free") hparams.epochs_num = 2 hparams.num_agents = 2 hparams.epoch_length = 3 rl_trainer_lib.train(hparams)
def test_no_crash_cartpole(self): hparams = trainer_lib.create_hparams( "discrete_action_base", TrainTest.test_config) rl_trainer_lib.train(hparams, "CartPole-v0")
def test_no_crash_pendulum(self): hparams = trainer_lib.create_hparams( "continuous_action_base", TrainTest.test_config) rl_trainer_lib.train(hparams, "Pendulum-v0")
def train(hparams, output_dir): """Training function.""" prefix = output_dir data_dir = os.path.expanduser(prefix + "/data") tmp_dir = os.path.expanduser(prefix + "/tmp") output_dir = os.path.expanduser(prefix + "/output") autoencoder_dir = os.path.expanduser(prefix + "/autoencoder") tf.gfile.MakeDirs(data_dir) tf.gfile.MakeDirs(tmp_dir) tf.gfile.MakeDirs(output_dir) tf.gfile.MakeDirs(autoencoder_dir) last_model = "" start_time = time.time() line = ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> " epoch_metrics = [] iter_data_dirs = [] ae_data_dirs = [] orig_autoencoder_path = FLAGS.autoencoder_path for iloop in range(hparams.epochs): # Train autoencoder if needed. if (hparams.autoencoder_train_steps > 0 and iloop == 0 and not orig_autoencoder_path): time_delta = time.time() - start_time tf.logging.info("%s Step AE - train autoencoder. Time: %s", line, str(datetime.timedelta(seconds=time_delta))) with tf.Graph().as_default(): # Generate data. FLAGS.autoencoder_path = "" FLAGS.problem = "gym_discrete_problem_with_agent_on_%s" % hparams.game FLAGS.agent_policy_path = "" gym_problem = registry.problem(FLAGS.problem) gym_problem.settable_num_steps = hparams.true_env_generator_num_steps ae_data_dir = os.path.join(data_dir, "ae%d" % iloop) ae_data_dirs.append(ae_data_dir) tf.gfile.MakeDirs(ae_data_dir) gym_problem.generate_data(ae_data_dir, tmp_dir) if ae_data_dirs[:-1]: combine_world_model_train_data(gym_problem, ae_data_dir, ae_data_dirs[:-1]) # Train AE. FLAGS.data_dir = ae_data_dir FLAGS.output_dir = autoencoder_dir # TODO(lukaszkaiser): make non-hardcoded here and in gym_problems.py. FLAGS.model = "autoencoder_ordered_discrete" FLAGS.hparams_set = "autoencoder_discrete_pong" FLAGS.train_steps = hparams.autoencoder_train_steps * (iloop + 2) FLAGS.eval_steps = 100 t2t_trainer.main([]) FLAGS.autoencoder_path = autoencoder_dir # Generate random frames. if iloop == 0: time_delta = time.time() - start_time tf.logging.info("%s Step %d.0 - generate random data. Time: %s", line, iloop, str(datetime.timedelta(seconds=time_delta))) FLAGS.problem = "gym_discrete_problem_with_agent_on_%s" % hparams.game FLAGS.agent_policy_path = "" gym_problem = registry.problem(FLAGS.problem) gym_problem.settable_num_steps = hparams.true_env_generator_num_steps iter_data_dir = os.path.join(data_dir, "0random") iter_data_dirs.append(iter_data_dir) tf.gfile.MakeDirs(iter_data_dir) gym_problem.generate_data(iter_data_dir, tmp_dir) mean_reward = gym_problem.sum_of_rewards / max(1.0, gym_problem.dones) tf.logging.info("%s Step 0.0 random reward: %.4f" % (line, mean_reward)) time_delta = time.time() - start_time tf.logging.info("%s Step %d.1 - generate env model. Time: %s", line, iloop, str(datetime.timedelta(seconds=time_delta))) # Train env model FLAGS.data_dir = iter_data_dir FLAGS.output_dir = output_dir FLAGS.model = hparams.generative_model FLAGS.hparams_set = hparams.generative_model_params FLAGS.train_steps = hparams.model_train_steps * (iloop + 2) FLAGS.eval_steps = 100 t2t_trainer.main([]) # Evaluate and dump frames from env model time_delta = time.time() - start_time tf.logging.info("%s Step %d.1a - evaluate env model. Time: %s", line, iloop, str(datetime.timedelta(seconds=time_delta))) gym_simulated_problem = registry.problem( "gym_simulated_discrete_problem_with_agent_on_%s" % hparams.game) sim_steps = hparams.simulated_env_generator_num_steps gym_simulated_problem.settable_num_steps = sim_steps gym_simulated_problem.real_env_problem = gym_problem gym_simulated_problem.simulation_random_starts = False gym_simulated_problem.intrinsic_reward_scale = 0. gym_simulated_problem.generate_data(iter_data_dir, tmp_dir) model_reward_accuracy = 0.0 if gym_simulated_problem.dones != 0: n = float(gym_simulated_problem.dones) model_reward_accuracy = ( gym_simulated_problem.successful_episode_reward_predictions / n) tf.logging.info("%s Step %d.1a env model reward accuracy: %.4f" % ( line, iloop, model_reward_accuracy)) # Train PPO agent time_delta = time.time() - start_time tf.logging.info("%s Step %d.2 - train PPO in model env. Time: %s", line, iloop, str(datetime.timedelta(seconds=time_delta))) # Setup PPO hparams ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params, data_dir=output_dir) ppo_epochs_num = hparams.ppo_epochs_num ppo_hparams.epochs_num = ppo_epochs_num ppo_hparams.simulated_environment = True ppo_hparams.simulation_random_starts = hparams.simulation_random_starts ppo_hparams.intrinsic_reward_scale = hparams.intrinsic_reward_scale ppo_hparams.eval_every_epochs = 0 ppo_hparams.save_models_every_epochs = ppo_epochs_num ppo_hparams.epoch_length = hparams.ppo_epoch_length ppo_hparams.num_agents = hparams.ppo_num_agents ppo_hparams.problem = gym_problem in_graph_wrappers = [ (TimeLimitWrapper, {"timelimit": hparams.ppo_time_limit}), (MaxAndSkipWrapper, {"skip": 4})] in_graph_wrappers += gym_problem.in_graph_wrappers ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers) ppo_dir = generator_utils.make_tmp_dir(dir=data_dir, prefix="ppo_") rl_trainer_lib.train(ppo_hparams, gym_simulated_problem.env_name, ppo_dir) last_model = ppo_dir # Evaluate agent. time_delta = time.time() - start_time tf.logging.info("%s Step %d.3 - evaluate agent. Time: %s", line, iloop, str(datetime.timedelta(seconds=time_delta))) FLAGS.problem = "gym_discrete_problem_with_agent_on_%s" % hparams.game FLAGS.agent_policy_path = last_model eval_gym_problem = registry.problem(FLAGS.problem) eval_gym_problem.settable_num_steps = hparams.true_env_generator_num_steps eval_gym_problem.eval_runs = 5 eval_data_dir = os.path.join(data_dir, str(iloop)+"eval") iter_data_dirs.append(eval_data_dir) tf.gfile.MakeDirs(eval_data_dir) eval_gym_problem.generate_data(eval_data_dir, tmp_dir) # Generate environment frames. time_delta = time.time() - start_time tf.logging.info("%s Step %d.4 - generate environment data. Time: %s", line, iloop, str(datetime.timedelta(seconds=time_delta))) gym_problem = registry.problem(FLAGS.problem) gym_problem.settable_num_steps = hparams.true_env_generator_num_steps iter_data_dir = os.path.join(data_dir, str(iloop)) iter_data_dirs.append(iter_data_dir) tf.gfile.MakeDirs(iter_data_dir) gym_problem.generate_data(iter_data_dir, tmp_dir) combine_world_model_train_data(gym_problem, iter_data_dir, iter_data_dirs[:-1]) mean_reward = 0.0 if eval_gym_problem.dones != 0: mean_reward = eval_gym_problem.sum_of_rewards / float(eval_gym_problem.dones) tf.logging.info("%s Step %d mean reward: %.4f" % (line, iloop, mean_reward)) # Report metrics. eval_metrics = {"model_reward_accuracy": model_reward_accuracy, "mean_reward": mean_reward} epoch_metrics.append(eval_metrics) # Report the evaluation metrics from the final epoch return epoch_metrics[-1]
def main(_): hparams = trainer_lib.create_hparams(FLAGS.hparams_set, FLAGS.hparams) rl_trainer_lib.train(hparams, FLAGS.problem, FLAGS.output_dir)
def main(_): rl_trainer_lib.train(rl_trainer_lib.example_params())
def train(hparams, output_dir): """Training function.""" prefix = output_dir data_dir = os.path.expanduser(prefix + "/data") tmp_dir = os.path.expanduser(prefix + "/tmp") output_dir = os.path.expanduser(prefix + "/output") tf.gfile.MakeDirs(data_dir) tf.gfile.MakeDirs(tmp_dir) tf.gfile.MakeDirs(output_dir) last_model = "" start_time = time.time() line = ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> " epoch_metrics = [] for iloop in range(hparams.epochs): # Generate random frames. if iloop == 0: time_delta = time.time() - start_time tf.logging.info("%s Step %d.0 - generate random data. Time: %s", line, iloop, str(datetime.timedelta(seconds=time_delta))) FLAGS.problem = "gym_discrete_problem_with_agent_on_%s" % hparams.game FLAGS.agent_policy_path = "" gym_problem = registry.problem(FLAGS.problem) gym_problem.settable_num_steps = hparams.true_env_generator_num_steps iter_data_dir = os.path.join(data_dir, "0random") tf.gfile.MakeDirs(iter_data_dir) gym_problem.generate_data(iter_data_dir, tmp_dir) mean_reward = gym_problem.sum_of_rewards / max(1.0, gym_problem.dones) tf.logging.info("%s Step 0.0 random reward: %.4f" % (line, mean_reward)) time_delta = time.time() - start_time tf.logging.info("%s Step %d.1 - generate env model. Time: %s", line, iloop, str(datetime.timedelta(seconds=time_delta))) # Train env model FLAGS.data_dir = iter_data_dir FLAGS.output_dir = output_dir FLAGS.model = hparams.generative_model FLAGS.hparams_set = hparams.generative_model_params FLAGS.train_steps = hparams.model_train_steps * (iloop + 2) FLAGS.eval_steps = 10 t2t_trainer.main([]) # Evaluate and dump frames from env model time_delta = time.time() - start_time tf.logging.info("%s Step %d.1a - evaluate env model. Time: %s", line, iloop, str(datetime.timedelta(seconds=time_delta))) gym_simulated_problem = registry.problem( "gym_simulated_discrete_problem_with_agent_on_%s" % hparams.game) sim_steps = hparams.simulated_env_generator_num_steps gym_simulated_problem.settable_num_steps = sim_steps gym_simulated_problem.real_env_problem = gym_problem gym_simulated_problem.generate_data(iter_data_dir, tmp_dir) model_reward_accuracy = 0.0 if gym_simulated_problem.dones != 0: n = float(gym_simulated_problem.dones) model_reward_accuracy = ( gym_simulated_problem.successful_episode_reward_predictions / n) # Train PPO agent time_delta = time.time() - start_time tf.logging.info("%s Step %d.2 - train PPO in model env. Time: %s", line, iloop, str(datetime.timedelta(seconds=time_delta))) # Setup PPO hparams ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params, data_dir=output_dir) ppo_epochs_num = hparams.ppo_epochs_num ppo_hparams.epochs_num = ppo_epochs_num ppo_hparams.simulated_environment = True ppo_hparams.eval_every_epochs = 0 ppo_hparams.save_models_every_epochs = ppo_epochs_num ppo_hparams.epoch_length = hparams.ppo_epoch_length ppo_hparams.num_agents = hparams.ppo_num_agents ppo_hparams.problem = gym_problem in_graph_wrappers = [ (TimeLimitWrapper, {"timelimit": hparams.ppo_time_limit}), (MaxAndSkipWrapper, {"skip": 4})] in_graph_wrappers += gym_problem.in_graph_wrappers ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers) ppo_dir = generator_utils.make_tmp_dir(dir=data_dir, prefix="ppo_") rl_trainer_lib.train(ppo_hparams, gym_simulated_problem.env_name, ppo_dir) last_model = ppo_dir # Generate environment frames. time_delta = time.time() - start_time tf.logging.info("%s Step %d.3 - generate environment data. Time: %s", line, iloop, str(datetime.timedelta(seconds=time_delta))) FLAGS.problem = "gym_discrete_problem_with_agent_on_%s" % hparams.game FLAGS.agent_policy_path = last_model gym_problem = registry.problem(FLAGS.problem) gym_problem.settable_num_steps = hparams.true_env_generator_num_steps iter_data_dir = os.path.join(data_dir, str(iloop)) tf.gfile.MakeDirs(iter_data_dir) gym_problem.generate_data(iter_data_dir, tmp_dir) mean_reward = 0.0 if gym_problem.dones != 0: mean_reward = gym_problem.sum_of_rewards / float(gym_problem.dones) tf.logging.info("%s Step %d mean reward: %.4f" % (line, iloop, mean_reward)) # Report metrics. eval_metrics = {"model_reward_accuracy": model_reward_accuracy, "mean_reward": mean_reward} epoch_metrics.append(eval_metrics) # Report the evaluation metrics from the final epoch return epoch_metrics[-1]
def test_no_crash_cartpole(self): hparams = trainer_lib.create_hparams( "discrete_action_base", "epochs_num=11,video_during_eval=False") rl_trainer_lib.train(hparams, "CartPole-v0")