def initialize_env_specs(hparams): """Initializes env_specs using T2TGymEnvs.""" if getattr(hparams, "game", None): game_name = gym_env.camel_case_name(hparams.game) env = gym_env.T2TGymEnv("{}Deterministic-v4".format(game_name), batch_size=hparams.batch_size) env.start_new_epoch(0) hparams.add_hparam("env_fn", rl.make_real_env_fn(env)) eval_env = gym_env.T2TGymEnv("{}Deterministic-v4".format(game_name), batch_size=hparams.eval_batch_size) eval_env.start_new_epoch(0) hparams.add_hparam("eval_env_fn", rl.make_real_env_fn(eval_env)) return hparams
def initialize_env_specs(hparams): """Initializes env_specs using T2TGymEnvs.""" if getattr(hparams, "game", None): game_name = gym_env.camel_case_name(hparams.game) env = gym_env.T2TGymEnv("{}Deterministic-v4".format(game_name), batch_size=hparams.batch_size) env.start_new_epoch(0) hparams.add_hparam("env_fn", rl.make_real_env_fn(env)) eval_env = gym_env.T2TGymEnv("{}Deterministic-v4".format(game_name), batch_size=hparams.eval_batch_size) eval_env.start_new_epoch(0) hparams.add_hparam("eval_env_fn", rl.make_real_env_fn(eval_env)) return hparams
def initialize_env_specs(hparams): """Initializes env_specs using T2TGymEnvs.""" env = rl_utils.setup_env(hparams, hparams.batch_size, hparams.eval_max_num_noops) env.start_new_epoch(0) hparams.add_hparam("env_fn", rl.make_real_env_fn(env)) return hparams
def _eval_fn_with_learner(env, hparams, policy_hparams, policy_dir, sampling_temp): env_fn = rl.make_real_env_fn(env) learner = LEARNERS[hparams.base_algo](hparams.frame_stack_size, base_event_dir=None, agent_model_dir=policy_dir, total_num_epochs=1) learner.evaluate(env_fn, policy_hparams, sampling_temp)
def initialize_env_specs(hparams): """Initializes env_specs using T2TGymEnvs.""" env = rl_utils.setup_env(hparams, hparams.batch_size, hparams.eval_max_num_noops) env.start_new_epoch(0) # TODO(afrozm): Decouple env_fn from hparams and return both, is there # even a need to return hparams? Just return the env_fn? hparams.add_hparam("env_fn", rl.make_real_env_fn(env)) return hparams
def initialize_env_specs(hparams): """Initializes env_specs using T2TGymEnvs.""" env = rl_utils.setup_env(hparams, hparams.batch_size, hparams.eval_max_num_noops, hparams.rl_env_max_episode_steps, env_name=hparams.rl_env_name) env.start_new_epoch(0) return rl.make_real_env_fn(env)
def initialize_env_specs(hparams, env_problem_name): """Initializes env_specs using the appropriate env.""" if env_problem_name: env = registry.env_problem(env_problem_name, batch_size=hparams.batch_size) else: env = rl_utils.setup_env(hparams, hparams.batch_size, hparams.eval_max_num_noops, hparams.rl_env_max_episode_steps, env_name=hparams.rl_env_name) env.start_new_epoch(0) return rl.make_real_env_fn(env)
def train_agent_real_env(env, learner, hparams, epoch): """Train the PPO agent in the real environment.""" base_algo_str = hparams.base_algo train_hparams = trainer_lib.create_hparams(hparams.base_algo_params) update_hparams_from_hparams( train_hparams, hparams, "real_" + base_algo_str + "_" ) env_fn = rl.make_real_env_fn(env) num_env_steps = real_env_step_increment(hparams) learner.train( env_fn, train_hparams, simulated=False, save_continuously=False, epoch=epoch, num_env_steps=num_env_steps ) # Save unfinished rollouts to history. env.reset()
def train_agent_real_env(env, learner, hparams, epoch): """Train the PPO agent in the real environment.""" base_algo_str = hparams.base_algo train_hparams = trainer_lib.create_hparams(hparams.base_algo_params) update_hparams_from_hparams( train_hparams, hparams, "real_" + base_algo_str + "_" ) env_fn = rl.make_real_env_fn(env) num_env_steps = real_env_step_increment(hparams) learner.train( env_fn, train_hparams, simulated=False, save_continuously=False, epoch=epoch, num_env_steps=num_env_steps ) # Save unfinished rollouts to history. env.reset()
def evaluate_single_config(hparams, stochastic, max_num_noops, agent_model_dir): """Evaluate the PPO agent in the real environment.""" eval_hparams = trainer_lib.create_hparams(hparams.base_algo_params) env = setup_env(hparams, batch_size=hparams.eval_batch_size, max_num_noops=max_num_noops) env.start_new_epoch(0) env_fn = rl.make_real_env_fn(env) learner = LEARNERS[hparams.base_algo](hparams.frame_stack_size, base_event_dir=None, agent_model_dir=agent_model_dir) learner.evaluate(env_fn, eval_hparams, stochastic) rollouts = env.current_epoch_rollouts() env.close() return tuple( compute_mean_reward(rollouts, clipped) for clipped in (True, False))
def evaluate_single_config(hparams, stochastic, max_num_noops, agent_model_dir): """Evaluate the PPO agent in the real environment.""" eval_hparams = trainer_lib.create_hparams(hparams.base_algo_params) env = setup_env( hparams, batch_size=hparams.eval_batch_size, max_num_noops=max_num_noops ) env.start_new_epoch(0) env_fn = rl.make_real_env_fn(env) learner = LEARNERS[hparams.base_algo]( hparams.frame_stack_size, base_event_dir=None, agent_model_dir=agent_model_dir ) learner.evaluate(env_fn, eval_hparams, stochastic) rollouts = env.current_epoch_rollouts() env.close() return tuple( compute_mean_reward(rollouts, clipped) for clipped in (True, False) )
def evaluate_single_config(hparams, agent_model_dir): """Evaluate the PPO agent in the real environment.""" eval_hparams = trainer_lib.create_hparams(hparams.base_algo_params) eval_hparams.num_agents = hparams.num_agents eval_hparams.add_hparam("stochastic", hparams.stochastic) env = setup_env(hparams, batch_size=hparams.num_agents) env.start_new_epoch(0) env_fn = rl.make_real_env_fn(env) learner = LEARNERS[hparams.base_algo](hparams.frame_stack_size, event_dir=None, agent_model_dir=agent_model_dir) learner.evaluate(env_fn, eval_hparams, eval_hparams.stochastic) rollouts = env.current_epoch_rollouts()[:hparams.num_agents] env.close() assert len(rollouts) == hparams.num_agents, "{} {}".format( len(rollouts), hparams.num_agents) return tuple( compute_mean_reward(rollouts, clipped) for clipped in (True, False))
def train_agent_real_env(env, agent_model_dir, event_dir, data_dir, hparams, completed_ppo_epochs_num, epoch=0, is_final_epoch=False): """Train the PPO agent in the real environment.""" del is_final_epoch, data_dir ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params) ppo_params_names = [ "epochs_num", "epoch_length", "learning_rate", "num_agents", "eval_every_epochs", "optimization_epochs", "effective_num_agents" ] # This should be overridden. ppo_hparams.add_hparam("effective_num_agents", None) for param_name in ppo_params_names: ppo_param_name = "real_ppo_" + param_name if ppo_param_name in hparams: ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name)) completed_ppo_epochs_num += real_ppo_epoch_increment(hparams) ppo_hparams.epochs_num = completed_ppo_epochs_num # We do not save model, as that resets frames that we need at restarts. # But we need to save at the last step, so we set it very high. ppo_hparams.save_models_every_epochs = 1000000 ppo_hparams.add_hparam("env_fn", rl.make_real_env_fn(env)) ppo_hparams.add_hparam("force_beginning_resets", False) ppo_hparams.add_hparam("frame_stack_size", hparams.frame_stack_size) rl_trainer_lib.train(ppo_hparams, event_dir + "real", agent_model_dir, name_scope="ppo_real%d" % (epoch + 1)) # Save unfinished rollouts to history. env.reset() return completed_ppo_epochs_num
def evaluate_single_config(hparams, agent_model_dir): """Evaluate the PPO agent in the real environment.""" eval_hparams = trainer_lib.create_hparams(hparams.ppo_params) eval_hparams.num_agents = hparams.num_agents env = setup_env(hparams, batch_size=hparams.num_agents) env_fn = rl.make_real_env_fn(env) eval_hparams.add_hparam("env_fn", env_fn) eval_hparams.add_hparam("policy_to_actions_lambda", hparams.policy_to_actions_lambda) eval_hparams.add_hparam("frame_stack_size", hparams.frame_stack_size) eval_hparams.add_hparam("force_beginning_resets", False) env.start_new_epoch(0) rl_trainer_lib.evaluate(eval_hparams, agent_model_dir) rollouts = env.current_epoch_rollouts()[:hparams.num_agents] env.close() assert len(rollouts) == hparams.num_agents return tuple( compute_mean_reward(rollouts, clipped) for clipped in (True, False))
def train_agent_real_env(env, learner, hparams, epoch): """Train the PPO agent in the real environment.""" base_algo_str = hparams.base_algo train_hparams = trainer_lib.create_hparams(hparams.base_algo_params) rl_utils.update_hparams_from_hparams(train_hparams, hparams, "real_" + base_algo_str + "_") if hparams.wm_policy_param_sharing: train_hparams.optimizer_zero_grads = True env_fn = rl.make_real_env_fn(env) num_env_steps = real_env_step_increment(hparams) learner.train( env_fn, train_hparams, simulated=False, save_continuously=False, epoch=epoch, sampling_temp=hparams.real_sampling_temp, num_env_steps=num_env_steps, ) # Save unfinished rollouts to history. env.reset()
def train_agent_real_env(env, agent_model_dir, event_dir, data_dir, hparams, completed_epochs_num, epoch=0, is_final_epoch=False): """Train the PPO agent in the real environment.""" del is_final_epoch, data_dir base_algo_str = hparams.base_algo train_hparams = trainer_lib.create_hparams(hparams.base_algo_params) _update_hparams_from_hparams(train_hparams, hparams, "real_" + base_algo_str + "_") # TODO(konradczechowski): add effective_num_agents to ppo_atari_base etc. # this requires refactoring ppo. # This should be overridden. train_hparams.add_hparam("effective_num_agents", hparams.real_ppo_effective_num_agents) completed_epochs_num += real_ppo_epoch_increment(hparams) env_fn = rl.make_real_env_fn(env) learner = LEARNERS[base_algo_str](hparams.frame_stack_size, event_dir, agent_model_dir) learner.train(env_fn, train_hparams, completed_epochs_num, simulated=False, epoch=epoch) # Save unfinished rollouts to history. env.reset() return completed_epochs_num