def train(hparams, output_dir, report_fn=None): """Train.""" hparams = initialize_env_specs(hparams) learner = rl_utils.LEARNERS[hparams.base_algo](hparams.frame_stack_size, FLAGS.output_dir, output_dir) policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params) rl_utils.update_hparams_from_hparams(policy_hparams, hparams, hparams.base_algo + "_") total_steps = policy_hparams.epochs_num eval_every_epochs = policy_hparams.eval_every_epochs if eval_every_epochs == 0: eval_every_epochs = total_steps policy_hparams.eval_every_epochs = 0 steps = list(range(eval_every_epochs, total_steps + 1, eval_every_epochs)) if not steps or steps[-1] < eval_every_epochs: steps.append(eval_every_epochs) metric_name = rl_utils.get_metric_name( sampling_temp=hparams.eval_sampling_temps[0], max_num_noops=hparams.eval_max_num_noops, clipped=False) for step in steps: policy_hparams.epochs_num = step learner.train(hparams.env_fn, policy_hparams, simulated=False, save_continuously=True, epoch=0) eval_metrics = rl_utils.evaluate_all_configs(hparams, output_dir) tf.logging.info("Agent eval metrics:\n{}".format( pprint.pformat(eval_metrics))) if report_fn: report_fn(eval_metrics[metric_name], step)
def train_agent(real_env, learner, world_model_dir, hparams, epoch): """Train the PPO agent in the simulated environment.""" initial_frame_chooser = rl_utils.make_initial_frame_chooser( real_env, hparams.frame_stack_size, hparams.simulation_random_starts, hparams.simulation_flip_first_random_for_beginning ) env_fn = make_simulated_env_fn_from_hparams( real_env, hparams, batch_size=hparams.simulated_batch_size, initial_frame_chooser=initial_frame_chooser, model_dir=world_model_dir, sim_video_dir=os.path.join( learner.agent_model_dir, "sim_videos_{}".format(epoch) ) ) base_algo_str = hparams.base_algo train_hparams = trainer_lib.create_hparams(hparams.base_algo_params) if hparams.wm_policy_param_sharing: train_hparams.optimizer_zero_grads = True rl_utils.update_hparams_from_hparams( train_hparams, hparams, base_algo_str + "_" ) final_epoch = hparams.epochs - 1 is_special_epoch = (epoch + 3) == final_epoch or (epoch + 7) == final_epoch is_final_epoch = epoch == final_epoch env_step_multiplier = 3 if is_final_epoch else 2 if is_special_epoch else 1 learner.train( env_fn, train_hparams, simulated=True, save_continuously=True, epoch=epoch, env_step_multiplier=env_step_multiplier )
def train_agent(real_env, learner, world_model_dir, hparams, epoch): """Train the PPO agent in the simulated environment.""" frame_stack_size = hparams.frame_stack_size initial_frame_rollouts = real_env.current_epoch_rollouts( split=tf.contrib.learn.ModeKeys.TRAIN, minimal_rollout_frames=frame_stack_size, ) # TODO(koz4k): Move this to a different module. def initial_frame_chooser(batch_size): """Frame chooser.""" deterministic_initial_frames =\ initial_frame_rollouts[0][:frame_stack_size] if not hparams.simulation_random_starts: # Deterministic starts: repeat first frames from the first rollout. initial_frames = [deterministic_initial_frames] * batch_size else: # Random starts: choose random initial frames from random rollouts. initial_frames = random_rollout_subsequences( initial_frame_rollouts, batch_size, frame_stack_size) if hparams.simulation_flip_first_random_for_beginning: # Flip first entry in the batch for deterministic initial frames. initial_frames[0] = deterministic_initial_frames return np.stack( [[frame.observation.decode() for frame in initial_frame_stack] for initial_frame_stack in initial_frames]) env_fn = make_simulated_env_fn( real_env, hparams, hparams.simulated_batch_size, initial_frame_chooser, world_model_dir, os.path.join(learner.agent_model_dir, "sim_videos_{}".format(epoch))) base_algo_str = hparams.base_algo train_hparams = trainer_lib.create_hparams(hparams.base_algo_params) if hparams.wm_policy_param_sharing: train_hparams.optimizer_zero_grads = True rl_utils.update_hparams_from_hparams(train_hparams, hparams, base_algo_str + "_") final_epoch = hparams.epochs - 1 is_special_epoch = (epoch + 3) == final_epoch or (epoch + 7) == final_epoch is_final_epoch = epoch == final_epoch env_step_multiplier = 3 if is_final_epoch else 2 if is_special_epoch else 1 learner.train(env_fn, train_hparams, simulated=True, save_continuously=True, epoch=epoch, env_step_multiplier=env_step_multiplier)
def train_agent_real_env(env, learner, hparams, epoch): """Train the PPO agent in the real environment.""" base_algo_str = hparams.base_algo train_hparams = trainer_lib.create_hparams(hparams.base_algo_params) rl_utils.update_hparams_from_hparams(train_hparams, hparams, "real_" + base_algo_str + "_") env_fn = rl.make_real_env_fn(env) num_env_steps = real_env_step_increment(hparams) learner.train(env_fn, train_hparams, simulated=False, save_continuously=False, epoch=epoch, num_env_steps=num_env_steps) # Save unfinished rollouts to history. env.reset()
def train_agent_real_env(env, learner, hparams, epoch): """Train the PPO agent in the real environment.""" base_algo_str = hparams.base_algo train_hparams = trainer_lib.create_hparams(hparams.base_algo_params) rl_utils.update_hparams_from_hparams(train_hparams, hparams, "real_" + base_algo_str + "_") if hparams.wm_policy_param_sharing: train_hparams.optimizer_zero_grads = True env_fn = rl.make_real_env_fn(env) num_env_steps = real_env_step_increment(hparams) learner.train( env_fn, train_hparams, simulated=False, save_continuously=False, epoch=epoch, sampling_temp=hparams.real_sampling_temp, num_env_steps=num_env_steps, ) # Save unfinished rollouts to history. env.reset()
def train(hparams, output_dir, env_problem_name, report_fn=None): """Train.""" env_fn = initialize_env_specs(hparams, env_problem_name) tf.logging.vlog(1, "HParams in trainer_model_free.train : %s", misc_utils.pprint_hparams(hparams)) tf.logging.vlog(1, "Using hparams.base_algo: %s", hparams.base_algo) learner = rl_utils.LEARNERS[hparams.base_algo](hparams.frame_stack_size, output_dir, output_dir, total_num_epochs=1) policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params) rl_utils.update_hparams_from_hparams(policy_hparams, hparams, hparams.base_algo + "_") tf.logging.vlog(1, "Policy HParams : %s", misc_utils.pprint_hparams(policy_hparams)) total_steps = policy_hparams.epochs_num tf.logging.vlog(2, "total_steps: %d", total_steps) eval_every_epochs = policy_hparams.eval_every_epochs tf.logging.vlog(2, "eval_every_epochs: %d", eval_every_epochs) if eval_every_epochs == 0: eval_every_epochs = total_steps policy_hparams.eval_every_epochs = 0 metric_name = rl_utils.get_metric_name( sampling_temp=hparams.eval_sampling_temps[0], max_num_noops=hparams.eval_max_num_noops, clipped=False) tf.logging.vlog(1, "metric_name: %s", metric_name) eval_metrics_dir = os.path.join(output_dir, "eval_metrics") eval_metrics_dir = os.path.expanduser(eval_metrics_dir) tf.gfile.MakeDirs(eval_metrics_dir) eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir) def evaluate_on_new_model(model_dir_path): global step eval_metrics = rl_utils.evaluate_all_configs(hparams, model_dir_path) tf.logging.info("Agent eval metrics:\n{}".format( pprint.pformat(eval_metrics))) rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, step) if report_fn: report_fn(eval_metrics[metric_name], step) step += 1 policy_hparams.epochs_num = total_steps policy_hparams.save_models_every_epochs = eval_every_epochs learner.train(env_fn, policy_hparams, simulated=False, save_continuously=True, epoch=0, model_save_fn=evaluate_on_new_model)
def train(hparams, output_dir, env_problem_name, report_fn=None): """Train.""" env_fn = initialize_env_specs(hparams, env_problem_name) tf.logging.vlog(1, "HParams in trainer_model_free.train : %s", misc_utils.pprint_hparams(hparams)) tf.logging.vlog(1, "Using hparams.base_algo: %s", hparams.base_algo) learner = rl_utils.LEARNERS[hparams.base_algo]( hparams.frame_stack_size, output_dir, output_dir, total_num_epochs=1, distributional_size=hparams.get("distributional_size", 1), distributional_subscale=hparams.get("distributional_subscale", 0.04), distributional_threshold=hparams.get("distributional_threshold", 0.0), ) policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params) rl_utils.update_hparams_from_hparams( policy_hparams, hparams, hparams.base_algo + "_" ) tf.logging.vlog(1, "Policy HParams : %s", misc_utils.pprint_hparams(policy_hparams)) # TODO(konradczechowski): remove base_algo dependance, when evaluation method # will be decided if hparams.base_algo == "ppo": total_steps = policy_hparams.epochs_num tf.logging.vlog(2, "total_steps: %d", total_steps) eval_every_epochs = policy_hparams.eval_every_epochs tf.logging.vlog(2, "eval_every_epochs: %d", eval_every_epochs) if eval_every_epochs == 0: eval_every_epochs = total_steps policy_hparams.eval_every_epochs = 0 metric_name = rl_utils.get_metric_name( sampling_temp=hparams.eval_sampling_temps[0], max_num_noops=hparams.eval_max_num_noops, clipped=False ) tf.logging.vlog(1, "metric_name: %s", metric_name) eval_metrics_dir = os.path.join(output_dir, "eval_metrics") eval_metrics_dir = os.path.expanduser(eval_metrics_dir) tf.gfile.MakeDirs(eval_metrics_dir) eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir) def evaluate_on_new_model(model_dir_path): global step eval_metrics = rl_utils.evaluate_all_configs(hparams, model_dir_path) tf.logging.info( "Agent eval metrics:\n{}".format(pprint.pformat(eval_metrics))) rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, step) if report_fn: report_fn(eval_metrics[metric_name], step) step += 1 policy_hparams.epochs_num = total_steps policy_hparams.save_models_every_epochs = eval_every_epochs else: def evaluate_on_new_model(model_dir_path): del model_dir_path raise NotImplementedError( "This function is currently implemented only for ppo") learner.train(env_fn, policy_hparams, simulated=False, save_continuously=True, epoch=0, model_save_fn=evaluate_on_new_model)
def train(hparams, output_dir, report_fn=None): """Train.""" hparams = initialize_env_specs(hparams) tf.logging.vlog(1, "HParams in trainer_model_free.train : %s", misc_utils.pprint_hparams(hparams)) tf.logging.vlog(1, "Using hparams.base_algo: %s", hparams.base_algo) learner = rl_utils.LEARNERS[hparams.base_algo](hparams.frame_stack_size, output_dir, output_dir, total_num_epochs=1) policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params) rl_utils.update_hparams_from_hparams(policy_hparams, hparams, hparams.base_algo + "_") tf.logging.vlog(1, "Policy HParams : %s", misc_utils.pprint_hparams(policy_hparams)) total_steps = policy_hparams.epochs_num tf.logging.vlog(2, "total_steps: %d", total_steps) eval_every_epochs = policy_hparams.eval_every_epochs tf.logging.vlog(2, "eval_every_epochs: %d", eval_every_epochs) if eval_every_epochs == 0: eval_every_epochs = total_steps policy_hparams.eval_every_epochs = 0 steps = list(range(eval_every_epochs, total_steps + 1, eval_every_epochs)) if not steps or steps[-1] < eval_every_epochs: steps.append(eval_every_epochs) tf.logging.vlog(1, "steps: [%s]", ",".join([str(s) for s in steps])) metric_name = rl_utils.get_metric_name( sampling_temp=hparams.eval_sampling_temps[0], max_num_noops=hparams.eval_max_num_noops, clipped=False) tf.logging.vlog(1, "metric_name: %s", metric_name) for i, step in enumerate(steps): tf.logging.info("Starting training iteration [%d] for [%d] steps.", i, step) policy_hparams.epochs_num = step learner.train(hparams.env_fn, policy_hparams, simulated=False, save_continuously=True, epoch=0) tf.logging.info("Ended training iteration [%d] for [%d] steps.", i, step) eval_metrics = rl_utils.evaluate_all_configs(hparams, output_dir) tf.logging.info("Agent eval metrics:\n{}".format( pprint.pformat(eval_metrics))) if report_fn: report_fn(eval_metrics[metric_name], step)