def _setup(self, data_dir): trajectory_length = self.num_testing_steps if self.num_steps < 1200: # Decrease the trajectory length for tiny experiments, otherwise we don't # have enough data to run the evaluation. trajectory_length = 2 self._initial_frame_chooser = InitialFrameChooser( self.environment_spec, mode=tf.estimator.ModeKeys.EVAL, trajectory_length=trajectory_length) frame_index = tf.Variable(0, trainable=False) def fixed_action_policy_fun(action_space, unused_config, observations): """Policy which replays actions from a trajectory.""" action = self._initial_frame_chooser.trajectory[ "action"].read_value()[:, frame_index.read_value(), :] inc_frame_index = frame_index.assign( (frame_index.read_value() + 1) % trajectory_length) with tf.control_dependencies([inc_frame_index]): action = tf.identity(action) obs_shape = observations.shape.as_list() with tf.variable_scope("network_parameters"): probs = tf.one_hot(tf.transpose(action), depth=action_space.n) policy = tf.distributions.Categorical(probs=probs) value = tf.zeros(obs_shape[:2]) return rl.NetworkOutput(policy, value, lambda a: a) super(GymSimulatedDiscreteProblemForWorldModelEval, self)._setup(data_dir, override_collect_hparams={ "policy_network": fixed_action_policy_fun })
def train_agent(environment_spec, agent_model_dir, event_dir, world_model_dir, epoch_data_dir, hparams, epoch=0, is_final_epoch=False): """Train the PPO agent in the simulated environment.""" ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params) ppo_params_names = [ "epochs_num", "epoch_length", "learning_rate", "num_agents", "optimization_epochs", "eval_every_epochs" ] for param_name in ppo_params_names: ppo_param_name = "ppo_" + param_name if ppo_param_name in hparams: ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name)) ppo_hparams.epochs_num = _ppo_training_epochs(hparams, epoch, is_final_epoch, False) ppo_hparams.save_models_every_epochs = 10 ppo_hparams.world_model_dir = world_model_dir ppo_hparams.add_hparam("force_beginning_resets", True) # Adding model hparams for model specific adjustments model_hparams = trainer_lib.create_hparams(hparams.generative_model_params) ppo_hparams.add_hparam("model_hparams", model_hparams) environment_spec = copy.copy(environment_spec) environment_spec_param_names = [ "simulation_random_starts", "simulation_flip_first_random_for_beginning", "intrinsic_reward_scale" ] for param_name in environment_spec_param_names: environment_spec.set_hparam(param_name, hparams.get(param_name)) ppo_hparams.add_hparam("environment_spec", environment_spec) ppo_hparams.add_hparam( "initial_frame_chooser", InitialFrameChooser(environment_spec, mode=tf.estimator.ModeKeys.EVAL)) # TODO(koz4k): Pass by arguments. with temporary_flags({ "problem": environment_spec.initial_frames_problem, "model": hparams.generative_model, "hparams_set": hparams.generative_model_params, "output_dir": world_model_dir, "data_dir": epoch_data_dir, }): rl_trainer_lib.train(ppo_hparams, event_dir + "sim", agent_model_dir, name_scope="ppo_sim%d" % (epoch + 1))
def _setup(self, data_dir, extra_collect_hparams=None, override_collect_hparams=None): if extra_collect_hparams is None: extra_collect_hparams = {} if self._initial_frame_chooser is None: self._initial_frame_chooser = InitialFrameChooser( self.environment_spec, mode=tf.estimator.ModeKeys.EVAL ) extra_collect_hparams["initial_frame_chooser"] = self._initial_frame_chooser super(GymSimulatedDiscreteProblem, self)._setup( data_dir, extra_collect_hparams, override_collect_hparams )