예제 #1
0
    def _setup(self, data_dir):
        trajectory_length = self.num_testing_steps
        if self.num_steps < 1200:
            # Decrease the trajectory length for tiny experiments, otherwise we don't
            # have enough data to run the evaluation.
            trajectory_length = 2
        self._initial_frame_chooser = InitialFrameChooser(
            self.environment_spec,
            mode=tf.estimator.ModeKeys.EVAL,
            trajectory_length=trajectory_length)

        frame_index = tf.Variable(0, trainable=False)

        def fixed_action_policy_fun(action_space, unused_config, observations):
            """Policy which replays actions from a trajectory."""
            action = self._initial_frame_chooser.trajectory[
                "action"].read_value()[:, frame_index.read_value(), :]
            inc_frame_index = frame_index.assign(
                (frame_index.read_value() + 1) % trajectory_length)
            with tf.control_dependencies([inc_frame_index]):
                action = tf.identity(action)

            obs_shape = observations.shape.as_list()
            with tf.variable_scope("network_parameters"):
                probs = tf.one_hot(tf.transpose(action), depth=action_space.n)
                policy = tf.distributions.Categorical(probs=probs)
                value = tf.zeros(obs_shape[:2])
            return rl.NetworkOutput(policy, value, lambda a: a)

        super(GymSimulatedDiscreteProblemForWorldModelEval,
              self)._setup(data_dir,
                           override_collect_hparams={
                               "policy_network": fixed_action_policy_fun
                           })
def train_agent(environment_spec,
                agent_model_dir,
                event_dir,
                world_model_dir,
                epoch_data_dir,
                hparams,
                epoch=0,
                is_final_epoch=False):
    """Train the PPO agent in the simulated environment."""
    ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params)
    ppo_params_names = [
        "epochs_num", "epoch_length", "learning_rate", "num_agents",
        "optimization_epochs", "eval_every_epochs"
    ]

    for param_name in ppo_params_names:
        ppo_param_name = "ppo_" + param_name
        if ppo_param_name in hparams:
            ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name))

    ppo_hparams.epochs_num = _ppo_training_epochs(hparams, epoch,
                                                  is_final_epoch, False)
    ppo_hparams.save_models_every_epochs = 10
    ppo_hparams.world_model_dir = world_model_dir
    ppo_hparams.add_hparam("force_beginning_resets", True)

    # Adding model hparams for model specific adjustments
    model_hparams = trainer_lib.create_hparams(hparams.generative_model_params)
    ppo_hparams.add_hparam("model_hparams", model_hparams)

    environment_spec = copy.copy(environment_spec)
    environment_spec_param_names = [
        "simulation_random_starts",
        "simulation_flip_first_random_for_beginning", "intrinsic_reward_scale"
    ]
    for param_name in environment_spec_param_names:
        environment_spec.set_hparam(param_name, hparams.get(param_name))
    ppo_hparams.add_hparam("environment_spec", environment_spec)

    ppo_hparams.add_hparam(
        "initial_frame_chooser",
        InitialFrameChooser(environment_spec, mode=tf.estimator.ModeKeys.EVAL))

    # TODO(koz4k): Pass by arguments.
    with temporary_flags({
            "problem": environment_spec.initial_frames_problem,
            "model": hparams.generative_model,
            "hparams_set": hparams.generative_model_params,
            "output_dir": world_model_dir,
            "data_dir": epoch_data_dir,
    }):
        rl_trainer_lib.train(ppo_hparams,
                             event_dir + "sim",
                             agent_model_dir,
                             name_scope="ppo_sim%d" % (epoch + 1))
예제 #3
0
  def _setup(self, data_dir, extra_collect_hparams=None,
             override_collect_hparams=None):
    if extra_collect_hparams is None:
      extra_collect_hparams = {}

    if self._initial_frame_chooser is None:
      self._initial_frame_chooser = InitialFrameChooser(
          self.environment_spec, mode=tf.estimator.ModeKeys.EVAL
      )
    extra_collect_hparams["initial_frame_chooser"] = self._initial_frame_chooser

    super(GymSimulatedDiscreteProblem, self)._setup(
        data_dir, extra_collect_hparams, override_collect_hparams
    )