示例#1
0
    def test_rl2_trpo_pendulum(self):
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            algo = RL2TRPO(
                rl2_max_path_length=self.max_path_length,
                meta_batch_size=self.meta_batch_size,
                task_sampler=self.tasks,
                env_spec=self.env_spec,
                policy=self.policy,
                baseline=self.baseline,
                max_path_length=self.max_path_length * self.episode_per_task,
                discount=0.99,
                max_kl_step=0.01,
                optimizer=ConjugateGradientOptimizer,
                optimizer_args=dict(hvp_approach=FiniteDifferenceHvp(
                    base_eps=1e-5)))

            runner.setup(algo,
                         self.tasks.sample(self.meta_batch_size),
                         sampler_cls=LocalSampler,
                         n_workers=self.meta_batch_size,
                         worker_class=RL2Worker)

            last_avg_ret = runner.train(n_epochs=1,
                                        batch_size=self.episode_per_task *
                                        self.max_path_length *
                                        self.meta_batch_size)
            assert last_avg_ret > -40
示例#2
0
def rl2_trpo_halfcheetah(ctxt, seed, max_episode_length, meta_batch_size,
                         n_epochs, episode_per_task):
    """Train TRPO with HalfCheetah environment.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~Trainer` to create the :class:`~Snapshotter`.
        seed (int): Used to seed the random number generator to produce
            determinism.
        max_episode_length (int): Maximum length of a single episode.
        meta_batch_size (int): Meta batch size.
        n_epochs (int): Total number of epochs for training.
        episode_per_task (int): Number of training episode per task.


    """
    set_seed(seed)
    with TFTrainer(snapshot_config=ctxt) as trainer:
        tasks = task_sampler.SetTaskSampler(
            HalfCheetahVelEnv,
            wrapper=lambda env, _: RL2Env(
                GymEnv(env, max_episode_length=max_episode_length)))

        env_spec = RL2Env(
            GymEnv(HalfCheetahVelEnv(),
                   max_episode_length=max_episode_length)).spec
        policy = GaussianGRUPolicy(name='policy',
                                   hidden_dim=64,
                                   env_spec=env_spec,
                                   state_include_action=False)

        baseline = LinearFeatureBaseline(env_spec=env_spec)

        algo = RL2TRPO(meta_batch_size=meta_batch_size,
                       task_sampler=tasks,
                       env_spec=env_spec,
                       policy=policy,
                       baseline=baseline,
                       episodes_per_trial=episode_per_task,
                       discount=0.99,
                       max_kl_step=0.01,
                       optimizer=ConjugateGradientOptimizer,
                       optimizer_args=dict(hvp_approach=FiniteDifferenceHVP(
                           base_eps=1e-5)))

        trainer.setup(algo,
                      tasks.sample(meta_batch_size),
                      sampler_cls=LocalSampler,
                      n_workers=meta_batch_size,
                      worker_class=RL2Worker,
                      worker_args=dict(n_episodes_per_trial=episode_per_task))

        trainer.train(n_epochs=n_epochs,
                      batch_size=episode_per_task * max_episode_length *
                      meta_batch_size)
示例#3
0
 def test_rl2_trpo_pendulum_invalid_kl_constraint(self):
     with LocalTFRunner(snapshot_config, sess=self.sess):
         with pytest.raises(ValueError):
             RL2TRPO(meta_batch_size=self.meta_batch_size,
                     task_sampler=self.tasks,
                     env_spec=self.env_spec,
                     policy=self.policy,
                     baseline=self.baseline,
                     kl_constraint='xyz',
                     episodes_per_trial=self.episode_per_task,
                     discount=0.99,
                     max_kl_step=0.01)
示例#4
0
 def test_ppo_pendulum_default_optimizer2(self):
     with LocalTFRunner(snapshot_config, sess=self.sess):
         algo = RL2TRPO(meta_batch_size=self.meta_batch_size,
                        task_sampler=self.tasks,
                        env_spec=self.env_spec,
                        policy=self.policy,
                        baseline=self.baseline,
                        kl_constraint='soft',
                        episodes_per_trial=self.episode_per_task,
                        discount=0.99,
                        max_kl_step=0.01)
         assert isinstance(algo._inner_algo._optimizer,
                           PenaltyLbfgsOptimizer)
示例#5
0
 def test_rl2_trpo_pendulum_default_optimizer(self):
     with TFTrainer(snapshot_config, sess=self.sess):
         algo = RL2TRPO(meta_batch_size=self.meta_batch_size,
                        task_sampler=self.tasks,
                        env_spec=self.env_spec,
                        policy=self.policy,
                        baseline=self.baseline,
                        kl_constraint='hard',
                        episodes_per_trial=self.episode_per_task,
                        discount=0.99,
                        max_kl_step=0.01)
         assert isinstance(algo._inner_algo._optimizer,
                           ConjugateGradientOptimizer)
示例#6
0
def rl2_trpo_halfcheetah(ctxt=None, seed=1):
    """Train TRPO with HalfCheetah environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        max_path_length = 100
        meta_batch_size = 10
        n_epochs = 50
        episode_per_task = 4

        tasks = task_sampler.SetTaskSampler(lambda: RL2Env(
            env=HalfCheetahVelEnv()))

        env_spec = RL2Env(env=HalfCheetahVelEnv()).spec
        policy = GaussianGRUPolicy(name='policy',
                                   hidden_dim=64,
                                   env_spec=env_spec,
                                   state_include_action=False)

        baseline = LinearFeatureBaseline(env_spec=env_spec)

        algo = RL2TRPO(rl2_max_path_length=max_path_length,
                       meta_batch_size=meta_batch_size,
                       task_sampler=tasks,
                       env_spec=env_spec,
                       policy=policy,
                       baseline=baseline,
                       max_path_length=max_path_length * episode_per_task,
                       discount=0.99,
                       max_kl_step=0.01,
                       optimizer=ConjugateGradientOptimizer,
                       optimizer_args=dict(hvp_approach=FiniteDifferenceHvp(
                           base_eps=1e-5)))

        runner.setup(algo,
                     tasks.sample(meta_batch_size),
                     sampler_cls=LocalSampler,
                     n_workers=meta_batch_size,
                     worker_class=RL2Worker)

        runner.train(n_epochs=n_epochs,
                     batch_size=episode_per_task * max_path_length *
                     meta_batch_size)
示例#7
0
    def test_rl2_trpo_pendulum(self):
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            algo = RL2TRPO(
                meta_batch_size=self.meta_batch_size,
                task_sampler=self.tasks,
                env_spec=self.env_spec,
                policy=self.policy,
                baseline=self.baseline,
                sampler=self.sampler,
                episodes_per_trial=self.episode_per_task,
                discount=0.99,
                max_kl_step=0.01,
                optimizer=ConjugateGradientOptimizer,
                optimizer_args=dict(hvp_approach=FiniteDifferenceHVP(
                    base_eps=1e-5)))

            trainer.setup(algo, self.tasks.sample(self.meta_batch_size))

            last_avg_ret = trainer.train(n_epochs=1,
                                         batch_size=self.episode_per_task *
                                         self.max_episode_length *
                                         self.meta_batch_size)
            assert last_avg_ret > -40