示例#1
0
文件: test_trpo.py 项目: edrya/garage
    def test_trpo_unknown_kl_constraint(self):
        """Test TRPO with unkown KL constraints."""
        logger.reset()
        env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(hidden_sizes=(32, 32)),
        )
        with self.assertRaises(NotImplementedError) as context:
            TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=2048,
                max_path_length=100,
                n_itr=10,
                discount=0.99,
                gae_lambda=0.98,
                policy_ent_coeff=0.0,
                plot=False,
                kl_constraint="random kl_constraint",
            )
        assert "Unknown KLConstraint" in str(context.exception)

        env.close()
示例#2
0
 def test_npo_pendulum(self):
     """Test NPO with Pendulum environment."""
     logger.reset()
     env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
     policy = GaussianMLPPolicy(
         env_spec=env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     baseline = GaussianMLPBaseline(
         env_spec=env.spec,
         regressor_args=dict(hidden_sizes=(32, 32)),
     )
     algo = NPO(
         env=env,
         policy=policy,
         baseline=baseline,
         batch_size=2048,
         max_path_length=100,
         n_itr=10,
         discount=0.99,
         gae_lambda=0.98,
         policy_ent_coeff=0.0,
         plot=False,
     )
     last_avg_ret = algo.train(sess=self.sess)
     assert last_avg_ret > 20
示例#3
0
文件: test_trpo.py 项目: edrya/garage
    def test_trpo_pendulum(self):
        """Test TRPO with Pendulum environment."""
        with LocalRunner(self.sess) as runner:
            logger.reset()
            env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(64, 64),
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )
            baseline = GaussianMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = TRPO(env=env,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=100,
                        discount=0.99,
                        gae_lambda=0.98,
                        policy_ent_coeff=0.0)
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 50

            env.close()
示例#4
0
文件: test_reps.py 项目: edrya/garage
    def test_reps_cartpole(self):
        """Test REPS with gym Cartpole environment."""
        with LocalRunner(self.sess) as runner:
            logger.reset()
            env = TfEnv(gym.make("CartPole-v0"))

            policy = CategoricalMLPPolicy(env_spec=env.spec,
                                          hidden_sizes=[32, 32])

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = REPS(env=env,
                        policy=policy,
                        baseline=baseline,
                        batch_size=4000,
                        max_path_length=100,
                        n_itr=10,
                        discount=0.99,
                        max_kl_step=1e6)

            runner.setup(algo, env)

            last_avg_ret = runner.train(n_epochs=10, batch_size=4000)
            assert last_avg_ret > 5

            env.close()
示例#5
0
    def test_ppo_pendulum_with_model(self):
        """Test PPO with model, with Pendulum environment."""
        logger.reset()
        env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
        policy = GaussianMLPPolicyWithModel(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(hidden_sizes=(32, 32)),
        )
        algo = PPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=2048,
            max_path_length=100,
            n_itr=10,
            discount=0.99,
            lr_clip_range=0.01,
            optimizer_args=dict(batch_size=32, max_epochs=10),
            plot=False,
        )
        last_avg_ret = algo.train(sess=self.sess)
        assert last_avg_ret > 40

        env.close()
示例#6
0
    def test_gaussian_policies(self, policy_cls):
        with LocalRunner(self.sess) as runner:
            logger.reset()
            env = TfEnv(normalize(gym.make("Pendulum-v0")))

            policy = policy_cls(name="policy", env_spec=env.spec)

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                step_size=0.01,
                plot=True,
                optimizer=ConjugateGradientOptimizer,
                optimizer_args=dict(
                    hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)),
            )

            runner.setup(algo, env)
            runner.train(n_epochs=1, batch_size=4000)
            env.close()
示例#7
0
 def test_ddpg_pendulum(self):
     """Test PPO with Pendulum environment."""
     logger.reset()
     env = TfEnv(gym.make('InvertedDoublePendulum-v2'))
     action_noise = OUStrategy(env.spec, sigma=0.2)
     policy = ContinuousMLPPolicy(env_spec=env.spec,
                                  hidden_sizes=[64, 64],
                                  hidden_nonlinearity=tf.nn.relu,
                                  output_nonlinearity=tf.nn.tanh)
     qf = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[64, 64],
                                 hidden_nonlinearity=tf.nn.relu)
     replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                        size_in_transitions=int(1e6),
                                        time_horizon=100)
     algo = DDPG(
         env,
         policy=policy,
         policy_lr=1e-4,
         qf_lr=1e-3,
         qf=qf,
         replay_buffer=replay_buffer,
         plot=False,
         target_update_tau=1e-2,
         n_epochs=10,
         n_epoch_cycles=20,
         max_path_length=100,
         n_train_steps=50,
         discount=0.9,
         min_buffer_size=int(1e4),
         exploration_strategy=action_noise,
     )
     last_avg_ret = algo.train(sess=self.sess)
     assert last_avg_ret > 60
示例#8
0
    def test_ppo_pendulum_recurrent(self):
        """Test PPO with Pendulum environment and recurrent policy."""
        with LocalRunner() as runner:
            logger.reset()
            env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
            policy = GaussianLSTMPolicy(env_spec=env.spec, )
            baseline = GaussianMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = PPO(
                env=env,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                lr_clip_range=0.01,
                optimizer_args=dict(batch_size=32, max_epochs=10),
                plot=False,
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 40

            env.close()
示例#9
0
    def test_erwr_cartpole(self):
        """Test ERWR with Cartpole environment."""
        logger.reset()
        env = TfEnv(normalize(CartpoleEnv()))

        policy = GaussianMLPPolicy(
            name="policy", env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = ERWR(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=10000,
            max_path_length=100,
            n_itr=10,
            discount=0.99)

        last_avg_ret = algo.train(sess=self.sess)
        assert last_avg_ret > 100
示例#10
0
 def test_npo_unknown_pg_loss(self):
     """Test NPO with unkown policy gradient loss."""
     logger.reset()
     env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
     policy = GaussianMLPPolicy(
         env_spec=env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     baseline = GaussianMLPBaseline(
         env_spec=env.spec,
         regressor_args=dict(hidden_sizes=(32, 32)),
     )
     with self.assertRaises(NotImplementedError) as context:
         NPO(
             env=env,
             policy=policy,
             baseline=baseline,
             pg_loss="random pg_loss",
         )
     assert "Unknown PGLoss" in str(context.exception)
示例#11
0
    def test_tnpg_cartpole(self):
        """Test TNPG with Cartpole environment."""
        logger.reset()
        env = TfEnv(normalize(CartpoleEnv()))

        policy = GaussianMLPPolicy(name="policy",
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TNPG(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=10000,
                    max_path_length=100,
                    n_itr=10,
                    discount=0.99,
                    optimizer_args=dict(reg_coeff=5e-2))

        last_avg_ret = algo.train(sess=self.sess)
        assert last_avg_ret > 40
    def test_gaussian_policies(self, policy_cls):
        logger.reset()
        env = TfEnv(normalize(CartpoleEnv()))

        policy = policy_cls(name="policy", env_spec=env.spec)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=100,
            n_itr=1,
            discount=0.99,
            step_size=0.01,
            plot=True,
            optimizer=ConjugateGradientOptimizer,
            optimizer_args=dict(hvp_approach=FiniteDifferenceHvp(
                base_eps=1e-5)),
        )
        algo.train(sess=self.sess)
示例#13
0
文件: fixtures.py 项目: edrya/garage
 def setUp(self):
     self.graph = tf.Graph()
     self.sess = tf.Session(graph=self.graph)
     self.sess.__enter__()
     logger.reset()
     deterministic.set_seed(1)
示例#14
0
 def setUp(self):
     self.graph = tf.Graph()
     self.sess = tf.Session(graph=self.graph)
     self.sess.__enter__()
     logger.reset()