Python NPO示例，garage.tf.algos.NPO Python示例

示例#1

0

显示文件

文件： test_npo.py 项目： vincentzhang/garage

 def test_npo_pendulum(self):
     """Test NPO with Pendulum environment."""
     logger.reset()
     env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
     policy = GaussianMLPPolicy(
         env_spec=env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     baseline = GaussianMLPBaseline(
         env_spec=env.spec,
         regressor_args=dict(hidden_sizes=(32, 32)),
     )
     algo = NPO(
         env=env,
         policy=policy,
         baseline=baseline,
         batch_size=2048,
         max_path_length=100,
         n_itr=10,
         discount=0.99,
         gae_lambda=0.98,
         policy_ent_coeff=0.0,
         plot=False,
     )
     last_avg_ret = algo.train(sess=self.sess)
     assert last_avg_ret > 20

示例#2

0

显示文件

    def test_npo_pendulum(self):
        """Test NPO with Pendulum environment."""
        with LocalRunner(self.sess) as runner:
            env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(64, 64),
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )
            baseline = GaussianMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = NPO(env_spec=env.spec,
                       policy=policy,
                       baseline=baseline,
                       max_path_length=100,
                       discount=0.99,
                       gae_lambda=0.98,
                       policy_ent_coeff=0.0)
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 20

            env.close()

示例#3

0

显示文件

 def test_npo_with_invalid_entropy_method(self):
     """Test NPO with invalid entropy method."""
     with pytest.raises(ValueError, match='Invalid entropy_method'):
         NPO(
             env_spec=self.env.spec,
             policy=self.policy,
             baseline=self.baseline,
             entropy_method=None,
         )

示例#4

0

显示文件

 def test_npo_with_unknown_pg_loss(self):
     """Test NPO with unkown pg loss."""
     with pytest.raises(ValueError, match='Invalid pg_loss'):
         NPO(
             env_spec=self.env.spec,
             policy=self.policy,
             baseline=self.baseline,
             pg_loss='random pg_loss',
         )

示例#5

0

显示文件

 def test_npo_with_invalid_no_entropy_configuration(self):
     """Test NPO with invalid no entropy configuration."""
     with pytest.raises(ValueError):
         NPO(
             env_spec=self.env.spec,
             policy=self.policy,
             baseline=self.baseline,
             entropy_method='no_entropy',
             policy_ent_coeff=0.02,
         )

示例#6

0

显示文件

 def test_npo_with_max_entropy_and_no_stop_entropy_gradient(self):
     """Test NPO with max entropy and false stop_entropy_gradient."""
     with pytest.raises(ValueError):
         NPO(
             env_spec=self.env.spec,
             policy=self.policy,
             baseline=self.baseline,
             entropy_method='max',
             stop_entropy_gradient=False,
         )

示例#7

0

显示文件

 def test_npo_with_max_entropy_and_center_adv(self):
     """Test NPO with max entropy and center_adv."""
     with pytest.raises(ValueError):
         NPO(
             env_spec=self.env.spec,
             policy=self.policy,
             baseline=self.baseline,
             entropy_method='max',
             center_adv=True,
         )

示例#8

0

显示文件

 def test_npo_pendulum(self):
     """Test NPO with Pendulum environment."""
     with TFTrainer(snapshot_config, sess=self.sess) as trainer:
         algo = NPO(env_spec=self.env.spec,
                    policy=self.policy,
                    baseline=self.baseline,
                    discount=0.99,
                    gae_lambda=0.98,
                    policy_ent_coeff=0.0)
         trainer.setup(algo, self.env, sampler_cls=LocalSampler)
         last_avg_ret = trainer.train(n_epochs=10, batch_size=2048)
         assert last_avg_ret > 20

示例#9

0

显示文件

 def test_npo_pendulum(self):
     """Test NPO with Pendulum environment."""
     with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
         algo = NPO(env_spec=self.env.spec,
                    policy=self.policy,
                    baseline=self.baseline,
                    max_path_length=100,
                    discount=0.99,
                    gae_lambda=0.98,
                    policy_ent_coeff=0.0)
         runner.setup(algo, self.env)
         last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
         assert last_avg_ret > 20

示例#10

0

显示文件

文件： heartbreak.py 项目： AaEll/ProjectHeartbreak

def tf_gym_music(ctxt=None, seed=1):
    """Train Policy Gradient LSTM with Music-v0 environment.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
            created by @wrap_experiment
        seed (int): Used to seed the random number generator to produce
            determinism.

    """

    set_seed(seed)

    with TFTrainer(snapshot_config=ctxt) as trainer:

        env = GymEnv(MusicEnv(monitor = HeartMonitor('DC:39:39:66:26:1F')),max_episode_length = 35) 
        
        policy = GaussianLSTMPolicy(name='policy',
                                    env_spec=env.spec,
                                    hidden_dim= 32)
        
        baseline = GaussianMLPBaseline(
            env_spec = env.spec,
            hidden_sizes=(32, 32),
        )

        sampler = LocalSampler(agents=policy,
                               envs=env,
                               max_episode_length=env.spec.max_episode_length,
                               is_tf_worker=False,
                               n_workers = 1,
                              )
        
        algo = NPO(env_spec = env.spec,
                    policy = policy,
                    baseline = baseline, 
                    sampler = sampler,
                  )
        
        trainer.setup(algo, env)

        trainer.train(n_epochs=120, batch_size=1,store_episodes = True)

示例#11

0

显示文件

文件： test_npo.py 项目： vincentzhang/garage

 def test_npo_unknown_pg_loss(self):
     """Test NPO with unkown policy gradient loss."""
     logger.reset()
     env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
     policy = GaussianMLPPolicy(
         env_spec=env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     baseline = GaussianMLPBaseline(
         env_spec=env.spec,
         regressor_args=dict(hidden_sizes=(32, 32)),
     )
     with self.assertRaises(NotImplementedError) as context:
         NPO(
             env=env,
             policy=policy,
             baseline=baseline,
             pg_loss="random pg_loss",
         )
     assert "Unknown PGLoss" in str(context.exception)

示例#12

0

显示文件

seed = 2021

set_seed(seed)

trainer = TFTrainer(snapshot_config=ctxt)
env = GymEnv(MusicEnv(monitor=HeartMonitor('DC:39:39:66:26:1F')),
             max_episode_length=25)
policy = GaussianLSTMPolicy(name='policy', env_spec=env.spec, hidden_dim=32)

baseline = GaussianMLPBaseline(
    env_spec=env.spec,
    hidden_sizes=(32, 32),
)
sampler = LocalSampler(
    agents=policy,
    envs=env,
    max_episode_length=env.spec.max_episode_length,
    is_tf_worker=False,
    n_workers=1,
)

algo = NPO(
    env_spec=env.spec,
    policy=policy,
    baseline=baseline,
    sampler=sampler,
)

trainer.setup(algo, env)
print("trainer.train(n_epochs=120, batch_size=1)")