示例#1
0
def trpo_garage_tf(ctxt, env_id, seed):
    """Create garage Tensorflow TROI model and training.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with TFTrainer(ctxt) as trainer:
        env = normalize(GymEnv(env_id))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['hidden_sizes'],
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        sampler = RaySampler(agents=policy,
                             envs=env,
                             max_episode_length=env.spec.max_episode_length,
                             is_tf_worker=True)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    sampler=sampler,
                    discount=hyper_parameters['discount'],
                    gae_lambda=hyper_parameters['gae_lambda'],
                    max_kl_step=hyper_parameters['max_kl'])

        trainer.setup(algo, env)
        trainer.train(n_epochs=hyper_parameters['n_epochs'],
                      batch_size=hyper_parameters['batch_size'])
示例#2
0
 def setup_method(self):
     super().setup_method()
     self.env = normalize(
         GymEnv('InvertedDoublePendulum-v2', max_episode_length=100))
     self.policy = GaussianMLPPolicy(
         env_spec=self.env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     self.lstm_policy = GaussianLSTMPolicy(env_spec=self.env.spec)
     self.gru_policy = GaussianGRUPolicy(env_spec=self.env.spec)
     self.baseline = GaussianMLPBaseline(
         env_spec=self.env.spec,
         hidden_sizes=(32, 32),
     )
     self.sampler = LocalSampler(
         agents=self.policy,
         envs=self.env,
         max_episode_length=self.env.spec.max_episode_length,
         is_tf_worker=True)
def run_task(snapshot_config, *_):
    """Run task."""
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(gym.make('Swimmer-v2'))

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=500,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo,
                     env,
                     sampler_cls=RaySamplerTF,
                     sampler_args={'seed': seed})
        runner.train(n_epochs=40, batch_size=4000)
示例#4
0
    def test_erwr_cartpole(self):
        """Test ERWR with Cartpole environment."""
        logger.reset()
        env = TfEnv(normalize(CartpoleEnv()))

        policy = GaussianMLPPolicy(
            name="policy", env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = ERWR(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=10000,
            max_path_length=100,
            n_itr=10,
            discount=0.99)

        last_avg_ret = algo.train(sess=self.sess)
        assert last_avg_ret > 100
def run_task(snapshot_config, *_):
    """Run the job."""
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(normalize(gym.make('InvertedPendulum-v2')))

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo,
                     env,
                     sampler_cls=ISSampler,
                     sampler_args=dict(n_backtrack=1))
        runner.train(n_epochs=200, batch_size=4000)
示例#6
0
def run_task(snapshot_config, *_):
    """Run task."""
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(env_name='Pusher3DOF-v1')

        policy = GaussianMLPPolicy(name='policy',
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32),
                                      init_std=10)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo, env)
        runner.train(n_epochs=200, batch_size=50*250)
def trpo_swimmer_ray_sampler(ctxt=None, seed=1):
    """tf_trpo_swimmer.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.


    """
    # Since this is an example, we are running ray in a reduced state.
    # One can comment this line out in order to run ray at full capacity
    ray.init(_memory=52428800,
             object_store_memory=78643200,
             ignore_reinit_error=True,
             log_to_driver=False,
             include_dashboard=False)
    with TFTrainer(snapshot_config=ctxt) as trainer:
        set_seed(seed)
        env = GymEnv('Swimmer-v2')

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        sampler = RaySampler(agents=policy,
                             envs=env,
                             max_episode_length=env.spec.max_episode_length,
                             is_tf_worker=True)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    sampler=sampler,
                    discount=0.99,
                    max_kl_step=0.01)

        trainer.setup(algo, env)
        trainer.train(n_epochs=40, batch_size=4000)
示例#8
0
def run_task(v):
    v = SimpleNamespace(**v)

    with LocalRunner() as runner:
        # Environment
        env = FlatTorqueReacher(
            fix_goal=True,
            fixed_goal=GOALS[0],
            reward_type="hand_distance",
            # hand_distance_completion_bonus=0.,
            # torque_limit_pct=0.2,
            indicator_threshold=0.03,
            # velocity_penalty_coeff=0.01,
            action_scale=10.0,
            # hide_goal_pos=True,
        )
        env = TfEnv(normalize(env))

        # Policy
        policy = GaussianMLPPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_sizes=(64, 32),
            init_std=v.policy_init_std,
        )

        baseline = GaussianMLPBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            max_path_length=v.max_path_length,
            discount=0.99,
            max_kl_step=0.01,
            #optimizer_args=dict(max_grad_norm=0.5)
        )

        runner.setup(algo, env)
        runner.train(n_epochs=1000, batch_size=v.batch_size, plot=True)
示例#9
0
    def test_tnpg_cartpole(self):
        """Test TNPG with Cartpole environment."""
        logger.reset()
        env = TfEnv(normalize(CartpoleEnv()))

        policy = GaussianMLPPolicy(name="policy",
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TNPG(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=10000,
                    max_path_length=100,
                    n_itr=10,
                    discount=0.99,
                    optimizer_args=dict(reg_coeff=5e-2))

        last_avg_ret = algo.train(sess=self.sess)
        assert last_avg_ret > 40
示例#10
0
 def test_npo_unknown_pg_loss(self):
     """Test NPO with unkown policy gradient loss."""
     logger.reset()
     env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
     policy = GaussianMLPPolicy(
         env_spec=env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     baseline = GaussianMLPBaseline(
         env_spec=env.spec,
         regressor_args=dict(hidden_sizes=(32, 32)),
     )
     with self.assertRaises(NotImplementedError) as context:
         NPO(
             env=env,
             policy=policy,
             baseline=baseline,
             pg_loss="random pg_loss",
         )
     assert "Unknown PGLoss" in str(context.exception)
示例#11
0
def run_task(*_):
    env = TfEnv(normalize(PointEnv(goal=(-1, 0))))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=100,
        discount=0.99,
        step_size=0.01,
        plot=False,
        force_batch_sampler=True,
    )
    algo.train()
示例#12
0
def trpo_swimmer_ray_sampler(ctxt=None, seed=1):
    """tf_trpo_swimmer.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.


    """
    # Since this is an example, we are running ray in a reduced state.
    # One can comment this line out in order to run ray at full capacity
    ray.init(memory=52428800,
             object_store_memory=78643200,
             ignore_reinit_error=True,
             log_to_driver=False,
             include_webui=False)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        set_seed(seed)
        env = TfEnv(gym.make('Swimmer-v2'))

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=500,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo,
                     env,
                     sampler_cls=RaySampler,
                     sampler_args={'seed': seed})
        runner.train(n_epochs=40, batch_size=4000)
示例#13
0
def run_task(*_):
    with LocalRunner() as runner:
        env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(32, 32),
                use_trust_region=True,
            ),
        )

        algo = PPO(
            env=env,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
            ),
            plot=False,
        )

        runner.setup(algo, env)

        runner.train(n_epochs=120, batch_size=2048, plot=False)
def run_task(snapshot_config, *_):
    """Train CEM"""
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(env_name='Swimmer-v2')

        policy = GaussianMLPPolicy(name='policy',
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        n_samples = 20

        algo = CEM(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   best_frac=0.05,
                   max_path_length=100,
                   n_samples=n_samples)

        runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler)
        # NOTE: make sure that n_epoch_cycles == n_samples !
        runner.train(n_epochs=100, batch_size=1000, n_epoch_cycles=n_samples)
示例#15
0
    def test_tnpg_inverted_pendulum(self):
        """Test TNPG with InvertedPendulum-v2 environment."""
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            env = normalize(GymEnv('InvertedPendulum-v2'))

            policy = GaussianMLPPolicy(name='policy',
                                       env_spec=env.spec,
                                       hidden_sizes=(32, 32))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TNPG(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        discount=0.99,
                        optimizer_args=dict(reg_coeff=5e-1))

            trainer.setup(algo, env, sampler_cls=LocalSampler)

            last_avg_ret = trainer.train(n_epochs=10, batch_size=10000)
            assert last_avg_ret > 15

            env.close()
示例#16
0
    def test_rl2_sampler_invalid_num_of_env_again(self):
        with pytest.raises(
                ValueError,
                match='n_envs must be a multiple of meta_batch_size'):
            with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
                policy = GaussianMLPPolicy(env_spec=self.env.spec,
                                           hidden_sizes=[32, 32])

                baseline = LinearFeatureBaseline(env_spec=self.env.spec)

                algo = PPO(env_spec=self.env.spec,
                           policy=policy,
                           baseline=baseline,
                           max_path_length=self.max_path_length,
                           discount=0.99)
                runner.setup(algo,
                             env=self.env,
                             sampler_cls=RL2Sampler,
                             sampler_args=dict(
                                 meta_batch_size=self.meta_batch_size,
                                 n_envs=self.meta_batch_size + 1))
                runner._start_worker()
                runner._sampler.obtain_samples(0)
示例#17
0
    def test_tnpg_inverted_pendulum(self):
        """Test TNPG with InvertedPendulum-v2 environment."""
        env = TfEnv(normalize(gym.make("InvertedPendulum-v2")))

        policy = GaussianMLPPolicy(name="policy",
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TNPG(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=10000,
                    max_path_length=100,
                    n_itr=10,
                    discount=0.99,
                    optimizer_args=dict(reg_coeff=5e-1))

        last_avg_ret = algo.train(sess=self.sess)
        assert last_avg_ret > 30

        env.close()
示例#18
0
    def test_ppo_pendulum_continuous_baseline(self):
        """Test PPO with Pendulum environment."""
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(64, 64),
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )
            baseline = ContinuousMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = PPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                gae_lambda=0.95,
                lr_clip_range=0.2,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                ),
                stop_entropy_gradient=True,
                entropy_method='max',
                policy_ent_coeff=0.02,
                center_adv=False,
            )
            runner.setup(algo, env, sampler_cls=LocalSampler)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 100

            env.close()
示例#19
0
    def test_tnpg_inverted_pendulum(self):
        """Test TNPG with InvertedPendulum-v2 environment."""
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = GarageEnv(normalize(gym.make('InvertedPendulum-v2')))

            policy = GaussianMLPPolicy(name='policy',
                                       env_spec=env.spec,
                                       hidden_sizes=(32, 32))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TNPG(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=100,
                        discount=0.99,
                        optimizer_args=dict(reg_coeff=5e-1))

            runner.setup(algo, env)

            last_avg_ret = runner.train(n_epochs=10, batch_size=10000)
            assert last_avg_ret > 15

            env.close()
示例#20
0
def multi_env_trpo(ctxt=None, seed=1):
    """Train TRPO on two different PointEnv instances.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with TFTrainer(ctxt) as trainer:
        env1 = normalize(PointEnv(goal=(-1., 0.), max_episode_length=100))
        env2 = normalize(PointEnv(goal=(1., 0.), max_episode_length=100))
        env = MultiEnvWrapper([env1, env2])

        policy = GaussianMLPPolicy(env_spec=env.spec)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        sampler = RaySampler(agents=policy,
                             envs=env,
                             max_episode_length=env.spec.max_episode_length,
                             is_tf_worker=True)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    sampler=sampler,
                    discount=0.99,
                    gae_lambda=0.95,
                    lr_clip_range=0.2,
                    policy_ent_coeff=0.0)

        trainer.setup(algo, env)
        trainer.train(n_epochs=40, batch_size=2048, plot=False)
示例#21
0
def run_task(vv):

    env = TfEnv(normalize(gym.make('HalfCheetah-v1')))

    policy = GaussianMLPPolicy(env_spec=env.spec,
                               hidden_sizes=(32, 32),
                               name="policy")

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=vv["step_size"],
        # Uncomment both lines (this and the plot parameter below) to enable
        # plotting
        # plot=True,
    )
    algo.train()
    def test_dm_control_tf_policy(self):
        task = ALL_TASKS[0]

        with self.graph.as_default():
            env = TfEnv(DmControlEnv(domain_name=task[0], task_name=task[1]))

            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(32, 32),
            )

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=10,
                max_path_length=5,
                n_itr=1,
                discount=0.99,
                step_size=0.01,
            )
            algo.train()
示例#23
0
def run_task(*_):
    """
    Wrap PPO training task in the run_task function.

    :param _:
    :return:
    """
    env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))

    baseline = GaussianMLPBaseline(env_spec=env.spec)

    algo = PPO(env=env,
               policy=policy,
               baseline=baseline,
               batch_size=2048,
               max_path_length=100,
               n_itr=488,
               discount=0.99,
               step_size=0.01,
               optimizer_args=dict(batch_size=32, max_epochs=10),
               plot=False)
    algo.train()
示例#24
0
def trpo_garage_tf(ctxt, env_id, seed):
    """Create garage Tensorflow TROI model and training.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(ctxt) as runner:
        env = GarageEnv(normalize(gym.make(env_id)))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['hidden_sizes'],
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=hyper_parameters['max_path_length'],
                    discount=hyper_parameters['discount'],
                    gae_lambda=hyper_parameters['gae_lambda'],
                    max_kl_step=hyper_parameters['max_kl'])

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_parameters['n_epochs'],
                     batch_size=hyper_parameters['batch_size'])
    def test_get_action(self, obs_dim, action_dim):
        env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        obs_var = tf.compat.v1.placeholder(
            tf.float32,
            shape=[None, None, env.observation_space.flat_dim],
            name='obs')
        policy = GaussianMLPPolicy(env_spec=env.spec)

        policy.build(obs_var)
        env.reset()
        obs, _, _, _ = env.step(1)

        action, _ = policy.get_action(obs.flatten())
        assert env.action_space.contains(action)
        actions, _ = policy.get_actions(
            [obs.flatten(), obs.flatten(),
             obs.flatten()])
        for action in actions:
            assert env.action_space.contains(action)
示例#26
0
def run_garage(env, seed, log_dir):
    '''
    Create garage model and training.

    Replace the ppo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)

    with LocalRunner() as runner:
        env = TfEnv(normalize(env))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(64, 64),
                use_trust_region=False,
                optimizer=FirstOrderOptimizer,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                    tf_optimizer_args=dict(learning_rate=1e-3),
                ),
            ),
        )

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                tf_optimizer_args=dict(learning_rate=1e-3),
            ),
            plot=False,
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        garage_logger.add_output(StdOutput())
        garage_logger.add_output(CsvOutput(tabular_log_file))
        garage_logger.add_output(TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=488, batch_size=2048)

        garage_logger.remove_all()

        return tabular_log_file
示例#27
0
from garage.baselines import LinearFeatureBaseline
from garage.envs import normalize
from garage.misc.instrument import stub
from garage.misc.instrument import run_experiment

from garage.tf.algos import TRPO
from garage.tf.policies import GaussianMLPPolicy
from garage.tf.envs import TfEnv

from sandbox.embed2learn.envs.mujoco import PR2ArmEnv

env = TfEnv(normalize(PR2ArmEnv()))

policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    hidden_sizes=(32, 32),
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=100,
    discount=0.99,
    step_size=0.01,
    plot=True,
示例#28
0
class TestGaussianMLPPolicyWithModelTransit(TfGraphTestCase):
    def setup_method(self):
        with mock.patch('tensorflow.random.normal') as mock_rand:
            mock_rand.return_value = 0.5
            super().setup_method()
            self.box_env = TfEnv(DummyBoxEnv())
            self.policy1 = GaussianMLPPolicy(env_spec=self.box_env,
                                             init_std=1.0,
                                             name='P1')
            self.policy2 = GaussianMLPPolicy(env_spec=self.box_env,
                                             init_std=1.2,
                                             name='P2')
            self.policy3 = GaussianMLPPolicyWithModel(env_spec=self.box_env,
                                                      init_std=1.0,
                                                      name='P3')
            self.policy4 = GaussianMLPPolicyWithModel(env_spec=self.box_env,
                                                      init_std=1.2,
                                                      name='P4')

            self.sess.run(tf.global_variables_initializer())

            for a, b in zip(self.policy3.get_params(),
                            self.policy1.get_params()):
                self.sess.run(tf.assign(b, a))
            for a, b in zip(self.policy4.get_params(),
                            self.policy2.get_params()):
                self.sess.run(tf.assign(b, a))

            self.obs = [self.box_env.reset()]
            self.obs_ph = tf.placeholder(
                tf.float32,
                shape=(None, self.box_env.observation_space.flat_dim))
            self.action_ph = tf.placeholder(
                tf.float32, shape=(None, self.box_env.action_space.flat_dim))

            self.dist1_sym = self.policy1.dist_info_sym(self.obs_ph,
                                                        name='p1_sym')
            self.dist2_sym = self.policy2.dist_info_sym(self.obs_ph,
                                                        name='p2_sym')
            self.dist3_sym = self.policy3.dist_info_sym(self.obs_ph,
                                                        name='p3_sym')
            self.dist4_sym = self.policy4.dist_info_sym(self.obs_ph,
                                                        name='p4_sym')

            assert self.policy1.vectorized == self.policy2.vectorized
            assert self.policy3.vectorized == self.policy4.vectorized

    def test_dist_info_sym_output(self):
        dist1 = self.sess.run(self.dist1_sym,
                              feed_dict={self.obs_ph: self.obs})
        dist2 = self.sess.run(self.dist2_sym,
                              feed_dict={self.obs_ph: self.obs})
        dist3 = self.sess.run(self.dist3_sym,
                              feed_dict={self.obs_ph: self.obs})
        dist4 = self.sess.run(self.dist4_sym,
                              feed_dict={self.obs_ph: self.obs})

        assert np.array_equal(dist1['mean'], dist3['mean'])
        assert np.array_equal(dist1['log_std'], dist3['log_std'])
        assert np.array_equal(dist2['mean'], dist4['mean'])
        assert np.array_equal(dist2['log_std'], dist4['log_std'])

    @mock.patch('numpy.random.normal')
    def test_get_action(self, mock_rand):
        mock_rand.return_value = 0.5
        action1, _ = self.policy1.get_action(self.obs)
        action2, _ = self.policy2.get_action(self.obs)
        action3, _ = self.policy3.get_action(self.obs)
        action4, _ = self.policy4.get_action(self.obs)

        assert np.array_equal(action1, action3)
        assert np.array_equal(action2, action4)

        actions1, dist_info1 = self.policy1.get_actions([self.obs])
        actions2, dist_info2 = self.policy2.get_actions([self.obs])
        actions3, dist_info3 = self.policy3.get_actions([self.obs])
        actions4, dist_info4 = self.policy4.get_actions([self.obs])

        assert np.array_equal(actions1, actions3)
        assert np.array_equal(actions2, actions4)

        assert np.array_equal(dist_info1['mean'], dist_info3['mean'])
        assert np.array_equal(dist_info1['log_std'], dist_info3['log_std'])
        assert np.array_equal(dist_info2['mean'], dist_info4['mean'])
        assert np.array_equal(dist_info2['log_std'], dist_info4['log_std'])

    def test_kl_sym(self):
        kl_diff_sym1 = self.policy1.distribution.kl_sym(
            self.dist1_sym, self.dist2_sym)
        objective1 = tf.reduce_mean(kl_diff_sym1)

        kl_func = tensor_utils.compile_function([self.obs_ph], objective1)
        kl1 = kl_func(self.obs, self.obs)

        kl_diff_sym2 = self.policy3.distribution.kl_sym(
            self.dist3_sym, self.dist4_sym)
        objective2 = tf.reduce_mean(kl_diff_sym2)

        kl_func = tensor_utils.compile_function([self.obs_ph], objective2)
        kl2 = kl_func(self.obs, self.obs)

        assert np.array_equal(kl1, kl2)
        assert kl1 == pytest.approx(kl2)

    def test_log_likehihood_sym(self):
        log_prob_sym1 = self.policy1.distribution.log_likelihood_sym(
            self.action_ph, self.dist1_sym)
        log_prob_func = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym1)
        log_prob1 = log_prob_func(self.obs, [[1, 1]])

        log_prob_sym2 = self.policy3.model.networks[
            'default'].dist.log_likelihood_sym(self.action_ph, self.dist3_sym)
        log_prob_func2 = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym2)
        log_prob2 = log_prob_func2(self.obs, [[1, 1]])
        assert log_prob1 == log_prob2

        log_prob_sym1 = self.policy2.distribution.log_likelihood_sym(
            self.action_ph, self.dist2_sym)
        log_prob_func = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym1)
        log_prob1 = log_prob_func(self.obs, [[1, 1]])

        log_prob_sym2 = self.policy4.model.networks[
            'default'].dist.log_likelihood_sym(self.action_ph, self.dist4_sym)
        log_prob_func2 = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym2)
        log_prob2 = log_prob_func2(self.obs, [[1, 1]])
        assert log_prob1 == log_prob2

    def test_policy_entropy_sym(self):
        entropy_sym1 = self.policy1.distribution.entropy_sym(
            self.dist1_sym, name='entropy_sym1')
        entropy_func = tensor_utils.compile_function([self.obs_ph],
                                                     entropy_sym1)
        entropy1 = entropy_func(self.obs)

        entropy_sym2 = self.policy3.distribution.entropy_sym(
            self.dist3_sym, name='entropy_sym1')
        entropy_func = tensor_utils.compile_function([self.obs_ph],
                                                     entropy_sym2)
        entropy2 = entropy_func(self.obs)
        assert entropy1 == entropy2

    def test_likelihood_ratio_sym(self):
        likelihood_ratio_sym1 = self.policy1.distribution.likelihood_ratio_sym(
            self.action_ph,
            self.dist1_sym,
            self.dist2_sym,
            name='li_ratio_sym1')
        likelihood_ratio_func = tensor_utils.compile_function(
            [self.action_ph, self.obs_ph], likelihood_ratio_sym1)
        likelihood_ratio1 = likelihood_ratio_func([[1, 1]], self.obs)

        likelihood_ratio_sym2 = self.policy3.distribution.likelihood_ratio_sym(
            self.action_ph,
            self.dist3_sym,
            self.dist4_sym,
            name='li_ratio_sym2')
        likelihood_ratio_func = tensor_utils.compile_function(
            [self.action_ph, self.obs_ph], likelihood_ratio_sym2)
        likelihood_ratio2 = likelihood_ratio_func([[1, 1]], self.obs)

        assert likelihood_ratio1 == likelihood_ratio2
示例#29
0
"""
Example using TRPO with ISSampler, iterations alternate between live and
importance sampled iterations.
"""
import gym

from garage.baselines import LinearFeatureBaseline
from garage.contrib.alexbeloi.is_sampler import ISSampler
from garage.envs import normalize
from garage.tf.algos import TRPO
from garage.tf.policies import GaussianMLPPolicy

env = normalize(gym.make('InvertedPendulum-v2'))

policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

optimizer_args = dict(
    # debug_nan=True,
    # reg_coeff=0.1,
    # cg_iters=2
)

algo = TRPO(env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=100,
            n_itr=200,
            discount=0.99,
 def test_clone(self):
     env = GarageEnv(DummyBoxEnv(obs_dim=(10, ), action_dim=(4, )))
     policy = GaussianMLPPolicy(env_spec=env.spec)
     policy_clone = policy.clone('GaussnaMLPPolicyClone')
     assert policy.env_spec == policy_clone.env_spec