示例#1
0
def test_in_local_sampler(policy, envs):
    true_workers = WorkerFactory(seed=100,
                                 n_workers=N_TRAJ,
                                 max_path_length=MAX_PATH_LENGTH)
    true_sampler = LocalSampler.from_worker_factory(true_workers, policy, envs)
    vec_workers = WorkerFactory(seed=100,
                                n_workers=1,
                                worker_class=VecWorker,
                                worker_args=dict(n_envs=N_TRAJ),
                                max_path_length=MAX_PATH_LENGTH)
    vec_sampler = LocalSampler.from_worker_factory(vec_workers, policy, [envs])
    n_samples = 100

    true_trajs = true_sampler.obtain_samples(0, n_samples, None)
    vec_trajs = vec_sampler.obtain_samples(0, n_samples, None)
    assert vec_trajs.lengths.sum() >= n_samples
    assert_trajs_eq(true_trajs, vec_trajs)

    # Test start_rollout optimization

    true_trajs = true_sampler.obtain_samples(0, n_samples, None)
    vec_trajs = vec_sampler.obtain_samples(0, n_samples, None)
    assert vec_trajs.lengths.sum() >= n_samples
    assert_trajs_eq(true_trajs, vec_trajs)

    true_sampler.shutdown_worker()
    vec_sampler.shutdown_worker()
示例#2
0
def test_in_local_sampler(policy, envs, other_envs, timesteps_per_call):
    true_workers = WorkerFactory(seed=100,
                                 n_workers=N_EPS,
                                 max_episode_length=MAX_EPISODE_LENGTH)
    true_sampler = LocalSampler.from_worker_factory(true_workers, policy, envs)
    worker_args = dict(n_envs=N_EPS, timesteps_per_call=timesteps_per_call)
    vec_workers = WorkerFactory(seed=100,
                                n_workers=1,
                                worker_class=FragmentWorker,
                                worker_args=worker_args,
                                max_episode_length=MAX_EPISODE_LENGTH)
    vec_sampler = LocalSampler.from_worker_factory(vec_workers, policy, [envs])
    n_samples = 400

    true_eps = true_sampler.obtain_samples(0, n_samples, None)
    sliced_true_eps = slice_episodes(true_eps, timesteps_per_call)

    vec_eps = vec_sampler.obtain_samples(0, 50, None)
    for test_eps in vec_eps.split():
        assert any(eps_eq(true_eps, test_eps) for true_eps in sliced_true_eps)

    true_eps = true_sampler.obtain_samples(0,
                                           n_samples,
                                           None,
                                           env_update=other_envs)
    sliced_true_eps = slice_episodes(true_eps, timesteps_per_call)

    vec_eps = vec_sampler.obtain_samples(0, 50, None, env_update=[other_envs])
    for test_eps in vec_eps.split():
        assert any(eps_eq(true_eps, test_eps) for true_eps in sliced_true_eps)

    true_sampler.shutdown_worker()
    vec_sampler.shutdown_worker()
def test_onehots_consistent_with_task_sampler():
    # Import, construct environments here to avoid using up too much
    # resources if this test isn't run.
    # pylint: disable=import-outside-toplevel
    import metaworld
    mt10 = metaworld.MT10()
    env = MetaWorldSetTaskEnv(mt10, 'train', add_env_onehot=True)
    policy = RandomPolicy(env.action_space)
    workers = WorkerFactory(seed=100, max_episode_length=1, n_workers=10)
    sampler1 = LocalSampler.from_worker_factory(workers, policy, env)
    env_ups = [
        SetTaskUpdate(MetaWorldSetTaskEnv, task, None)
        for task in env.sample_tasks(10)
    ]
    samples1 = sampler1.obtain_exact_episodes(1, policy, env_ups)
    task_sampler = MetaWorldTaskSampler(mt10, 'train', add_env_onehot=True)
    env_ups = task_sampler.sample(10)
    sampler2 = LocalSampler.from_worker_factory(workers, policy, env_ups)
    samples2 = sampler2.obtain_exact_episodes(1, policy, env_ups)
    name_to_obs1 = {}
    for obs1, name1 in zip(samples1.observations,
                           samples1.env_infos['task_name']):
        name_to_obs1[name1] = obs1
    for obs2, name2 in zip(samples2.observations,
                           samples2.env_infos['task_name']):
        assert (name_to_obs1[name2][-10:] == obs2[-10:]).all()
示例#4
0
    def test_dm_control_tf_policy(self):
        task = ALL_TASKS[0]

        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            env = DMControlEnv.from_suite(*task)

            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(32, 32),
            )

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            sampler = LocalSampler(
                agents=policy,
                envs=env,
                max_episode_length=env.spec.max_episode_length,
                is_tf_worker=True)

            algo = TRPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                sampler=sampler,
                discount=0.99,
                max_kl_step=0.01,
            )

            trainer.setup(algo, env)
            trainer.train(n_epochs=1, batch_size=10)

            env.close()
示例#5
0
        def train_ppo(ctxt=None):
            set_seed(seed)
            with TFTrainer(ctxt) as trainer:
                env = MyGymEnv(gym_env, max_episode_length=100)
                policy = CategoricalGRUPolicy(name='policy',
                                              env_spec=env.spec,
                                              state_include_action=False)
                baseline = LinearFeatureBaseline(env_spec=env.spec)
                sampler = LocalSampler(
                    agents=policy,
                    envs=env,
                    max_episode_length=env.spec.max_episode_length,
                    worker_class=FragmentWorker,
                    is_tf_worker=True,
                )
                self.algo = LoggedPPO(
                    env=env,
                    env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    sampler=sampler,
                    discount=0.99,
                    center_adv=False,
                    optimizer_args=dict(max_optimization_epochs=8))

                trainer.setup(self.algo, env)
                trainer.train(n_epochs=n_eps, batch_size=4000)
                return self.algo.rew_chkpts
示例#6
0
def trpo_cartpole(ctxt=None, seed=1):
    """Train TRPO with CartPole-v1 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with TFTrainer(ctxt) as trainer:
        env = GymEnv('CartPole-v1')

        policy = CategoricalMLPPolicy(name='policy',
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        sampler = LocalSampler(agents=policy,
                               envs=env,
                               max_episode_length=env.spec.max_episode_length,
                               is_tf_worker=True)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    sampler=sampler,
                    discount=0.99,
                    max_kl_step=0.01)

        trainer.setup(algo, env)
        trainer.train(n_epochs=100, batch_size=4000)
示例#7
0
    def test_categorical_policies(self, policy_cls):
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            env = normalize(GymEnv('CartPole-v0', max_episode_length=100))

            policy = policy_cls(name='policy', env_spec=env.spec)

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            sampler = LocalSampler(
                agents=policy,
                envs=env,
                max_episode_length=env.spec.max_episode_length,
                is_tf_worker=True)

            algo = TRPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                sampler=sampler,
                discount=0.99,
                max_kl_step=0.01,
                optimizer=ConjugateGradientOptimizer,
                optimizer_args=dict(hvp_approach=FiniteDifferenceHVP(
                    base_eps=1e-5)),
            )

            trainer.setup(algo, env)
            trainer.train(n_epochs=1, batch_size=4000)

            env.close()
示例#8
0
def test_obtain_exact_episodes():
    max_episode_length = 15
    n_workers = 8
    env = PointEnv()
    per_worker_actions = [env.action_space.sample() for _ in range(n_workers)]
    policies = [
        FixedPolicy(env.spec, [action] * max_episode_length)
        for action in per_worker_actions
    ]
    workers = WorkerFactory(seed=100,
                            max_episode_length=max_episode_length,
                            n_workers=n_workers)
    sampler = LocalSampler.from_worker_factory(workers, policies, envs=env)
    n_eps_per_worker = 3
    episodes = sampler.obtain_exact_episodes(n_eps_per_worker,
                                             agent_update=policies)
    # At least one action per episode.
    assert sum(episodes.lengths) >= n_workers * n_eps_per_worker
    # All of the episodes.
    assert len(episodes.lengths) == n_workers * n_eps_per_worker
    worker = -1
    for count, eps in enumerate(episodes.split()):
        if count % n_eps_per_worker == 0:
            worker += 1
        assert (eps.actions == per_worker_actions[worker]).all()
def sac_half_cheetah_batch(ctxt=None, seed=1):
    """Set up environment and algorithm and run the task.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    deterministic.set_seed(seed)
    trainer = Trainer(snapshot_config=ctxt)
    env = normalize(GymEnv('HalfCheetah-v2'))

    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[256, 256],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[256, 256],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[256, 256],
                                 hidden_nonlinearity=F.relu)

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))

    sampler = LocalSampler(agents=policy,
                           envs=env,
                           max_episode_length=env.spec.max_episode_length,
                           worker_class=FragmentWorker)

    sac = SAC(env_spec=env.spec,
              policy=policy,
              qf1=qf1,
              qf2=qf2,
              sampler=sampler,
              gradient_steps_per_itr=1000,
              max_episode_length_eval=1000,
              replay_buffer=replay_buffer,
              min_buffer_size=1e4,
              target_update_tau=5e-3,
              discount=0.99,
              buffer_batch_size=256,
              reward_scale=1.,
              steps_per_epoch=1)

    if torch.cuda.is_available():
        set_gpu_mode(True)
    else:
        set_gpu_mode(False)
    sac.to()
    trainer.setup(algo=sac, env=env)
    trainer.train(n_epochs=1000, batch_size=1000)
示例#10
0
    def test_set_plot(self):
        deterministic.set_seed(1)
        with TFTrainer(snapshot_config) as trainer:
            env = GymEnv('CartPole-v1')

            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          hidden_sizes=(8, 8))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            sampler = LocalSampler(
                agents=policy,
                envs=env,
                max_episode_length=env.spec.max_episode_length,
                is_tf_worker=True)

            algo = VPG(env_spec=env.spec,
                       policy=policy,
                       baseline=baseline,
                       sampler=sampler,
                       discount=0.99,
                       optimizer_args=dict(learning_rate=0.01, ))

            trainer.setup(algo, env)
            trainer.train(n_epochs=1, batch_size=100, plot=True)

            assert isinstance(trainer._plotter, Plotter), (
                'self.plotter in TFTrainer should be set to Plotter.')
示例#11
0
def test_obtain_exact_trajectories():
    max_path_length = 15
    n_workers = 8
    env = GarageEnv(PointEnv())
    per_worker_actions = [env.action_space.sample() for _ in range(n_workers)]
    policies = [
        FixedPolicy(env.spec, [action] * max_path_length)
        for action in per_worker_actions
    ]
    workers = WorkerFactory(seed=100,
                            max_path_length=max_path_length,
                            n_workers=n_workers)
    sampler = LocalSampler.from_worker_factory(workers, policies, envs=env)
    n_traj_per_worker = 3
    rollouts = sampler.obtain_exact_trajectories(n_traj_per_worker,
                                                 agent_update=policies)
    # At least one action per trajectory.
    assert sum(rollouts.lengths) >= n_workers * n_traj_per_worker
    # All of the trajectories.
    assert len(rollouts.lengths) == n_workers * n_traj_per_worker
    worker = -1
    for count, rollout in enumerate(rollouts.split()):
        if count % n_traj_per_worker == 0:
            worker += 1
        assert (rollout.actions == per_worker_actions[worker]).all()
示例#12
0
    def test_rl2_ppo_pendulum_wrong_worker(self):
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            with pytest.raises(ValueError):
                sampler = LocalSampler(
                    agents=self.policy,
                    envs=self.tasks.sample(self.meta_batch_size),
                    max_episode_length=self.env_spec.max_episode_length,
                    is_tf_worker=True,
                    n_workers=self.meta_batch_size)
                algo = RL2PPO(meta_batch_size=self.meta_batch_size,
                              task_sampler=self.tasks,
                              env_spec=self.env_spec,
                              policy=self.policy,
                              baseline=self.baseline,
                              sampler=sampler,
                              discount=0.99,
                              gae_lambda=0.95,
                              lr_clip_range=0.2,
                              optimizer_args=dict(
                                  batch_size=32,
                                  max_optimization_epochs=10,
                              ),
                              stop_entropy_gradient=True,
                              entropy_method='max',
                              policy_ent_coeff=0.02,
                              center_adv=False,
                              episodes_per_trial=self.episode_per_task)

                trainer.setup(algo, self.tasks.sample(self.meta_batch_size))

                trainer.train(n_epochs=10,
                              batch_size=self.episode_per_task *
                              self.max_episode_length * self.meta_batch_size)
示例#13
0
    def test_trpo_lstm_cartpole(self):
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            env = normalize(GymEnv('CartPole-v1', max_episode_length=100))

            policy = CategoricalLSTMPolicy(name='policy', env_spec=env.spec)

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            sampler = LocalSampler(
                agents=policy,
                envs=env,
                max_episode_length=env.spec.max_episode_length,
                is_tf_worker=True)

            algo = TRPO(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        sampler=sampler,
                        discount=0.99,
                        max_kl_step=0.01,
                        optimizer_args=dict(hvp_approach=FiniteDifferenceHVP(
                            base_eps=1e-5)))

            snapshotter.snapshot_dir = './'
            trainer.setup(algo, env)
            last_avg_ret = trainer.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 60

            env.close()
示例#14
0
        def train_gru_trpo(ctxt=None):
            set_seed(seed)
            with TFTrainer(snapshot_config=ctxt) as trainer:
                env = MyGymEnv(gym_env, max_episode_length=100)
                policy = CategoricalGRUPolicy(name='policy',
                                              env_spec=env.spec,
                                              state_include_action=False)
                baseline = LinearFeatureBaseline(env_spec=env.spec)
                sampler = LocalSampler(
                    agents=policy,
                    envs=env,
                    max_episode_length=env.spec.max_episode_length,
                    worker_class=FragmentWorker,
                )
                self.algo = LoggedTRPO(
                    env=env,
                    env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    sampler=sampler,
                    discount=0.99,
                    max_kl_step=0.01,
                    optimizer_args=dict(hvp_approach=FiniteDifferenceHVP(
                        base_eps=1e-5)))

                trainer.setup(self.algo, env)
                trainer.train(n_epochs=n_eps, batch_size=4000)
                return self.algo.rew_chkpts
示例#15
0
    def test_te_ppo(self):
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            sampler = LocalSampler(
                agents=self.policy,
                envs=self.env,
                max_episode_length=self.env.spec.max_episode_length,
                is_tf_worker=True,
                worker_class=TaskEmbeddingWorker)
            algo = TEPPO(env_spec=self.env.spec,
                         policy=self.policy,
                         baseline=self.baseline,
                         inference=self.inference,
                         sampler=sampler,
                         discount=0.99,
                         lr_clip_range=0.2,
                         policy_ent_coeff=self.policy_ent_coeff,
                         encoder_ent_coeff=self.encoder_ent_coeff,
                         inference_ce_coeff=self.inference_ce_coeff,
                         use_softplus_entropy=True,
                         optimizer_args=dict(
                             batch_size=32,
                             max_optimization_epochs=10,
                         ),
                         inference_optimizer_args=dict(
                             batch_size=32,
                             max_optimization_epochs=10,
                         ),
                         center_adv=True,
                         stop_ce_gradient=True)

            trainer.setup(algo, self.env)
            trainer.train(n_epochs=1, batch_size=self.batch_size, plot=False)
示例#16
0
 def train_sac(ctxt=None):
     trainer = Trainer(ctxt)
     env = MyGymEnv(gym_env, max_episode_length=100)
     policy = CategoricalGRUPolicy(name='policy',
                                   env_spec=env.spec,
                                   state_include_action=False).to(
                                       global_device())
     qf1 = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5))
     qf2 = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5))
     replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
     sampler = LocalSampler(
         agents=policy,
         envs=env,
         max_episode_length=env.spec.max_episode_length,
         worker_class=FragmentWorker)
     self.algo = LoggedSAC(env=env,
                           env_spec=env.spec,
                           policy=policy,
                           qf1=qf1,
                           qf2=qf2,
                           sampler=sampler,
                           gradient_steps_per_itr=1000,
                           max_episode_length_eval=100,
                           replay_buffer=replay_buffer,
                           min_buffer_size=1e4,
                           target_update_tau=5e-3,
                           discount=0.99,
                           buffer_batch_size=256,
                           reward_scale=1.,
                           steps_per_epoch=1)
     trainer.setup(self.algo, env)
     trainer.train(n_epochs=n_eps, batch_size=4000)
     return self.algo.rew_chkpts
示例#17
0
    def test_erwr_cartpole(self):
        """Test ERWR with Cartpole-v1 environment."""
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            deterministic.set_seed(1)
            env = GymEnv('CartPole-v1')

            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          hidden_sizes=(32, 32))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            sampler = LocalSampler(
                agents=policy,
                envs=env,
                max_episode_length=env.spec.max_episode_length,
                is_tf_worker=True)

            algo = ERWR(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        sampler=sampler,
                        discount=0.99)

            trainer.setup(algo, env)

            last_avg_ret = trainer.train(n_epochs=10, batch_size=10000)
            assert last_avg_ret > 60

            env.close()
示例#18
0
    def test_reps_cartpole(self):
        """Test REPS with gym Cartpole environment."""
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            env = GymEnv('CartPole-v0')

            policy = CategoricalMLPPolicy(env_spec=env.spec,
                                          hidden_sizes=[32, 32])

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            sampler = LocalSampler(
                agents=policy,
                envs=env,
                max_episode_length=env.spec.max_episode_length,
                is_tf_worker=True)

            algo = REPS(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        sampler=sampler,
                        discount=0.99)

            trainer.setup(algo, env)

            last_avg_ret = trainer.train(n_epochs=10, batch_size=4000)
            assert last_avg_ret > 5

            env.close()
示例#19
0
    def test_rl2_ppo_pendulum(self):
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            sampler = LocalSampler(
                agents=self.policy,
                envs=self.tasks.sample(self.meta_batch_size),
                max_episode_length=self.env_spec.max_episode_length,
                is_tf_worker=True,
                n_workers=self.meta_batch_size,
                worker_class=RL2Worker,
                worker_args=dict(n_episodes_per_trial=self.episode_per_task))
            algo = RL2PPO(meta_batch_size=self.meta_batch_size,
                          task_sampler=self.tasks,
                          env_spec=self.env_spec,
                          policy=self.policy,
                          baseline=self.baseline,
                          sampler=sampler,
                          discount=0.99,
                          gae_lambda=0.95,
                          lr_clip_range=0.2,
                          stop_entropy_gradient=True,
                          entropy_method='max',
                          policy_ent_coeff=0.02,
                          center_adv=False,
                          episodes_per_trial=self.episode_per_task)

            trainer.setup(algo, self.tasks.sample(self.meta_batch_size))

            last_avg_ret = trainer.train(n_epochs=1,
                                         batch_size=self.episode_per_task *
                                         self.max_episode_length *
                                         self.meta_batch_size)
            assert last_avg_ret > -40
示例#20
0
    def test_train(self):
        with TFTrainer(snapshot_config) as trainer:
            env = GymEnv('CartPole-v1')

            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          hidden_sizes=(8, 8))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            sampler = LocalSampler(
                agents=policy,
                envs=env,
                max_episode_length=env.spec.max_episode_length,
                is_tf_worker=True)

            algo = VPG(env_spec=env.spec,
                       policy=policy,
                       baseline=baseline,
                       sampler=sampler,
                       discount=0.99,
                       optimizer_args=dict(learning_rate=0.01, ))

            trainer.setup(algo, env)
            trainer.train(n_epochs=1, batch_size=100)
示例#21
0
def expert_source(env, goal, max_episode_length, n_eps):
    expert = OptimalPolicy(env.spec, goal=goal)
    workers = WorkerFactory(seed=100, max_episode_length=max_episode_length)
    expert_sampler = LocalSampler.from_worker_factory(workers, expert, env)
    for _ in range(n_eps):
        eps_batch = expert_sampler.obtain_samples(0, max_episode_length, None)
        yield TimeStepBatch.from_episode_batch(eps_batch)
示例#22
0
 def __init__(self, env, max_episode_length):
     self.env = env
     self.policy = RandomPolicy(self.env.spec.action_space)
     self.max_episode_length = max_episode_length
     self.sampler = LocalSampler(agents=self.policy,
                                 envs=self.env,
                                 max_episode_length=self.max_episode_length)
示例#23
0
    def test_local_batch_sampler(self):
        workers = WorkerFactory(seed=100,
                                max_path_length=self.algo.max_path_length)
        sampler1 = LocalSampler.from_worker_factory(workers, self.policy,
                                                    self.env)
        sampler2 = OnPolicyVectorizedSampler(self.algo, self.env)
        sampler2.start_worker()
        trajs1 = sampler1.obtain_samples(
            0, 1000, tuple(self.algo.policy.get_param_values()))
        trajs2 = sampler2.obtain_samples(0, 1000)
        # pylint: disable=superfluous-parens
        assert trajs1.observations.shape[0] >= 1000
        assert trajs1.actions.shape[0] >= 1000
        assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum(
            trajs2[0]['rewards']) == 1)

        true_obs = np.array([0, 1, 2, 6, 10, 14])
        true_actions = np.array([2, 2, 1, 1, 1, 2])
        true_rewards = np.array([0, 0, 0, 0, 0, 1])
        start = 0
        for length in trajs1.lengths:
            observations = trajs1.observations[start:start + length]
            actions = trajs1.actions[start:start + length]
            rewards = trajs1.rewards[start:start + length]
            assert np.array_equal(observations, true_obs)
            assert np.array_equal(actions, true_actions)
            assert np.array_equal(rewards, true_rewards)
            start += length
        sampler1.shutdown_worker()
        sampler2.shutdown_worker()
示例#24
0
    def setup_method(self):
        super().setup_method()
        self.meta_batch_size = 10
        self.episode_per_task = 4
        self.max_episode_length = 100
        # Avoid pickling self
        max_episode_length = 100
        self.tasks = task_sampler.SetTaskSampler(
            HalfCheetahDirEnv,
            wrapper=lambda env, _: RL2Env(
                normalize(GymEnv(env, max_episode_length=max_episode_length))))
        self.env_spec = RL2Env(
            normalize(
                GymEnv(HalfCheetahDirEnv(),
                       max_episode_length=max_episode_length))).spec

        self.policy = GaussianGRUPolicy(env_spec=self.env_spec,
                                        hidden_dim=64,
                                        state_include_action=False)
        self.baseline = LinearFeatureBaseline(env_spec=self.env_spec)
        self.sampler = LocalSampler(
            agents=self.policy,
            envs=self.tasks.sample(self.meta_batch_size),
            max_episode_length=self.env_spec.max_episode_length,
            is_tf_worker=True,
            n_workers=self.meta_batch_size,
            worker_class=RL2Worker)
示例#25
0
    def test_tnpg_inverted_pendulum(self):
        """Test TNPG with InvertedPendulum-v2 environment."""
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            env = normalize(GymEnv('InvertedPendulum-v2'))

            policy = GaussianMLPPolicy(name='policy',
                                       env_spec=env.spec,
                                       hidden_sizes=(32, 32))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            sampler = LocalSampler(
                agents=policy,
                envs=env,
                max_episode_length=env.spec.max_episode_length,
                is_tf_worker=True)

            algo = TNPG(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        sampler=sampler,
                        discount=0.99,
                        optimizer_args=dict(reg_coeff=5e-1))

            trainer.setup(algo, env)

            last_avg_ret = trainer.train(n_epochs=10, batch_size=10000)
            assert last_avg_ret > 15

            env.close()
示例#26
0
 def test_ppo_pendulum_gru(self):
     """Test PPO with Pendulum environment and recurrent policy."""
     with TFTrainer(snapshot_config) as trainer:
         env = normalize(
             GymEnv('InvertedDoublePendulum-v2', max_episode_length=100))
         gru_policy = GaussianGRUPolicy(env_spec=env.spec)
         baseline = GaussianMLPBaseline(
             env_spec=env.spec,
             hidden_sizes=(32, 32),
         )
         sampler = LocalSampler(
             agents=gru_policy,
             envs=env,
             max_episode_length=env.spec.max_episode_length,
             is_tf_worker=True)
         algo = PPO(
             env_spec=env.spec,
             policy=gru_policy,
             baseline=baseline,
             sampler=sampler,
             discount=0.99,
             gae_lambda=0.95,
             lr_clip_range=0.2,
             optimizer_args=dict(
                 batch_size=32,
                 max_optimization_epochs=10,
             ),
             stop_entropy_gradient=True,
             entropy_method='max',
             policy_ent_coeff=0.02,
             center_adv=False,
         )
         trainer.setup(algo, env)
         last_avg_ret = trainer.train(n_epochs=10, batch_size=2048)
         assert last_avg_ret > 80
示例#27
0
def test_update_envs_env_update():
    max_episode_length = 16
    env = PointEnv()
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_episode_length)
                         ])
    tasks = SetTaskSampler(PointEnv)
    n_workers = 8
    workers = WorkerFactory(seed=100,
                            max_episode_length=max_episode_length,
                            n_workers=n_workers)
    sampler = LocalSampler.from_worker_factory(workers, policy, env)
    episodes = sampler.obtain_samples(0,
                                      161,
                                      np.asarray(policy.get_param_values()),
                                      env_update=tasks.sample(n_workers))
    mean_rewards = []
    goals = []
    for eps in episodes.split():
        mean_rewards.append(eps.rewards.mean())
        goals.append(eps.env_infos['task'][0]['goal'])
    assert len(mean_rewards) == 11
    assert len(goals) == 11
    assert np.var(mean_rewards) > 1e-2
    assert np.var(goals) > 1e-2
    with pytest.raises(ValueError):
        sampler.obtain_samples(0,
                               10,
                               np.asarray(policy.get_param_values()),
                               env_update=tasks.sample(n_workers + 1))
示例#28
0
    def test_cem_cartpole(self):
        """Test CEM with Cartpole-v1 environment."""
        with TFTrainer(snapshot_config) as trainer:
            env = GymEnv('CartPole-v1')

            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          hidden_sizes=(32, 32))

            n_samples = 10

            sampler = LocalSampler(
                agents=policy,
                envs=env,
                max_episode_length=env.spec.max_episode_length,
                is_tf_worker=True)

            algo = CEM(env_spec=env.spec,
                       policy=policy,
                       sampler=sampler,
                       best_frac=0.1,
                       n_samples=n_samples)

            trainer.setup(algo, env)
            rtn = trainer.train(n_epochs=10, batch_size=2048)
            assert rtn > 40

            env.close()
示例#29
0
def cma_es_cartpole(ctxt=None, seed=1):
    """Train CMA_ES with Cartpole-v1 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with TFTrainer(ctxt) as trainer:
        env = GymEnv('CartPole-v1')

        policy = CategoricalMLPPolicy(name='policy',
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))

        n_samples = 20

        sampler = LocalSampler(agents=policy,
                               envs=env,
                               max_episode_length=env.spec.max_episode_length,
                               is_tf_worker=True)

        algo = CMAES(env_spec=env.spec,
                     policy=policy,
                     sampler=sampler,
                     n_samples=n_samples)

        trainer.setup(algo, env)
        trainer.train(n_epochs=100, batch_size=1000)
示例#30
0
        def train_trpo(ctxt=None):
            set_seed(seed)
            with TFTrainer(snapshot_config=ctxt) as trainer:
                env = MyGymEnv(gym_env, max_episode_length=100)
                policy = CategoricalMLPPolicy(name='policy',
                                      env_spec=env.spec,
                                      )
                baseline = LinearFeatureBaseline(env_spec=env.spec)
                sampler = LocalSampler(
                    agents=policy,
                    envs=env,
                    max_episode_length=env.spec.max_episode_length,
                    worker_class=FragmentWorker,
                )
                self.algo = LoggedTRPO(
                    env=env,
                    env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    sampler=sampler,
                    discount=0.99,
                    max_kl_step=0.01,
                )

                trainer.setup(self.algo, env)
                trainer.train(n_epochs=n_eps, batch_size=4000)
                return self.algo.rew_chkpts