Exemplo n.º 1
0
        def train_dqn(ctxt=None):
            set_seed(seed)
            trainer = Trainer(ctxt)
            env = MyGymEnv(gym_env, max_episode_length=100)
            steps_per_epoch = 10
            sampler_batch_size = 4000
            num_timesteps = n_eps * steps_per_epoch * sampler_batch_size
            replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
            qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5))
            policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf)
            exploration_policy = EpsilonGreedyPolicy(
                env_spec=env.spec,
                policy=policy,
                total_timesteps=num_timesteps,
                max_epsilon=1.0,
                min_epsilon=0.01,
                decay_ratio=0.4,
            )
            sampler = LocalSampler(
                agents=exploration_policy,
                envs=env,
                max_episode_length=env.spec.max_episode_length,
                worker_class=FragmentWorker,
            )
            self.algo = LoggedDQN(
                env=env,
                env_spec=env.spec,
                policy=policy,
                qf=qf,
                exploration_policy=exploration_policy,
                replay_buffer=replay_buffer,
                sampler=sampler,
                steps_per_epoch=steps_per_epoch,
                qf_lr=5e-5,
                discount=0.99,
                min_buffer_size=int(1e4),
                n_train_steps=500,
                target_update_freq=30,
                buffer_batch_size=64,
            )
            trainer.setup(self.algo, env)
            trainer.train(n_epochs=n_eps, batch_size=sampler_batch_size)

            return self.algo.rew_chkpts
Exemplo n.º 2
0
def test_mtsac_inverted_double_pendulum():
    """Performance regression test of MTSAC on 2 InvDoublePendulum envs."""
    env_names = ['InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v2']
    task_envs = [GymEnv(name, max_episode_length=100) for name in env_names]
    env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy)
    test_envs = MultiEnvWrapper(task_envs,
                                sample_strategy=round_robin_strategy)
    deterministic.set_seed(0)
    trainer = Trainer(snapshot_config=snapshot_config)
    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[32, 32],
        hidden_nonlinearity=torch.nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[32, 32],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[32, 32],
                                 hidden_nonlinearity=F.relu)
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )
    num_tasks = 2
    buffer_batch_size = 128
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  gradient_steps_per_itr=100,
                  eval_env=[test_envs],
                  env_spec=env.spec,
                  num_tasks=num_tasks,
                  steps_per_epoch=5,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1e3,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=buffer_batch_size)
    trainer.setup(mtsac, env, sampler_cls=LocalSampler)
    ret = trainer.train(n_epochs=8, batch_size=128, plot=False)
    assert ret > 0
Exemplo n.º 3
0
def dqn_cartpole(ctxt=None, seed=1):
    """Train TRPO with CubeCrash-v0 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with LocalTFRunner(ctxt) as runner:
        n_epochs = 10
        steps_per_epoch = 10
        sampler_batch_size = 500
        num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
        env = GymEnv('CartPole-v0')
        replay_buffer = PathBuffer(capacity_in_transitions=int(1e4))
        qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64))
        policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf)
        exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec,
                                                 policy=policy,
                                                 total_timesteps=num_timesteps,
                                                 max_epsilon=1.0,
                                                 min_epsilon=0.02,
                                                 decay_ratio=0.1)
        algo = DQN(env_spec=env.spec,
                   policy=policy,
                   qf=qf,
                   exploration_policy=exploration_policy,
                   max_episode_length=100,
                   replay_buffer=replay_buffer,
                   steps_per_epoch=steps_per_epoch,
                   qf_lr=1e-4,
                   discount=1.0,
                   min_buffer_size=int(1e3),
                   double_q=True,
                   n_train_steps=500,
                   target_network_update_freq=1,
                   buffer_batch_size=32)

        runner.setup(algo, env)
        runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
Exemplo n.º 4
0
def dqn_cartpole(ctxt=None, seed=24):
    """Train DQN with CartPole-v0 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
    """
    set_seed(seed)
    runner = Trainer(ctxt)

    n_epochs = 100
    steps_per_epoch = 10
    sampler_batch_size = 512
    num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
    env = GymEnv('CartPole-v0')
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
    qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5))
    policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf)
    exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec,
                                             policy=policy,
                                             total_timesteps=num_timesteps,
                                             max_epsilon=1.0,
                                             min_epsilon=0.01,
                                             decay_ratio=0.4)
    algo = DQN(env_spec=env.spec,
               policy=policy,
               qf=qf,
               exploration_policy=exploration_policy,
               replay_buffer=replay_buffer,
               steps_per_epoch=steps_per_epoch,
               qf_lr=5e-5,
               discount=0.9,
               min_buffer_size=int(1e4),
               n_train_steps=500,
               target_update_freq=30,
               buffer_batch_size=64)

    runner.setup(algo, env, sampler_cls=LocalSampler)
    runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size)

    env.close()
Exemplo n.º 5
0
    def test_pickling(self):
        """Test pickle and unpickle."""

        deterministic.set_seed(0)
        n_epochs = 10
        steps_per_epoch = 20
        sampler_batch_size = 100
        num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
        env = normalize(
            GymEnv('InvertedDoublePendulum-v2', max_episode_length=100))
        policy = DeterministicMLPPolicy(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=F.relu,
                                        output_nonlinearity=None)
        exploration_policy = AddGaussianNoise(env.spec,
                                              policy,
                                              total_timesteps=num_timesteps,
                                              max_sigma=0.1,
                                              min_sigma=0.1)
        qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                     hidden_sizes=[256, 256],
                                     hidden_nonlinearity=F.relu)
        qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                     hidden_sizes=[256, 256],
                                     hidden_nonlinearity=F.relu)
        replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
        td3 = TD3(env_spec=env.spec,
                  policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  replay_buffer=replay_buffer,
                  exploration_policy=exploration_policy,
                  steps_per_epoch=steps_per_epoch,
                  grad_steps_per_env_step=1,
                  num_evaluation_episodes=1,
                  discount=0.99)
        prefer_gpu()
        td3.to()

        pickled = pickle.dumps(td3)
        unpickled = pickle.loads(pickled)
        assert unpickled
Exemplo n.º 6
0
def test_fixed_alpha():
    """Test if using fixed_alpha ensures that alpha is non differentiable."""
    # pylint: disable=unexpected-keyword-arg
    env = normalize(GymEnv('InvertedDoublePendulum-v2',
                           max_episode_length=100))
    deterministic.set_seed(0)
    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[32, 32],
        hidden_nonlinearity=torch.nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[32, 32],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[32, 32],
                                 hidden_nonlinearity=F.relu)
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )
    runner = LocalRunner(snapshot_config=snapshot_config)
    sac = SAC(env_spec=env.spec,
              policy=policy,
              qf1=qf1,
              qf2=qf2,
              gradient_steps_per_itr=100,
              replay_buffer=replay_buffer,
              min_buffer_size=100,
              target_update_tau=5e-3,
              discount=0.99,
              buffer_batch_size=64,
              reward_scale=1.,
              steps_per_epoch=1,
              fixed_alpha=np.exp(0.5))
    runner.setup(sac, env, sampler_cls=LocalSampler)
    sac.to()
    runner.train(n_epochs=1, batch_size=100, plot=False)
    assert torch.allclose(torch.Tensor([0.5]), sac._log_alpha.cpu())
    assert not sac._use_automatic_entropy_tuning
Exemplo n.º 7
0
    def test_ddpg_double_pendulum(self):
        """Test DDPG with Pendulum environment."""
        deterministic.set_seed(0)
        trainer = Trainer(snapshot_config)
        env = GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)
        policy = DeterministicMLPPolicy(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=F.relu,
                                        output_nonlinearity=torch.tanh)

        exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec,
                                                       policy,
                                                       sigma=0.2)

        qf = ContinuousMLPQFunction(env_spec=env.spec,
                                    hidden_sizes=[64, 64],
                                    hidden_nonlinearity=F.relu)

        replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))

        sampler = LocalSampler(agents=exploration_policy,
                               envs=env,
                               max_episode_length=env.spec.max_episode_length,
                               worker_class=FragmentWorker)

        algo = DDPG(env_spec=env.spec,
                    policy=policy,
                    qf=qf,
                    replay_buffer=replay_buffer,
                    sampler=sampler,
                    steps_per_epoch=20,
                    n_train_steps=50,
                    min_buffer_size=int(1e4),
                    exploration_policy=exploration_policy,
                    target_update_tau=1e-2,
                    discount=0.9)

        trainer.setup(algo, env)
        last_avg_ret = trainer.train(n_epochs=10, batch_size=100)
        assert last_avg_ret > 45

        env.close()
Exemplo n.º 8
0
def run_exp(*_):
    with LocalRunner() as runner:
        env = GarageEnv(HalfCheetahEnv())
        # q-functions
        qf1 = ContinuousMLPQFunction(env_spec=env.spec)
        qf2 = ContinuousMLPQFunction(env_spec=env.spec)
        # replay buffer
        replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )
        # policy
        policy = GaussianMLPPolicy(env_spec=env.spec)
        # algorithm
        algo = SAC(
            env_spec=env.spec,
            policy=policy,
            qfs=[qf1, qf2],
            replay_buffer=replay_buffer,
        )
        # setup and train
        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=1000)
Exemplo n.º 9
0
    def test_ddpg_pendulum(self):
        """Test DDPG with Pendulum environment.

        This environment has a [-3, 3] action_space bound.
        """
        deterministic.set_seed(0)
        runner = LocalRunner(snapshot_config)
        env = GarageEnv(normalize(gym.make('InvertedPendulum-v2')))

        policy = DeterministicMLPPolicy(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=F.relu,
                                        output_nonlinearity=torch.tanh)

        exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec,
                                                       policy,
                                                       sigma=0.2)

        qf = ContinuousMLPQFunction(env_spec=env.spec,
                                    hidden_sizes=[64, 64],
                                    hidden_nonlinearity=F.relu)

        replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))

        algo = DDPG(env_spec=env.spec,
                    policy=policy,
                    qf=qf,
                    replay_buffer=replay_buffer,
                    max_path_length=100,
                    steps_per_epoch=20,
                    n_train_steps=50,
                    min_buffer_size=int(1e4),
                    exploration_policy=exploration_policy,
                    target_update_tau=1e-2,
                    discount=0.9)

        runner.setup(algo, env)
        last_avg_ret = runner.train(n_epochs=10, batch_size=100)
        assert last_avg_ret > 10

        env.close()
Exemplo n.º 10
0
    def test_ddpg_double_pendulum(self):
        """Test DDPG with Pendulum environment."""
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            env = GymEnv('InvertedDoublePendulum-v2')
            policy = ContinuousMLPPolicy(env_spec=env.spec,
                                         hidden_sizes=[64, 64],
                                         hidden_nonlinearity=tf.nn.relu,
                                         output_nonlinearity=tf.nn.tanh)
            exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec,
                                                           policy,
                                                           sigma=0.2)
            qf = ContinuousMLPQFunction(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=tf.nn.relu)
            replay_buffer = PathBuffer(capacity_in_transitions=int(1e5))
            sampler = LocalSampler(
                agents=exploration_policy,
                envs=env,
                max_episode_length=env.spec.max_episode_length,
                is_tf_worker=True,
                worker_class=FragmentWorker)
            algo = DDPG(
                env_spec=env.spec,
                policy=policy,
                policy_lr=1e-4,
                qf_lr=1e-3,
                qf=qf,
                replay_buffer=replay_buffer,
                sampler=sampler,
                steps_per_epoch=20,
                target_update_tau=1e-2,
                n_train_steps=50,
                discount=0.9,
                min_buffer_size=int(5e3),
                exploration_policy=exploration_policy,
            )
            trainer.setup(algo, env)
            last_avg_ret = trainer.train(n_epochs=10, batch_size=100)
            assert last_avg_ret > 60

            env.close()
Exemplo n.º 11
0
    def test_dqn_cartpole_grad_clip(self):
        """Test DQN with CartPole environment."""
        deterministic.set_seed(100)
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            n_epochs = 10
            steps_per_epoch = 10
            sampler_batch_size = 500
            num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
            env = GarageEnv(gym.make('CartPole-v0'))
            replay_buffer = PathBuffer(capacity_in_transitions=int(1e4))
            qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64))
            policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf)
            epilson_greedy_policy = EpsilonGreedyPolicy(
                env_spec=env.spec,
                policy=policy,
                total_timesteps=num_timesteps,
                max_epsilon=1.0,
                min_epsilon=0.02,
                decay_ratio=0.1)
            algo = DQN(env_spec=env.spec,
                       policy=policy,
                       qf=qf,
                       exploration_policy=epilson_greedy_policy,
                       replay_buffer=replay_buffer,
                       max_path_length=100,
                       qf_lr=1e-4,
                       discount=1.0,
                       min_buffer_size=int(1e3),
                       double_q=False,
                       n_train_steps=500,
                       grad_norm_clipping=5.0,
                       steps_per_epoch=steps_per_epoch,
                       target_network_update_freq=1,
                       buffer_batch_size=32)

            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=n_epochs,
                                        batch_size=sampler_batch_size)
            assert last_avg_ret > 9

            env.close()
Exemplo n.º 12
0
    def test_td3_inverted_double_pendulum(self):
        deterministic.set_seed(0)
        n_epochs = 10
        steps_per_epoch = 20
        sampler_batch_size = 100
        num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
        trainer = Trainer(snapshot_config=snapshot_config)
        env = normalize(
            GymEnv('InvertedDoublePendulum-v2', max_episode_length=100))
        policy = DeterministicMLPPolicy(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=F.relu,
                                        output_nonlinearity=None)
        exploration_policy = AddGaussianNoise(env.spec,
                                              policy,
                                              total_timesteps=num_timesteps,
                                              max_sigma=0.1,
                                              min_sigma=0.1)
        qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                     hidden_sizes=[256, 256],
                                     hidden_nonlinearity=F.relu)
        qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                     hidden_sizes=[256, 256],
                                     hidden_nonlinearity=F.relu)
        replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
        td3 = TD3(env_spec=env.spec,
                  policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  replay_buffer=replay_buffer,
                  exploration_policy=exploration_policy,
                  steps_per_epoch=steps_per_epoch,
                  grad_steps_per_env_step=1,
                  num_evaluation_episodes=1,
                  discount=0.99)

        prefer_gpu()
        td3.to()
        trainer.setup(td3, env, sampler_cls=LocalSampler)
        trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
Exemplo n.º 13
0
    def test_ddpg_pendulum_with_decayed_weights(self):
        """Test DDPG with Pendulum environment and decayed weights.

        This environment has a [-3, 3] action_space bound.
        """
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = GarageEnv(normalize(gym.make('InvertedPendulum-v2')))
            policy = ContinuousMLPPolicy(env_spec=env.spec,
                                         hidden_sizes=[64, 64],
                                         hidden_nonlinearity=tf.nn.relu,
                                         output_nonlinearity=tf.nn.tanh)
            exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec,
                                                           policy,
                                                           sigma=0.2)
            qf = ContinuousMLPQFunction(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=tf.nn.relu)
            replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
            algo = DDPG(
                env_spec=env.spec,
                policy=policy,
                policy_lr=1e-4,
                qf_lr=1e-3,
                qf=qf,
                replay_buffer=replay_buffer,
                max_path_length=100,
                steps_per_epoch=20,
                target_update_tau=1e-2,
                n_train_steps=50,
                discount=0.9,
                policy_weight_decay=0.01,
                qf_weight_decay=0.01,
                min_buffer_size=int(5e3),
                exploration_policy=exploration_policy,
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=100)
            assert last_avg_ret > 10

            env.close()
Exemplo n.º 14
0
    def test_dqn_cartpole_double_q(self):
        """Test DQN with CartPole environment."""
        deterministic.set_seed(100)
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            n_epochs = 10
            steps_per_epoch = 10
            sampler_batch_size = 500
            num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
            env = GymEnv('CartPole-v0')
            replay_buffer = PathBuffer(capacity_in_transitions=int(1e4))
            qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64))
            policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf)
            epilson_greedy_policy = EpsilonGreedyPolicy(
                env_spec=env.spec,
                policy=policy,
                total_timesteps=num_timesteps,
                max_epsilon=1.0,
                min_epsilon=0.02,
                decay_ratio=0.1)
            algo = DQN(env_spec=env.spec,
                       policy=policy,
                       qf=qf,
                       exploration_policy=epilson_greedy_policy,
                       replay_buffer=replay_buffer,
                       qf_lr=1e-4,
                       discount=1.0,
                       min_buffer_size=int(1e3),
                       double_q=True,
                       n_train_steps=500,
                       steps_per_epoch=steps_per_epoch,
                       target_network_update_freq=1,
                       buffer_batch_size=32)

            trainer.setup(algo, env)
            last_avg_ret = trainer.train(n_epochs=n_epochs,
                                         batch_size=sampler_batch_size)
            assert last_avg_ret > 8.8

            env.close()
Exemplo n.º 15
0
    def test_eviction_policy(self):
        obs = np.array([[1], [1]])
        replay_buffer = PathBuffer(capacity_in_transitions=3)
        replay_buffer.add_path(dict(obs=obs))

        sampled_obs = replay_buffer.sample_transitions(3)['obs']
        assert (sampled_obs == np.array([[1], [1], [1]])).all()

        sampled_path_obs = replay_buffer.sample_path()['obs']
        assert (sampled_path_obs == np.array([[1], [1]])).all()

        obs2 = np.array([[2], [3]])
        replay_buffer.add_path(dict(obs=obs2))

        # Can still sample from old path
        new_sampled_obs = replay_buffer.sample_transitions(1000)['obs']
        assert set(new_sampled_obs.flatten()) == {1, 2, 3}

        # Can't sample complete old path
        for _ in range(100):
            new_sampled_path_obs = replay_buffer.sample_path()['obs']
            assert (new_sampled_path_obs == np.array([[2], [3]])).all()
Exemplo n.º 16
0
def setup():
    set_seed(24)
    n_epochs = 11
    steps_per_epoch = 10
    sampler_batch_size = 512
    num_timesteps = 100 * steps_per_epoch * sampler_batch_size
    env = GymEnv('CartPole-v0')

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))

    qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5))

    policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf)
    exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec,
                                             policy=policy,
                                             total_timesteps=num_timesteps,
                                             max_epsilon=1.0,
                                             min_epsilon=0.01,
                                             decay_ratio=0.4)
    sampler = LocalSampler(agents=exploration_policy,
                           envs=env,
                           max_episode_length=env.spec.max_episode_length,
                           worker_class=FragmentWorker)
    algo = DQN(env_spec=env.spec,
               policy=policy,
               qf=qf,
               exploration_policy=exploration_policy,
               replay_buffer=replay_buffer,
               sampler=sampler,
               steps_per_epoch=steps_per_epoch,
               qf_lr=5e-5,
               double_q=False,
               discount=0.9,
               min_buffer_size=int(1e4),
               n_train_steps=500,
               target_update_freq=30,
               buffer_batch_size=64)

    return algo, env, replay_buffer, n_epochs, sampler_batch_size
Exemplo n.º 17
0
def load_sac(env_name="MountainCarContinuous-v0"):
    """Return an instance of the SAC algorithm."""
    env = GarageEnv(env_name=env_name)
    policy = DeterministicMLPPolicy(name='policy',
                                    env_spec=env.spec,
                                    hidden_sizes=[64, 64])

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[64, 64],
                                 hidden_nonlinearity=F.relu)
    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[64, 64],
                                 hidden_nonlinearity=F.relu)

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
    algo = SAC(env_spec=env.spec,
               policy=policy,
               qf1=qf1,
               qf2=qf2,
               gradient_steps_per_itr=1000,
               max_path_length=500,
               replay_buffer=replay_buffer)
    return algo
Exemplo n.º 18
0
    def test_eviction_policy(self):
        obs = np.array([[1], [1]])
        replay_buffer = PathBuffer(capacity_in_transitions=3)
        replay_buffer.add_path(dict(obs=obs))

        sampled_obs = replay_buffer.sample_transitions(3)['obs']
        assert (sampled_obs == np.array([[1], [1], [1]])).all()

        sampled_path_obs = replay_buffer.sample_path()['obs']
        assert (sampled_path_obs == np.array([[1], [1]])).all()

        obs2 = np.array([[2], [3]])
        replay_buffer.add_path(dict(obs=obs2))

        with pytest.raises(Exception):
            assert replay_buffer.add_path(dict(test_obs=obs2))

        obs3 = np.array([1])
        with pytest.raises(Exception):
            assert replay_buffer.add_path(dict(obs=obs3))

        obs4 = np.array([[4], [5], [6], [7]])
        with pytest.raises(Exception):
            assert replay_buffer.add_path(dict(obs=obs4))

        # Can still sample from old path
        new_sampled_obs = replay_buffer.sample_transitions(1000)['obs']
        assert set(new_sampled_obs.flatten()) == {1, 2, 3}

        # Can't sample complete old path
        for _ in range(100):
            new_sampled_path_obs = replay_buffer.sample_path()['obs']
            assert (new_sampled_path_obs == np.array([[2], [3]])).all()

        replay_buffer.clear()
        assert replay_buffer.n_transitions_stored == 0
        assert not replay_buffer._buffer
Exemplo n.º 19
0
def dqn_pong(ctxt=None, seed=1, buffer_size=int(5e4), max_episode_length=500):
    """Train DQN on PongNoFrameskip-v4 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        buffer_size (int): Number of timesteps to store in replay buffer.
        max_episode_length (int): Maximum length of an episode, after which an
            episode is considered complete. This is used during testing to
            minimize the memory required to store a single episode.

    """
    set_seed(seed)
    with TFTrainer(ctxt) as trainer:
        n_epochs = 100
        steps_per_epoch = 20
        sampler_batch_size = 500
        num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size

        env = gym.make('PongNoFrameskip-v4')
        env = env.unwrapped
        env = Noop(env, noop_max=30)
        env = MaxAndSkip(env, skip=4)
        env = EpisodicLife(env)
        if 'FIRE' in env.unwrapped.get_action_meanings():
            env = FireReset(env)
        env = Grayscale(env)
        env = Resize(env, 84, 84)
        env = ClipReward(env)
        env = StackFrames(env, 4)

        env = GymEnv(env, is_image=True, max_episode_length=max_episode_length)

        replay_buffer = PathBuffer(capacity_in_transitions=buffer_size)

        qf = DiscreteCNNQFunction(env_spec=env.spec,
                                  filters=(
                                              (32, (8, 8)),
                                              (64, (4, 4)),
                                              (64, (3, 3)),
                                          ),
                                  strides=(4, 2, 1),
                                  dueling=False)  # yapf: disable

        policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf)
        exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec,
                                                 policy=policy,
                                                 total_timesteps=num_timesteps,
                                                 max_epsilon=1.0,
                                                 min_epsilon=0.02,
                                                 decay_ratio=0.1)

        algo = DQN(env_spec=env.spec,
                   policy=policy,
                   qf=qf,
                   exploration_policy=exploration_policy,
                   replay_buffer=replay_buffer,
                   qf_lr=1e-4,
                   discount=0.99,
                   min_buffer_size=int(1e4),
                   double_q=False,
                   n_train_steps=500,
                   steps_per_epoch=steps_per_epoch,
                   target_network_update_freq=2,
                   buffer_batch_size=32)

        trainer.setup(algo, env)
        trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
def sparse_mlp_mtsac_metaworld_mt1_pick_place(ctxt=None, *, seed, timesteps, _gpu):
    """Train MTSAC with the MT1 pick-place-v1 environment.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).
        timesteps (int): Number of timesteps to run.
    """
    deterministic.set_seed(seed)
    mt1 = metaworld.MT1('pick-place-v1')
    mt1_test = metaworld.MT1('pick-place-v1')
    train_task_sampler = MetaWorldTaskSampler(mt1, 'train',
                                              lambda env, _: normalize(env))
    test_task_sampler = MetaWorldTaskSampler(mt1_test, 'train',
                                             lambda env, _: normalize(env))
    n_tasks = 50
    train_envs = train_task_sampler.sample(n_tasks)
    env = train_envs[0]()
    test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)]

    trainer = Trainer(ctxt)

    policy = TanhGaussianSparseMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[400, 400, 400],
        linear_activity_percent_on=(0.1, 0.1, 0.1),
        linear_weight_percent_on=(0.4, 0.4, 0.4),
        mean_nonlinearity=None,
        std_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousSparseMLPQFunction(
        env_spec=env.spec,
        hidden_sizes=(400, 400, 400),
        linear_activity_percent_on=(0.1, 0.1, 0.1,),
        linear_weight_percent_on=(0.4, 0.4, 0.4,),
    )

    qf2 = ContinuousSparseMLPQFunction(
        env_spec=env.spec,
        hidden_sizes=(400, 400, 400),
        linear_activity_percent_on=(0.1, 0.1, 0.1),
        linear_weight_percent_on=(0.4, 0.4, 0.4),
    )

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

    sampler = LocalSampler(agents=policy,
                           envs=train_envs,
                           max_episode_length=env.spec.max_episode_length,
                           n_workers=n_tasks,
                           worker_class=FragmentWorker)

    batch_size = int(env.spec.max_episode_length * n_tasks)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    mtsac = SparseWeightsMTSAC(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        sampler=sampler,
        gradient_steps_per_itr=150,
        eval_env=test_envs,
        env_spec=env.spec,
        num_tasks=1,
        steps_per_epoch=epoch_cycles,
        replay_buffer=replay_buffer,
        min_buffer_size=1500,
        target_update_tau=5e-3,
        discount=0.99,
        buffer_batch_size=1280)
    if _gpu is not None:
        set_gpu_mode(True, _gpu)
    mtsac.to()
    trainer.setup(algo=mtsac, env=train_envs)
    trainer.train(n_epochs=epochs, batch_size=batch_size)
Exemplo n.º 21
0
def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps):
    """Train MTSAC with MT10 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).
        n_tasks (int): Number of tasks to use. Should be a multiple of 10.
        timesteps (int): Number of timesteps to run.

    """
    deterministic.set_seed(seed)
    trainer = Trainer(ctxt)
    mt10 = metaworld.MT10()  # pylint: disable=no-member
    mt10_test = metaworld.MT10()  # pylint: disable=no-member

    # pylint: disable=missing-return-doc, missing-return-type-doc
    def wrap(env, _):
        return normalize(env, normalize_reward=True)

    train_task_sampler = MetaWorldTaskSampler(mt10,
                                              'train',
                                              wrap,
                                              add_env_onehot=True)
    test_task_sampler = MetaWorldTaskSampler(mt10_test,
                                             'train',
                                             add_env_onehot=True)
    assert n_tasks % 10 == 0
    assert n_tasks <= 500
    mt10_train_envs = train_task_sampler.sample(n_tasks)
    env = mt10_train_envs[0]()
    mt10_test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)]

    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[400, 400, 400],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )
    meta_batch_size = 10

    sampler = LocalSampler(
        agents=policy,
        envs=mt10_train_envs,
        max_episode_length=env.spec.max_episode_length,
        # 1 sampler worker for each environment
        n_workers=meta_batch_size,
        worker_class=FragmentWorker,
        # increasing n_envs increases the vectorization of a sampler worker
        # which improves runtime performance, but you will need to adjust this
        # depending on your memory constraints. For reference, each worker by
        # default uses n_envs=8. Each environment is approximately ~50mb large
        # so creating 50 envs with 8 copies comes out to 20gb of memory. Many
        # users want to be able to run multiple seeds on 1 machine, so I have
        # reduced this to n_envs = 2 for 2 copies in the meantime.
        worker_args=dict(n_envs=2))

    batch_size = int(env.spec.max_episode_length * meta_batch_size)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  sampler=sampler,
                  gradient_steps_per_itr=env.spec.max_episode_length,
                  eval_env=mt10_test_envs,
                  env_spec=env.spec,
                  num_tasks=10,
                  steps_per_epoch=epoch_cycles,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1500,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=1280)
    if _gpu is not None:
        set_gpu_mode(True, _gpu)
    mtsac.to()
    trainer.setup(algo=mtsac, env=mt10_train_envs)
    trainer.train(n_epochs=epochs, batch_size=batch_size)
Exemplo n.º 22
0
def td3_half_cheetah(ctxt=None, seed=1):
    """Train TD3 with InvertedDoublePendulum-v2 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
        determinism.
    """
    set_seed(seed)

    n_epochs = 500
    steps_per_epoch = 20
    sampler_batch_size = 250
    num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size

    trainer = Trainer(ctxt)
    env = normalize(GymEnv('HalfCheetah-v2'))

    policy = DeterministicMLPPolicy(env_spec=env.spec,
                                    hidden_sizes=[256, 256],
                                    hidden_nonlinearity=F.relu,
                                    output_nonlinearity=torch.tanh)

    exploration_policy = AddGaussianNoise(env.spec,
                                          policy,
                                          total_timesteps=num_timesteps,
                                          max_sigma=0.1,
                                          min_sigma=0.1)

    uniform_random_policy = UniformRandomPolicy(env.spec)

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[256, 256],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[256, 256],
                                 hidden_nonlinearity=F.relu)

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))

    td3 = TD3(env_spec=env.spec,
              policy=policy,
              qf1=qf1,
              qf2=qf2,
              replay_buffer=replay_buffer,
              policy_optimizer=torch.optim.Adam,
              qf_optimizer=torch.optim.Adam,
              exploration_policy=exploration_policy,
              uniform_random_policy=uniform_random_policy,
              target_update_tau=0.005,
              discount=0.99,
              policy_noise_clip=0.5,
              policy_noise=0.2,
              policy_lr=1e-3,
              qf_lr=1e-3,
              steps_per_epoch=40,
              start_steps=1000,
              grad_steps_per_env_step=50,
              min_buffer_size=1000,
              buffer_batch_size=100)

    trainer.setup(algo=td3, env=env)
    trainer.train(n_epochs=750, batch_size=100)
Exemplo n.º 23
0
def td3_garage_tf(ctxt, env_id, seed):
    """Create garage TensorFlow TD3 model and training.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(ctxt) as runner:
        env = GarageEnv(normalize(gym.make(env_id)))

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['policy_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh)

        exploration_policy = AddGaussianNoise(
            env.spec,
            policy,
            max_sigma=hyper_parameters['sigma'],
            min_sigma=hyper_parameters['sigma'])

        qf = ContinuousMLPQFunction(
            name='ContinuousMLPQFunction',
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['qf_hidden_sizes'],
            action_merge_layer=0,
            hidden_nonlinearity=tf.nn.relu)

        qf2 = ContinuousMLPQFunction(
            name='ContinuousMLPQFunction2',
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['qf_hidden_sizes'],
            action_merge_layer=0,
            hidden_nonlinearity=tf.nn.relu)

        replay_buffer = PathBuffer(
            capacity_in_transitions=hyper_parameters['replay_buffer_size'])

        td3 = TD3(env.spec,
                  policy=policy,
                  qf=qf,
                  qf2=qf2,
                  replay_buffer=replay_buffer,
                  steps_per_epoch=hyper_parameters['steps_per_epoch'],
                  policy_lr=hyper_parameters['policy_lr'],
                  qf_lr=hyper_parameters['qf_lr'],
                  target_update_tau=hyper_parameters['tau'],
                  n_train_steps=hyper_parameters['n_train_steps'],
                  discount=hyper_parameters['discount'],
                  smooth_return=hyper_parameters['smooth_return'],
                  min_buffer_size=hyper_parameters['min_buffer_size'],
                  buffer_batch_size=hyper_parameters['buffer_batch_size'],
                  exploration_policy=exploration_policy,
                  policy_optimizer=tf.compat.v1.train.AdamOptimizer,
                  qf_optimizer=tf.compat.v1.train.AdamOptimizer)

        runner.setup(td3, env)
        runner.train(n_epochs=hyper_parameters['n_epochs'],
                     batch_size=hyper_parameters['n_rollout_steps'])
Exemplo n.º 24
0
    def __init__(
        self,
        env,
        skill_env,
        controller_policy,
        skill_actor,
        qf,
        vf,
        num_skills,
        num_train_tasks,
        num_test_tasks,
        latent_dim,
        encoder_hidden_sizes,
        controller_class=OpenContextConditionedControllerPolicy,
        encoder_class=GaussianContextEncoder,
        encoder_module_class=MLPEncoder,
        is_encoder_recurrent=False,
        controller_lr=3E-4,
        qf_lr=3E-4,
        vf_lr=3E-4,
        context_lr=3E-4,
        policy_mean_reg_coeff=1E-3,
        policy_std_reg_coeff=1E-3,
        policy_pre_activation_coeff=0.,
        soft_target_tau=0.005,
        kl_lambda=.1,
        optimizer_class=torch.optim.Adam,
        use_next_obs_in_context=False,
        meta_batch_size=64,
        num_steps_per_epoch=1000,
        num_skills_reason_steps=1000,
        num_skills_sample=10,
        num_initial_steps=1500,
        num_tasks_sample=5,
        num_steps_prior=400,
        num_steps_posterior=0,
        num_eval_trajs=50,
        num_extra_rl_steps_posterior=600,
        batch_size=1024,
        embedding_batch_size=1024,
        embedding_mini_batch_size=1024,
        max_path_length=1000,
        discount=0.99,
        replay_buffer_size=1000000,
        # TODO: the ratio needs to be tuned
        skills_reason_reward_scale=1,
        tasks_adapt_reward_scale=1.2,
        use_information_bottleneck=True,
        update_post_train=1,
    ):

        self._env = env
        self._skill_env = skill_env
        self._qf1 = qf
        self._qf2 = copy.deepcopy(qf)
        self._vf = vf
        self._skill_actor = skill_actor
        self._num_skills = num_skills
        self._num_train_tasks = num_train_tasks
        self._num_test_tasks = num_test_tasks
        self._latent_dim = latent_dim

        self._policy_mean_reg_coeff = policy_mean_reg_coeff
        self._policy_std_reg_coeff = policy_std_reg_coeff
        self._policy_pre_activation_coeff = policy_pre_activation_coeff
        self._soft_target_tau = soft_target_tau
        self._kl_lambda = kl_lambda
        self._use_next_obs_in_context = use_next_obs_in_context

        self._meta_batch_size = meta_batch_size
        self._num_skills_reason_steps = num_skills_reason_steps
        self._num_steps_per_epoch = num_steps_per_epoch
        self._num_initial_steps = num_initial_steps
        self._num_tasks_sample = num_tasks_sample
        self._num_skills_sample = num_skills_sample
        self._num_steps_prior = num_steps_prior
        self._num_steps_posterior = num_steps_posterior
        self._num_extra_rl_steps_posterior = num_extra_rl_steps_posterior
        self._num_eval_trajs = num_eval_trajs
        self._batch_size = batch_size
        self._embedding_batch_size = embedding_batch_size
        self._embedding_mini_batch_size = embedding_mini_batch_size
        self.max_path_length = max_path_length
        self._discount = discount
        self._replay_buffer_size = replay_buffer_size

        self._is_encoder_recurrent = is_encoder_recurrent
        self._use_information_bottleneck = use_information_bottleneck
        self._skills_reason_reward_scale = skills_reason_reward_scale
        self._tasks_adapt_reward_scale = tasks_adapt_reward_scale
        self._update_post_train = update_post_train
        self._task_idx = None
        self._skill_idx = None  # do we really need it

        self._is_resuming = False

        self._average_rewards = []

        encoder_spec = self.get_env_spec(env[0](), latent_dim, num_skills,
                                         'encoder')
        encoder_in_dim = int(np.prod(encoder_spec.input_space.shape))
        encoder_out_dim = int(np.prod(encoder_spec.output_space.shape))

        encoder_module = encoder_module_class(
            input_dim=encoder_in_dim,
            output_dim=encoder_out_dim,
            hidden_sizes=encoder_hidden_sizes)

        context_encoder = encoder_class(encoder_module,
                                        use_information_bottleneck, latent_dim)

        self._controller = controller_class(
            latent_dim=latent_dim,
            context_encoder=context_encoder,
            controller_policy=controller_policy,
            num_skills=num_skills,
            sub_actor=skill_actor,
            use_information_bottleneck=use_information_bottleneck,
            use_next_obs=use_next_obs_in_context)

        self._skills_replay_buffer = PathBuffer(replay_buffer_size)

        self._replay_buffers = {
            i: PathBuffer(replay_buffer_size)
            for i in range(num_train_tasks)
        }

        self._context_replay_buffers = {
            i: PathBuffer(replay_buffer_size)
            for i in range(num_train_tasks)
        }

        self.target_vf = copy.deepcopy(self._vf)
        self.vf_criterion = torch.nn.MSELoss()

        self._controller_optimizer = optimizer_class(
            self._controller.networks[1].parameters(),
            lr=controller_lr,
        )
        self.qf1_optimizer = optimizer_class(
            self._qf1.parameters(),
            lr=qf_lr,
        )
        self.qf2_optimizer = optimizer_class(
            self._qf2.parameters(),
            lr=qf_lr,
        )
        self.vf_optimizer = optimizer_class(
            self._vf.parameters(),
            lr=vf_lr,
        )
        self.context_optimizer = optimizer_class(
            self._controller.networks[0].parameters(),
            lr=context_lr,
        )
Exemplo n.º 25
0
def diayn_half_cheetah_vel_batch_for_pearl(ctxt=None, seed=1):
    deterministic.set_seed(seed)
    runner = LocalRunner(snapshot_config=ctxt)
    env = GarageEnv(normalize(HalfCheetahVelEnv()))

    policy = TanhGaussianMLPSkillPolicy(
        env_spec=env.spec,
        skills_num=skills_num,
        hidden_sizes=[256, 256],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPSkillQFunction(env_spec=env.spec,
                                      skills_num=skills_num,
                                      hidden_sizes=[256, 256],
                                      hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPSkillQFunction(env_spec=env.spec,
                                      skills_num=skills_num,
                                      hidden_sizes=[256, 256],
                                      hidden_nonlinearity=F.relu)

    discriminator = MLPDiscriminator(env_spec=env.spec,
                                     skills_num=skills_num,
                                     hidden_sizes=[64, 64],
                                     hidden_nonlinearity=F.relu)

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))

    diayn = DIAYN(
        env_spec=env.spec,
        skills_num=skills_num,
        discriminator=discriminator,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        gradient_steps_per_itr=1000,
        max_path_length=300,
        replay_buffer=replay_buffer,
        min_buffer_size=1e4,
        recorded=True,  # enable the video recording func
        target_update_tau=5e-3,
        discount=0.99,
        buffer_batch_size=256,
        reward_scale=1.,
        steps_per_epoch=1)

    if torch.cuda.is_available():
        tu.set_gpu_mode(True)
    else:
        tu.set_gpu_mode(False)
    diayn.to()
    worker_args = {"skills_num": skills_num}
    runner.setup(algo=diayn,
                 env=env,
                 sampler_cls=LocalSkillSampler,
                 worker_class=SkillWorker,
                 worker_args=worker_args)
    runner.train(n_epochs=1000, batch_size=1000)  # 1000
    # runner.restore(from_dir=os.path.join(os.getcwd(), 'data/local/experiment/diayn_half_cheetah_batch_50'))
    # diayn = runner.get_algo()
    runner.save(999)  # saves the last episode

    return discriminator, diayn
    def test_no_reset(self):
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            # This tests if off-policy sampler respect batch_size
            # when no_reset is set to True
            env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            policy = ContinuousMLPPolicy(env_spec=env.spec,
                                         hidden_sizes=[64, 64],
                                         hidden_nonlinearity=tf.nn.relu,
                                         output_nonlinearity=tf.nn.tanh)
            exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec,
                                                           policy,
                                                           sigma=0.2)
            qf = ContinuousMLPQFunction(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=tf.nn.relu)
            replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
            algo = DDPG(
                env_spec=env.spec,
                policy=policy,
                policy_lr=1e-4,
                qf_lr=1e-3,
                qf=qf,
                replay_buffer=replay_buffer,
                target_update_tau=1e-2,
                n_train_steps=50,
                discount=0.9,
                min_buffer_size=int(1e4),
                exploration_policy=exploration_policy,
            )

            sampler = OffPolicyVectorizedSampler(algo, env, 1, no_reset=True)
            sampler.start_worker()

            runner.initialize_tf_vars()

            paths1 = sampler.obtain_samples(0, 5)
            paths2 = sampler.obtain_samples(0, 5)

            len1 = sum([len(path['rewards']) for path in paths1])
            len2 = sum([len(path['rewards']) for path in paths2])

            assert len1 == 5 and len2 == 5, 'Sampler should respect batch_size'
            # yapf: disable
            # When done is False in 1st sampling, the next sampling should be
            # stacked with the last batch in 1st sampling
            case1 = (len(paths1[-1]['rewards']) + len(paths2[0]['rewards'])
                     == paths2[0]['running_length'])
            # When done is True in 1st sampling, the next sampling should be
            # separated
            case2 = len(paths2[0]['rewards']) == paths2[0]['running_length']
            done = paths1[-1]['dones'][-1]
            assert (
                (not done and case1) or (done and case2)
            ), 'Running length should be the length of full path'

            # yapf: enable
            case1 = np.isclose(
                paths1[-1]['rewards'].sum() + paths2[0]['rewards'].sum(),
                paths2[0]['undiscounted_return'])
            case2 = np.isclose(paths2[0]['rewards'].sum(),
                               paths2[0]['undiscounted_return'])
            assert (
                (not done and case1) or (done and case2)
            ), 'Undiscounted_return should be the sum of rewards of full path'
Exemplo n.º 27
0
 def test_episode_batch_to_timestep_batch(self, eps_data):
     t = EpisodeBatch(**eps_data)
     replay_buffer = PathBuffer(capacity_in_transitions=100)
     replay_buffer.add_episode_batch(t)
     timesteps = replay_buffer.sample_timesteps(10)
     assert len(timesteps.rewards) == 10
Exemplo n.º 28
0
    def __init__(
        self,
        experiment_name,
        use_gpu,
        trainer_args
    ):
        """Train MTSAC with metaworld_experiments environment.
        Args:
            experiment_name: expeirment name to be used for logging and checkpointing
            use_wandb: boolean, defines whether or not to log to wandb
            use_gpu: boolean, defines whether or not to use to GPU for training
            trainer_args: named tuple with args given by config
        """

        # Define log and checkpoint dir
        self.checkpoint_dir = os.path.join(
            trainer_args.log_dir,
            f"{experiment_name}-{trainer_args.project_id}"
        )
        print(f"Checkpoint dir: {self.checkpoint_dir}")
        self.state_path = os.path.join(self.checkpoint_dir, "experiment_state.p")
        self.env_state_path = os.path.join(self.checkpoint_dir, "env_state.p")
        self.config_path = os.path.join(self.checkpoint_dir, "config.json")
        self.experiment_name = experiment_name

        # Only define viz_save_path if required to save visualizations local
        self.viz_save_path = None
        if trainer_args.save_visualizations_local:
            self.viz_save_path = os.path.join(self.checkpoint_dir, "viz")

        # Check if loading from existing experiment
        self.loading_from_existing = os.path.exists(self.checkpoint_dir)
        os.makedirs(self.checkpoint_dir, exist_ok=True)

        # Save arguments for later retrieval
        self.init_config(trainer_args)

        num_tasks = trainer_args.num_tasks

        # TODO: do we have to fix which GPU to use? run distributed across multiGPUs
        if use_gpu:
            set_gpu_mode(True, 0)

        if trainer_args.seed is not None:
            deterministic.set_seed(trainer_args.seed)

        # Note: different classes whether it uses 10 or 50 tasks. Why?
        mt_env = (
            metaworld.MT10(seed=trainer_args.env_seed) if num_tasks <= 10
            else metaworld.MT50(seed=trainer_args.env_seed)
        )

        train_task_sampler = MetaWorldTaskSampler(
            mt_env, "train", add_env_onehot=True
        )

        # TODO: add some clarifying comments of why these asserts are required
        assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10"
        assert num_tasks <= 500, "Number of tasks should be less or equal 500"

        # TODO: do we have guarantees that in case seed is set, the tasks being sampled
        # are the same?
        mt_train_envs = train_task_sampler.sample(num_tasks)
        env = mt_train_envs[0]()

        if trainer_args.params_seed is not None:
            torch.manual_seed(trainer_args.params_seed)

        policy = create_policy_net(env_spec=env.spec, net_params=trainer_args)
        qf1 = create_qf_net(env_spec=env.spec, net_params=trainer_args)
        qf2 = create_qf_net(env_spec=env.spec, net_params=trainer_args)

        if trainer_args.params_seed is not None:
            calculate_mean_param("policy", policy)
            calculate_mean_param("qf1", qf1)
            calculate_mean_param("qf2", qf2)

        if trainer_args.override_weight_initialization:
            logging.warn("Overriding dendritic layer weight initialization")
            self.override_weight_initialization([policy, qf1, qf2])

        replay_buffer = PathBuffer(
            capacity_in_transitions=trainer_args.num_buffer_transitions
        )
        max_episode_length = env.spec.max_episode_length
        self.env_steps_per_epoch = int(max_episode_length * num_tasks)
        self.num_epochs = trainer_args.timesteps // self.env_steps_per_epoch

        sampler = RaySampler(
            agent=policy,
            envs=mt_train_envs,
            max_episode_length=max_episode_length,
            cpus_per_worker=trainer_args.cpus_per_worker,
            gpus_per_worker=trainer_args.gpus_per_worker,
            workers_per_env=trainer_args.workers_per_env,
            seed=trainer_args.seed,
        )

        self._algo = CustomMTSAC(
            env_spec=env.spec,
            policy=policy,
            qf1=qf1,
            qf2=qf2,
            replay_buffer=replay_buffer,
            sampler=sampler,
            train_task_sampler=train_task_sampler,
            gradient_steps_per_itr=int(
                max_episode_length * trainer_args.num_grad_steps_scale
            ),
            task_update_frequency=trainer_args.task_update_frequency,
            num_tasks=num_tasks,
            min_buffer_size=max_episode_length * num_tasks,
            target_update_tau=trainer_args.target_update_tau,
            discount=trainer_args.discount,
            buffer_batch_size=trainer_args.buffer_batch_size,
            policy_lr=trainer_args.policy_lr,
            qf_lr=trainer_args.qf_lr,
            reward_scale=trainer_args.reward_scale,
            num_evaluation_episodes=trainer_args.eval_episodes,
            fp16=trainer_args.fp16 if use_gpu else False,
            log_per_task=trainer_args.log_per_task,
            share_train_eval_env=trainer_args.share_train_eval_env
        )

        # Override with loaded networks if existing experiment
        self.current_epoch = 0
        if self.loading_from_existing:
            self.load_experiment_state()

        # Move all networks within the model on device
        self._algo.to()
Exemplo n.º 29
0
    def __init__(self,
                 env,
                 inner_policy,
                 qf,
                 vf,
                 num_train_tasks,
                 num_test_tasks,
                 latent_dim,
                 encoder_hidden_sizes,
                 test_env_sampler,
                 policy_class=ContextConditionedPolicy,
                 encoder_class=MLPEncoder,
                 policy_lr=3E-4,
                 qf_lr=3E-4,
                 vf_lr=3E-4,
                 context_lr=3E-4,
                 policy_mean_reg_coeff=1E-3,
                 policy_std_reg_coeff=1E-3,
                 policy_pre_activation_coeff=0.,
                 soft_target_tau=0.005,
                 kl_lambda=.1,
                 optimizer_class=torch.optim.Adam,
                 use_information_bottleneck=True,
                 use_next_obs_in_context=False,
                 meta_batch_size=64,
                 num_steps_per_epoch=1000,
                 num_initial_steps=100,
                 num_tasks_sample=100,
                 num_steps_prior=100,
                 num_steps_posterior=0,
                 num_extra_rl_steps_posterior=100,
                 batch_size=1024,
                 embedding_batch_size=1024,
                 embedding_mini_batch_size=1024,
                 max_path_length=1000,
                 discount=0.99,
                 replay_buffer_size=1000000,
                 reward_scale=1,
                 update_post_train=1):

        self._env = env
        self._qf1 = qf
        self._qf2 = copy.deepcopy(qf)
        self._vf = vf
        self._num_train_tasks = num_train_tasks
        self._num_test_tasks = num_test_tasks
        self._latent_dim = latent_dim

        self._policy_mean_reg_coeff = policy_mean_reg_coeff
        self._policy_std_reg_coeff = policy_std_reg_coeff
        self._policy_pre_activation_coeff = policy_pre_activation_coeff
        self._soft_target_tau = soft_target_tau
        self._kl_lambda = kl_lambda
        self._use_information_bottleneck = use_information_bottleneck
        self._use_next_obs_in_context = use_next_obs_in_context

        self._meta_batch_size = meta_batch_size
        self._num_steps_per_epoch = num_steps_per_epoch
        self._num_initial_steps = num_initial_steps
        self._num_tasks_sample = num_tasks_sample
        self._num_steps_prior = num_steps_prior
        self._num_steps_posterior = num_steps_posterior
        self._num_extra_rl_steps_posterior = num_extra_rl_steps_posterior
        self._batch_size = batch_size
        self._embedding_batch_size = embedding_batch_size
        self._embedding_mini_batch_size = embedding_mini_batch_size
        self.max_path_length = max_path_length
        self._discount = discount
        self._replay_buffer_size = replay_buffer_size
        self._reward_scale = reward_scale
        self._update_post_train = update_post_train
        self._task_idx = None

        self._is_resuming = False

        worker_args = dict(deterministic=True, accum_context=True)
        self._evaluator = MetaEvaluator(test_task_sampler=test_env_sampler,
                                        max_path_length=max_path_length,
                                        worker_class=PEARLWorker,
                                        worker_args=worker_args,
                                        n_test_tasks=num_test_tasks)

        encoder_spec = self.get_env_spec(env[0](), latent_dim, 'encoder')
        encoder_in_dim = int(np.prod(encoder_spec.input_space.shape))
        encoder_out_dim = int(np.prod(encoder_spec.output_space.shape))
        context_encoder = encoder_class(input_dim=encoder_in_dim,
                                        output_dim=encoder_out_dim,
                                        hidden_sizes=encoder_hidden_sizes)

        self._policy = policy_class(
            latent_dim=latent_dim,
            context_encoder=context_encoder,
            policy=inner_policy,
            use_information_bottleneck=use_information_bottleneck,
            use_next_obs=use_next_obs_in_context)

        # buffer for training RL update
        self._replay_buffers = {
            i: PathBuffer(replay_buffer_size)
            for i in range(num_train_tasks)
        }

        self._context_replay_buffers = {
            i: PathBuffer(replay_buffer_size)
            for i in range(num_train_tasks)
        }

        self.target_vf = copy.deepcopy(self._vf)
        self.vf_criterion = torch.nn.MSELoss()

        self._policy_optimizer = optimizer_class(
            self._policy.networks[1].parameters(),
            lr=policy_lr,
        )
        self.qf1_optimizer = optimizer_class(
            self._qf1.parameters(),
            lr=qf_lr,
        )
        self.qf2_optimizer = optimizer_class(
            self._qf2.parameters(),
            lr=qf_lr,
        )
        self.vf_optimizer = optimizer_class(
            self._vf.parameters(),
            lr=vf_lr,
        )
        self.context_optimizer = optimizer_class(
            self._policy.networks[0].parameters(),
            lr=context_lr,
        )
def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None):
    """Train MTSAC with the ML1 pick-place-v1 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).

    """
    deterministic.set_seed(seed)
    runner = LocalRunner(ctxt)
    train_envs = []
    test_envs = []
    env_names = []
    for i in range(50):
        train_env = normalize(
            GymEnv(mwb.ML1.get_train_tasks('pick-place-v1'),
                   normalize_reward=True))
        test_env = pickle.loads(pickle.dumps(train_env))
        env_names.append('pick_place_{}'.format(i))
        train_envs.append(train_env)
        test_envs.append(test_env)
    ml1_train_envs = MultiEnvWrapper(train_envs,
                                     sample_strategy=round_robin_strategy,
                                     env_names=env_names)
    ml1_test_envs = MultiEnvWrapper(test_envs,
                                    sample_strategy=round_robin_strategy,
                                    env_names=env_names)
    policy = TanhGaussianMLPPolicy(
        env_spec=ml1_train_envs.spec,
        hidden_sizes=[400, 400, 400],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

    timesteps = 10000000
    batch_size = int(150 * ml1_train_envs.num_tasks)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  gradient_steps_per_itr=150,
                  max_episode_length=150,
                  eval_env=ml1_test_envs,
                  env_spec=ml1_train_envs.spec,
                  num_tasks=50,
                  steps_per_epoch=epoch_cycles,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1500,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=1280)
    if _gpu is not None:
        set_gpu_mode(True, _gpu)
    mtsac.to()
    runner.setup(algo=mtsac, env=ml1_train_envs, sampler_cls=LocalSampler)
    runner.train(n_epochs=epochs, batch_size=batch_size)