def test_dist_info_sym_include_action(self, obs_dim, action_dim,
                                          hidden_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))

        obs_ph = tf.placeholder(
            tf.float32, shape=(None, None, env.observation_space.flat_dim))

        with mock.patch(('garage.tf.policies.'
                         'gaussian_lstm_policy_with_model.GaussianLSTMModel'),
                        new=SimpleGaussianLSTMModel):
            policy = GaussianLSTMPolicyWithModel(
                env_spec=env.spec, state_include_action=True)

            policy.reset()
            obs = env.reset()
            dist_sym = policy.dist_info_sym(
                obs_var=obs_ph,
                state_info_vars={'prev_action': np.zeros((2, 1) + action_dim)},
                name='p2_sym')
        dist = self.sess.run(
            dist_sym, feed_dict={obs_ph: [[obs.flatten()], [obs.flatten()]]})

        assert np.array_equal(dist['mean'], np.full((2, 1) + action_dim, 0.5))
        assert np.array_equal(dist['log_std'], np.full((2, 1) + action_dim,
                                                       0.5))
Пример #2
0
    def test_ppo_pendulum_lstm_with_model(self):
        """Test PPO with model, with Pendulum environment."""
        with LocalTFRunner(sess=self.sess) as runner:
            env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            policy = GaussianLSTMPolicyWithModel(env_spec=env.spec, )
            baseline = GaussianMLPBaselineWithModel(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = PPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                gae_lambda=0.95,
                lr_clip_range=0.2,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                ),
                stop_entropy_gradient=True,
                entropy_method='max',
                policy_ent_coeff=0.02,
                center_adv=False,
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 80

            env.close()
    def test_is_pickleable(self):
        env = TfEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, )))
        with mock.patch(('garage.tf.policies.'
                         'gaussian_lstm_policy_with_model.GaussianLSTMModel'),
                        new=SimpleGaussianLSTMModel):
            policy = GaussianLSTMPolicyWithModel(
                env_spec=env.spec, state_include_action=False)

        env.reset()
        obs = env.reset()

        with tf.variable_scope(
                'GaussianLSTMPolicyWithModel/GaussianLSTMModel', reuse=True):
            return_var = tf.get_variable('return_var')
        # assign it to all one
        return_var.load(tf.ones_like(return_var).eval())

        output1 = self.sess.run(
            policy.model.networks['default'].mean,
            feed_dict={policy.model.input: [[obs.flatten()], [obs.flatten()]]})

        p = pickle.dumps(policy)

        with tf.Session(graph=tf.Graph()) as sess:
            policy_pickled = pickle.loads(p)
            output2 = sess.run(
                policy_pickled.model.networks['default'].mean,
                feed_dict={
                    policy_pickled.model.input: [[obs.flatten()],
                                                 [obs.flatten()]]
                })
            assert np.array_equal(output1, output2)
    def test_get_action(self, obs_dim, action_dim, hidden_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'gaussian_lstm_policy_with_model.GaussianLSTMModel'),
                        new=SimpleGaussianLSTMModel):
            policy = GaussianLSTMPolicyWithModel(
                env_spec=env.spec, state_include_action=False)

        policy.reset()
        obs = env.reset()

        action, agent_info = policy.get_action(obs)
        assert env.action_space.contains(action)
        assert np.array_equal(action, np.full(action_dim, 0.75))
        expected_mean = np.full(action_dim, 0.5)
        assert np.array_equal(agent_info['mean'], expected_mean)
        expected_log_std = np.full(action_dim, 0.5)
        assert np.array_equal(agent_info['log_std'], expected_log_std)

        actions, agent_infos = policy.get_actions([obs])
        for action, mean, log_std in zip(actions, agent_infos['mean'],
                                         agent_infos['log_std']):
            assert env.action_space.contains(action)
            assert np.array_equal(action, np.full(action_dim, 0.75))
            assert np.array_equal(mean, expected_mean)
            assert np.array_equal(log_std, expected_log_std)
Пример #5
0
    def test_dist_info_sym_wrong_input(self):
        env = TfEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, )))

        obs_ph = tf.placeholder(tf.float32,
                                shape=(None, None,
                                       env.observation_space.flat_dim))

        with mock.patch(('garage.tf.policies.'
                         'gaussian_lstm_policy_with_model.GaussianLSTMModel'),
                        new=SimpleGaussianLSTMModel):
            policy = GaussianLSTMPolicyWithModel(env_spec=env.spec,
                                                 state_include_action=True)

            policy.reset()
            obs = env.reset()

            policy.dist_info_sym(
                obs_var=obs_ph,
                state_info_vars={'prev_action': np.zeros((3, 1, 1))},
                name='p2_sym')
        # observation batch size = 2 but prev_action batch size = 3
        with self.assertRaises(tf.errors.InvalidArgumentError):
            self.sess.run(
                policy.model.networks['p2_sym'].input,
                feed_dict={obs_ph: [[obs.flatten()], [obs.flatten()]]})
Пример #6
0
    def test_ppo_pendulum_lstm_with_model(self):
        """Test PPO with model, with Pendulum environment."""
        with LocalRunner(self.sess) as runner:
            env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            policy = GaussianLSTMPolicyWithModel(env_spec=env.spec, )
            baseline = GaussianMLPBaselineWithModel(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = PPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                lr_clip_range=0.01,
                optimizer_args=dict(batch_size=32, max_epochs=10),
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 40

            env.close()
 def test_invalid_env(self):
     env = TfEnv(DummyDiscreteEnv())
     with pytest.raises(ValueError):
         GaussianLSTMPolicyWithModel(env_spec=env.spec)
    def setup_method(self):
        super().setup_method()
        with mock.patch('tensorflow.random.normal') as mock_rand:
            mock_rand.return_value = 0.5
            env = TfEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, )))
            self.default_initializer = tf.constant_initializer(1)
            self.default_hidden_nonlinearity = tf.nn.tanh
            self.default_recurrent_nonlinearity = tf.nn.sigmoid
            self.default_output_nonlinearity = None
            self.time_step = 1

            self.policy1 = GaussianLSTMPolicy(
                env_spec=env.spec,
                hidden_dim=4,
                hidden_nonlinearity=self.default_hidden_nonlinearity,
                recurrent_nonlinearity=self.default_recurrent_nonlinearity,
                recurrent_w_x_init=self.default_initializer,
                recurrent_w_h_init=self.default_initializer,
                output_nonlinearity=self.default_output_nonlinearity,
                output_w_init=self.default_initializer,
                state_include_action=True,
                name='P1')
            self.policy2 = GaussianLSTMPolicy(
                env_spec=env.spec,
                hidden_dim=4,
                hidden_nonlinearity=self.default_hidden_nonlinearity,
                recurrent_nonlinearity=self.default_recurrent_nonlinearity,
                recurrent_w_x_init=self.default_initializer,
                recurrent_w_h_init=self.default_initializer,
                output_nonlinearity=self.default_output_nonlinearity,
                output_w_init=tf.constant_initializer(2),
                state_include_action=True,
                name='P2')

            self.sess.run(tf.global_variables_initializer())

            self.policy3 = GaussianLSTMPolicyWithModel(
                env_spec=env.spec,
                hidden_dim=4,
                hidden_nonlinearity=self.default_hidden_nonlinearity,
                hidden_w_init=self.default_initializer,
                recurrent_nonlinearity=self.default_recurrent_nonlinearity,
                recurrent_w_init=self.default_initializer,
                output_nonlinearity=self.default_output_nonlinearity,
                output_w_init=self.default_initializer,
                state_include_action=True,
                name='P3')
            self.policy4 = GaussianLSTMPolicyWithModel(
                env_spec=env.spec,
                hidden_dim=4,
                hidden_nonlinearity=self.default_hidden_nonlinearity,
                hidden_w_init=self.default_initializer,
                recurrent_nonlinearity=self.default_recurrent_nonlinearity,
                recurrent_w_init=self.default_initializer,
                output_nonlinearity=self.default_output_nonlinearity,
                output_w_init=tf.constant_initializer(2),
                state_include_action=True,
                name='P4')

            self.policy1.reset()
            self.policy2.reset()
            self.policy3.reset()
            self.policy4.reset()
            self.obs = [env.reset()]
            self.obs = np.concatenate(
                [self.obs for _ in range(self.time_step)], axis=0)

            self.obs_ph = tf.placeholder(
                tf.float32, shape=(None, None, env.observation_space.flat_dim))
            self.action_ph = tf.placeholder(tf.float32,
                                            shape=(None, None,
                                                   env.action_space.flat_dim))

            self.dist1_sym = self.policy1.dist_info_sym(
                obs_var=self.obs_ph,
                state_info_vars={
                    'prev_action': np.zeros((2, self.time_step, 1))
                },
                name='p1_sym')
            self.dist2_sym = self.policy2.dist_info_sym(
                obs_var=self.obs_ph,
                state_info_vars={
                    'prev_action': np.zeros((2, self.time_step, 1))
                },
                name='p2_sym')
            self.dist3_sym = self.policy3.dist_info_sym(
                obs_var=self.obs_ph,
                state_info_vars={
                    'prev_action': np.zeros((2, self.time_step, 1))
                },
                name='p3_sym')
            self.dist4_sym = self.policy4.dist_info_sym(
                obs_var=self.obs_ph,
                state_info_vars={
                    'prev_action': np.zeros((2, self.time_step, 1))
                },
                name='p4_sym')