예제 #1
0
    def setup_method(self):
        with mock.patch('tensorflow.random.normal') as mock_rand:
            mock_rand.return_value = 0.5
            super().setup_method()
            self.box_env = TfEnv(DummyBoxEnv())
            self.policy1 = ContinuousMLPPolicy(
                env_spec=self.box_env, hidden_sizes=(32, 32), name='P1')
            self.policy2 = ContinuousMLPPolicy(
                env_spec=self.box_env, hidden_sizes=(64, 64), name='P2')
            self.policy3 = ContinuousMLPPolicyWithModel(
                env_spec=self.box_env, hidden_sizes=(32, 32), name='P3')
            self.policy4 = ContinuousMLPPolicyWithModel(
                env_spec=self.box_env, hidden_sizes=(64, 64), name='P4')

            self.sess.run(tf.compat.v1.global_variables_initializer())
            for a, b in zip(self.policy3.get_params(),
                            self.policy1.get_params()):
                self.sess.run(a.assign(b))
            for a, b in zip(self.policy4.get_params(),
                            self.policy2.get_params()):
                self.sess.run(a.assign(b))

            self.obs = self.box_env.reset()
            self.action_bound = self.box_env.action_space.high
            assert self.policy1.vectorized == self.policy2.vectorized
            assert self.policy3.vectorized == self.policy4.vectorized
예제 #2
0
    def test_is_pickleable(self, obs_dim, action_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'continuous_mlp_policy_with_model.MLPModel'),
                        new=SimpleMLPModel):
            policy = ContinuousMLPPolicyWithModel(env_spec=env.spec)

        env.reset()
        obs, _, _, _ = env.step(1)

        with tf.compat.v1.variable_scope('ContinuousMLPPolicy/MLPModel',
                                         reuse=True):
            return_var = tf.compat.v1.get_variable('return_var')
        # assign it to all one
        return_var.load(tf.ones_like(return_var).eval())
        output1 = self.sess.run(
            policy.model.outputs,
            feed_dict={policy.model.input: [obs.flatten()]})

        p = pickle.dumps(policy)
        with tf.compat.v1.Session(graph=tf.Graph()) as sess:
            policy_pickled = pickle.loads(p)
            output2 = sess.run(
                policy_pickled.model.outputs,
                feed_dict={policy_pickled.model.input: [obs.flatten()]})
            assert np.array_equal(output1, output2)
예제 #3
0
def run_garage(env, seed, log_dir):
    '''
    Create garage model and training.
    Replace the ddpg with the algorithm you want to run.
    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)

    with LocalRunner() as runner:
        env = TfEnv(normalize(env))
        # Set up params for ddpg
        action_noise = OUStrategy(env.spec, sigma=params['sigma'])

        policy = ContinuousMLPPolicyWithModel(
            env_spec=env.spec,
            hidden_sizes=params['policy_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh)

        qf = ContinuousMLPQFunction(env_spec=env.spec,
                                    hidden_sizes=params['qf_hidden_sizes'],
                                    hidden_nonlinearity=tf.nn.relu)

        replay_buffer = SimpleReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=params['replay_buffer_size'],
            time_horizon=params['n_rollout_steps'])

        ddpg = DDPG(env_spec=env.spec,
                    policy=policy,
                    qf=qf,
                    replay_buffer=replay_buffer,
                    policy_lr=params['policy_lr'],
                    qf_lr=params['qf_lr'],
                    target_update_tau=params['tau'],
                    n_train_steps=params['n_train_steps'],
                    discount=params['discount'],
                    min_buffer_size=int(1e4),
                    exploration_strategy=action_noise,
                    policy_optimizer=tf.train.AdamOptimizer,
                    qf_optimizer=tf.train.AdamOptimizer)

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        tensorboard_log_dir = osp.join(log_dir)
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(tensorboard_log_dir))

        runner.setup(ddpg, env)
        runner.train(n_epochs=params['n_epochs'],
                     n_epoch_cycles=params['n_epoch_cycles'],
                     batch_size=params['n_rollout_steps'])

        dowel_logger.remove_all()

        return tabular_log_file
예제 #4
0
    def test_no_reset(self):
        with LocalRunner(sess=self.sess) as runner:
            # This tests if off-policy sampler respect batch_size
            # when no_reset is set to True
            env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            action_noise = OUStrategy(env.spec, sigma=0.2)
            policy = ContinuousMLPPolicyWithModel(
                env_spec=env.spec,
                hidden_sizes=[64, 64],
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.tanh)
            qf = ContinuousMLPQFunction(
                env_spec=env.spec,
                hidden_sizes=[64, 64],
                hidden_nonlinearity=tf.nn.relu)
            replay_buffer = SimpleReplayBuffer(
                env_spec=env.spec,
                size_in_transitions=int(1e6),
                time_horizon=100)
            algo = DDPG(
                env_spec=env.spec,
                policy=policy,
                policy_lr=1e-4,
                qf_lr=1e-3,
                qf=qf,
                replay_buffer=replay_buffer,
                target_update_tau=1e-2,
                n_train_steps=50,
                discount=0.9,
                min_buffer_size=int(1e4),
                exploration_strategy=action_noise,
            )

            sampler = OffPolicyVectorizedSampler(algo, env, 1, no_reset=True)
            sampler.start_worker()

            runner.initialize_tf_vars()

            paths1 = sampler.obtain_samples(0, 5)
            paths2 = sampler.obtain_samples(0, 5)

            len1 = sum([len(path['rewards']) for path in paths1])
            len2 = sum([len(path['rewards']) for path in paths2])

            assert len1 == 5 and len2 == 5, 'Sampler should respect batch_size'

            # yapf: disable
            assert (len(paths1[0]['rewards']) + len(paths2[0]['rewards'])
                    == paths2[0]['running_length']), (
                'Running length should be the length of full path')
            # yapf: enable

            assert np.isclose(
                paths1[0]['rewards'].sum() + paths2[0]['rewards'].sum(),
                paths2[0]['undiscounted_return']
            ), 'Undiscounted_return should be the sum of rewards of full path'
예제 #5
0
    def test_get_action(self, obs_dim, action_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'continuous_mlp_policy_with_model.MLPModel'),
                        new=SimpleMLPModel):
            policy = ContinuousMLPPolicyWithModel(env_spec=env.spec)

        env.reset()
        obs, _, _, _ = env.step(1)

        action, _ = policy.get_action(obs)

        expected_action = np.full(action_dim, 0.5)

        assert env.action_space.contains(action)
        assert np.array_equal(action, expected_action)

        actions, _ = policy.get_actions([obs, obs, obs])
        for action in actions:
            assert env.action_space.contains(action)
            assert np.array_equal(action, expected_action)
예제 #6
0
def run_task(snapshot_config, *_):
    """Run task."""
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(gym.make('FetchReach-v1'))

        action_noise = OUStrategy(env.spec, sigma=0.2)

        policy = ContinuousMLPPolicyWithModel(
            env_spec=env.spec,
            name='Policy',
            hidden_sizes=[256, 256, 256],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
            input_include_goal=True,
        )

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            name='QFunction',
            hidden_sizes=[256, 256, 256],
            hidden_nonlinearity=tf.nn.relu,
            input_include_goal=True,
        )

        replay_buffer = HerReplayBuffer(env_spec=env.spec,
                                        size_in_transitions=int(1e6),
                                        time_horizon=100,
                                        replay_k=0.4,
                                        reward_fun=env.compute_reward)

        ddpg = DDPG(
            env_spec=env.spec,
            policy=policy,
            policy_lr=1e-3,
            qf_lr=1e-3,
            qf=qf,
            replay_buffer=replay_buffer,
            target_update_tau=0.05,
            n_epoch_cycles=20,
            max_path_length=100,
            n_train_steps=40,
            discount=0.9,
            exploration_strategy=action_noise,
            policy_optimizer=tf.train.AdamOptimizer,
            qf_optimizer=tf.train.AdamOptimizer,
            buffer_batch_size=256,
            input_include_goal=True,
        )

        runner.setup(algo=ddpg, env=env)

        runner.train(n_epochs=50, batch_size=100, n_epoch_cycles=20)
예제 #7
0
    def test_get_action_sym(self, obs_dim, action_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'continuous_mlp_policy_with_model.MLPModel'),
                        new=SimpleMLPModel):
            policy = ContinuousMLPPolicyWithModel(env_spec=env.spec)

        env.reset()
        obs, _, _, _ = env.step(1)

        obs_dim = env.spec.observation_space.flat_dim
        state_input = tf.placeholder(tf.float32, shape=(None, obs_dim))
        action_sym = policy.get_action_sym(state_input, name='action_sym')

        expected_action = np.full(action_dim, 0.5)

        action = self.sess.run(action_sym,
                               feed_dict={state_input: [obs.flatten()]})
        action = policy.action_space.unflatten(action)

        assert np.array_equal(action, expected_action)
        assert env.action_space.contains(action)
예제 #8
0
    def test_ddpg_pendulum(self):
        """
        Test DDPG with Pendulum environment.

        This environment has a [-3, 3] action_space bound.
        """
        with LocalRunner(sess=self.sess) as runner:
            env = TfEnv(normalize(gym.make('InvertedPendulum-v2')))
            action_noise = OUStrategy(env.spec, sigma=0.2)
            policy = ContinuousMLPPolicyWithModel(
                env_spec=env.spec,
                hidden_sizes=[64, 64],
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.tanh)
            qf = ContinuousMLPQFunction(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=tf.nn.relu)
            replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                               size_in_transitions=int(1e6),
                                               time_horizon=100)
            algo = DDPG(
                env_spec=env.spec,
                policy=policy,
                policy_lr=1e-4,
                qf_lr=1e-3,
                qf=qf,
                replay_buffer=replay_buffer,
                target_update_tau=1e-2,
                n_train_steps=50,
                discount=0.9,
                min_buffer_size=int(1e4),
                exploration_strategy=action_noise,
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10,
                                        n_epoch_cycles=20,
                                        batch_size=100)
            assert last_avg_ret > 10

            env.close()
예제 #9
0
def run_task(snapshot_config, *_):
    """Run task."""
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(gym.make('InvertedDoublePendulum-v2'))

        action_noise = OUStrategy(env.spec, sigma=0.2)

        policy = ContinuousMLPPolicyWithModel(env_spec=env.spec,
                                              hidden_sizes=[64, 64],
                                              hidden_nonlinearity=tf.nn.relu,
                                              output_nonlinearity=tf.nn.tanh)

        qf = ContinuousMLPQFunction(env_spec=env.spec,
                                    hidden_sizes=[64, 64],
                                    hidden_nonlinearity=tf.nn.relu)

        replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                           size_in_transitions=int(1e6),
                                           time_horizon=100)

        ddpg = DDPG(env_spec=env.spec,
                    policy=policy,
                    policy_lr=1e-4,
                    qf_lr=1e-3,
                    qf=qf,
                    replay_buffer=replay_buffer,
                    target_update_tau=1e-2,
                    n_train_steps=50,
                    discount=0.9,
                    min_buffer_size=int(1e4),
                    exploration_strategy=action_noise,
                    policy_optimizer=tf.train.AdamOptimizer,
                    qf_optimizer=tf.train.AdamOptimizer)

        runner.setup(algo=ddpg, env=env)

        runner.train(n_epochs=500, n_epoch_cycles=20, batch_size=100)
예제 #10
0
class TestContinuousMLPPolicyWithModelTransit(TfGraphTestCase):
    def setup_method(self):
        with mock.patch('tensorflow.random.normal') as mock_rand:
            mock_rand.return_value = 0.5
            super().setup_method()
            self.box_env = TfEnv(DummyBoxEnv())
            self.policy1 = ContinuousMLPPolicy(
                env_spec=self.box_env, hidden_sizes=(32, 32), name='P1')
            self.policy2 = ContinuousMLPPolicy(
                env_spec=self.box_env, hidden_sizes=(64, 64), name='P2')
            self.policy3 = ContinuousMLPPolicyWithModel(
                env_spec=self.box_env, hidden_sizes=(32, 32), name='P3')
            self.policy4 = ContinuousMLPPolicyWithModel(
                env_spec=self.box_env, hidden_sizes=(64, 64), name='P4')

            self.sess.run(tf.compat.v1.global_variables_initializer())
            for a, b in zip(self.policy3.get_params(),
                            self.policy1.get_params()):
                self.sess.run(a.assign(b))
            for a, b in zip(self.policy4.get_params(),
                            self.policy2.get_params()):
                self.sess.run(a.assign(b))

            self.obs = self.box_env.reset()
            self.action_bound = self.box_env.action_space.high
            assert self.policy1.vectorized == self.policy2.vectorized
            assert self.policy3.vectorized == self.policy4.vectorized

    @mock.patch('numpy.random.normal')
    def test_get_action(self, mock_rand):
        mock_rand.return_value = 0.5
        action1, _ = self.policy1.get_action(self.obs)
        action2, _ = self.policy2.get_action(self.obs)
        action3, _ = self.policy3.get_action(self.obs)
        action4, _ = self.policy4.get_action(self.obs)

        assert np.array_equal(action1, action3 * self.action_bound)
        assert np.array_equal(action2, action4 * self.action_bound)

        actions1, _ = self.policy1.get_actions([self.obs, self.obs])
        actions2, _ = self.policy2.get_actions([self.obs, self.obs])
        actions3, _ = self.policy3.get_actions([self.obs, self.obs])
        actions4, _ = self.policy4.get_actions([self.obs, self.obs])

        assert np.array_equal(actions1, actions3 * self.action_bound)
        assert np.array_equal(actions2, actions4 * self.action_bound)

    def test_get_action_sym(self):
        obs_dim = self.box_env.spec.observation_space.flat_dim
        state_input = tf.compat.v1.placeholder(
            tf.float32, shape=(None, obs_dim))

        action_sym1 = self.policy1.get_action_sym(
            state_input, name='action_sym')
        action_sym2 = self.policy2.get_action_sym(
            state_input, name='action_sym')
        action_sym3 = self.policy3.get_action_sym(
            state_input, name='action_sym')
        action_sym4 = self.policy4.get_action_sym(
            state_input, name='action_sym')

        action1 = self.sess.run(
            action_sym1, feed_dict={state_input: [self.obs]})
        action2 = self.sess.run(
            action_sym2, feed_dict={state_input: [self.obs]})
        action3 = self.sess.run(
            action_sym3, feed_dict={state_input: [self.obs]})
        action4 = self.sess.run(
            action_sym4, feed_dict={state_input: [self.obs]})

        assert np.array_equal(action1, action3 * self.action_bound)
        assert np.array_equal(action2, action4 * self.action_bound)
예제 #11
0
def run_garage(env, seed, log_dir):
    '''
    Create garage model and training.

    Replace the ppo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)
    env.reset()

    with LocalTFRunner(snapshot_config) as runner:
        env = TfEnv(normalize(env))

        action_noise = OUStrategy(env.spec, sigma=params['sigma'])

        policy = ContinuousMLPPolicyWithModel(
            env_spec=env.spec,
            hidden_sizes=params['policy_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
            input_include_goal=True,
        )

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            hidden_sizes=params['qf_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            input_include_goal=True,
        )

        replay_buffer = HerReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=params['replay_buffer_size'],
            time_horizon=params['n_rollout_steps'],
            replay_k=0.4,
            reward_fun=env.compute_reward,
        )

        algo = DDPG(
            env_spec=env.spec,
            policy=policy,
            qf=qf,
            replay_buffer=replay_buffer,
            policy_lr=params['policy_lr'],
            qf_lr=params['qf_lr'],
            target_update_tau=params['tau'],
            n_train_steps=params['n_train_steps'],
            discount=params['discount'],
            exploration_strategy=action_noise,
            policy_optimizer=tf.train.AdamOptimizer,
            qf_optimizer=tf.train.AdamOptimizer,
            buffer_batch_size=256,
            input_include_goal=True,
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        logger.add_output(dowel.StdOutput())
        logger.add_output(dowel.CsvOutput(tabular_log_file))
        logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=params['n_epochs'],
                     n_epoch_cycles=params['n_epoch_cycles'],
                     batch_size=params['n_rollout_steps'])

        logger.remove_all()

        return tabular_log_file