def test_target_params_copied(self):
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
        )
        target_qf = algo.target_qf
        target_policy = algo.target_policy
        qf = algo.qf
        qf_copy = algo.qf_with_action_input
        policy = algo.policy

        # Make sure they're different to start
        random_values = [
            np.random.rand(*values.shape) for values in qf.get_param_values()
        ]
        qf.set_param_values(random_values)
        random_values = [
            np.random.rand(*values.shape)
            for values in policy.get_param_values()
        ]
        policy.set_param_values(random_values)

        self.assertParamsNotEqual(target_qf, qf)
        self.assertParamsNotEqual(target_policy, policy)
        self.assertParamsEqual(qf_copy, qf)

        algo.train()
        self.assertParamsEqual(target_qf, qf)
        self.assertParamsEqual(target_policy, policy)
        self.assertParamsEqual(qf_copy, qf)
    def test_qf_targets(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        rewards = np.array([3., 4.])
        terminals = np.array([0., 0.])
        obs = np.array([[1., 1., 1., 1.], [1., 1., 1., 1.]])
        actions = np.array([[-0.5], [-0.5]])
        next_obs = np.array([[1., 1., 1., 1.], [1., 1., 1., 1.]])

        # target = reward + discount * target_qf(next_obs,
        #                                            target_policy(next_obs))
        # target1 = 3 + 0.5 * Q([1,1,1,1], u([1,1,1,1]))
        #         = 3 + 0.5 * Q([1,1,1,1], 4)
        #         = 3 + 0.5 * 8
        #         = 7
        # target2 = 8

        feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions,
                                       next_obs)
        self.assertNpEqual(np.array([[7.], [8.]]),
                           algo.sess.run(algo.ys, feed_dict=feed_dict))
    def test_qf_targets2(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        rewards = np.array([3.5])
        terminals = np.array([0.])
        obs = np.array([[1., 1., 1., 1.]])
        actions = np.array([[2.]])
        next_obs = np.array([[2., 2., 2., 2.]])

        # target = reward + discount * target_qf(next_obs,
        #                                            target_policy(next_obs))
        # target = 3.5 + 0.5 * Q([2,2,2,2], u([2,2,2,2]))
        #        = 3.5 + 0.5 * Q([2,2,2,2], 8)
        #        = 3.5 + 0.5 * 16
        #        = 11.5
        feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions,
                                       next_obs)
        self.assertNpEqual(np.array([[11.5]]),
                           algo.sess.run(algo.ys, feed_dict=feed_dict))
    def test_policy_gradient(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        obs = np.array([[1., 1., 1., 1.], [1., 1., 1., 1.]])

        # grad = -1/N sum_{i=0}^N * dQ/da * da/dtheta
        #      = -1/2 sum_{i=0}^1 * 1 * [1,1,1,1]
        #      = - [1,1,1,1]
        feed_dict = algo._policy_feed_dict(obs)
        loss_grad_ops = tf.gradients(algo.policy_surrogate_loss,
                                     algo.policy.get_params_internal())
        actual_loss_grads = algo.sess.run(loss_grad_ops, feed_dict=feed_dict)
        actual_loss_grads_flat = np.vstack(actual_loss_grads).flatten()
        expected = [
            -1 * np.ones_like(v) for v in algo.policy.get_param_values()
        ]
        self.assertTrue(
            are_np_array_iterables_equal(actual_loss_grads_flat, expected))
    def test_policy_gradient2(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        obs = np.array([[1., -10., 1., 2.], [1., 100., 1., 2.]])

        # grad = -1/N sum_{i=0}^N * dQ/da * da/dtheta
        #      = -1/2 * 1 * [1,-10,1,2]
        #         + -1/2 * 1 * [1,100,1,2]
        #      = - [1., 45., 1., 2.]
        feed_dict = algo._policy_feed_dict(obs)
        loss_grad_ops = tf.gradients(algo.policy_surrogate_loss,
                                     algo.policy.get_params_internal())
        actual_loss_grads = algo.sess.run(loss_grad_ops, feed_dict=feed_dict)
        expected = [np.array([[-1.], [-45.], [-1.], [-2.]])]
        self.assertTrue(
            are_np_array_iterables_equal(actual_loss_grads, expected))
def main():
    env = TfEnv(CartpoleEnv())
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    default_ddpg_params = dict(
        batch_size=128,
        n_epochs=10,
        epoch_length=1000,
        eval_samples=1000,
        max_path_length=100,
        min_pool_size=100,
    )
    exp_prefix = 'ddpg-cartpole-speed-{0}'.format(timestamp())
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **default_ddpg_params,
    )

    run_experiment_lite(
        algorithm.train(),
        n_parallel=1,
        snapshot_mode="last",
        exp_prefix=exp_prefix,
        seed=1,
    )
    def test_policy_surrogate_loss2(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        obs = np.array([[0., 1., 1., -11.], [5., 10., 10., -10.]])

        # loss = -1/N sum_i Q(s_i, u(s_i))
        #      = -1/2 * {(Q([0,1,1,-11], u([0,1,1,-11]))
        #                + Q([5,10,10,-10], u([5,10,10,-10]))}
        #      = -1/2 * {Q([0,1,1,-11], -9)) + Q([5,10,10,-10], 15))}
        #      = -1/2 * (-18 + 30)
        #      = -6
        feed_dict = algo._policy_feed_dict(obs)
        actual = algo.sess.run(algo.policy_surrogate_loss, feed_dict=feed_dict)
        self.assertEqual(actual, -6.)
        self.assertEqual(np.float32, type(actual))
示例#8
0
def lstm_launcher(variant):
    """
    Run a simple LSTM on an environment.

    :param variant: Dictionary of dictionary with the following keys:
        - algo_params
        - env_params
        - qf_params
        - policy_params
    :return:
    """
    from railrl.algos.ddpg import DDPG as MyDDPG
    from railrl.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    from rllab.exploration_strategies.ou_strategy import OUStrategy
    from railrl.launchers.launcher_util import get_env_settings
    env_settings = get_env_settings(**variant['env_params'])
    env = env_settings['env']
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(name_or_scope="critic",
                           env_spec=env.spec,
                           **variant.get('qf_params', {}))
    policy = FeedForwardPolicy(name_or_scope="actor",
                               env_spec=env.spec,
                               **variant.get('policy_params', {}))
    algorithm = MyDDPG(env, es, policy, qf, **variant['algo_params'])
    algorithm.train()
    def test_only_qf_values_change(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        old_qf_values = algo.qf.get_param_values()
        old_qf_copy_values = (algo.qf_with_action_input.get_param_values())
        old_policy_values = algo.policy.get_param_values()
        old_target_qf_values = algo.target_qf.get_param_values()
        old_target_policy_values = algo.target_policy.get_param_values()

        rewards = np.array([3.])
        terminals = np.array([0.])
        obs = np.array([[1., 1., 1., 1.]])
        actions = np.array([[-0.5]])
        next_obs = np.array([[1., 1., 1., 1.]])
        feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions,
                                       next_obs)
        algo.sess.run(algo.train_qf_op, feed_dict=feed_dict)

        new_qf_values = algo.qf.get_param_values()
        new_qf_copy_values = (algo.qf_with_action_input.get_param_values())
        new_policy_values = algo.policy.get_param_values()
        new_target_qf_values = algo.target_qf.get_param_values()
        new_target_policy_values = algo.target_policy.get_param_values()

        self.assertTrue(
            are_np_array_iterables_equal(old_policy_values, new_policy_values))
        self.assertFalse(
            are_np_array_iterables_equal(old_qf_values, new_qf_values))
        self.assertFalse(
            are_np_array_iterables_equal(old_qf_copy_values,
                                         new_qf_copy_values))
        self.assertTrue(
            are_np_array_iterables_equal(old_target_policy_values,
                                         new_target_policy_values))
        self.assertTrue(
            are_np_array_iterables_equal(old_target_qf_values,
                                         new_target_qf_values))
        self.assertParamsEqual(algo.qf_with_action_input, algo.qf)
    def test_target_params_hard_update(self):
        tau = 1
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            soft_target_tau=tau,
        )
        target_qf = algo.target_qf
        target_policy = algo.target_policy
        qf = algo.qf
        policy = algo.policy

        random_values = [
            np.random.rand(*values.shape) for values in qf.get_param_values()
        ]
        qf.set_param_values(random_values)
        random_values = [
            np.random.rand(*values.shape)
            for values in policy.get_param_values()
        ]
        policy.set_param_values(random_values)
        self.assertParamsNotEqual(target_qf, qf)
        self.assertParamsNotEqual(target_policy, policy)
        algo.sess.run(algo.update_target_policy_op)
        algo.sess.run(algo.update_target_qf_op)
        self.assertParamsEqual(target_qf, qf)
        self.assertParamsEqual(target_policy, policy)
def run_task(_):
    from railrl.algos.ddpg import DDPG
    from railrl.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF
    from rllab.exploration_strategies.ou_strategy import OUStrategy
    from sandbox.rocky.tf.envs.base import TfEnv
    from rllab.envs.gym_env import GymEnv

    def gym_env(name):
        return GymEnv(name,
                      record_video=False,
                      log_dir='/tmp/gym-test',  # Ignore gym log.
                      record_log=False)

    env = TfEnv(gym_env('AxeTwoDPoint-v0'))
    ddpg_params = dict(
        batch_size=128,
        n_epochs=50,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    es = OUStrategy(env_spec=env.spec)
    qf = QuadraticNAF(
        name_or_scope="quadratic_qf",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **ddpg_params
    )
    algorithm.train()
def my_ddpg_launcher(variant):
	"""
	Run DDPG
	:param variant: Dictionary of dictionary with the following keys:
		- algo_params
		- env_params
		- qf_params
		- policy_params
	:return:
	"""
	from railrl.algos.ddpg import DDPG as MyDDPG
	from railrl.policies.nn_policy import FeedForwardPolicy
	from railrl.qfunctions.nn_qfunction import FeedForwardCritic
	from rllab.exploration_strategies.ou_strategy import OUStrategy
	from railrl.launchers.launcher_util import get_env_settings
	from railrl.core.tf_util import BatchNormConfig
	if ('batch_norm_params' in variant
		and variant['batch_norm_params'] is not None):
		bn_config = BatchNormConfig(**variant['batch_norm_params'])
	else:
		bn_config = None
	env_settings = get_env_settings(**variant['env_params'])
	env = env_settings['env']
	es = OUStrategy(env_spec=env.spec)
	qf = FeedForwardCritic(
		name_or_scope="critic",
		env_spec=env.spec,
		batch_norm_config=bn_config,
		**variant.get('qf_params', {})
	)
	policy = FeedForwardPolicy(
		name_or_scope="actor",
		env_spec=env.spec,
		batch_norm_config=bn_config,
		**variant.get('policy_params', {})
	)

	algorithm = MyDDPG(
		env,
		es,
		policy,
		qf,
		variant['tensorboard'],
		batch_norm_config=bn_config,
		**variant['algo_params'],
	)
	algorithm.train()
示例#13
0
def main():
    stub(globals())
    env = TfEnv(HalfCheetahEnv())
    for seed in range(3):
        ddpg_params = dict(
            batch_size=128,
            n_epochs=100,
            epoch_length=10000,
            eval_samples=10000,
            discount=0.99,
            policy_learning_rate=1e-4,
            qf_learning_rate=1e-3,
            soft_target_tau=0.01,
            replay_pool_size=1000000,
            min_pool_size=256,
            scale_reward=1.0,
            max_path_length=1000,
            qf_weight_decay=0.0,
        )
        vitchyr_es = OUStrategy(env_spec=env.spec)
        vitchyr_qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        vitchyr_policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        vitchyr_ddpg = DDPG(env, vitchyr_es, vitchyr_policy, vitchyr_qf,
                            **ddpg_params)

        shane_es = GaussianStrategy(env.spec)
        shane_policy = DeterministicMLPPolicy(
            name="init_policy",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
        )
        shane_qf = ContinuousMLPQFunction(name="qf",
                                          env_spec=env.spec,
                                          hidden_sizes=(100, 100))
        shane_ddpg = ShaneDDPG(env, shane_policy, shane_qf, shane_es,
                               **ddpg_params)

        names_and_algos = [
            ("Vitchyr_DDPG", vitchyr_ddpg),
            ("Shane_DDPG", shane_ddpg),
        ]
        for name, algorithm in names_and_algos:
            env.reset()
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix="ddpg-comparison-cheetah",
                seed=seed,
            )
    def test_qf_gradient(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        rewards = np.array([3.5])
        terminals = np.array([0.])
        obs = np.array([[1., 1., 1., 1.]])
        actions = np.array([[1.]])
        next_obs = np.array([[2., 2., 2., 2.]])

        # target = reward + discount * target_qf(next_obs,
        #                                            target_policy(next_obs))
        # target = 3.5 + 0.5 * Q([2,2,2,2], u([2,2,2,2]))
        #        = 3.5 + 0.5 * Q([2,2,2,2], 8)
        #        = 3.5 + 0.5 * 16
        #        = 11.5
        #
        # dloss/dtheta = - 2 ( y - qf(obs, action)) *
        #                   d/dtheta (qf(obs, action))
        # dloss/dtheta = - 2 ( y - qf([1,1,1,1], 1)) *
        #                   d/dtheta (qf(obs, action))
        # dloss/dtheta = - 2 ( 11.5 - 5) *
        #                   d/dtheta (qf(obs, action))
        # dloss/dtheta = - 13 * d/dtheta (qf(obs, action))
        feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions,
                                       next_obs)
        grads = tf.gradients(algo.qf_loss, algo.qf.get_params_internal())
        # qf_grads = algo.sess.run(
        #         tf.gradients(algo.qf.output, algo.qf.get_vars()))
        expected = [-13. * np.ones_like(v) for v in algo.qf.get_param_values()]
        actual = algo.sess.run(grads, feed_dict=feed_dict)
        actual_flat = np.vstack(actual).flatten()
        self.assertTrue(are_np_array_iterables_equal(expected, actual_flat),
                        "Numpy arrays not equal")
def main():
    env = TfEnv(CartpoleEnv())
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    default_ddpg_params = dict(
        batch_size=32,
        n_epochs=10,
        epoch_length=1000,
        eval_samples=1000,
        max_path_length=100,
        min_pool_size=1000,
    )
    sweeper = DeterministicHyperparameterSweeper(
        {'scale_reward': [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]}, )
    exp_prefix = 'ddpg-cart-reward-scale-sweep-{0}'.format(timestamp())
    for ddpg_params in sweeper.iterate_hyperparameters():
        algorithm = DDPG(
            env,
            es,
            policy,
            qf,
            scale_reward=ddpg_params['scale_reward'],
            **default_ddpg_params,
        )

        for seed in range(3):
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix=exp_prefix,
                seed=seed,
                # mode="local",
                # use_cloudpickle=True,
            )
def example(*_):
    env = HalfCheetahEnv()
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        n_epochs=25,
        batch_size=1024,
        replay_pool_size=10000,
    )
    algorithm.train()
示例#17
0
def main():
    stub(globals())
    env = TfEnv(CartpoleEnv())
    ddpg_params = dict(
        batch_size=128,
        n_epochs=50,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    es = OUStrategy(env_spec=env.spec)
    qf = QuadraticNAF(
        name_or_scope="quadratic_qf",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(env, es, policy, qf, **ddpg_params)

    for seed in range(3):
        env.reset()
        run_experiment_lite(
            algorithm.train(),
            n_parallel=1,
            snapshot_mode="last",
            exp_prefix="test-qddpg-cartpole",
            seed=seed,
        )
def run_task(_):
    for seed in range(3):
        env = TfEnv(HalfCheetahEnv())
        es = OUStrategy(env_spec=env.spec)
        qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        ddpg_params = dict(
            batch_size=16,
            n_epochs=100,
            epoch_length=100,
            eval_samples=100,
            max_path_length=10,
            min_pool_size=2,
        )
        algorithm = DDPG(env, es, policy, qf, **ddpg_params)

        algorithm.train(),
    def test_target_params_update(self):
        tau = 0.2
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            soft_target_tau=tau,
        )
        target_qf = algo.target_qf
        target_policy = algo.target_policy
        qf = algo.qf
        policy = algo.policy

        algo.train()

        orig_tc_vals = target_qf.get_param_values()
        orig_ta_vals = target_policy.get_param_values()
        orig_c_vals = qf.get_param_values()
        orig_a_vals = policy.get_param_values()
        algo.sess.run(algo.update_target_policy_op)
        algo.sess.run(algo.update_target_qf_op)
        new_tc_vals = target_qf.get_param_values()
        new_ta_vals = target_policy.get_param_values()

        for orig_tc_val, orig_c_val, new_tc_val in zip(orig_tc_vals,
                                                       orig_c_vals,
                                                       new_tc_vals):
            self.assertTrue((new_tc_val == tau * orig_c_val +
                             (1 - tau) * orig_tc_val).all())

        for orig_ta_val, orig_a_val, new_ta_val in zip(orig_ta_vals,
                                                       orig_a_vals,
                                                       new_ta_vals):
            self.assertTrue((new_ta_val == tau * orig_a_val +
                             (1 - tau) * orig_ta_val).all())
    def test_qf_loss2(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        rewards = np.array([3.5])
        terminals = np.array([0.])
        obs = np.array([[1., 1., 1., 1.]])
        actions = np.array([[2.]])
        next_obs = np.array([[2., 2., 2., 2.]])

        # target = reward + discount * target_qf(next_obs,
        #                                            target_policy(next_obs))
        # target = 3.5 + 0.5 * Q([2,2,2,2], u([2,2,2,2]))
        #        = 3.5 + 0.5 * Q([2,2,2,2], 8)
        #        = 3.5 + 0.5 * 16
        #        = 11.5
        #
        # loss = (target - qf(obs, action))^2
        #      = (target - qf([1,1,1,1], 2))^2
        #      = (target - 6)^2
        #      = (11.5 - 6)^2
        #      = (5.5)^2
        #      = 30.25
        feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions,
                                       next_obs)
        actual = algo.sess.run(algo.qf_loss, feed_dict=feed_dict)
        self.assertEqual(30.25, actual)
        self.assertEqual(np.float32, type(actual))
 def test_sum_policy(self):
     algo = DDPG(
         self.env,
         self.es,
         self.sum_policy,
         self.sum_critic,
         n_epochs=0,
         epoch_length=0,
         eval_samples=0,  # Ignore eval. Just do this to remove warnings.
     )
     obs = np.array([[1., 1., 1., 1.]])
     for policy in [algo.policy, algo.target_policy]:
         feed_dict = {
             policy.observation_input: obs,
         }
         self.assertEqual(np.sum(obs),
                          algo.sess.run(policy.output, feed_dict=feed_dict))
 def test_sum_qf(self):
     algo = DDPG(
         self.env,
         self.es,
         self.sum_policy,
         self.sum_critic,
         n_epochs=0,
         epoch_length=0,
         eval_samples=0,  # Ignore eval. Just do this to remove warnings.
     )
     obs = np.array([[1., 1., 1., 1.]])
     actions = np.array([[-0.5]])
     for qf in [algo.qf, algo.target_qf]:
         feed_dict = {
             qf.action_input: actions,
             qf.observation_input: obs,
         }
         self.assertEqual(
             np.sum(obs) + actions,
             algo.sess.run(qf.output, feed_dict=feed_dict))
    def test_target_params_no_update(self):
        tau = 0
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            soft_target_tau=tau,
        )
        target_qf = algo.target_qf
        target_policy = algo.target_policy
        qf = algo.qf
        policy = algo.policy

        random_values = [
            np.random.rand(*values.shape) for values in qf.get_param_values()
        ]
        qf.set_param_values(random_values)
        random_values = [
            np.random.rand(*values.shape)
            for values in policy.get_param_values()
        ]
        policy.set_param_values(random_values)
        old_target_qf_values = target_qf.get_param_values()
        old_target_policy_values = target_policy.get_param_values()
        self.assertParamsNotEqual(target_qf, qf)
        self.assertParamsNotEqual(target_policy, policy)
        algo.sess.run(algo.update_target_policy_op)
        algo.sess.run(algo.update_target_qf_op)
        self.assertTrue(
            are_np_array_iterables_equal(old_target_qf_values,
                                         target_qf.get_param_values()))
        self.assertTrue(
            are_np_array_iterables_equal(old_target_policy_values,
                                         target_policy.get_param_values()))
        self.assertParamsNotEqual(target_qf, qf)
        self.assertParamsNotEqual(target_policy, policy)
def main():

    parser = argparse.ArgumentParser()
    # Hyperparameters
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--policy_initlr', type=float, default=1e-4)
    parser.add_argument('--qf_initlr', type=float, default=1e-3)

    parser.add_argument('--qf_decay', type=float, default=.0)
    parser.add_argument('--qf_soft_tau', type=float, default=1e-3)

    # Exploration hyperparameters
    parser.add_argument('--ou_theta', type=float, default=0.15)
    parser.add_argument('--ou_sigma', type=float, default=0.3)

    parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard')
    parser.add_argument('--gpu_ratio', type=float, default=1.0)

    args = parser.parse_args()

    env = TfEnv(normalize(env=GymEnv('Box3dReach-v11',record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))

    name = 'ddpg-state-v11-plr{0}-qlr{1}-tau{2}-qfdecay{3}-ou_theta{4}-ou_sigma{5}'.format(
        args.policy_initlr, args.qf_initlr, args.qf_soft_tau, args.qf_decay,
        args.ou_theta, args.ou_sigma)

    es = OUStrategy(env_spec=env.spec,
                    theta=args.ou_theta,
                    sigma=args.ou_sigma)

    policy = FeedForwardPolicy(
        name_or_scope="actor",
        observation_hidden_sizes=(400, 300),
        env_spec=env.spec,
    )

    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
        embedded_hidden_sizes=(100, ),
        observation_hidden_sizes=(100, ),
    )

    algo = DDPG(
        env=env,
        exploration_strategy=es,
        policy=policy,
        qf=qf,
        tensorboard_path=os.path.join(args.tfboard_path, name,
                                      '_%d' % args.seed),
        qf_learning_rate=args.qf_initlr,
        policy_learning_rate=args.policy_initlr,
        soft_target_tau=args.qf_soft_tau,
        gpu_ratio=args.gpu_ratio,
    )

    run_experiment_lite(algo.train(),
                        exp_prefix=name,
                        n_parallel=1,
                        snapshot_mode="last",
                        seed=args.seed,
                        mode="local")
示例#25
0
	)
	policy = FeedForwardPolicy(
		name_or_scope="actor",
		env_spec=env.spec,
		hidden_nonlinearity=tf.nn.tanh,
	)

	algo = DDPG(
		env,
		es,
		policy,
		qf,
		"/data0/dianchen/box3d/ddpg_box3d_state_v4_tf_policy_{0}_qf_{1}_gamma_{2}_tau_{3}".format(
			policy_lr,
			qf_lr,
			gamma,
			tau,
		),
		qf_learning_rate=qf_lr,
		policy_learning_rate=policy_lr,
		discount=gamma,
		soft_target_tau=tau,
		gpu_ratio=0.25,
	)

	run_experiment_lite(
		algo.train(),
		exp_prefix="ddpg_box3d_state_v4_tf_policy_{0}_qf_{1}_gamma_{2}_tau_{3}".format(
			policy_lr,
			qf_lr,
			gamma,
def main():

    parser = argparse.ArgumentParser()
    # Hyperparameters
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--policy_initlr', type=float, default=1e-4)
    parser.add_argument('--qf_initlr', type=float, default=1e-3)

    parser.add_argument('--qf_decay', type=float, default=0.01)
    parser.add_argument('--qf_soft_tau', type=float, default=1e-3)

    # Exploration hyperparameters
    parser.add_argument('--ou_theta', type=float, default=0.15)
    parser.add_argument('--ou_sigma', type=float, default=0.3)

    parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard')
    parser.add_argument('--gpu_ratio', type=float, default=0.95)

    args = parser.parse_args()

    env = TfEnv(normalize(env=GymEnv('Box3dReachPixel-v15',record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))

    name = 'ddpg-pixel-v15-plr{0}-qlr{1}-tau{2}-qfdecay{3}'.format(
        args.policy_initlr, args.qf_initlr, args.qf_soft_tau, args.qf_decay)

    es = OUStrategy(env_spec=env.spec,
                    theta=args.ou_theta,
                    sigma=args.ou_sigma)
    # import pdb; pdb.set_trace()

    qf = ConvNNCritic(
        name_or_scope="critic",
        input_shape=env.observation_space.shape,
        env_spec=env.spec,
        conv_filters=(32, 32, 32, 32, 32),
        conv_filter_sizes=((3, 3), (3, 3), (3, 3), (3, 3), (3, 3)),
        conv_strides=(2, 2, 2, 2, 2),
        conv_pads=('SAME', 'SAME', 'SAME', 'SAME', 'SAME'),
        observation_hidden_sizes=(256, ),
        embedded_hidden_sizes=(256, ),
        hidden_nonlinearity=tf.nn.relu,
    )

    policy = ConvNNPolicy(
        name_or_scope="actor",
        input_shape=env.observation_space.shape,
        env_spec=env.spec,
        conv_filters=(32, 32, 32, 32, 32),
        conv_filter_sizes=((3, 3), (3, 3), (3, 3), (3, 3), (3, 3)),
        conv_strides=(2, 2, 2, 2, 2),
        conv_pads=('SAME', 'SAME', 'SAME', 'SAME', 'SAME'),
        observation_hidden_sizes=(256, 128),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.tanh,
    )

    algo = DDPG(
        env=env,
        exploration_strategy=es,
        policy=policy,
        qf=qf,
        tensorboard_path=os.path.join(args.tfboard_path,
                                      name + '_%d' % args.seed),
        replay_pool_size=100000,
        obs_dtype='uint8',
        qf_learning_rate=args.qf_initlr,
        policy_learning_rate=args.policy_initlr,
        soft_target_tau=args.qf_soft_tau,
        gpu_ratio=args.gpu_ratio,
    )

    run_experiment_lite(algo.train(),
                        exp_prefix=name,
                        n_parallel=1,
                        snapshot_mode="last",
                        seed=args.seed,
                        mode="local")