Exemplo n.º 1
0
    def _setup_actor_optimizer(self, scope):
        """Create the actor loss, gradient, and optimizer."""
        if self.verbose >= 2:
            print('setting up actor optimizer')

        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            actor_shapes = [
                var.get_shape().as_list()
                for var in get_trainable_vars(scope_name)
            ]
            actor_nb_params = sum(
                [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
            print('  actor shapes: {}'.format(actor_shapes))
            print('  actor params: {}'.format(actor_nb_params))

        # compute the actor loss
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf[0])

        # create an optimizer object
        optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr)

        self.actor_optimizer = optimizer.minimize(
            self.actor_loss, var_list=get_trainable_vars(scope_name))
Exemplo n.º 2
0
    def _setup_actor_update(self, all_obs_ph, combined_actors, scope):
        """Create the actor loss and optimization process.

        Parameters
        ----------
        all_obs_ph : tf.compat.v1.placeholder
            the placeholder for the full-state observation
        combined_actors : tf.Variable
            the output from all actors, as a function of the agent's policy
            parameters
        scope : str
            an outer scope term

        Returns
        -------
        tf.Operation
            the operation that returns the loss of the actor
        tf.Operation
            the operation that updates the trainable parameters of the actor
        """
        if self.verbose >= 2:
            print('setting up actor optimizer')

        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            actor_shapes = [
                var.get_shape().as_list()
                for var in get_trainable_vars(scope_name)
            ]
            actor_nb_params = sum(
                [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
            print('  actor shapes: {}'.format(actor_shapes))
            print('  actor params: {}'.format(actor_nb_params))

        # Create a differentiable form of the critic.
        with tf.compat.v1.variable_scope("model", reuse=False):
            critic_with_actor_tf = [
                self.make_critic(all_obs_ph,
                                 combined_actors,
                                 scope="centralized_qf_{}".format(i),
                                 reuse=True) for i in range(2)
            ]

        # compute the actor loss
        actor_loss = -tf.reduce_mean(critic_with_actor_tf[0])

        # Add a regularization penalty.
        actor_loss += self._l2_loss(self.l2_penalty, scope_name)

        # create an optimizer object
        optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr)

        actor_optimizer = optimizer.minimize(
            loss=actor_loss, var_list=get_trainable_vars(scope_name))

        return actor_loss, actor_optimizer
Exemplo n.º 3
0
    def test_init_conv(self):
        """Check the functionality of the __init__() method with conv policies.

        This method tests that the proper structure graph was generated.
        """
        policy_params = self.policy_params.copy()
        policy_params["model_params"]["model_type"] = "conv"
        _ = TD3FeedForwardPolicy(**policy_params)

        print(sorted([var.name for var in get_trainable_vars()]))
        # test case 1
        self.assertListEqual(
            sorted([var.name for var in get_trainable_vars()]), [
                'model/pi/conv0/bias:0', 'model/pi/conv0/kernel:0',
                'model/pi/conv1/bias:0', 'model/pi/conv1/kernel:0',
                'model/pi/conv2/bias:0', 'model/pi/conv2/kernel:0',
                'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0',
                'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0',
                'model/pi/output/bias:0', 'model/pi/output/kernel:0',
                'model/qf_0/conv0/bias:0', 'model/qf_0/conv0/kernel:0',
                'model/qf_0/conv1/bias:0', 'model/qf_0/conv1/kernel:0',
                'model/qf_0/conv2/bias:0', 'model/qf_0/conv2/kernel:0',
                'model/qf_0/fc0/bias:0', 'model/qf_0/fc0/kernel:0',
                'model/qf_0/fc1/bias:0', 'model/qf_0/fc1/kernel:0',
                'model/qf_0/qf_output/bias:0', 'model/qf_0/qf_output/kernel:0',
                'model/qf_1/conv0/bias:0', 'model/qf_1/conv0/kernel:0',
                'model/qf_1/conv1/bias:0', 'model/qf_1/conv1/kernel:0',
                'model/qf_1/conv2/bias:0', 'model/qf_1/conv2/kernel:0',
                'model/qf_1/fc0/bias:0', 'model/qf_1/fc0/kernel:0',
                'model/qf_1/fc1/bias:0', 'model/qf_1/fc1/kernel:0',
                'model/qf_1/qf_output/bias:0', 'model/qf_1/qf_output/kernel:0',
                'target/pi/conv0/bias:0', 'target/pi/conv0/kernel:0',
                'target/pi/conv1/bias:0', 'target/pi/conv1/kernel:0',
                'target/pi/conv2/bias:0', 'target/pi/conv2/kernel:0',
                'target/pi/fc0/bias:0', 'target/pi/fc0/kernel:0',
                'target/pi/fc1/bias:0', 'target/pi/fc1/kernel:0',
                'target/pi/output/bias:0', 'target/pi/output/kernel:0',
                'target/qf_0/conv0/bias:0', 'target/qf_0/conv0/kernel:0',
                'target/qf_0/conv1/bias:0', 'target/qf_0/conv1/kernel:0',
                'target/qf_0/conv2/bias:0', 'target/qf_0/conv2/kernel:0',
                'target/qf_0/fc0/bias:0', 'target/qf_0/fc0/kernel:0',
                'target/qf_0/fc1/bias:0', 'target/qf_0/fc1/kernel:0',
                'target/qf_0/qf_output/bias:0',
                'target/qf_0/qf_output/kernel:0', 'target/qf_1/conv0/bias:0',
                'target/qf_1/conv0/kernel:0', 'target/qf_1/conv1/bias:0',
                'target/qf_1/conv1/kernel:0', 'target/qf_1/conv2/bias:0',
                'target/qf_1/conv2/kernel:0', 'target/qf_1/fc0/bias:0',
                'target/qf_1/fc0/kernel:0', 'target/qf_1/fc1/bias:0',
                'target/qf_1/fc1/kernel:0', 'target/qf_1/qf_output/bias:0',
                'target/qf_1/qf_output/kernel:0'
            ])
Exemplo n.º 4
0
    def _setup_actor_optimizer(self, scope):
        """Create minimization operations for policy and entropy.

        Creates a `tf.optimizer.minimize` operations for updating policy and
        entropy with gradient descent.

        See Section 4.2 in [1], for further information of the policy update,
        and Section 5 in [1] for further information of the entropy update.
        """
        if self.verbose >= 2:
            print('setting up actor and alpha optimizers')

        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            actor_shapes = [
                var.get_shape().as_list()
                for var in get_trainable_vars(scope_name)
            ]
            actor_nb_params = sum(
                [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
            print('  actor shapes: {}'.format(actor_shapes))
            print('  actor params: {}'.format(actor_nb_params))

        # Take the min of the two Q-Values (Double-Q Learning)
        min_qf_pi = tf.minimum(self.qf1_pi, self.qf2_pi)

        # Compute the entropy temperature loss.
        self.alpha_loss = -tf.reduce_mean(
            self.log_alpha *
            tf.stop_gradient(self.logp_pi + self.target_entropy))

        alpha_optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr)

        self.alpha_optimizer = alpha_optimizer.minimize(
            self.alpha_loss, var_list=self.log_alpha)

        # Compute the policy loss
        self.actor_loss = tf.reduce_mean(self.alpha * self.logp_pi - min_qf_pi)

        # Policy train op (has to be separate from value train op, because
        # min_qf_pi appears in policy_loss)
        actor_optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr)

        self.actor_optimizer = actor_optimizer.minimize(
            self.actor_loss, var_list=get_trainable_vars(scope_name))
Exemplo n.º 5
0
    def _setup_critic_optimizer(self, critic_target, scope):
        """Create the critic loss, gradient, and optimizer."""
        if self.verbose >= 2:
            print('setting up critic optimizer')

        # compute the target critic term
        with tf.compat.v1.variable_scope("loss", reuse=False):
            q_obs1 = tf.minimum(critic_target[0], critic_target[1])
            target_q = tf.stop_gradient(self.rew_ph + (1. - self.terminals1) *
                                        self.gamma * q_obs1)

            tf.compat.v1.summary.scalar('critic_target',
                                        tf.reduce_mean(target_q))

        # choose the loss function
        if self.use_huber:
            loss_fn = tf.compat.v1.losses.huber_loss
        else:
            loss_fn = tf.compat.v1.losses.mean_squared_error

        self.critic_loss = [loss_fn(q, target_q) for q in self.critic_tf]

        self.critic_optimizer = []

        for i, critic_loss in enumerate(self.critic_loss):
            scope_name = 'model/qf_{}/'.format(i)
            if scope is not None:
                scope_name = scope + '/' + scope_name

            if self.verbose >= 2:
                critic_shapes = [
                    var.get_shape().as_list()
                    for var in get_trainable_vars(scope_name)
                ]
                critic_nb_params = sum([
                    reduce(lambda x, y: x * y, shape)
                    for shape in critic_shapes
                ])
                print('  critic shapes: {}'.format(critic_shapes))
                print('  critic params: {}'.format(critic_nb_params))

            # create an optimizer object
            optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr)

            # create the optimizer object
            self.critic_optimizer.append(
                optimizer.minimize(loss=critic_loss,
                                   var_list=get_trainable_vars(scope_name)))
Exemplo n.º 6
0
    def _l2_loss(l2_penalty, scope_name):
        """Compute the L2 regularization penalty.

        Parameters
        ----------
        l2_penalty : float
            L2 regularization penalty
        scope_name : str
            the scope of the trainable variables to regularize

        Returns
        -------
        float
            the overall regularization penalty
        """
        if l2_penalty > 0:
            print("regularizing policy network: L2 = {}".format(l2_penalty))
            regularizer = tf.contrib.layers.l2_regularizer(
                scale=l2_penalty, scope="{}/l2_regularize".format(scope_name))
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer,
                weights_list=get_trainable_vars(scope_name))
        else:
            # no regularization
            l2_loss = 0

        return l2_loss
Exemplo n.º 7
0
    def _setup_deterministic_optimizer(self, action, scope=None):
        """Create the loss and optimizer of a deterministic policy."""
        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up optimizer')
            print_params_shape(scope_name, "policy")

        # Choose the loss function.
        if self.use_huber:
            loss_fn = tf.compat.v1.losses.huber_loss
        else:
            loss_fn = tf.compat.v1.losses.mean_squared_error

        # Define the loss function.
        self.loss = loss_fn(action, self.policy)

        # Create an optimizer object.
        optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate)

        # Create the optimizer operation.
        self.optimizer = optimizer.minimize(
            loss=self.loss, var_list=get_trainable_vars(scope_name))
Exemplo n.º 8
0
    def test_init(self):
        """Check the functionality of the __init__() method.

        This the proper structure graph and the proper loss function was
        generated for the following cases:

        1. stochastic policies
        2. deterministic policies
        """
        # test case 1
        policy_params = self.policy_params.copy()
        policy_params["stochastic"] = True
        _ = ImitationFeedForwardPolicy(**policy_params)

        # test the graph
        expected_vars = [
            '0:0', '1:0', 'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0',
            'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0',
            'model/pi/log_std/bias:0', 'model/pi/log_std/kernel:0',
            'model/pi/mean/bias:0', 'model/pi/mean/kernel:0'
        ]

        try:
            self.assertListEqual(
                sorted([var.name for var in get_trainable_vars()]),
                expected_vars)
        except AssertionError:
            # Seems to ignore the first two sometimes.
            self.assertListEqual(
                sorted([var.name for var in get_trainable_vars()]),
                expected_vars[2:])

        # Clear the graph.
        tf.compat.v1.reset_default_graph()

        # test case 2
        policy_params = self.policy_params.copy()
        policy_params["stochastic"] = False
        _ = ImitationFeedForwardPolicy(**policy_params)

        # test the graph
        self.assertListEqual(
            sorted([var.name for var in get_trainable_vars()]), [
                'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0',
                'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0',
                'model/pi/output/bias:0', 'model/pi/output/kernel:0'
            ])
Exemplo n.º 9
0
    def test_setup_model_feedforward(self):
        # Create the algorithm object.
        policy_params = self.init_parameters.copy()
        policy_params['policy'] = FeedForwardPolicy
        policy_params['_init_setup_model'] = True
        alg = RLAlgorithm(**policy_params)

        # check the policy_kwargs term
        policy_kwargs = FEEDFORWARD_PARAMS.copy()
        policy_kwargs.update(TD3_PARAMS)
        policy_kwargs['verbose'] = self.init_parameters['verbose']
        policy_kwargs['num_envs'] = self.init_parameters['num_envs']
        self.assertDictEqual(alg.policy_kwargs, policy_kwargs)

        with alg.graph.as_default():
            expected_vars = sorted([var.name for var in get_trainable_vars()])

        # Check that all trainable variables have been created in the
        # TensorFlow graph.
        self.assertListEqual(
            expected_vars,
            ['model/pi/fc0/bias:0',
             'model/pi/fc0/kernel:0',
             'model/pi/fc1/bias:0',
             'model/pi/fc1/kernel:0',
             'model/pi/output/bias:0',
             'model/pi/output/kernel:0',
             'model/qf_0/fc0/bias:0',
             'model/qf_0/fc0/kernel:0',
             'model/qf_0/fc1/bias:0',
             'model/qf_0/fc1/kernel:0',
             'model/qf_0/qf_output/bias:0',
             'model/qf_0/qf_output/kernel:0',
             'model/qf_1/fc0/bias:0',
             'model/qf_1/fc0/kernel:0',
             'model/qf_1/fc1/bias:0',
             'model/qf_1/fc1/kernel:0',
             'model/qf_1/qf_output/bias:0',
             'model/qf_1/qf_output/kernel:0',
             'target/pi/fc0/bias:0',
             'target/pi/fc0/kernel:0',
             'target/pi/fc1/bias:0',
             'target/pi/fc1/kernel:0',
             'target/pi/output/bias:0',
             'target/pi/output/kernel:0',
             'target/qf_0/fc0/bias:0',
             'target/qf_0/fc0/kernel:0',
             'target/qf_0/fc1/bias:0',
             'target/qf_0/fc1/kernel:0',
             'target/qf_0/qf_output/bias:0',
             'target/qf_0/qf_output/kernel:0',
             'target/qf_1/fc0/bias:0',
             'target/qf_1/fc0/kernel:0',
             'target/qf_1/fc1/bias:0',
             'target/qf_1/fc1/kernel:0',
             'target/qf_1/qf_output/bias:0',
             'target/qf_1/qf_output/kernel:0']
        )
Exemplo n.º 10
0
    def test_init(self):
        """Check the functionality of the __init__() method.

        This the proper structure graph and the proper loss function was
        generated for the following cases:

        1. stochastic policies
        2. deterministic policies
        """
        # test case 1
        policy_params = self.policy_params.copy()
        policy_params["stochastic"] = True
        policy = ImitationFeedForwardPolicy(**policy_params)

        # test the graph
        self.assertListEqual(
            sorted([var.name for var in get_trainable_vars()]), [
                'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0',
                'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0',
                'model/pi/log_std/bias:0', 'model/pi/log_std/kernel:0',
                'model/pi/mean/bias:0', 'model/pi/mean/kernel:0'
            ])

        # test the loss function
        del policy  # TODO

        # Clear the graph.
        tf.compat.v1.reset_default_graph()

        # test case 2
        policy_params = self.policy_params.copy()
        policy_params["stochastic"] = False
        policy = ImitationFeedForwardPolicy(**policy_params)

        # test the graph
        self.assertListEqual(
            sorted([var.name for var in get_trainable_vars()]), [
                'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0',
                'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0',
                'model/pi/output/bias:0', 'model/pi/output/kernel:0'
            ])

        # test the loss function
        del policy  # TODO
Exemplo n.º 11
0
    def _setup_connected_gradients(self):
        """Create the updated manager optimization with connected gradients."""
        goal_dim = self.manager.ac_space.shape[0]
        obs_shape = self.worker.ob_space.shape[0]
        obs = tf.concat([
            self.worker.obs1_ph[:, :obs_shape], self.manager.obs_ph[:,
                                                                    obs_shape:]
        ],
                        axis=-1)

        if self.relative_goals:
            # The observation from the perspective of the manager can be
            # collected from the first goal_dim elements of the observation. We
            # use goal_dim in case the goal-specific observations are not the
            # entire observation space.
            obs_t = self.manager.obs_ph[:, :goal_dim]
            # We collect the observation of the worker in a similar fashion as
            # above.
            obs_tpi = self.worker.obs1_ph[:, :goal_dim]
            # Relative goal formulation as per HIRO.
            goal = obs_t + self.manager.action_ph - obs_tpi
        else:
            # Goal is the direct output from the manager in this case.
            goal = self.manager.action_ph

        with tf.compat.v1.variable_scope("Manager/model"):
            manager_with_worker_message = self.manager.make_critic(
                obs, goal, self.worker.message_tf, reuse=True, scope="qf_0")

        self.cg_loss = -tf.reduce_mean(manager_with_worker_message)

        kl_div_loss = 1 + self.worker.message_std - tf.square(
            self.worker.message_mean) - tf.exp(self.worker.message_std)
        kl_div_loss = -0.5 * tf.reduce_sum(kl_div_loss, 1)
        self.worker_message_loss = tf.reduce_mean(kl_div_loss)

        optimizer = tf.compat.v1.train.AdamOptimizer(self.worker.actor_lr)
        self.cg_optimizer = optimizer.minimize(
            0.1 * self.worker_message_loss + self.cg_weights * self.cg_loss +
            self.worker.actor_loss,
            var_list=get_trainable_vars("Worker/communication/") +
            get_trainable_vars("Worker/model/pi"),
        )
Exemplo n.º 12
0
    def _setup_critic_optimizer(self, scope):
        """Create minimization operation for critic Q-function.

        Create a `tf.optimizer.minimize` operation for updating critic
        Q-function with gradient descent.

        See Equations (5, 6) in [1], for further information of the Q-function
        update rule.
        """
        scope_name = 'model/value_fns'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up critic optimizer')
            for name in ['qf1', 'qf2', 'vf']:
                scope_i = '{}/{}'.format(scope_name, name)
                print_params_shape(scope_i, name)

        # Take the min of the two Q-Values (Double-Q Learning)
        min_qf_pi = tf.minimum(self.qf1_pi, self.qf2_pi)

        # Target for Q value regression
        q_backup = tf.stop_gradient(
            self.rew_ph +
            (1 - self.terminals1) * self.gamma * self.value_target)

        # choose the loss function
        if self.use_huber:
            loss_fn = tf.compat.v1.losses.huber_loss
        else:
            loss_fn = tf.compat.v1.losses.mean_squared_error

        # Compute Q-Function loss
        qf1_loss = loss_fn(q_backup, self.qf1)
        qf2_loss = loss_fn(q_backup, self.qf2)

        # Target for value fn regression
        # We update the vf towards the min of two Q-functions in order to
        # reduce overestimation bias from function approximation error.
        v_backup = tf.stop_gradient(min_qf_pi - self.alpha * self.logp_pi)
        value_loss = loss_fn(self.value_fn, v_backup)

        self.critic_loss = (qf1_loss, qf2_loss, value_loss)

        # Combine the loss functions for the optimizer.
        critic_loss = qf1_loss + qf2_loss + value_loss

        # Critic train op
        critic_optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr)
        self.critic_optimizer = critic_optimizer.minimize(
            critic_loss,
            var_list=get_trainable_vars(scope_name))
Exemplo n.º 13
0
    def test_init(self):
        """Check the functionality of the __init__() method.

        This method is tested for the following features:

        1. The proper structure graph was generated.
        2. All input placeholders are correct.
        """
        policy_params = deepcopy(self.policy_params)
        policy_params['sess'] = tf.compat.v1.Session()
        policy = PPOFeedForwardPolicy(**policy_params)

        # test case 1
        self.assertListEqual(
            sorted([var.name for var in get_trainable_vars()]),
            ['model/logstd:0',
             'model/pi/fc0/bias:0',
             'model/pi/fc0/kernel:0',
             'model/pi/fc1/bias:0',
             'model/pi/fc1/kernel:0',
             'model/pi/output/bias:0',
             'model/pi/output/kernel:0',
             'model/vf/fc0/bias:0',
             'model/vf/fc0/kernel:0',
             'model/vf/fc1/bias:0',
             'model/vf/fc1/kernel:0',
             'model/vf/output/bias:0',
             'model/vf/output/kernel:0']
        )

        # test case 2
        self.assertEqual(
            tuple(v.__int__() for v in policy.rew_ph.shape),
            (None,))
        self.assertEqual(
            tuple(v.__int__() for v in policy.action_ph.shape),
            (None, 1))
        self.assertEqual(
            tuple(v.__int__() for v in policy.obs_ph.shape),
            (None, 5))
        self.assertEqual(
            tuple(v.__int__() for v in policy.advs_ph.shape),
            (None,))
        self.assertEqual(
            tuple(v.__int__() for v in policy.old_neglog_pac_ph.shape),
            (None,))
        self.assertEqual(
            tuple(v.__int__() for v in policy.old_vpred_ph.shape),
            (None,))

        # Kill the session,
        policy_params['sess'].close()
Exemplo n.º 14
0
    def _setup_target_updates(model_scope, target_scope, scope, tau, verbose):
        """Create the soft and initial target updates.

        The initial model parameters are assumed to be stored under the scope
        name "model", while the target policy parameters are assumed to be
        under the scope name "target".

        If an additional outer scope was provided when creating the policies,
        they can be passed under the `scope` parameter.

        Parameters
        ----------
        model_scope : str
            the scope of the model parameters
        target_scope : str
            the scope of the target parameters
        scope : str or None
            the outer scope, set to None if not available
        tau : float
            target update rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug

        Returns
        -------
        tf.Operation
            initial target updates, to match the target with the model
        tf.Operation
            soft target update operations
        """
        if scope is not None:
            model_scope = scope + '/' + model_scope
            target_scope = scope + '/' + target_scope

        return get_target_updates(
            get_trainable_vars(model_scope),
            get_trainable_vars(target_scope),
            tau, verbose)
Exemplo n.º 15
0
    def test_init(self):
        """Check the functionality of the __init__() method.

        This method is tested for the following features:

        1. The proper structure graph was generated.
        2. All input placeholders are correct.
        """
        policy = TD3FeedForwardPolicy(**self.policy_params)

        # test case 1
        self.assertListEqual(
            sorted([var.name for var in get_trainable_vars()]), [
                'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0',
                'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0',
                'model/pi/output/bias:0', 'model/pi/output/kernel:0',
                'model/qf_0/fc0/bias:0', 'model/qf_0/fc0/kernel:0',
                'model/qf_0/fc1/bias:0', 'model/qf_0/fc1/kernel:0',
                'model/qf_0/qf_output/bias:0', 'model/qf_0/qf_output/kernel:0',
                'model/qf_1/fc0/bias:0', 'model/qf_1/fc0/kernel:0',
                'model/qf_1/fc1/bias:0', 'model/qf_1/fc1/kernel:0',
                'model/qf_1/qf_output/bias:0', 'model/qf_1/qf_output/kernel:0',
                'target/pi/fc0/bias:0', 'target/pi/fc0/kernel:0',
                'target/pi/fc1/bias:0', 'target/pi/fc1/kernel:0',
                'target/pi/output/bias:0', 'target/pi/output/kernel:0',
                'target/qf_0/fc0/bias:0', 'target/qf_0/fc0/kernel:0',
                'target/qf_0/fc1/bias:0', 'target/qf_0/fc1/kernel:0',
                'target/qf_0/qf_output/bias:0',
                'target/qf_0/qf_output/kernel:0', 'target/qf_1/fc0/bias:0',
                'target/qf_1/fc0/kernel:0', 'target/qf_1/fc1/bias:0',
                'target/qf_1/fc1/kernel:0', 'target/qf_1/qf_output/bias:0',
                'target/qf_1/qf_output/kernel:0'
            ])

        # test case 2
        self.assertEqual(tuple(v.__int__() for v in policy.terminals1.shape),
                         (None, 1))
        self.assertEqual(tuple(v.__int__() for v in policy.rew_ph.shape),
                         (None, 1))
        self.assertEqual(tuple(v.__int__() for v in policy.action_ph.shape),
                         (None, self.policy_params['ac_space'].shape[0]))
        self.assertEqual(tuple(v.__int__() for v in policy.obs_ph.shape),
                         (None, self.policy_params['ob_space'].shape[0] +
                          self.policy_params['co_space'].shape[0]))
        self.assertEqual(tuple(v.__int__() for v in policy.obs1_ph.shape),
                         (None, self.policy_params['ob_space'].shape[0] +
                          self.policy_params['co_space'].shape[0]))
Exemplo n.º 16
0
    def _setup_actor_optimizer(self, scope):
        """Create the actor loss, gradient, and optimizer."""
        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up actor optimizer')
            print_params_shape(scope_name, "actor")

        # compute the actor loss
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf[0])

        # create an optimizer object
        optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr)

        self.actor_optimizer = optimizer.minimize(
            self.actor_loss, var_list=get_trainable_vars(scope_name))
Exemplo n.º 17
0
    def _setup_stochastic_optimizer(self, scope):
        """Create the loss and optimizer of a stochastic policy."""
        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up optimizer')
            print_params_shape(scope_name, "policy")

        # Define the loss function.
        self.loss = -tf.reduce_mean(self.logp_ac)

        # Create an optimizer object.
        optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate)

        # Create the optimizer operation.
        self.optimizer = optimizer.minimize(
            loss=self.loss, var_list=get_trainable_vars(scope_name))
Exemplo n.º 18
0
    def _setup_connected_gradients(self):
        """Create the updated manager optimization with connected gradients."""
        # Index relevant variables based on self.goal_indices
        manager_obs0 = self.crop_to_goal(self.manager.obs_ph)
        manager_obs1 = self.crop_to_goal(self.manager.obs1_ph)
        worker_obs0 = self.crop_to_goal(self.worker.obs_ph)
        worker_obs1 = self.crop_to_goal(self.worker.obs1_ph)

        if self.relative_goals:
            # Relative goal formulation as per HIRO.
            goal = manager_obs0 + self.manager.actor_tf - manager_obs1
        else:
            # Goal is the direct output from the manager in this case.
            goal = self.manager.actor_tf

        # concatenate the output from the manager with the worker policy.
        obs_shape = self.worker.ob_space.shape[0]
        obs = tf.concat([self.worker.obs_ph[:, :obs_shape], goal], axis=-1)

        # create the worker policy with inputs directly from the manager
        with tf.compat.v1.variable_scope("Worker/model"):
            worker_with_manager_obs = self.worker.make_critic(
                obs, self.worker.action_ph, reuse=True, scope="qf_0")

        # create a tensorflow operation that mimics the reward function that is
        # used to provide feedback to the worker
        if self.relative_goals:
            reward_fn = -tf.compat.v1.losses.mean_squared_error(
                worker_obs0 + goal, worker_obs1)
        else:
            reward_fn = -tf.compat.v1.losses.mean_squared_error(
                goal, worker_obs1)

        # compute the worker loss with respect to the manager actions
        self.cg_loss = - tf.reduce_mean(worker_with_manager_obs) - reward_fn

        # create the optimizer object
        optimizer = tf.compat.v1.train.AdamOptimizer(self.manager.actor_lr)
        self.cg_optimizer = optimizer.minimize(
            self.manager.actor_loss + self.cg_weights * self.cg_loss,
            var_list=get_trainable_vars("Manager/model/pi/"),
        )
Exemplo n.º 19
0
    def test_init_conv(self):
        """Check the functionality of the __init__() method with conv policies.

        This method tests that the proper structure graph was generated.
        """
        policy_params = deepcopy(self.policy_params)
        policy_params['sess'] = tf.compat.v1.Session()
        policy_params["model_params"]["model_type"] = "conv"
        _ = PPOFeedForwardPolicy(**policy_params)

        self.assertListEqual(
            sorted([var.name for var in get_trainable_vars()]),
            ['model/logstd:0',
             'model/pi/conv0/bias:0',
             'model/pi/conv0/kernel:0',
             'model/pi/conv1/bias:0',
             'model/pi/conv1/kernel:0',
             'model/pi/conv2/bias:0',
             'model/pi/conv2/kernel:0',
             'model/pi/fc0/bias:0',
             'model/pi/fc0/kernel:0',
             'model/pi/fc1/bias:0',
             'model/pi/fc1/kernel:0',
             'model/pi/output/bias:0',
             'model/pi/output/kernel:0',
             'model/vf/conv0/bias:0',
             'model/vf/conv0/kernel:0',
             'model/vf/conv1/bias:0',
             'model/vf/conv1/kernel:0',
             'model/vf/conv2/bias:0',
             'model/vf/conv2/kernel:0',
             'model/vf/fc0/bias:0',
             'model/vf/fc0/kernel:0',
             'model/vf/fc1/bias:0',
             'model/vf/fc1/kernel:0',
             'model/vf/output/bias:0',
             'model/vf/output/kernel:0']
        )

        # Kill the session,
        policy_params['sess'].close()
Exemplo n.º 20
0
    def _setup_connected_gradients(self):
        """Create the connected gradients meta-policy optimizer."""
        # Index relevant variables based on self.goal_indices
        meta_obs0 = self.crop_to_goal(self.policy[0].obs_ph)
        meta_obs1 = self.crop_to_goal(self.policy[0].obs1_ph)
        worker_obs0 = self.crop_to_goal(self.policy[-1].obs_ph)
        worker_obs1 = self.crop_to_goal(self.policy[-1].obs1_ph)

        if self.relative_goals:
            # Relative goal formulation as per HIRO.
            goal = meta_obs0 + self.policy[0].actor_tf - meta_obs1
        else:
            # Goal is the direct output from the meta policy in this case.
            goal = self.policy[0].actor_tf

        # Concatenate the output from the manager with the worker policy.
        obs_shape = self.policy[-1].ob_space.shape[0]
        obs = tf.concat([self.policy[-1].obs_ph[:, :obs_shape], goal], axis=-1)

        # Create the worker policy with inputs directly from the manager.
        with tf.compat.v1.variable_scope("level_1/model"):
            worker_with_meta_obs = self.policy[-1].make_critic(
                obs, self.policy[-1].action_ph, reuse=True, scope="qf_0")

        # Create a tensorflow operation that mimics the reward function that is
        # used to provide feedback to the worker.
        if self.intrinsic_reward_type.startswith("scaled"):
            # Scale the observations/goals by the action space of the upper-
            # level policy if requested.
            ac_space = self.policy[0].ac_space
            scale = 0.5 * (ac_space.high - ac_space.low)
            worker_obs0 /= scale
            goal /= scale
            worker_obs1 /= scale

        if self.relative_goals:
            # Implement relative goals if requested.
            goal += worker_obs0

        if self.intrinsic_reward_type.endswith("exp_negative_distance"):
            reward_fn = tf.reduce_mean(
                tf.exp(-tf.reduce_sum(
                    tf.square(worker_obs0 + goal - worker_obs1), axis=1)))
        elif self.intrinsic_reward_type.endswith("negative_distance"):
            reward_fn = -tf.compat.v1.losses.mean_squared_error(
                worker_obs0 + goal, worker_obs1)
        else:
            raise ValueError("Unknown intrinsic reward type: {}".format(
                self.intrinsic_reward_type))

        # Scale by the worker reward scale.
        reward_fn *= self.intrinsic_reward_scale

        # Compute the worker loss with respect to the meta policy actions.
        self.cg_loss = -tf.reduce_mean(worker_with_meta_obs) - reward_fn

        # Create the optimizer object.
        optimizer = tf.compat.v1.train.AdamOptimizer(self.policy[0].actor_lr)
        self.cg_optimizer = optimizer.minimize(
            self.policy[0].actor_loss + self.cg_weights * self.cg_loss,
            var_list=get_trainable_vars("level_0/model/pi/"),
        )
Exemplo n.º 21
0
    def initialize(self):
        """See parent class.

        This method performs the following operations:

        - It calls the initialization methods of the policies at every level of
          the hierarchy to match the target value function parameters with the
          current policy parameters.
        - It also imports the lower-level policies from a pretrained checkpoint
          if a path to one is specified.
        """
        # Initialize the separate policies in the hierarchy.
        for i in range(self.num_levels):
            self.policy[i].initialize()

        if self.pretrain_path is not None:
            ckpt_path = os.path.join(self.pretrain_path, "checkpoints")

            # Get the checkpoint number.
            if self.pretrain_ckpt is None:
                filenames = os.listdir(ckpt_path)
                metafiles = [f[:-5] for f in filenames if f[-5:] == ".meta"]
                metanum = [int(f.split("-")[-1]) for f in metafiles]
                ckpt_num = max(metanum)
            else:
                ckpt_num = self.pretrain_ckpt

            # Extract the checkpoint path.
            ckpt_path = os.path.join(ckpt_path, "itr-{}".format(ckpt_num))
            var_list = tf.train.list_variables(ckpt_path)
            ckpt_reader = tf.train.load_checkpoint(ckpt_path)

            # Check that the number of levels match.
            assert var_list[-1][0].startswith(
                "level_{}".format(self.num_levels-1)), \
                "Number of levels between the checkpoint and current policy " \
                "do not match. Policy={}, Checkpoint={}".format(
                    self.num_levels,
                    int(var_list[-1][0].split("/")[0][6:]) + 1)

            # Check that the names and shapes of the lowest-level policy
            # parameters match the current policy.
            current_vars = {
                v.name: v.shape.as_list()
                for v in get_trainable_vars()
            }
            for var in var_list:
                var_name, var_shape = var
                var_name = "{}:0".format(var_name)
                # We only check the lower-level policies.
                if any(
                        var_name.startswith("level_{}".format(level))
                        for level in range(1, self.num_levels)):
                    assert var_name in current_vars.keys(), \
                        "{} not available in current policy.".format(var_name)
                    current_shape = current_vars[var_name]
                    assert current_shape == var_shape, \
                        "Shape mismatch for {}, {} != {}".format(
                            var_name, var_shape, current_shape)

            # Import the lower-level policy parameters.
            current_vars = {v.name: v for v in get_trainable_vars()}
            for var in var_list:
                var_name, var_shape = var
                if any(
                        var_name.startswith("level_{}".format(level))
                        for level in range(1, self.num_levels)):
                    value = ckpt_reader.get_tensor(var_name)
                    var_name = "{}:0".format(var_name)
                    self.sess.run(
                        tf.compat.v1.assign(current_vars[var_name], value))
Exemplo n.º 22
0
    def _setup_cooperative_gradients(self):
        """Create the cooperative gradients meta-policy optimizer."""
        self._n_train_steps = 0

        if self.cg_delta is not None:
            # placeholder for the lambda term.
            self.cg_weights = [
                tf.compat.v1.Variable(initial_value=-4.20, trainable=True)
                for _ in range(self.num_levels - 1)
            ]
        else:
            self.cg_weights = [
                self.cg_weights for _ in range(self.num_levels - 1)
            ]

        self.cg_loss = []
        self.cg_optimizer = []
        for level in range(self.num_levels - 1):
            # Index relevant variables based on self.goal_indices
            meta_obs0 = self.crop_to_goal(self.policy[level].obs_ph)
            meta_obs1 = self.crop_to_goal(self.policy[level].obs1_ph)
            worker_obs0 = self.crop_to_goal(self.policy[level + 1].obs_ph)
            worker_obs1 = self.crop_to_goal(self.policy[level + 1].obs1_ph)

            if self.relative_goals:
                # Relative goal formulation as per HIRO.
                goal = meta_obs0 + self.policy[level].actor_tf - meta_obs1
            else:
                # Goal is the direct output from the meta policy in this case.
                goal = self.policy[level].actor_tf

            # Concatenate the output from the manager with the worker policy.
            obs_shape = self.policy[level + 1].ob_space.shape[0]
            obs = tf.concat(
                [self.policy[level + 1].obs_ph[:, :obs_shape], goal], axis=-1)

            # Create the worker policy with inputs directly from the manager.
            with tf.compat.v1.variable_scope("level_{}/model".format(level +
                                                                     1)):
                worker_with_meta_obs = self.policy[level + 1].make_critic(
                    obs,
                    self.policy[level + 1].action_ph,
                    reuse=True,
                    scope="qf_0")

            # Create a tensorflow operation that mimics the reward function
            # that is used to provide feedback to the worker.
            if self.intrinsic_reward_type.startswith("scaled"):
                # Scale the observations/goals by the action space of the
                # upper-level policy if requested.
                ac_space = self.policy[level].ac_space
                scale = 0.5 * (ac_space.high - ac_space.low)
                worker_obs0 /= scale
                goal /= scale
                worker_obs1 /= scale

            if self.relative_goals:
                # Implement relative goals if requested.
                goal += worker_obs0

            if self.intrinsic_reward_type.endswith("exp_negative_distance"):
                reward_fn = tf.reduce_mean(
                    tf.exp(-tf.reduce_sum(
                        tf.square(worker_obs0 + goal - worker_obs1), axis=1)))
            elif self.intrinsic_reward_type.endswith("negative_distance"):
                reward_fn = -tf.compat.v1.losses.mean_squared_error(
                    worker_obs0 + goal, worker_obs1)
            else:
                raise ValueError("Unknown intrinsic reward type: {}".format(
                    self.intrinsic_reward_type))

            # Scale by the worker reward scale.
            reward_fn *= self.intrinsic_reward_scale

            # Compute the worker loss with respect to the meta policy actions.
            cg_loss = -(tf.reduce_mean(worker_with_meta_obs) + reward_fn)
            self.cg_loss.append(cg_loss)

            # Create the optimizer object.
            optimizer = tf.compat.v1.train.AdamOptimizer(
                self.policy[level].actor_lr)
            self.cg_optimizer.append(
                optimizer.minimize(
                    self.policy[level].actor_loss +
                    tf.exp(self.cg_weights[level]) * cg_loss,
                    var_list=get_trainable_vars(
                        "level_{}/model/pi/".format(level)),
                ))

            if self.cg_delta is not None:
                cg_weights_loss = \
                    tf.reduce_mean(
                        tf.exp(self.cg_weights[level]) * tf.stop_gradient(
                            worker_with_meta_obs + reward_fn - self.cg_delta
                        )
                    )
                optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr)
                self.cg_weights_optimizer = optimizer.minimize(
                    cg_weights_loss, var_list=[self.cg_weights[level]])

                # Add to tensorboard.
                tf.compat.v1.summary.scalar(
                    "level_{}/cg_weights_log".format(level),
                    self.cg_weights[level])
                tf.compat.v1.summary.scalar(
                    "level_{}/cg_weights".format(level),
                    tf.exp(self.cg_weights[level]))
                tf.compat.v1.summary.scalar(
                    "level_{}/cg_weights_loss".format(level), cg_weights_loss)
                tf.compat.v1.summary.scalar(
                    "level_{}/worker_with_meta_obs".format(level),
                    tf.reduce_mean(worker_with_meta_obs))
Exemplo n.º 23
0
    def test_setup_model_goal_conditioned(self):
        # Create the algorithm object.
        policy_params = self.init_parameters.copy()
        policy_params['policy'] = GoalConditionedPolicy
        policy_params['_init_setup_model'] = True
        alg = OffPolicyRLAlgorithm(**policy_params)

        # check the policy_kwargs term
        policy_kwargs = GOAL_CONDITIONED_PARAMS.copy()
        policy_kwargs.update(TD3_PARAMS)
        policy_kwargs['verbose'] = self.init_parameters['verbose']
        policy_kwargs['env_name'] = self.init_parameters['env']
        policy_kwargs['num_envs'] = self.init_parameters['num_envs']
        self.assertDictEqual(alg.policy_kwargs, policy_kwargs)

        with alg.graph.as_default():
            expected_vars = sorted([var.name for var in get_trainable_vars()])

        # Check that all trainable variables have been created in the
        # TensorFlow graph.
        self.assertListEqual(
            expected_vars,
            ['level_0/model/pi/fc0/bias:0',
             'level_0/model/pi/fc0/kernel:0',
             'level_0/model/pi/fc1/bias:0',
             'level_0/model/pi/fc1/kernel:0',
             'level_0/model/pi/output/bias:0',
             'level_0/model/pi/output/kernel:0',
             'level_0/model/qf_0/fc0/bias:0',
             'level_0/model/qf_0/fc0/kernel:0',
             'level_0/model/qf_0/fc1/bias:0',
             'level_0/model/qf_0/fc1/kernel:0',
             'level_0/model/qf_0/qf_output/bias:0',
             'level_0/model/qf_0/qf_output/kernel:0',
             'level_0/model/qf_1/fc0/bias:0',
             'level_0/model/qf_1/fc0/kernel:0',
             'level_0/model/qf_1/fc1/bias:0',
             'level_0/model/qf_1/fc1/kernel:0',
             'level_0/model/qf_1/qf_output/bias:0',
             'level_0/model/qf_1/qf_output/kernel:0',
             'level_0/target/pi/fc0/bias:0',
             'level_0/target/pi/fc0/kernel:0',
             'level_0/target/pi/fc1/bias:0',
             'level_0/target/pi/fc1/kernel:0',
             'level_0/target/pi/output/bias:0',
             'level_0/target/pi/output/kernel:0',
             'level_0/target/qf_0/fc0/bias:0',
             'level_0/target/qf_0/fc0/kernel:0',
             'level_0/target/qf_0/fc1/bias:0',
             'level_0/target/qf_0/fc1/kernel:0',
             'level_0/target/qf_0/qf_output/bias:0',
             'level_0/target/qf_0/qf_output/kernel:0',
             'level_0/target/qf_1/fc0/bias:0',
             'level_0/target/qf_1/fc0/kernel:0',
             'level_0/target/qf_1/fc1/bias:0',
             'level_0/target/qf_1/fc1/kernel:0',
             'level_0/target/qf_1/qf_output/bias:0',
             'level_0/target/qf_1/qf_output/kernel:0',
             'level_1/model/pi/fc0/bias:0',
             'level_1/model/pi/fc0/kernel:0',
             'level_1/model/pi/fc1/bias:0',
             'level_1/model/pi/fc1/kernel:0',
             'level_1/model/pi/output/bias:0',
             'level_1/model/pi/output/kernel:0',
             'level_1/model/qf_0/fc0/bias:0',
             'level_1/model/qf_0/fc0/kernel:0',
             'level_1/model/qf_0/fc1/bias:0',
             'level_1/model/qf_0/fc1/kernel:0',
             'level_1/model/qf_0/qf_output/bias:0',
             'level_1/model/qf_0/qf_output/kernel:0',
             'level_1/model/qf_1/fc0/bias:0',
             'level_1/model/qf_1/fc0/kernel:0',
             'level_1/model/qf_1/fc1/bias:0',
             'level_1/model/qf_1/fc1/kernel:0',
             'level_1/model/qf_1/qf_output/bias:0',
             'level_1/model/qf_1/qf_output/kernel:0',
             'level_1/target/pi/fc0/bias:0',
             'level_1/target/pi/fc0/kernel:0',
             'level_1/target/pi/fc1/bias:0',
             'level_1/target/pi/fc1/kernel:0',
             'level_1/target/pi/output/bias:0',
             'level_1/target/pi/output/kernel:0',
             'level_1/target/qf_0/fc0/bias:0',
             'level_1/target/qf_0/fc0/kernel:0',
             'level_1/target/qf_0/fc1/bias:0',
             'level_1/target/qf_0/fc1/kernel:0',
             'level_1/target/qf_0/qf_output/bias:0',
             'level_1/target/qf_0/qf_output/kernel:0',
             'level_1/target/qf_1/fc0/bias:0',
             'level_1/target/qf_1/fc0/kernel:0',
             'level_1/target/qf_1/fc1/bias:0',
             'level_1/target/qf_1/fc1/kernel:0',
             'level_1/target/qf_1/qf_output/bias:0',
             'level_1/target/qf_1/qf_output/kernel:0']
        )
Exemplo n.º 24
0
    def _setup_critic_update(self, critic, all_obs1_ph, actor_target, rew_ph,
                             done1, scope):
        """Create the critic loss and optimization process.

        Parameters
        ----------
        critic : tf.Variable
            the output from the centralized critic of the agent
        all_obs1_ph : tf.compat.v1.placeholder
            the placeholder for the full-state observation
        actor_target : tf.Variable
            the output from the combined target actors of all agents
        rew_ph : tf.compat.v1.placeholder
            placeholder for the rewards of the agent
        done1 : tf.compat.v1.placeholder
            placeholder for the done mask of the agent
        scope : str
            an outer scope term

        Returns
        -------
        tf.Operation
            the operation that returns the loss of the critic
        tf.Operation
            the operation that updates the trainable parameters of the critic
        """
        if self.verbose >= 2:
            print('setting up critic optimizer')

        # Create the centralized target critic policy.
        with tf.compat.v1.variable_scope("target", reuse=False):
            critic_target = [
                self.make_critic(all_obs1_ph,
                                 actor_target,
                                 scope="centralized_qf_{}".format(i))
                for i in range(2)
            ]

        # compute the target critic term
        with tf.compat.v1.variable_scope("loss", reuse=False):
            q_obs1 = tf.minimum(critic_target[0], critic_target[1])
            target_q = tf.stop_gradient(rew_ph +
                                        (1. - done1) * self.gamma * q_obs1)

            tf.compat.v1.summary.scalar('critic_target',
                                        tf.reduce_mean(target_q))

        # choose the loss function
        if self.use_huber:
            loss_fn = tf.compat.v1.losses.huber_loss
        else:
            loss_fn = tf.compat.v1.losses.mean_squared_error

        critic_loss = [loss_fn(q, target_q) for q in critic]

        critic_optimizer = []

        for i, loss in enumerate(critic_loss):
            scope_name = 'model/centralized_qf_{}'.format(i)
            if scope is not None:
                scope_name = scope + '/' + scope_name

            if self.verbose >= 2:
                critic_shapes = [
                    var.get_shape().as_list()
                    for var in get_trainable_vars(scope_name)
                ]
                critic_nb_params = sum([
                    reduce(lambda x, y: x * y, shape)
                    for shape in critic_shapes
                ])
                print('  critic shapes: {}'.format(critic_shapes))
                print('  critic params: {}'.format(critic_nb_params))

            # create an optimizer object
            optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr)

            # create the optimizer object
            critic_optimizer.append(
                optimizer.minimize(loss=loss,
                                   var_list=get_trainable_vars(scope_name)))

        return critic_loss, critic_optimizer
Exemplo n.º 25
0
    def _setup_optimizers(self, scope):
        """Create the actor and critic optimizers."""
        scope_name = 'model/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up actor optimizer')
            print_params_shape("{}pi/".format(scope_name), "actor")
            print('setting up critic optimizer')
            print_params_shape("{}vf/".format(scope_name), "critic")

        neglogpac = self._neglogp(self.action_ph)
        self.entropy = tf.reduce_sum(tf.reshape(self.pi_logstd, [-1]) +
                                     .5 * np.log(2.0 * np.pi * np.e),
                                     axis=-1)

        # Value function clipping: not present in the original PPO
        if self.cliprange_vf is None:
            # Default behavior (legacy from OpenAI baselines):
            # use the same clipping as for the policy
            self.cliprange_vf = self.cliprange

        if self.cliprange_vf < 0:
            # Original PPO implementation: no value function clipping.
            vpred_clipped = self.value_flat
        else:
            # Clip the different between old and new value
            # NOTE: this depends on the reward scaling
            vpred_clipped = self.old_vpred_ph + tf.clip_by_value(
                self.value_flat - self.old_vpred_ph, -self.cliprange_vf,
                self.cliprange_vf)

        vf_losses1 = tf.square(self.value_flat - self.rew_ph)
        vf_losses2 = tf.square(vpred_clipped - self.rew_ph)
        self.vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        ratio = tf.exp(self.old_neglog_pac_ph - neglogpac)
        pg_losses = -self.advs_ph * ratio
        pg_losses2 = -self.advs_ph * tf.clip_by_value(
            ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
        self.pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        self.approxkl = .5 * tf.reduce_mean(
            tf.square(neglogpac - self.old_neglog_pac_ph))
        self.clipfrac = tf.reduce_mean(
            tf.cast(tf.greater(tf.abs(ratio - 1.0), self.cliprange),
                    tf.float32))
        self.loss = self.pg_loss - self.entropy * self.ent_coef \
            + self.vf_loss * self.vf_coef

        # Compute the gradients of the loss.
        var_list = get_trainable_vars(scope_name)
        grads = tf.gradients(self.loss, var_list)

        # Perform gradient clipping if requested.
        if self.max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads,
                                                       self.max_grad_norm)
        grads = list(zip(grads, var_list))

        # Create the operation that applies the gradients.
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.learning_rate,
            epsilon=1e-5).apply_gradients(grads)
Exemplo n.º 26
0
    def test_init_conv(self):
        """Check the functionality of the __init__() method with conv policies.

        This method tests that the proper structure graph was generated.
        """
        policy_params = self.policy_params.copy()
        policy_params["model_params"]["model_type"] = "conv"
        _ = SACFeedForwardPolicy(**policy_params)

        self.assertListEqual(
            sorted([var.name for var in get_trainable_vars()]), [
                'model/log_alpha:0', 'model/pi/conv0/bias:0',
                'model/pi/conv0/kernel:0', 'model/pi/conv1/bias:0',
                'model/pi/conv1/kernel:0', 'model/pi/conv2/bias:0',
                'model/pi/conv2/kernel:0', 'model/pi/fc0/bias:0',
                'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0',
                'model/pi/fc1/kernel:0', 'model/pi/log_std/bias:0',
                'model/pi/log_std/kernel:0', 'model/pi/mean/bias:0',
                'model/pi/mean/kernel:0', 'model/value_fns/qf1/conv0/bias:0',
                'model/value_fns/qf1/conv0/kernel:0',
                'model/value_fns/qf1/conv1/bias:0',
                'model/value_fns/qf1/conv1/kernel:0',
                'model/value_fns/qf1/conv2/bias:0',
                'model/value_fns/qf1/conv2/kernel:0',
                'model/value_fns/qf1/fc0/bias:0',
                'model/value_fns/qf1/fc0/kernel:0',
                'model/value_fns/qf1/fc1/bias:0',
                'model/value_fns/qf1/fc1/kernel:0',
                'model/value_fns/qf1/qf_output/bias:0',
                'model/value_fns/qf1/qf_output/kernel:0',
                'model/value_fns/qf2/conv0/bias:0',
                'model/value_fns/qf2/conv0/kernel:0',
                'model/value_fns/qf2/conv1/bias:0',
                'model/value_fns/qf2/conv1/kernel:0',
                'model/value_fns/qf2/conv2/bias:0',
                'model/value_fns/qf2/conv2/kernel:0',
                'model/value_fns/qf2/fc0/bias:0',
                'model/value_fns/qf2/fc0/kernel:0',
                'model/value_fns/qf2/fc1/bias:0',
                'model/value_fns/qf2/fc1/kernel:0',
                'model/value_fns/qf2/qf_output/bias:0',
                'model/value_fns/qf2/qf_output/kernel:0',
                'model/value_fns/vf/conv0/bias:0',
                'model/value_fns/vf/conv0/kernel:0',
                'model/value_fns/vf/conv1/bias:0',
                'model/value_fns/vf/conv1/kernel:0',
                'model/value_fns/vf/conv2/bias:0',
                'model/value_fns/vf/conv2/kernel:0',
                'model/value_fns/vf/fc0/bias:0',
                'model/value_fns/vf/fc0/kernel:0',
                'model/value_fns/vf/fc1/bias:0',
                'model/value_fns/vf/fc1/kernel:0',
                'model/value_fns/vf/vf_output/bias:0',
                'model/value_fns/vf/vf_output/kernel:0',
                'target/value_fns/vf/conv0/bias:0',
                'target/value_fns/vf/conv0/kernel:0',
                'target/value_fns/vf/conv1/bias:0',
                'target/value_fns/vf/conv1/kernel:0',
                'target/value_fns/vf/conv2/bias:0',
                'target/value_fns/vf/conv2/kernel:0',
                'target/value_fns/vf/fc0/bias:0',
                'target/value_fns/vf/fc0/kernel:0',
                'target/value_fns/vf/fc1/bias:0',
                'target/value_fns/vf/fc1/kernel:0',
                'target/value_fns/vf/vf_output/bias:0',
                'target/value_fns/vf/vf_output/kernel:0'
            ])
Exemplo n.º 27
0
    def test_init(self):
        """Check the functionality of the __init__() method.

        This method is tested for the following features:

        1. The proper structure graph was generated.
        2. All input placeholders are correct.
        3. self.log_alpha is initialized to zero
        4. self.target_entropy is initialized as specified, with the special
           (None) case as well
        """
        policy = SACFeedForwardPolicy(**self.policy_params)

        # test case 1
        self.assertListEqual(
            sorted([var.name for var in get_trainable_vars()]), [
                'model/log_alpha:0', 'model/pi/fc0/bias:0',
                'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0',
                'model/pi/fc1/kernel:0', 'model/pi/log_std/bias:0',
                'model/pi/log_std/kernel:0', 'model/pi/mean/bias:0',
                'model/pi/mean/kernel:0', 'model/value_fns/qf1/fc0/bias:0',
                'model/value_fns/qf1/fc0/kernel:0',
                'model/value_fns/qf1/fc1/bias:0',
                'model/value_fns/qf1/fc1/kernel:0',
                'model/value_fns/qf1/qf_output/bias:0',
                'model/value_fns/qf1/qf_output/kernel:0',
                'model/value_fns/qf2/fc0/bias:0',
                'model/value_fns/qf2/fc0/kernel:0',
                'model/value_fns/qf2/fc1/bias:0',
                'model/value_fns/qf2/fc1/kernel:0',
                'model/value_fns/qf2/qf_output/bias:0',
                'model/value_fns/qf2/qf_output/kernel:0',
                'model/value_fns/vf/fc0/bias:0',
                'model/value_fns/vf/fc0/kernel:0',
                'model/value_fns/vf/fc1/bias:0',
                'model/value_fns/vf/fc1/kernel:0',
                'model/value_fns/vf/vf_output/bias:0',
                'model/value_fns/vf/vf_output/kernel:0',
                'target/value_fns/vf/fc0/bias:0',
                'target/value_fns/vf/fc0/kernel:0',
                'target/value_fns/vf/fc1/bias:0',
                'target/value_fns/vf/fc1/kernel:0',
                'target/value_fns/vf/vf_output/bias:0',
                'target/value_fns/vf/vf_output/kernel:0'
            ])

        # test case 2
        self.assertEqual(tuple(v.__int__() for v in policy.terminals1.shape),
                         (None, 1))
        self.assertEqual(tuple(v.__int__() for v in policy.rew_ph.shape),
                         (None, 1))
        self.assertEqual(tuple(v.__int__() for v in policy.action_ph.shape),
                         (None, self.policy_params['ac_space'].shape[0]))
        self.assertEqual(tuple(v.__int__() for v in policy.obs_ph.shape),
                         (None, self.policy_params['ob_space'].shape[0] +
                          self.policy_params['co_space'].shape[0]))
        self.assertEqual(tuple(v.__int__() for v in policy.obs1_ph.shape),
                         (None, self.policy_params['ob_space'].shape[0] +
                          self.policy_params['co_space'].shape[0]))

        # Initialize the variables of the policy.
        policy.sess.run(tf.compat.v1.global_variables_initializer())

        # test case 3
        self.assertEqual(policy.sess.run(policy.log_alpha), 0.0)

        # test case 4a
        self.assertEqual(policy.target_entropy,
                         -self.policy_params['ac_space'].shape[0])

        # Clear the graph.
        tf.compat.v1.reset_default_graph()

        # test case 4b
        self.policy_params['target_entropy'] = 5
        policy = SACFeedForwardPolicy(**self.policy_params)
        self.assertEqual(policy.target_entropy,
                         self.policy_params['target_entropy'])
Exemplo n.º 28
0
    def test_init(self):
        """Validate that the graph and variables are initialized properly."""
        policy = SACGoalConditionedPolicy(**self.policy_params)

        # Check that the abstract class has all the required attributes.
        self.assertEqual(policy.meta_period, self.policy_params['meta_period'])
        self.assertEqual(policy.relative_goals,
                         self.policy_params['relative_goals'])
        self.assertEqual(policy.off_policy_corrections,
                         self.policy_params['off_policy_corrections'])
        self.assertEqual(policy.use_fingerprints,
                         self.policy_params['use_fingerprints'])
        self.assertEqual(policy.centralized_value_functions,
                         self.policy_params['centralized_value_functions'])
        self.assertEqual(policy.connected_gradients,
                         self.policy_params['connected_gradients'])
        self.assertEqual(policy.cg_weights, self.policy_params['cg_weights'])

        self.assertListEqual(
            sorted([var.name for var in get_trainable_vars()]), [
                'Manager/model/log_alpha:0',
                'Manager/model/pi/fc0/bias:0',
                'Manager/model/pi/fc0/kernel:0',
                'Manager/model/pi/fc1/bias:0',
                'Manager/model/pi/fc1/kernel:0',
                'Manager/model/pi/log_std/bias:0',
                'Manager/model/pi/log_std/kernel:0',
                'Manager/model/pi/mean/bias:0',
                'Manager/model/pi/mean/kernel:0',
                'Manager/model/value_fns/qf1/fc0/bias:0',
                'Manager/model/value_fns/qf1/fc0/kernel:0',
                'Manager/model/value_fns/qf1/fc1/bias:0',
                'Manager/model/value_fns/qf1/fc1/kernel:0',
                'Manager/model/value_fns/qf1/qf_output/bias:0',
                'Manager/model/value_fns/qf1/qf_output/kernel:0',
                'Manager/model/value_fns/qf2/fc0/bias:0',
                'Manager/model/value_fns/qf2/fc0/kernel:0',
                'Manager/model/value_fns/qf2/fc1/bias:0',
                'Manager/model/value_fns/qf2/fc1/kernel:0',
                'Manager/model/value_fns/qf2/qf_output/bias:0',
                'Manager/model/value_fns/qf2/qf_output/kernel:0',
                'Manager/model/value_fns/vf/fc0/bias:0',
                'Manager/model/value_fns/vf/fc0/kernel:0',
                'Manager/model/value_fns/vf/fc1/bias:0',
                'Manager/model/value_fns/vf/fc1/kernel:0',
                'Manager/model/value_fns/vf/vf_output/bias:0',
                'Manager/model/value_fns/vf/vf_output/kernel:0',
                'Manager/target/value_fns/vf/fc0/bias:0',
                'Manager/target/value_fns/vf/fc0/kernel:0',
                'Manager/target/value_fns/vf/fc1/bias:0',
                'Manager/target/value_fns/vf/fc1/kernel:0',
                'Manager/target/value_fns/vf/vf_output/bias:0',
                'Manager/target/value_fns/vf/vf_output/kernel:0',
                'Worker/model/log_alpha:0',
                'Worker/model/pi/fc0/bias:0',
                'Worker/model/pi/fc0/kernel:0',
                'Worker/model/pi/fc1/bias:0',
                'Worker/model/pi/fc1/kernel:0',
                'Worker/model/pi/log_std/bias:0',
                'Worker/model/pi/log_std/kernel:0',
                'Worker/model/pi/mean/bias:0',
                'Worker/model/pi/mean/kernel:0',
                'Worker/model/value_fns/qf1/fc0/bias:0',
                'Worker/model/value_fns/qf1/fc0/kernel:0',
                'Worker/model/value_fns/qf1/fc1/bias:0',
                'Worker/model/value_fns/qf1/fc1/kernel:0',
                'Worker/model/value_fns/qf1/qf_output/bias:0',
                'Worker/model/value_fns/qf1/qf_output/kernel:0',
                'Worker/model/value_fns/qf2/fc0/bias:0',
                'Worker/model/value_fns/qf2/fc0/kernel:0',
                'Worker/model/value_fns/qf2/fc1/bias:0',
                'Worker/model/value_fns/qf2/fc1/kernel:0',
                'Worker/model/value_fns/qf2/qf_output/bias:0',
                'Worker/model/value_fns/qf2/qf_output/kernel:0',
                'Worker/model/value_fns/vf/fc0/bias:0',
                'Worker/model/value_fns/vf/fc0/kernel:0',
                'Worker/model/value_fns/vf/fc1/bias:0',
                'Worker/model/value_fns/vf/fc1/kernel:0',
                'Worker/model/value_fns/vf/vf_output/bias:0',
                'Worker/model/value_fns/vf/vf_output/kernel:0',
                'Worker/target/value_fns/vf/fc0/bias:0',
                'Worker/target/value_fns/vf/fc0/kernel:0',
                'Worker/target/value_fns/vf/fc1/bias:0',
                'Worker/target/value_fns/vf/fc1/kernel:0',
                'Worker/target/value_fns/vf/vf_output/bias:0',
                'Worker/target/value_fns/vf/vf_output/kernel:0',
            ])
Exemplo n.º 29
0
    def test_init(self):
        """Validate that the graph and variables are initialized properly."""
        policy = TD3GoalConditionedPolicy(**self.policy_params)

        # Check that the abstract class has all the required attributes.
        self.assertEqual(policy.meta_period, self.policy_params['meta_period'])
        self.assertEqual(policy.relative_goals,
                         self.policy_params['relative_goals'])
        self.assertEqual(policy.off_policy_corrections,
                         self.policy_params['off_policy_corrections'])
        self.assertEqual(policy.use_fingerprints,
                         self.policy_params['use_fingerprints'])
        self.assertEqual(policy.centralized_value_functions,
                         self.policy_params['centralized_value_functions'])
        self.assertEqual(policy.connected_gradients,
                         self.policy_params['connected_gradients'])
        self.assertEqual(policy.cg_weights, self.policy_params['cg_weights'])

        # Check that all trainable variables have been created in the
        # TensorFlow graph.
        self.assertListEqual(
            sorted([var.name for var in get_trainable_vars()]), [
                'Manager/model/pi/fc0/bias:0', 'Manager/model/pi/fc0/kernel:0',
                'Manager/model/pi/fc1/bias:0', 'Manager/model/pi/fc1/kernel:0',
                'Manager/model/pi/output/bias:0',
                'Manager/model/pi/output/kernel:0',
                'Manager/model/qf_0/fc0/bias:0',
                'Manager/model/qf_0/fc0/kernel:0',
                'Manager/model/qf_0/fc1/bias:0',
                'Manager/model/qf_0/fc1/kernel:0',
                'Manager/model/qf_0/qf_output/bias:0',
                'Manager/model/qf_0/qf_output/kernel:0',
                'Manager/model/qf_1/fc0/bias:0',
                'Manager/model/qf_1/fc0/kernel:0',
                'Manager/model/qf_1/fc1/bias:0',
                'Manager/model/qf_1/fc1/kernel:0',
                'Manager/model/qf_1/qf_output/bias:0',
                'Manager/model/qf_1/qf_output/kernel:0',
                'Manager/target/pi/fc0/bias:0',
                'Manager/target/pi/fc0/kernel:0',
                'Manager/target/pi/fc1/bias:0',
                'Manager/target/pi/fc1/kernel:0',
                'Manager/target/pi/output/bias:0',
                'Manager/target/pi/output/kernel:0',
                'Manager/target/qf_0/fc0/bias:0',
                'Manager/target/qf_0/fc0/kernel:0',
                'Manager/target/qf_0/fc1/bias:0',
                'Manager/target/qf_0/fc1/kernel:0',
                'Manager/target/qf_0/qf_output/bias:0',
                'Manager/target/qf_0/qf_output/kernel:0',
                'Manager/target/qf_1/fc0/bias:0',
                'Manager/target/qf_1/fc0/kernel:0',
                'Manager/target/qf_1/fc1/bias:0',
                'Manager/target/qf_1/fc1/kernel:0',
                'Manager/target/qf_1/qf_output/bias:0',
                'Manager/target/qf_1/qf_output/kernel:0',
                'Worker/model/pi/fc0/bias:0', 'Worker/model/pi/fc0/kernel:0',
                'Worker/model/pi/fc1/bias:0', 'Worker/model/pi/fc1/kernel:0',
                'Worker/model/pi/output/bias:0',
                'Worker/model/pi/output/kernel:0',
                'Worker/model/qf_0/fc0/bias:0',
                'Worker/model/qf_0/fc0/kernel:0',
                'Worker/model/qf_0/fc1/bias:0',
                'Worker/model/qf_0/fc1/kernel:0',
                'Worker/model/qf_0/qf_output/bias:0',
                'Worker/model/qf_0/qf_output/kernel:0',
                'Worker/model/qf_1/fc0/bias:0',
                'Worker/model/qf_1/fc0/kernel:0',
                'Worker/model/qf_1/fc1/bias:0',
                'Worker/model/qf_1/fc1/kernel:0',
                'Worker/model/qf_1/qf_output/bias:0',
                'Worker/model/qf_1/qf_output/kernel:0',
                'Worker/target/pi/fc0/bias:0', 'Worker/target/pi/fc0/kernel:0',
                'Worker/target/pi/fc1/bias:0', 'Worker/target/pi/fc1/kernel:0',
                'Worker/target/pi/output/bias:0',
                'Worker/target/pi/output/kernel:0',
                'Worker/target/qf_0/fc0/bias:0',
                'Worker/target/qf_0/fc0/kernel:0',
                'Worker/target/qf_0/fc1/bias:0',
                'Worker/target/qf_0/fc1/kernel:0',
                'Worker/target/qf_0/qf_output/bias:0',
                'Worker/target/qf_0/qf_output/kernel:0',
                'Worker/target/qf_1/fc0/bias:0',
                'Worker/target/qf_1/fc0/kernel:0',
                'Worker/target/qf_1/fc1/bias:0',
                'Worker/target/qf_1/fc1/kernel:0',
                'Worker/target/qf_1/qf_output/bias:0',
                'Worker/target/qf_1/qf_output/kernel:0'
            ])
Exemplo n.º 30
0
    def _setup_optimizers(self, scope):
        """Create the actor and critic optimizers."""
        scope_name = 'model/'
        old_scope_name = "oldpi/"
        if scope is not None:
            scope_name = scope + '/' + scope_name
            old_scope_name = scope + '/' + old_scope_name

        if self.verbose >= 2:
            print('setting up actor optimizer')
            print_params_shape("{}pi/".format(scope_name), "actor")
            print('setting up critic optimizer')
            print_params_shape("{}vf/".format(scope_name), "critic")

        # =================================================================== #
        # Create the policy loss and optimizers.                              #
        # =================================================================== #

        with tf.variable_scope("loss", reuse=False):
            # Compute the KL divergence.
            kloldnew = tf.reduce_sum(
                self.pi_logstd - self.old_pi_logstd +
                (tf.square(self.old_pi_std) +
                 tf.square(self.old_pi_mean - self.pi_mean)) /
                (2.0 * tf.square(self.pi_std)) - 0.5,
                axis=-1)
            meankl = tf.reduce_mean(kloldnew)

            # Compute the entropy bonus.
            entropy = tf.reduce_sum(self.pi_logstd +
                                    .5 * np.log(2.0 * np.pi * np.e),
                                    axis=-1)
            meanent = tf.reduce_mean(entropy)
            entbonus = self.ent_coef * meanent

            # advantage * pnew / pold
            ratio = tf.exp(
                self.logp(self.action_ph, old=False) -
                self.logp(self.action_ph, old=True))
            surrgain = tf.reduce_mean(ratio * self.advs_ph)

            optimgain = surrgain + entbonus
            self.losses = [optimgain, meankl, entbonus, surrgain, meanent]

            all_var_list = get_trainable_vars(scope_name)
            var_list = [
                v for v in all_var_list
                if "/vf" not in v.name and "/q/" not in v.name
            ]
            vf_var_list = [
                v for v in all_var_list
                if "/pi" not in v.name and "/logstd" not in v.name
            ]

            self.get_flat = GetFlat(var_list, sess=self.sess)
            self.set_from_flat = SetFromFlat(var_list, sess=self.sess)

            klgrads = tf.gradients(meankl, var_list)
            shapes = [var.get_shape().as_list() for var in var_list]
            start = 0
            tangents = []
            for shape in shapes:
                var_size = int(np.prod(shape))
                tangents.append(
                    tf.reshape(self.flat_tangent[start:start + var_size],
                               shape))
                start += var_size
            gvp = tf.add_n([
                tf.reduce_sum(grad * tangent)
                for (grad, tangent) in zip(klgrads, tangents)
            ])
            # Fisher vector products
            self.fvp = flatgrad(gvp, var_list)

        # =================================================================== #
        # Update the old model to match the new one.                          #
        # =================================================================== #

        self.assign_old_eq_new = tf.group(*[
            tf.assign(oldv, newv) for (oldv, newv) in zip(
                get_globals_vars(old_scope_name), get_globals_vars(scope_name))
        ])

        # =================================================================== #
        # Create the value function optimizer.                                #
        # =================================================================== #

        vferr = tf.reduce_mean(tf.square(self.value_flat - self.ret_ph))
        optimizer = tf.compat.v1.train.AdamOptimizer(self.vf_stepsize)
        self.vf_optimizer = optimizer.minimize(
            vferr,
            var_list=vf_var_list,
        )

        # Initialize the model parameters and optimizers.
        with self.sess.as_default():
            self.sess.run(tf.compat.v1.global_variables_initializer())

        th_init = self.get_flat()
        self.set_from_flat(th_init)

        self.grad = flatgrad(optimgain, var_list)