예제 #1
0
class TestReplayBuffer(unittest.TestCase):
    """Tests for the ReplayBuffer object."""

    def setUp(self):
        self.replay_buffer = ReplayBuffer(
            buffer_size=2, batch_size=1, obs_dim=1, ac_dim=1)

    def tearDown(self):
        del self.replay_buffer

    def test_buffer_size(self):
        """Validate the buffer_size output from the replay buffer."""
        self.assertEqual(self.replay_buffer.buffer_size, 2)

    def test_add_sample(self):
        """Test the `add` and `sample` methods the replay buffer."""
        # Add an element.
        self.replay_buffer.add(
            obs_t=np.array([0]),
            action=np.array([1]),
            reward=2,
            obs_tp1=np.array([3]),
            done=False
        )

        # Check is_full in the False case.
        self.assertEqual(self.replay_buffer.is_full(), False)

        # Add an element.
        self.replay_buffer.add(
            obs_t=np.array([0]),
            action=np.array([1]),
            reward=2,
            obs_tp1=np.array([3]),
            done=False
        )

        # Check is_full in the True case.
        self.assertEqual(self.replay_buffer.is_full(), True)

        # Check can_sample in the True case.
        self.assertEqual(self.replay_buffer.can_sample(), True)

        # Test the `sample` method.
        obs_t, actions_t, rewards, obs_tp1, done = self.replay_buffer.sample()
        np.testing.assert_array_almost_equal(obs_t, [[0]])
        np.testing.assert_array_almost_equal(actions_t, [[1]])
        np.testing.assert_array_almost_equal(rewards, [2])
        np.testing.assert_array_almost_equal(obs_tp1, [[3]])
        np.testing.assert_array_almost_equal(done, [False])
예제 #2
0
class FeedForwardPolicy(ActorCriticPolicy):
    """Feed-forward neural network actor-critic policy.

    Attributes
    ----------
    sess : tf.compat.v1.Session
        the current TensorFlow session
    ob_space : gym.spaces.*
        the observation space of the environment
    ac_space : gym.spaces.*
        the action space of the environment
    co_space : gym.spaces.*
        the context space of the environment
    buffer_size : int
        the max number of transitions to store
    batch_size : int
        SGD batch size
    actor_lr : float
        actor learning rate
    critic_lr : float
        critic learning rate
    verbose : int
        the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    layers : list of int
        the size of the Neural network for the policy
    tau : float
        target update rate
    gamma : float
        discount factor
    layer_norm : bool
        enable layer normalisation
    act_fun : tf.nn.*
        the activation function to use in the neural network
    use_huber : bool
        specifies whether to use the huber distance function as the loss for
        the critic. If set to False, the mean-squared error metric is used
        instead
    noise : float
        scaling term to the range of the action space, that is subsequently
        used as the standard deviation of Gaussian noise added to the action if
        `apply_noise` is set to True in `get_action`
    target_policy_noise : float
        standard deviation term to the noise from the output of the target
        actor policy. See TD3 paper for more.
    target_noise_clip : float
        clipping term for the noise injected in the target actor policy
    zero_fingerprint : bool
        whether to zero the last two elements of the observations for the actor
        and critic computations. Used for the worker policy when fingerprints
        are being implemented.
    fingerprint_dim : int
        the number of fingerprint elements in the observation. Used when trying
        to zero the fingerprint elements.
    replay_buffer : hbaselines.fcnet.replay_buffer.ReplayBuffer
        the replay buffer
    terminals1 : tf.compat.v1.placeholder
        placeholder for the next step terminals
    rew_ph : tf.compat.v1.placeholder
        placeholder for the rewards
    action_ph : tf.compat.v1.placeholder
        placeholder for the actions
    obs_ph : tf.compat.v1.placeholder
        placeholder for the observations
    obs1_ph : tf.compat.v1.placeholder
        placeholder for the next step observations
    actor_tf : tf.Variable
        the output from the actor network
    critic_tf : list of tf.Variable
        the output from the critic networks. Two networks are used to stabilize
        training.
    critic_with_actor_tf : list of tf.Variable
        the output from the critic networks with the action provided directly
        by the actor policy
    target_init_updates : tf.Operation
        an operation that sets the values of the trainable parameters of the
        target actor/critic to match those actual actor/critic
    target_soft_updates : tf.Operation
        soft target update function
    actor_loss : tf.Operation
        the operation that returns the loss of the actor
    actor_optimizer : tf.Operation
        the operation that updates the trainable parameters of the actor
    critic_loss : tf.Operation
        the operation that returns the loss of the critic
    critic_optimizer : tf.Operation
        the operation that updates the trainable parameters of the critic
    """
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 actor_lr,
                 critic_lr,
                 verbose,
                 tau,
                 gamma,
                 layer_norm,
                 layers,
                 act_fun,
                 use_huber,
                 noise,
                 target_policy_noise,
                 target_noise_clip,
                 scope=None,
                 zero_fingerprint=False,
                 fingerprint_dim=2):
        """Instantiate the feed-forward neural network policy.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        actor_lr : float
            actor learning rate
        critic_lr : float
            critic learning rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        tau : float
            target update rate
        gamma : float
            discount factor
        layer_norm : bool
            enable layer normalisation
        layers : list of int or None
            the size of the Neural network for the policy
        act_fun : tf.nn.*
            the activation function to use in the neural network
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            for the critic. If set to False, the mean-squared error metric is
            used instead
        noise : float
            scaling term to the range of the action space, that is subsequently
            used as the standard deviation of Gaussian noise added to the
            action if `apply_noise` is set to True in `get_action`
        target_policy_noise : float
            standard deviation term to the noise from the output of the target
            actor policy. See TD3 paper for more.
        target_noise_clip : float
            clipping term for the noise injected in the target actor policy
        scope : str
            an upper-level scope term. Used by policies that call this one.
        zero_fingerprint : bool
            whether to zero the last two elements of the observations for the
            actor and critic computations. Used for the worker policy when
            fingerprints are being implemented.
        fingerprint_dim : int
            the number of fingerprint elements in the observation. Used when
            trying to zero the fingerprint elements.

        Raises
        ------
        AssertionError
            if the layers is not a list of at least size 1
        """
        super(FeedForwardPolicy, self).__init__(sess=sess,
                                                ob_space=ob_space,
                                                ac_space=ac_space,
                                                co_space=co_space,
                                                buffer_size=buffer_size,
                                                batch_size=batch_size,
                                                actor_lr=actor_lr,
                                                critic_lr=critic_lr,
                                                verbose=verbose,
                                                tau=tau,
                                                gamma=gamma,
                                                layer_norm=layer_norm,
                                                layers=layers,
                                                act_fun=act_fun,
                                                use_huber=use_huber)

        # action magnitudes
        ac_mag = 0.5 * (ac_space.high - ac_space.low)

        self.noise = noise * ac_mag
        self.target_policy_noise = np.array([ac_mag * target_policy_noise])
        self.target_noise_clip = np.array([ac_mag * target_noise_clip])
        self.zero_fingerprint = zero_fingerprint
        self.fingerprint_dim = fingerprint_dim
        assert len(self.layers) >= 1, \
            "Error: must have at least one hidden layer for the policy."

        # Compute the shape of the input observation space, which may include
        # the contextual term.
        ob_dim = self._get_ob_dim(ob_space, co_space)

        # =================================================================== #
        # Step 1: Create a replay buffer object.                              #
        # =================================================================== #

        self.replay_buffer = ReplayBuffer(
            buffer_size=self.buffer_size,
            batch_size=self.batch_size,
            obs_dim=ob_dim[0],
            ac_dim=self.ac_space.shape[0],
        )

        # =================================================================== #
        # Step 2: Create input variables.                                     #
        # =================================================================== #

        with tf.compat.v1.variable_scope("input", reuse=False):
            self.terminals1 = tf.compat.v1.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals1')
            self.rew_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, 1),
                                                   name='rewards')
            self.action_ph = tf.compat.v1.placeholder(tf.float32,
                                                      shape=(None, ) +
                                                      ac_space.shape,
                                                      name='actions')
            self.obs_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, ) + ob_dim,
                                                   name='obs0')
            self.obs1_ph = tf.compat.v1.placeholder(tf.float32,
                                                    shape=(None, ) + ob_dim,
                                                    name='obs1')

        # logging of rewards to tensorboard
        with tf.compat.v1.variable_scope("input_info", reuse=False):
            tf.compat.v1.summary.scalar('rewards', tf.reduce_mean(self.rew_ph))

        # =================================================================== #
        # Step 3: Create actor and critic variables.                          #
        # =================================================================== #

        # Create networks and core TF parts that are shared across setup parts.
        with tf.compat.v1.variable_scope("model", reuse=False):
            self.actor_tf = self.make_actor(self.obs_ph)
            self.critic_tf = [
                self.make_critic(self.obs_ph,
                                 self.action_ph,
                                 scope="qf_{}".format(i)) for i in range(2)
            ]
            self.critic_with_actor_tf = [
                self.make_critic(self.obs_ph,
                                 self.actor_tf,
                                 reuse=True,
                                 scope="qf_{}".format(i)) for i in range(2)
            ]

        with tf.compat.v1.variable_scope("target", reuse=False):
            # create the target actor policy
            actor_target = self.make_actor(self.obs1_ph)

            # smooth target policy by adding clipped noise to target actions
            target_noise = tf.random.normal(tf.shape(actor_target),
                                            stddev=self.target_policy_noise)
            target_noise = tf.clip_by_value(target_noise,
                                            -self.target_noise_clip,
                                            self.target_noise_clip)

            # clip the noisy action to remain in the bounds
            noisy_actor_target = tf.clip_by_value(actor_target + target_noise,
                                                  self.ac_space.low,
                                                  self.ac_space.high)

            # create the target critic policies
            critic_target = [
                self.make_critic(self.obs1_ph,
                                 noisy_actor_target,
                                 scope="qf_{}".format(i)) for i in range(2)
            ]

        # Create the target update operations.
        init, soft = self._setup_target_updates('model', 'target', scope, tau,
                                                verbose)
        self.target_init_updates = init
        self.target_soft_updates = soft

        # =================================================================== #
        # Step 4: Setup the optimizers for the actor and critic.              #
        # =================================================================== #

        with tf.compat.v1.variable_scope("Optimizer", reuse=False):
            self._setup_actor_optimizer(scope)
            self._setup_critic_optimizer(critic_target, scope)
            tf.compat.v1.summary.scalar('actor_loss', self.actor_loss)
            tf.compat.v1.summary.scalar('Q1_loss', self.critic_loss[0])
            tf.compat.v1.summary.scalar('Q2_loss', self.critic_loss[1])

        # =================================================================== #
        # Step 5: Setup the operations for computing model statistics.        #
        # =================================================================== #

        # Setup the running means and standard deviations of the model inputs
        # and outputs.
        self.stats_ops, self.stats_names = self._setup_stats(scope or "Model")

    def _setup_actor_optimizer(self, scope):
        """Create the actor loss, gradient, and optimizer."""
        if self.verbose >= 2:
            print('setting up actor optimizer')

        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            actor_shapes = [
                var.get_shape().as_list()
                for var in get_trainable_vars(scope_name)
            ]
            actor_nb_params = sum(
                [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
            print('  actor shapes: {}'.format(actor_shapes))
            print('  actor params: {}'.format(actor_nb_params))

        # compute the actor loss
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf[0])

        # create an optimizer object
        optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr)

        self.actor_optimizer = optimizer.minimize(
            self.actor_loss, var_list=get_trainable_vars(scope_name))

    def _setup_critic_optimizer(self, critic_target, scope):
        """Create the critic loss, gradient, and optimizer."""
        if self.verbose >= 2:
            print('setting up critic optimizer')

        # compute the target critic term
        with tf.compat.v1.variable_scope("loss", reuse=False):
            q_obs1 = tf.minimum(critic_target[0], critic_target[1])
            target_q = tf.stop_gradient(self.rew_ph + (1. - self.terminals1) *
                                        self.gamma * q_obs1)

            tf.compat.v1.summary.scalar('critic_target',
                                        tf.reduce_mean(target_q))

        # choose the loss function
        if self.use_huber:
            loss_fn = tf.compat.v1.losses.huber_loss
        else:
            loss_fn = tf.compat.v1.losses.mean_squared_error

        self.critic_loss = [loss_fn(q, target_q) for q in self.critic_tf]

        self.critic_optimizer = []

        for i, critic_loss in enumerate(self.critic_loss):
            scope_name = 'model/qf_{}/'.format(i)
            if scope is not None:
                scope_name = scope + '/' + scope_name

            if self.verbose >= 2:
                critic_shapes = [
                    var.get_shape().as_list()
                    for var in get_trainable_vars(scope_name)
                ]
                critic_nb_params = sum([
                    reduce(lambda x, y: x * y, shape)
                    for shape in critic_shapes
                ])
                print('  critic shapes: {}'.format(critic_shapes))
                print('  critic params: {}'.format(critic_nb_params))

            # create an optimizer object
            optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr)

            # create the optimizer object
            self.critic_optimizer.append(
                optimizer.minimize(loss=critic_loss,
                                   var_list=get_trainable_vars(scope_name)))

    def make_actor(self, obs, reuse=False, scope="pi"):
        """Create an actor tensor.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor

        Returns
        -------
        tf.Variable
            the output from the actor
        """
        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            pi_h = obs

            # zero out the fingerprint observations for the worker policy
            if self.zero_fingerprint:
                pi_h = self._remove_fingerprint(pi_h, self.ob_space.shape[0],
                                                self.fingerprint_dim,
                                                self.co_space.shape[0])

            # create the hidden layers
            for i, layer_size in enumerate(self.layers):
                pi_h = self._layer(pi_h,
                                   layer_size,
                                   'fc{}'.format(i),
                                   act_fun=self.act_fun,
                                   layer_norm=self.layer_norm)

            # create the output layer
            policy = self._layer(
                pi_h,
                self.ac_space.shape[0],
                'output',
                act_fun=tf.nn.tanh,
                kernel_initializer=tf.random_uniform_initializer(minval=-3e-3,
                                                                 maxval=3e-3))

            # scaling terms to the output from the policy
            ac_means = (self.ac_space.high + self.ac_space.low) / 2.
            ac_magnitudes = (self.ac_space.high - self.ac_space.low) / 2.

            policy = ac_means + ac_magnitudes * tf.to_float(policy)

        return policy

    def make_critic(self, obs, action, reuse=False, scope="qf"):
        """Create a critic tensor.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        action : tf.compat.v1.placeholder
            the input action placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor

        Returns
        -------
        tf.Variable
            the output from the critic
        """
        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            # concatenate the observations and actions
            qf_h = tf.concat([obs, action], axis=-1)

            # zero out the fingerprint observations for the worker policy
            if self.zero_fingerprint:
                qf_h = self._remove_fingerprint(
                    qf_h, self.ob_space.shape[0], self.fingerprint_dim,
                    self.co_space.shape[0] + self.ac_space.shape[0])

            # create the hidden layers
            for i, layer_size in enumerate(self.layers):
                qf_h = self._layer(qf_h,
                                   layer_size,
                                   'fc{}'.format(i),
                                   act_fun=self.act_fun,
                                   layer_norm=self.layer_norm)

            # create the output layer
            qvalue_fn = self._layer(
                qf_h,
                1,
                'qf_output',
                kernel_initializer=tf.random_uniform_initializer(minval=-3e-3,
                                                                 maxval=3e-3))

        return qvalue_fn

    def update(self, update_actor=True, **kwargs):
        """Perform a gradient update step.

        **Note**; The target update soft updates occur at the same frequency as
        the actor update frequencies.

        Parameters
        ----------
        update_actor : bool
            specifies whether to update the actor policy. The critic policy is
            still updated if this value is set to False.

        Returns
        -------
        [float, float]
            Q1 loss, Q2 loss
        float
            actor loss
        """
        # Not enough samples in the replay buffer.
        if not self.replay_buffer.can_sample():
            return [0, 0], 0

        # Get a batch
        obs0, actions, rewards, obs1, terminals1 = self.replay_buffer.sample()

        return self.update_from_batch(obs0,
                                      actions,
                                      rewards,
                                      obs1,
                                      terminals1,
                                      update_actor=update_actor)

    def update_from_batch(self,
                          obs0,
                          actions,
                          rewards,
                          obs1,
                          terminals1,
                          update_actor=True):
        """Perform gradient update step given a batch of data.

        Parameters
        ----------
        obs0 : np.ndarray
            batch of observations
        actions : numpy float
            batch of actions executed given obs_batch
        rewards : numpy float
            rewards received as results of executing act_batch
        obs1 : np.ndarray
            next set of observations seen after executing act_batch
        terminals1 : numpy bool
            done_mask[i] = 1 if executing act_batch[i] resulted in the end of
            an episode and 0 otherwise.
        update_actor : bool, optional
            specified whether to perform gradient update procedures to the
            actor policy. Default set to True. Note that the update procedure
            for the critic is always performed when calling this method.

        Returns
        -------
        [float, float]
            Q1 loss, Q2 loss
        float
            actor loss
        """
        # Reshape to match previous behavior and placeholder shape.
        rewards = rewards.reshape(-1, 1)
        terminals1 = terminals1.reshape(-1, 1)

        # Update operations for the critic networks.
        step_ops = [
            self.critic_loss, self.critic_optimizer[0],
            self.critic_optimizer[1]
        ]

        if update_actor:
            # Actor updates and target soft update operation.
            step_ops += [
                self.actor_loss, self.actor_optimizer, self.target_soft_updates
            ]

        # Perform the update operations and collect the critic loss.
        critic_loss, *_vals = self.sess.run(step_ops,
                                            feed_dict={
                                                self.obs_ph: obs0,
                                                self.action_ph: actions,
                                                self.rew_ph: rewards,
                                                self.obs1_ph: obs1,
                                                self.terminals1: terminals1
                                            })

        # Extract the actor loss.
        actor_loss = _vals[2] if update_actor else 0

        return critic_loss, actor_loss

    def get_action(self, obs, context, apply_noise, random_actions):
        """See parent class."""
        # Add the contextual observation, if applicable.
        obs = self._get_obs(obs, context, axis=1)

        if random_actions:
            action = np.array([self.ac_space.sample()])
        else:
            action = self.sess.run(self.actor_tf, {self.obs_ph: obs})

            if apply_noise:
                # compute noisy action
                if apply_noise:
                    action += np.random.normal(0, self.noise, action.shape)

                # clip by bounds
                action = np.clip(action, self.ac_space.low, self.ac_space.high)

        return action

    def value(self, obs, context, action):
        """See parent class."""
        # Add the contextual observation, if applicable.
        obs = self._get_obs(obs, context, axis=1)

        return self.sess.run(self.critic_tf,
                             feed_dict={
                                 self.obs_ph: obs,
                                 self.action_ph: action
                             })

    def store_transition(self,
                         obs0,
                         context0,
                         action,
                         reward,
                         obs1,
                         context1,
                         done,
                         is_final_step,
                         evaluate=False):
        """See parent class."""
        if not evaluate:
            # Add the contextual observation, if applicable.
            obs0 = self._get_obs(obs0, context0, axis=0)
            obs1 = self._get_obs(obs1, context1, axis=0)

            # Modify the done mask in accordance with the TD3 algorithm. Done
            # masks that correspond to the final step are set to False.
            done = done and not is_final_step

            self.replay_buffer.add(obs0, action, reward, obs1, float(done))

    def initialize(self):
        """See parent class.

        This method syncs the actor and critic optimizers across CPUs, and
        initializes the target parameters to match the model parameters.
        """
        self.sess.run(self.target_init_updates)

    def _setup_stats(self, base="Model"):
        """Create the running means and std of the model inputs and outputs.

        This method also adds the same running means and stds as scalars to
        tensorboard for additional storage.
        """
        ops = []
        names = []

        ops += [tf.reduce_mean(self.critic_tf[0])]
        names += ['{}/reference_Q1_mean'.format(base)]
        ops += [reduce_std(self.critic_tf[0])]
        names += ['{}/reference_Q1_std'.format(base)]

        ops += [tf.reduce_mean(self.critic_tf[1])]
        names += ['{}/reference_Q2_mean'.format(base)]
        ops += [reduce_std(self.critic_tf[1])]
        names += ['{}/reference_Q2_std'.format(base)]

        ops += [tf.reduce_mean(self.critic_with_actor_tf[0])]
        names += ['{}/reference_actor_Q1_mean'.format(base)]
        ops += [reduce_std(self.critic_with_actor_tf[0])]
        names += ['{}/reference_actor_Q1_std'.format(base)]

        ops += [tf.reduce_mean(self.critic_with_actor_tf[1])]
        names += ['{}/reference_actor_Q2_mean'.format(base)]
        ops += [reduce_std(self.critic_with_actor_tf[1])]
        names += ['{}/reference_actor_Q2_std'.format(base)]

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['{}/reference_action_mean'.format(base)]
        ops += [reduce_std(self.actor_tf)]
        names += ['{}/reference_action_std'.format(base)]

        # Add all names and ops to the tensorboard summary.
        for op, name in zip(ops, names):
            tf.compat.v1.summary.scalar(name, op)

        return ops, names

    def get_td_map(self):
        """See parent class."""
        # Not enough samples in the replay buffer.
        if not self.replay_buffer.can_sample():
            return {}

        # Get a batch.
        obs0, actions, rewards, obs1, done1 = self.replay_buffer.sample()

        return self.get_td_map_from_batch(obs0, actions, rewards, obs1, done1)

    def get_td_map_from_batch(self, obs0, actions, rewards, obs1, terminals1):
        """Convert a batch to a td_map."""
        # Reshape to match previous behavior and placeholder shape.
        rewards = rewards.reshape(-1, 1)
        terminals1 = terminals1.reshape(-1, 1)

        td_map = {
            self.obs_ph: obs0,
            self.action_ph: actions,
            self.rew_ph: rewards,
            self.obs1_ph: obs1,
            self.terminals1: terminals1
        }

        return td_map
예제 #3
0
class FeedForwardPolicy(ActorCriticPolicy):
    """SAC-compatible feedforward policy.

    Attributes
    ----------
    sess : tf.compat.v1.Session
        the current TensorFlow session
    ob_space : gym.spaces.*
        the observation space of the environment
    ac_space : gym.spaces.*
        the action space of the environment
    co_space : gym.spaces.*
        the context space of the environment
    buffer_size : int
        the max number of transitions to store
    batch_size : int
        SGD batch size
    actor_lr : float
        actor learning rate
    critic_lr : float
        critic learning rate
    verbose : int
        the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    tau : float
        target update rate
    gamma : float
        discount factor
    use_huber : bool
        specifies whether to use the huber distance function as the loss for
        the critic. If set to False, the mean-squared error metric is used
        instead
    model_params : dict
        dictionary of model-specific parameters. See parent class.
    target_entropy : float
        target entropy used when learning the entropy coefficient
    replay_buffer : hbaselines.fcnet.replay_buffer.ReplayBuffer
        the replay buffer
    terminals1 : tf.compat.v1.placeholder
        placeholder for the next step terminals
    rew_ph : tf.compat.v1.placeholder
        placeholder for the rewards
    action_ph : tf.compat.v1.placeholder
        placeholder for the actions
    obs_ph : tf.compat.v1.placeholder
        placeholder for the observations
    obs1_ph : tf.compat.v1.placeholder
        placeholder for the next step observations
    deterministic_action : tf.Variable
        the output from the deterministic actor
    policy_out : tf.Variable
        the output from the stochastic actor
    logp_pi : tf.Variable
        the log-probability of a given observation given the output action from
        the policy
    logp_action : tf.Variable
        the log-probability of a given observation given a fixed action. Used
        by the hierarchical policy to perform off-policy corrections.
    qf1 : tf.Variable
        the output from the first Q-function
    qf2 : tf.Variable
        the output from the second Q-function
    value_fn : tf.Variable
        the output from the value function
    qf1_pi : tf.Variable
        the output from the first Q-function with the action provided directly
        by the actor policy
    qf2_pi : tf.Variable
        the output from the second Q-function with the action provided directly
        by the actor policy
    log_alpha : tf.Variable
        the log of the entropy coefficient
    alpha : tf.Variable
        the entropy coefficient
    value_target : tf.Variable
        the output from the target value function. Takes as input the next-step
        observations
    target_init_updates : tf.Operation
        an operation that sets the values of the trainable parameters of the
        target actor/critic to match those actual actor/critic
    target_soft_updates : tf.Operation
        soft target update function
    alpha_loss : tf.Operation
        the operation that returns the loss of the entropy term
    alpha_optimizer : tf.Operation
        the operation that updates the trainable parameters of the entropy term
    actor_loss : tf.Operation
        the operation that returns the loss of the actor
    actor_optimizer : tf.Operation
        the operation that updates the trainable parameters of the actor
    critic_loss : tf.Operation
        the operation that returns the loss of the critic
    critic_optimizer : tf.Operation
        the operation that updates the trainable parameters of the critic
    """
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 actor_lr,
                 critic_lr,
                 verbose,
                 tau,
                 gamma,
                 use_huber,
                 l2_penalty,
                 model_params,
                 target_entropy,
                 scope=None,
                 num_envs=1):
        """Instantiate the feed-forward neural network policy.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        actor_lr : float
            actor learning rate
        critic_lr : float
            critic learning rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        tau : float
            target update rate
        gamma : float
            discount factor
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            for the critic. If set to False, the mean-squared error metric is
            used instead
        l2_penalty : float
            L2 regularization penalty. This is applied to the policy network.
        model_params : dict
            dictionary of model-specific parameters. See parent class.
        target_entropy : float
            target entropy used when learning the entropy coefficient. If set
            to None, a heuristic value is used.
        scope : str
            an upper-level scope term. Used by policies that call this one.
        """
        super(FeedForwardPolicy, self).__init__(
            sess=sess,
            ob_space=ob_space,
            ac_space=ac_space,
            co_space=co_space,
            buffer_size=buffer_size,
            batch_size=batch_size,
            actor_lr=actor_lr,
            critic_lr=critic_lr,
            verbose=verbose,
            tau=tau,
            gamma=gamma,
            use_huber=use_huber,
            l2_penalty=l2_penalty,
            model_params=model_params,
            num_envs=num_envs,
        )

        if target_entropy is None:
            self.target_entropy = -np.prod(self.ac_space.shape)
        else:
            self.target_entropy = target_entropy

        self._ac_means = 0.5 * (ac_space.high + ac_space.low)
        self._ac_magnitudes = 0.5 * (ac_space.high - ac_space.low)

        # Compute the shape of the input observation space, which may include
        # the contextual term.
        ob_dim = self._get_ob_dim(ob_space, co_space)

        # =================================================================== #
        # Step 1: Create a replay buffer object.                              #
        # =================================================================== #

        self.replay_buffer = ReplayBuffer(
            buffer_size=self.buffer_size,
            batch_size=self.batch_size,
            obs_dim=ob_dim[0],
            ac_dim=self.ac_space.shape[0],
        )

        # =================================================================== #
        # Step 2: Create input variables.                                     #
        # =================================================================== #

        with tf.compat.v1.variable_scope("input", reuse=False):
            self.terminals1 = tf.compat.v1.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals1')
            self.rew_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, 1),
                                                   name='rewards')
            self.action_ph = tf.compat.v1.placeholder(tf.float32,
                                                      shape=(None, ) +
                                                      ac_space.shape,
                                                      name='actions')
            self.obs_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, ) + ob_dim,
                                                   name='obs0')
            self.obs1_ph = tf.compat.v1.placeholder(tf.float32,
                                                    shape=(None, ) + ob_dim,
                                                    name='obs1')

        # =================================================================== #
        # Step 3: Create actor and critic variables.                          #
        # =================================================================== #

        # Create networks and core TF parts that are shared across setup parts.
        with tf.compat.v1.variable_scope("model", reuse=False):
            self.deterministic_action, self.policy_out, self.logp_pi, \
                self.logp_action = self.make_actor(self.obs_ph, self.action_ph)
            self.qf1, self.qf2, self.value_fn = self.make_critic(
                self.obs_ph, self.action_ph, create_qf=True, create_vf=True)
            self.qf1_pi, self.qf2_pi, _ = self.make_critic(self.obs_ph,
                                                           self.policy_out,
                                                           create_qf=True,
                                                           create_vf=False,
                                                           reuse=True)

            # The entropy coefficient or entropy can be learned automatically,
            # see Automating Entropy Adjustment for Maximum Entropy RL section
            # of https://arxiv.org/abs/1812.05905
            self.log_alpha = tf.compat.v1.get_variable('log_alpha',
                                                       dtype=tf.float32,
                                                       initializer=0.0)
            self.alpha = tf.exp(self.log_alpha)

        with tf.compat.v1.variable_scope("target", reuse=False):
            # Create the value network
            _, _, value_target = self.make_critic(self.obs1_ph,
                                                  create_qf=False,
                                                  create_vf=True)
            self.value_target = value_target

        # Create the target update operations.
        init, soft = self._setup_target_updates('model/value_fns/vf',
                                                'target/value_fns/vf', scope,
                                                tau, verbose)
        self.target_init_updates = init
        self.target_soft_updates = soft

        # =================================================================== #
        # Step 4: Setup the optimizers for the actor and critic.              #
        # =================================================================== #

        with tf.compat.v1.variable_scope("Optimizer", reuse=False):
            self._setup_actor_optimizer(scope)
            self._setup_critic_optimizer(scope)

        # =================================================================== #
        # Step 5: Setup the operations for computing model statistics.        #
        # =================================================================== #

        # Setup the running means and standard deviations of the model inputs
        # and outputs.
        self.stats_ops, self.stats_names = self._setup_stats(scope or "Model")

    def make_actor(self, obs, action, reuse=False, scope="pi"):
        """Create the actor variables.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        action : tf.compat.v1.placeholder
            the input action placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor

        Returns
        -------
        tf.Variable
            the output from the deterministic actor
        tf.Variable
            the output from the stochastic actor
        tf.Variable
            the log-probability of a given observation given the output action
            from the policy
        tf.Variable
            the log-probability of a given observation given a fixed action
        """
        # Initial image pre-processing (for convolutional policies).
        if self.model_params["model_type"] == "conv":
            pi_h = create_conv(
                obs=obs,
                image_height=self.model_params["image_height"],
                image_width=self.model_params["image_width"],
                image_channels=self.model_params["image_channels"],
                ignore_flat_channels=self.model_params["ignore_flat_channels"],
                ignore_image=self.model_params["ignore_image"],
                filters=self.model_params["filters"],
                kernel_sizes=self.model_params["kernel_sizes"],
                strides=self.model_params["strides"],
                act_fun=self.model_params["act_fun"],
                layer_norm=self.model_params["layer_norm"],
                scope=scope,
                reuse=reuse,
            )
        else:
            pi_h = obs

        # Create the model.
        policy_mean, log_std = create_fcnet(
            obs=pi_h,
            layers=self.model_params["layers"],
            num_output=self.ac_space.shape[0],
            stochastic=True,
            act_fun=self.model_params["act_fun"],
            layer_norm=self.model_params["layer_norm"],
            scope=scope,
            reuse=reuse,
        )

        # OpenAI Variation to cap the standard deviation
        log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)

        std = tf.exp(log_std)

        # Reparameterization trick
        policy = policy_mean + tf.random.normal(tf.shape(policy_mean)) * std
        logp_pi = gaussian_likelihood(policy, policy_mean, log_std)
        logp_ac = gaussian_likelihood(action, policy_mean, log_std)

        # Apply squashing and account for it in the probability
        _, _, logp_ac = apply_squashing_func(policy_mean, action, logp_ac)
        deterministic_policy, policy, logp_pi = apply_squashing_func(
            policy_mean, policy, logp_pi)

        return deterministic_policy, policy, logp_pi, logp_ac

    def make_critic(self,
                    obs,
                    action=None,
                    reuse=False,
                    scope="value_fns",
                    create_qf=True,
                    create_vf=True):
        """Create the critic variables.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        action : tf.compat.v1.placeholder
            the input action placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor
        create_qf : bool
            whether to create the Q-functions
        create_vf : bool
            whether to create the value function

        Returns
        -------
        tf.Variable
            the output from the first Q-function. Set to None if `create_qf` is
            False.
        tf.Variable
            the output from the second Q-function. Set to None if `create_qf`
            is False.
        tf.Variable
            the output from the value function. Set to None if `create_vf` is
            False.
        """
        conv_params = dict(
            image_height=self.model_params["image_height"],
            image_width=self.model_params["image_width"],
            image_channels=self.model_params["image_channels"],
            ignore_flat_channels=self.model_params["ignore_flat_channels"],
            ignore_image=self.model_params["ignore_image"],
            filters=self.model_params["filters"],
            kernel_sizes=self.model_params["kernel_sizes"],
            strides=self.model_params["strides"],
            act_fun=self.model_params["act_fun"],
            layer_norm=self.model_params["layer_norm"],
            reuse=reuse,
        )

        fcnet_params = dict(
            layers=self.model_params["layers"],
            num_output=1,
            stochastic=False,
            act_fun=self.model_params["act_fun"],
            layer_norm=self.model_params["layer_norm"],
            reuse=reuse,
        )

        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            # Value function
            if create_vf:
                if self.model_params["model_type"] == "conv":
                    vf_h = create_conv(obs=obs, scope="vf", **conv_params)
                else:
                    vf_h = obs

                value_fn = create_fcnet(obs=vf_h,
                                        scope="vf",
                                        output_pre="vf_",
                                        **fcnet_params)
            else:
                value_fn = None

            # Double Q values to reduce overestimation
            if create_qf:
                # Concatenate the observations and actions.
                qf_h = tf.concat([obs, action], axis=-1)

                if self.model_params["model_type"] == "conv":
                    qf1_h = create_conv(obs=qf_h, scope="qf1", **conv_params)
                    qf2_h = create_conv(obs=qf_h, scope="qf2", **conv_params)
                else:
                    qf1_h = qf_h
                    qf2_h = qf_h

                qf1 = create_fcnet(obs=qf1_h,
                                   scope="qf1",
                                   output_pre="qf_",
                                   **fcnet_params)
                qf2 = create_fcnet(obs=qf2_h,
                                   scope="qf2",
                                   output_pre="qf_",
                                   **fcnet_params)
            else:
                qf1, qf2 = None, None

        return qf1, qf2, value_fn

    def update(self, **kwargs):
        """Perform a gradient update step."""
        # Not enough samples in the replay buffer.
        if not self.replay_buffer.can_sample():
            return

        # Get a batch
        obs0, actions, rewards, obs1, done1 = self.replay_buffer.sample()

        return self.update_from_batch(obs0, actions, rewards, obs1, done1)

    def update_from_batch(self,
                          obs0,
                          actions,
                          rewards,
                          obs1,
                          terminals1,
                          update_actor=True):
        """Perform gradient update step given a batch of data.

        Parameters
        ----------
        obs0 : array_like
            batch of observations
        actions : array_like
            batch of actions executed given obs_batch
        rewards : array_like
            rewards received as results of executing act_batch
        obs1 : array_like
            next set of observations seen after executing act_batch
        terminals1 : numpy bool
            done_mask[i] = 1 if executing act_batch[i] resulted in the end of
            an episode and 0 otherwise.
        update_actor : bool
            whether to update the actor policy. Unused by this method.
        """
        del update_actor  # unused by this method

        # Normalize the actions (bounded between [-1, 1]).
        actions = (actions - self._ac_means) / self._ac_magnitudes

        # Reshape to match previous behavior and placeholder shape.
        rewards = rewards.reshape(-1, 1)
        terminals1 = terminals1.reshape(-1, 1)

        # Collect all update and loss call operations.
        step_ops = [
            self.critic_optimizer,
            self.actor_optimizer,
            self.alpha_optimizer,
            self.target_soft_updates,
        ]

        # Prepare the feed_dict information.
        feed_dict = {
            self.obs_ph: obs0,
            self.action_ph: actions,
            self.rew_ph: rewards,
            self.obs1_ph: obs1,
            self.terminals1: terminals1
        }

        # Perform the update operations.
        self.sess.run(step_ops, feed_dict)

    def get_action(self, obs, context, apply_noise, random_actions, env_num=0):
        """See parent class."""
        # Add the contextual observation, if applicable.
        obs = self._get_obs(obs, context, axis=1)

        if random_actions:
            return np.array([self.ac_space.sample()])
        elif apply_noise:
            normalized_action = self.sess.run(self.policy_out,
                                              feed_dict={self.obs_ph: obs})
            return self._ac_magnitudes * normalized_action + self._ac_means
        else:
            normalized_action = self.sess.run(self.deterministic_action,
                                              feed_dict={self.obs_ph: obs})
            return self._ac_magnitudes * normalized_action + self._ac_means

    def _setup_critic_optimizer(self, scope):
        """Create minimization operation for critic Q-function.

        Create a `tf.optimizer.minimize` operation for updating critic
        Q-function with gradient descent.

        See Equations (5, 6) in [1], for further information of the Q-function
        update rule.
        """
        scope_name = 'model/value_fns'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up critic optimizer')
            for name in ['qf1', 'qf2', 'vf']:
                scope_i = '{}/{}'.format(scope_name, name)
                print_params_shape(scope_i, name)

        # Take the min of the two Q-Values (Double-Q Learning)
        min_qf_pi = tf.minimum(self.qf1_pi, self.qf2_pi)

        # Target for Q value regression
        q_backup = tf.stop_gradient(self.rew_ph + (1 - self.terminals1) *
                                    self.gamma * self.value_target)

        # choose the loss function
        if self.use_huber:
            loss_fn = tf.compat.v1.losses.huber_loss
        else:
            loss_fn = tf.compat.v1.losses.mean_squared_error

        # Compute Q-Function loss
        qf1_loss = loss_fn(q_backup, self.qf1)
        qf2_loss = loss_fn(q_backup, self.qf2)

        # Target for value fn regression
        # We update the vf towards the min of two Q-functions in order to
        # reduce overestimation bias from function approximation error.
        v_backup = tf.stop_gradient(min_qf_pi - self.alpha * self.logp_pi)
        value_loss = loss_fn(self.value_fn, v_backup)

        self.critic_loss = (qf1_loss, qf2_loss, value_loss)

        # Combine the loss functions for the optimizer.
        critic_loss = qf1_loss + qf2_loss + value_loss

        # Critic train op
        critic_optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr)
        self.critic_optimizer = critic_optimizer.minimize(
            critic_loss, var_list=get_trainable_vars(scope_name))

    def _setup_actor_optimizer(self, scope):
        """Create minimization operations for policy and entropy.

        Creates a `tf.optimizer.minimize` operations for updating policy and
        entropy with gradient descent.

        See Section 4.2 in [1], for further information of the policy update,
        and Section 5 in [1] for further information of the entropy update.
        """
        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up actor and alpha optimizers')
            print_params_shape(scope_name, "actor")

        # Take the min of the two Q-Values (Double-Q Learning)
        min_qf_pi = tf.minimum(self.qf1_pi, self.qf2_pi)

        # Compute the entropy temperature loss.
        self.alpha_loss = -tf.reduce_mean(
            self.log_alpha *
            tf.stop_gradient(self.logp_pi + self.target_entropy))

        alpha_optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr)

        self.alpha_optimizer = alpha_optimizer.minimize(
            self.alpha_loss, var_list=self.log_alpha)

        # Compute the policy loss
        self.actor_loss = tf.reduce_mean(self.alpha * self.logp_pi - min_qf_pi)

        # Add a regularization penalty.
        self.actor_loss += self._l2_loss(self.l2_penalty, scope_name)

        # Policy train op (has to be separate from value train op, because
        # min_qf_pi appears in policy_loss)
        actor_optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr)

        self.actor_optimizer = actor_optimizer.minimize(
            self.actor_loss, var_list=get_trainable_vars(scope_name))

    def _setup_stats(self, base):
        """Create the running means and std of the model inputs and outputs.

        This method also adds the same running means and stds as scalars to
        tensorboard for additional storage.
        """
        ops = []
        names = []

        ops += [tf.reduce_mean(self.qf1)]
        names += ['{}/reference_Q1_mean'.format(base)]
        ops += [reduce_std(self.qf1)]
        names += ['{}/reference_Q1_std'.format(base)]

        ops += [tf.reduce_mean(self.qf2)]
        names += ['{}/reference_Q2_mean'.format(base)]
        ops += [reduce_std(self.qf2)]
        names += ['{}/reference_Q2_std'.format(base)]

        ops += [tf.reduce_mean(self.qf1_pi)]
        names += ['{}/reference_actor_Q1_mean'.format(base)]
        ops += [reduce_std(self.qf1_pi)]
        names += ['{}/reference_actor_Q1_std'.format(base)]

        ops += [tf.reduce_mean(self.qf2_pi)]
        names += ['{}/reference_actor_Q2_mean'.format(base)]
        ops += [reduce_std(self.qf2_pi)]
        names += ['{}/reference_actor_Q2_std'.format(base)]

        ops += [
            tf.reduce_mean(self._ac_magnitudes * self.policy_out +
                           self._ac_means)
        ]
        names += ['{}/reference_action_mean'.format(base)]
        ops += [
            reduce_std(self._ac_magnitudes * self.policy_out + self._ac_means)
        ]
        names += ['{}/reference_action_std'.format(base)]

        ops += [tf.reduce_mean(self.logp_pi)]
        names += ['{}/reference_log_probability_mean'.format(base)]
        ops += [reduce_std(self.logp_pi)]
        names += ['{}/reference_log_probability_std'.format(base)]

        ops += [tf.reduce_mean(self.rew_ph)]
        names += ['{}/rewards'.format(base)]

        ops += [self.alpha_loss]
        names += ['{}/alpha_loss'.format(base)]

        ops += [self.actor_loss]
        names += ['{}/actor_loss'.format(base)]

        ops += [self.critic_loss[0]]
        names += ['{}/Q1_loss'.format(base)]

        ops += [self.critic_loss[1]]
        names += ['{}/Q2_loss'.format(base)]
        tf.compat.v1.summary.scalar('Q2_loss', self.critic_loss[1])

        ops += [self.critic_loss[2]]
        names += ['{}/value_loss'.format(base)]

        # Add all names and ops to the tensorboard summary.
        for op, name in zip(ops, names):
            tf.compat.v1.summary.scalar(name, op)

        return ops, names

    def initialize(self):
        """See parent class."""
        self.sess.run(self.target_init_updates)

    def store_transition(self,
                         obs0,
                         context0,
                         action,
                         reward,
                         obs1,
                         context1,
                         done,
                         is_final_step,
                         env_num=0,
                         evaluate=False):
        """See parent class."""
        if not evaluate:
            # Add the contextual observation, if applicable.
            obs0 = self._get_obs(obs0, context0, axis=0)
            obs1 = self._get_obs(obs1, context1, axis=0)

            self.replay_buffer.add(obs0, action, reward, obs1, float(done))

    def get_td_map(self):
        """See parent class."""
        # Not enough samples in the replay buffer.
        if not self.replay_buffer.can_sample():
            return {}

        # Get a batch.
        obs0, actions, rewards, obs1, done1 = self.replay_buffer.sample()

        return self.get_td_map_from_batch(obs0, actions, rewards, obs1, done1)

    def get_td_map_from_batch(self, obs0, actions, rewards, obs1, terminals1):
        """Convert a batch to a td_map."""
        # Reshape to match previous behavior and placeholder shape.
        rewards = rewards.reshape(-1, 1)
        terminals1 = terminals1.reshape(-1, 1)

        td_map = {
            self.obs_ph: obs0,
            self.action_ph: actions,
            self.rew_ph: rewards,
            self.obs1_ph: obs1,
            self.terminals1: terminals1
        }

        return td_map
예제 #4
0
class FeedForwardPolicy(ActorCriticPolicy):
    """Feed-forward neural network actor-critic policy.

    Attributes
    ----------
    sess : tf.compat.v1.Session
        the current TensorFlow session
    ob_space : gym.spaces.*
        the observation space of the environment
    ac_space : gym.spaces.*
        the action space of the environment
    co_space : gym.spaces.*
        the context space of the environment
    buffer_size : int
        the max number of transitions to store
    batch_size : int
        SGD batch size
    actor_lr : float
        actor learning rate
    critic_lr : float
        critic learning rate
    verbose : int
        the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    tau : float
        target update rate
    gamma : float
        discount factor
    use_huber : bool
        specifies whether to use the huber distance function as the loss for
        the critic. If set to False, the mean-squared error metric is used
        instead
    l2_penalty : float
        L2 regularization penalty. This is applied to the policy network.
    model_params : dict
        dictionary of model-specific parameters. See parent class.
    noise : float
        scaling term to the range of the action space, that is subsequently
        used as the standard deviation of Gaussian noise added to the action if
        `apply_noise` is set to True in `get_action`
    target_policy_noise : float
        standard deviation term to the noise from the output of the target
        actor policy. See TD3 paper for more.
    target_noise_clip : float
        clipping term for the noise injected in the target actor policy
    replay_buffer : hbaselines.fcnet.replay_buffer.ReplayBuffer
        the replay buffer
    terminals1 : tf.compat.v1.placeholder
        placeholder for the next step terminals
    rew_ph : tf.compat.v1.placeholder
        placeholder for the rewards
    action_ph : tf.compat.v1.placeholder
        placeholder for the actions
    obs_ph : tf.compat.v1.placeholder
        placeholder for the observations
    obs1_ph : tf.compat.v1.placeholder
        placeholder for the next step observations
    actor_tf : tf.Variable
        the output from the actor network
    critic_tf : list of tf.Variable
        the output from the critic networks. Two networks are used to stabilize
        training.
    critic_with_actor_tf : list of tf.Variable
        the output from the critic networks with the action provided directly
        by the actor policy
    target_init_updates : tf.Operation
        an operation that sets the values of the trainable parameters of the
        target actor/critic to match those actual actor/critic
    target_soft_updates : tf.Operation
        soft target update function
    actor_loss : tf.Operation
        the operation that returns the loss of the actor
    actor_optimizer : tf.Operation
        the operation that updates the trainable parameters of the actor
    critic_loss : tf.Operation
        the operation that returns the loss of the critic
    critic_optimizer : tf.Operation
        the operation that updates the trainable parameters of the critic
    """
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 actor_lr,
                 critic_lr,
                 verbose,
                 tau,
                 gamma,
                 use_huber,
                 l2_penalty,
                 model_params,
                 noise,
                 target_policy_noise,
                 target_noise_clip,
                 scope=None,
                 num_envs=1):
        """Instantiate the feed-forward neural network policy.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        actor_lr : float
            actor learning rate
        critic_lr : float
            critic learning rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        tau : float
            target update rate
        gamma : float
            discount factor
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            for the critic. If set to False, the mean-squared error metric is
            used instead
        l2_penalty : float
            L2 regularization penalty. This is applied to the policy network.
        model_params : dict
            dictionary of model-specific parameters. See parent class.
        noise : float
            scaling term to the range of the action space, that is subsequently
            used as the standard deviation of Gaussian noise added to the
            action if `apply_noise` is set to True in `get_action`
        target_policy_noise : float
            standard deviation term to the noise from the output of the target
            actor policy. See TD3 paper for more.
        target_noise_clip : float
            clipping term for the noise injected in the target actor policy
        scope : str
            an upper-level scope term. Used by policies that call this one.
        """
        super(FeedForwardPolicy, self).__init__(
            sess=sess,
            ob_space=ob_space,
            ac_space=ac_space,
            co_space=co_space,
            buffer_size=buffer_size,
            batch_size=batch_size,
            actor_lr=actor_lr,
            critic_lr=critic_lr,
            verbose=verbose,
            tau=tau,
            gamma=gamma,
            use_huber=use_huber,
            l2_penalty=l2_penalty,
            model_params=model_params,
            num_envs=num_envs,
        )

        # action magnitudes
        ac_mag = 0.5 * (ac_space.high - ac_space.low)

        self.noise = noise * ac_mag
        self.target_policy_noise = np.array([ac_mag * target_policy_noise])
        self.target_noise_clip = np.array([ac_mag * target_noise_clip])

        # Compute the shape of the input observation space, which may include
        # the contextual term.
        ob_dim = self._get_ob_dim(ob_space, co_space)

        # =================================================================== #
        # Step 1: Create a replay buffer object.                              #
        # =================================================================== #

        self.replay_buffer = ReplayBuffer(
            buffer_size=self.buffer_size,
            batch_size=self.batch_size,
            obs_dim=ob_dim[0],
            ac_dim=self.ac_space.shape[0],
        )

        # =================================================================== #
        # Step 2: Create input variables.                                     #
        # =================================================================== #

        with tf.compat.v1.variable_scope("input", reuse=False):
            self.terminals1 = tf.compat.v1.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals1')
            self.rew_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, 1),
                                                   name='rewards')
            self.action_ph = tf.compat.v1.placeholder(tf.float32,
                                                      shape=(None, ) +
                                                      ac_space.shape,
                                                      name='actions')
            self.obs_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, ) + ob_dim,
                                                   name='obs0')
            self.obs1_ph = tf.compat.v1.placeholder(tf.float32,
                                                    shape=(None, ) + ob_dim,
                                                    name='obs1')

        # =================================================================== #
        # Step 3: Create actor and critic variables.                          #
        # =================================================================== #

        # Create networks and core TF parts that are shared across setup parts.
        with tf.compat.v1.variable_scope("model", reuse=False):
            self.actor_tf = self.make_actor(self.obs_ph)
            self.critic_tf = [
                self.make_critic(self.obs_ph,
                                 self.action_ph,
                                 scope="qf_{}".format(i)) for i in range(2)
            ]
            self.critic_with_actor_tf = [
                self.make_critic(self.obs_ph,
                                 self.actor_tf,
                                 reuse=True,
                                 scope="qf_{}".format(i)) for i in range(2)
            ]

        with tf.compat.v1.variable_scope("target", reuse=False):
            # create the target actor policy
            actor_target = self.make_actor(self.obs1_ph)

            # smooth target policy by adding clipped noise to target actions
            target_noise = tf.random.normal(tf.shape(actor_target),
                                            stddev=self.target_policy_noise)
            target_noise = tf.clip_by_value(target_noise,
                                            -self.target_noise_clip,
                                            self.target_noise_clip)

            # clip the noisy action to remain in the bounds
            noisy_actor_target = tf.clip_by_value(actor_target + target_noise,
                                                  self.ac_space.low,
                                                  self.ac_space.high)

            # create the target critic policies
            critic_target = [
                self.make_critic(self.obs1_ph,
                                 noisy_actor_target,
                                 scope="qf_{}".format(i)) for i in range(2)
            ]

        # Create the target update operations.
        init, soft = self._setup_target_updates('model', 'target', scope, tau,
                                                verbose)
        self.target_init_updates = init
        self.target_soft_updates = soft

        # =================================================================== #
        # Step 4: Setup the optimizers for the actor and critic.              #
        # =================================================================== #

        with tf.compat.v1.variable_scope("Optimizer", reuse=False):
            self._setup_actor_optimizer(scope)
            self._setup_critic_optimizer(critic_target, scope)

        # =================================================================== #
        # Step 5: Setup the operations for computing model statistics.        #
        # =================================================================== #

        # Setup the running means and standard deviations of the model inputs
        # and outputs.
        self.stats_ops, self.stats_names = self._setup_stats(scope or "Model")

    def _setup_actor_optimizer(self, scope):
        """Create the actor loss, gradient, and optimizer."""
        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up actor optimizer')
            print_params_shape(scope_name, "actor")

        # Compute the actor loss.
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf[0])

        # Add a regularization penalty.
        self.actor_loss += self._l2_loss(self.l2_penalty, scope_name)

        # Create an optimizer object.
        optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr)

        self.actor_optimizer = optimizer.minimize(
            self.actor_loss, var_list=get_trainable_vars(scope_name))

    def _setup_critic_optimizer(self, critic_target, scope):
        """Create the critic loss, gradient, and optimizer."""
        if self.verbose >= 2:
            print('setting up critic optimizer')

        # compute the target critic term
        with tf.compat.v1.variable_scope("loss", reuse=False):
            q_obs1 = tf.minimum(critic_target[0], critic_target[1])
            target_q = tf.stop_gradient(self.rew_ph + (1. - self.terminals1) *
                                        self.gamma * q_obs1)

            tf.compat.v1.summary.scalar('critic_target',
                                        tf.reduce_mean(target_q))

        # choose the loss function
        if self.use_huber:
            loss_fn = tf.compat.v1.losses.huber_loss
        else:
            loss_fn = tf.compat.v1.losses.mean_squared_error

        self.critic_loss = [loss_fn(q, target_q) for q in self.critic_tf]

        self.critic_optimizer = []

        for i, critic_loss in enumerate(self.critic_loss):
            scope_name = 'model/qf_{}/'.format(i)
            if scope is not None:
                scope_name = scope + '/' + scope_name

            if self.verbose >= 2:
                print_params_shape(scope_name, "critic {}".format(i))

            # create an optimizer object
            optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr)

            # create the optimizer object
            self.critic_optimizer.append(
                optimizer.minimize(loss=critic_loss,
                                   var_list=get_trainable_vars(scope_name)))

    def make_actor(self, obs, reuse=False, scope="pi"):
        """Create an actor tensor.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor

        Returns
        -------
        tf.Variable
            the output from the actor
        """
        # Initial image pre-processing (for convolutional policies).
        if self.model_params["model_type"] == "conv":
            pi_h = create_conv(
                obs=obs,
                image_height=self.model_params["image_height"],
                image_width=self.model_params["image_width"],
                image_channels=self.model_params["image_channels"],
                ignore_flat_channels=self.model_params["ignore_flat_channels"],
                ignore_image=self.model_params["ignore_image"],
                filters=self.model_params["filters"],
                kernel_sizes=self.model_params["kernel_sizes"],
                strides=self.model_params["strides"],
                act_fun=self.model_params["act_fun"],
                layer_norm=self.model_params["layer_norm"],
                scope=scope,
                reuse=reuse,
            )
        else:
            pi_h = obs

        # Create the model.
        policy = create_fcnet(
            obs=pi_h,
            layers=self.model_params["layers"],
            num_output=self.ac_space.shape[0],
            stochastic=False,
            act_fun=self.model_params["act_fun"],
            layer_norm=self.model_params["layer_norm"],
            scope=scope,
            reuse=reuse,
        )

        # Scaling terms to the output from the policy.
        ac_means = (self.ac_space.high + self.ac_space.low) / 2.
        ac_magnitudes = (self.ac_space.high - self.ac_space.low) / 2.

        # Apply squashing and scale by action space.
        return ac_means + ac_magnitudes * tf.nn.tanh(policy)

    def make_critic(self, obs, action, reuse=False, scope="qf"):
        """Create a critic tensor.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        action : tf.compat.v1.placeholder
            the input action placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor

        Returns
        -------
        tf.Variable
            the output from the critic
        """
        # Concatenate the observations and actions.
        qf_h = tf.concat([obs, action], axis=-1)

        # Initial image pre-processing (for convolutional policies).
        if self.model_params["model_type"] == "conv":
            qf_h = create_conv(
                obs=qf_h,
                image_height=self.model_params["image_height"],
                image_width=self.model_params["image_width"],
                image_channels=self.model_params["image_channels"],
                ignore_flat_channels=self.model_params["ignore_flat_channels"],
                ignore_image=self.model_params["ignore_image"],
                filters=self.model_params["filters"],
                kernel_sizes=self.model_params["kernel_sizes"],
                strides=self.model_params["strides"],
                act_fun=self.model_params["act_fun"],
                layer_norm=self.model_params["layer_norm"],
                scope=scope,
                reuse=reuse,
            )

        return create_fcnet(
            obs=qf_h,
            layers=self.model_params["layers"],
            num_output=1,
            stochastic=False,
            act_fun=self.model_params["act_fun"],
            layer_norm=self.model_params["layer_norm"],
            scope=scope,
            reuse=reuse,
            output_pre="qf_",
        )

    def update(self, update_actor=True, **kwargs):
        """Perform a gradient update step.

        **Note**; The target update soft updates occur at the same frequency as
        the actor update frequencies.

        Parameters
        ----------
        update_actor : bool
            specifies whether to update the actor policy. The critic policy is
            still updated if this value is set to False.
        """
        # Not enough samples in the replay buffer.
        if not self.replay_buffer.can_sample():
            return

        # Get a batch
        obs0, actions, rewards, obs1, terminals1 = self.replay_buffer.sample()

        return self.update_from_batch(obs0, actions, rewards, obs1, terminals1,
                                      update_actor)

    def update_from_batch(self,
                          obs0,
                          actions,
                          rewards,
                          obs1,
                          terminals1,
                          update_actor=True):
        """Perform gradient update step given a batch of data.

        Parameters
        ----------
        obs0 : array_like
            batch of observations
        actions : array_like
            batch of actions executed given obs_batch
        rewards : array_like
            rewards received as results of executing act_batch
        obs1 : array_like
            next set of observations seen after executing act_batch
        terminals1 : numpy bool
            done_mask[i] = 1 if executing act_batch[i] resulted in the end of
            an episode and 0 otherwise.
        update_actor : bool, optional
            specified whether to perform gradient update procedures to the
            actor policy. Default set to True. Note that the update procedure
            for the critic is always performed when calling this method.
        """
        # Reshape to match previous behavior and placeholder shape.
        rewards = rewards.reshape(-1, 1)
        terminals1 = terminals1.reshape(-1, 1)

        # Update operations for the critic networks.
        step_ops = [self.critic_optimizer[0], self.critic_optimizer[1]]

        if update_actor:
            # Actor updates and target soft update operation.
            step_ops += [self.actor_optimizer, self.target_soft_updates]

        # Perform the update operations.
        self.sess.run(step_ops,
                      feed_dict={
                          self.obs_ph: obs0,
                          self.action_ph: actions,
                          self.rew_ph: rewards,
                          self.obs1_ph: obs1,
                          self.terminals1: terminals1
                      })

    def get_action(self, obs, context, apply_noise, random_actions, env_num=0):
        """See parent class."""
        # Add the contextual observation, if applicable.
        obs = self._get_obs(obs, context, axis=1)

        if random_actions:
            action = np.array([self.ac_space.sample()])
        else:
            action = self.sess.run(self.actor_tf, {self.obs_ph: obs})

            if apply_noise:
                # compute noisy action
                if apply_noise:
                    action += np.random.normal(0, self.noise, action.shape)

                # clip by bounds
                action = np.clip(action, self.ac_space.low, self.ac_space.high)

        return action

    def store_transition(self,
                         obs0,
                         context0,
                         action,
                         reward,
                         obs1,
                         context1,
                         done,
                         is_final_step,
                         env_num=0,
                         evaluate=False):
        """See parent class."""
        if not evaluate:
            # Add the contextual observation, if applicable.
            obs0 = self._get_obs(obs0, context0, axis=0)
            obs1 = self._get_obs(obs1, context1, axis=0)

            # Modify the done mask in accordance with the TD3 algorithm. Done
            # masks that correspond to the final step are set to False.
            done = done and not is_final_step

            self.replay_buffer.add(obs0, action, reward, obs1, float(done))

    def initialize(self):
        """See parent class.

        This method initializes the target parameters to match the model
        parameters.
        """
        self.sess.run(self.target_init_updates)

    def _setup_stats(self, base):
        """Create the running means and std of the model inputs and outputs.

        This method also adds the same running means and stds as scalars to
        tensorboard for additional storage.
        """
        ops = []
        names = []

        ops += [tf.reduce_mean(self.critic_tf[0])]
        names += ['{}/reference_Q1_mean'.format(base)]
        ops += [reduce_std(self.critic_tf[0])]
        names += ['{}/reference_Q1_std'.format(base)]

        ops += [tf.reduce_mean(self.critic_tf[1])]
        names += ['{}/reference_Q2_mean'.format(base)]
        ops += [reduce_std(self.critic_tf[1])]
        names += ['{}/reference_Q2_std'.format(base)]

        ops += [tf.reduce_mean(self.critic_with_actor_tf[0])]
        names += ['{}/reference_actor_Q1_mean'.format(base)]
        ops += [reduce_std(self.critic_with_actor_tf[0])]
        names += ['{}/reference_actor_Q1_std'.format(base)]

        ops += [tf.reduce_mean(self.critic_with_actor_tf[1])]
        names += ['{}/reference_actor_Q2_mean'.format(base)]
        ops += [reduce_std(self.critic_with_actor_tf[1])]
        names += ['{}/reference_actor_Q2_std'.format(base)]

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['{}/reference_action_mean'.format(base)]
        ops += [reduce_std(self.actor_tf)]
        names += ['{}/reference_action_std'.format(base)]

        ops += [tf.reduce_mean(self.rew_ph)]
        names += ['{}/rewards'.format(base)]

        ops += [self.actor_loss]
        names += ['{}/actor_loss'.format(base)]

        ops += [self.critic_loss[0]]
        names += ['{}/Q1_loss'.format(base)]

        ops += [self.critic_loss[1]]
        names += ['{}/Q2_loss'.format(base)]

        # Add all names and ops to the tensorboard summary.
        for op, name in zip(ops, names):
            tf.compat.v1.summary.scalar(name, op)

        return ops, names

    def get_td_map(self):
        """See parent class."""
        # Not enough samples in the replay buffer.
        if not self.replay_buffer.can_sample():
            return {}

        # Get a batch.
        obs0, actions, rewards, obs1, done1 = self.replay_buffer.sample()

        return self.get_td_map_from_batch(obs0, actions, rewards, obs1, done1)

    def get_td_map_from_batch(self, obs0, actions, rewards, obs1, terminals1):
        """Convert a batch to a td_map."""
        # Reshape to match previous behavior and placeholder shape.
        rewards = rewards.reshape(-1, 1)
        terminals1 = terminals1.reshape(-1, 1)

        td_map = {
            self.obs_ph: obs0,
            self.action_ph: actions,
            self.rew_ph: rewards,
            self.obs1_ph: obs1,
            self.terminals1: terminals1
        }

        return td_map
예제 #5
0
class FeedForwardPolicy(ImitationLearningPolicy):
    """Fully-connected neural network imitation learning policy.

    Attributes
    ----------
    replay_buffer : hbaselines.fcnet.replay_buffer.ReplayBuffer
        the replay buffer
    action_ph : tf.compat.v1.placeholder
        placeholder for the actions
    obs_ph : tf.compat.v1.placeholder
        placeholder for the observations
    policy : tf.Variable
        the output from the imitation learning policy
    logp_ac : tf.Operation
        the operation that computes the log-probability of a given action. Only
        applies to stochastic policies.
    loss : tf.Operation
        the operation that computes the loss
    optimizer : tf.Operation
        the operation that updates the trainable parameters of the policy
    """
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 learning_rate,
                 verbose,
                 layer_norm,
                 layers,
                 act_fun,
                 use_huber,
                 stochastic,
                 scope=None):
        """Instantiate the policy object.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        learning_rate : float
            the learning rate for the policy
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        layer_norm : bool
            enable layer normalisation
        layers : list of int or None
            the size of the Neural network for the policy
        act_fun : tf.nn.*
            the activation function to use in the neural network
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            function. If set to False, the mean-squared error metric is used
            instead
        stochastic : bool
            specifies whether the policies are stochastic or deterministic
        scope : str
            an upper-level scope term. Used by policies that call this one.
        """
        super(FeedForwardPolicy, self).__init__(sess=sess,
                                                ob_space=ob_space,
                                                ac_space=ac_space,
                                                co_space=co_space,
                                                buffer_size=buffer_size,
                                                batch_size=batch_size,
                                                learning_rate=learning_rate,
                                                verbose=verbose,
                                                layer_norm=layer_norm,
                                                layers=layers,
                                                act_fun=act_fun,
                                                use_huber=use_huber,
                                                stochastic=stochastic)

        assert len(self.layers) >= 1, \
            "Error: must have at least one hidden layer for the policy."

        # Compute the shape of the input observation space, which may include
        # the contextual term.
        ob_dim = self._get_ob_dim(ob_space, co_space)

        # =================================================================== #
        # Step 1: Create a replay buffer object.                              #
        # =================================================================== #

        self.replay_buffer = ReplayBuffer(
            buffer_size=self.buffer_size,
            batch_size=self.batch_size,
            obs_dim=ob_dim[0],
            ac_dim=self.ac_space.shape[0],
        )

        # =================================================================== #
        # Step 2: Create input variables.                                     #
        # =================================================================== #

        with tf.compat.v1.variable_scope("input", reuse=False):
            self.action_ph = tf.compat.v1.placeholder(tf.float32,
                                                      shape=(None, ) +
                                                      ac_space.shape,
                                                      name='actions')
            self.obs_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, ) + ob_dim,
                                                   name='obs0')

        # =================================================================== #
        # Step 3: Create policy variables.                                    #
        # =================================================================== #

        self.policy = None
        self.logp_ac = None

        # Create networks and core TF parts that are shared across setup parts.
        with tf.compat.v1.variable_scope("model", reuse=False):
            if self.stochastic:
                self._setup_stochastic_policy(self.obs_ph, self.action_ph)
            else:
                self._setup_deterministic_policy(self.obs_ph)

        # =================================================================== #
        # Step 4: Setup the optimizer.                                        #
        # =================================================================== #

        self.loss = None
        self.optimizer = None

        with tf.compat.v1.variable_scope("Optimizer", reuse=False):
            if self.stochastic:
                self._setup_stochastic_optimizer(scope)
            else:
                self._setup_deterministic_optimizer(self.action_ph, scope)

        # =================================================================== #
        # Step 5: Setup the operations for computing model statistics.        #
        # =================================================================== #

        # Setup the running means and standard deviations of the model inputs
        # and outputs.
        self.stats_ops, self.stats_names = self._setup_stats(scope or "Model")

    def _setup_stochastic_policy(self, obs, action, reuse=False, scope="pi"):
        """Create the variables of a stochastic policy.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        action : tf.compat.v1.placeholder
            the input action placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the policy
        """
        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            pi_h = obs

            # create the hidden layers
            for i, layer_size in enumerate(self.layers):
                pi_h = layer(pi_h,
                             layer_size,
                             'fc{}'.format(i),
                             act_fun=self.act_fun,
                             layer_norm=self.layer_norm)

            # create the output mean
            policy_mean = layer(
                pi_h,
                self.ac_space.shape[0],
                'mean',
                act_fun=None,
                kernel_initializer=tf.random_uniform_initializer(minval=-3e-3,
                                                                 maxval=3e-3))

            # create the output log_std
            log_std = layer(
                pi_h,
                self.ac_space.shape[0],
                'log_std',
                act_fun=None,
            )

        # OpenAI Variation to cap the standard deviation
        log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)

        std = tf.exp(log_std)

        # Reparameterization trick
        policy = policy_mean + tf.random.normal(tf.shape(policy_mean)) * std
        logp_pi = gaussian_likelihood(policy, policy_mean, log_std)
        logp_ac = gaussian_likelihood(action, policy_mean, log_std)

        # Apply squashing and account for it in the probability
        _, _, logp_ac = apply_squashing_func(policy_mean, action, logp_ac)
        _, policy, _ = apply_squashing_func(policy_mean, policy, logp_pi)

        # Store the variables under their respective parameters.
        self.policy = policy
        self.logp_ac = logp_ac

    def _setup_stochastic_optimizer(self, scope):
        """Create the loss and optimizer of a stochastic policy."""
        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up optimizer')
            print_params_shape(scope_name, "policy")

        # Define the loss function.
        self.loss = -tf.reduce_mean(self.logp_ac)

        # Create an optimizer object.
        optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate)

        # Create the optimizer operation.
        self.optimizer = optimizer.minimize(
            loss=self.loss, var_list=get_trainable_vars(scope_name))

    def _setup_deterministic_policy(self, obs, reuse=False, scope="pi"):
        """Create the variables of deterministic a policy.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the policy
        """
        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            pi_h = obs

            # create the hidden layers
            for i, layer_size in enumerate(self.layers):
                pi_h = layer(pi_h,
                             layer_size,
                             'fc{}'.format(i),
                             act_fun=self.act_fun,
                             layer_norm=self.layer_norm)

            # create the output layer
            policy = layer(pi_h,
                           self.ac_space.shape[0],
                           'output',
                           act_fun=tf.nn.tanh,
                           kernel_initializer=tf.random_uniform_initializer(
                               minval=-3e-3, maxval=3e-3))

            # scaling terms to the output from the policy
            ac_means = (self.ac_space.high + self.ac_space.low) / 2.
            ac_magnitudes = (self.ac_space.high - self.ac_space.low) / 2.

            policy = ac_means + ac_magnitudes * tf.to_float(policy)

        # Store the variables under their respective parameters.
        self.policy = policy

    def _setup_deterministic_optimizer(self, action, scope=None):
        """Create the loss and optimizer of a deterministic policy."""
        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up optimizer')
            print_params_shape(scope_name, "policy")

        # Choose the loss function.
        if self.use_huber:
            loss_fn = tf.compat.v1.losses.huber_loss
        else:
            loss_fn = tf.compat.v1.losses.mean_squared_error

        # Define the loss function.
        self.loss = loss_fn(action, self.policy)

        # Create an optimizer object.
        optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate)

        # Create the optimizer operation.
        self.optimizer = optimizer.minimize(
            loss=self.loss, var_list=get_trainable_vars(scope_name))

    def _setup_stats(self, base):
        """Create the running means and std of the model inputs and outputs.

        This method also adds the same running means and stds as scalars to
        tensorboard for additional storage.
        """
        ops = []
        names = []

        ops += [tf.reduce_mean(self.policy)]
        names += ['{}/reference_action_mean'.format(base)]
        ops += [reduce_std(self.policy)]
        names += ['{}/reference_action_std'.format(base)]

        ops += [tf.reduce_mean(self.loss)]
        names += ['{}/reference_loss_mean'.format(base)]
        ops += [reduce_std(self.loss)]
        names += ['{}/reference_loss_std'.format(base)]

        # Add all names and ops to the tensorboard summary.
        for op, name in zip(ops, names):
            tf.compat.v1.summary.scalar(name, op)

        return ops, names

    def update(self):
        """See parent class."""
        # Not enough samples in the replay buffer.
        if not self.replay_buffer.can_sample():
            return 0

        # Get a batch.
        obs0, actions, _, _, _ = self.replay_buffer.sample()

        return self.update_from_batch(obs0, actions)

    def update_from_batch(self, obs0, actions):
        """Perform gradient update step given a batch of data.

        Parameters
        ----------
        obs0 : array_like
            batch of observations
        actions : array_like
            batch of actions executed given obs_batch

        Returns
        -------
        float
            policy loss
        """
        loss, *_ = self.sess.run([self.loss, self.optimizer],
                                 feed_dict={
                                     self.obs_ph: obs0,
                                     self.action_ph: actions,
                                 })

        return loss

    def get_action(self, obs, context):
        """See parent class."""
        # Add the contextual observation, if applicable.
        obs = self._get_obs(obs, context, axis=1)

        # Compute the action by the policy.
        action = self.sess.run(self.policy, {self.obs_ph: obs})

        if self.stochastic:
            # Scale the action by the action space of the environment.
            ac_means = 0.5 * (self.ac_space.high + self.ac_space.low)
            ac_magnitudes = 0.5 * (self.ac_space.high - self.ac_space.low)
            action = ac_magnitudes * action + ac_means

        return action

    def store_transition(self, obs0, context0, action, obs1, context1):
        """See parent class."""
        # Add the contextual observation, if applicable.
        obs0 = self._get_obs(obs0, context0, axis=0)
        obs1 = self._get_obs(obs1, context1, axis=0)

        self.replay_buffer.add(obs0, action, 0, obs1, float(False))

    def get_td_map(self):
        """See parent class."""
        # Not enough samples in the replay buffer.
        if not self.replay_buffer.can_sample():
            return {}

        # Get a batch.
        obs0, actions, _, _, _ = self.replay_buffer.sample()

        return self.get_td_map_from_batch(obs0, actions)

    def get_td_map_from_batch(self, obs0, actions):
        """Convert a batch to a td_map."""
        return {
            self.obs_ph: obs0,
            self.action_ph: actions,
        }