Пример #1
0
 def setUp(self):
     self.replay_buffer = ReplayBuffer(buffer_size=2,
                                       batch_size=1,
                                       obs_dim=1,
                                       ac_dim=1)
Пример #2
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 actor_lr,
                 critic_lr,
                 verbose,
                 tau,
                 gamma,
                 layer_norm,
                 layers,
                 act_fun,
                 use_huber,
                 noise,
                 target_policy_noise,
                 target_noise_clip,
                 scope=None,
                 zero_fingerprint=False,
                 fingerprint_dim=2):
        """Instantiate the feed-forward neural network policy.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        actor_lr : float
            actor learning rate
        critic_lr : float
            critic learning rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        tau : float
            target update rate
        gamma : float
            discount factor
        layer_norm : bool
            enable layer normalisation
        layers : list of int or None
            the size of the Neural network for the policy
        act_fun : tf.nn.*
            the activation function to use in the neural network
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            for the critic. If set to False, the mean-squared error metric is
            used instead
        noise : float
            scaling term to the range of the action space, that is subsequently
            used as the standard deviation of Gaussian noise added to the
            action if `apply_noise` is set to True in `get_action`
        target_policy_noise : float
            standard deviation term to the noise from the output of the target
            actor policy. See TD3 paper for more.
        target_noise_clip : float
            clipping term for the noise injected in the target actor policy
        scope : str
            an upper-level scope term. Used by policies that call this one.
        zero_fingerprint : bool
            whether to zero the last two elements of the observations for the
            actor and critic computations. Used for the worker policy when
            fingerprints are being implemented.
        fingerprint_dim : int
            the number of fingerprint elements in the observation. Used when
            trying to zero the fingerprint elements.

        Raises
        ------
        AssertionError
            if the layers is not a list of at least size 1
        """
        super(FeedForwardPolicy, self).__init__(sess=sess,
                                                ob_space=ob_space,
                                                ac_space=ac_space,
                                                co_space=co_space,
                                                buffer_size=buffer_size,
                                                batch_size=batch_size,
                                                actor_lr=actor_lr,
                                                critic_lr=critic_lr,
                                                verbose=verbose,
                                                tau=tau,
                                                gamma=gamma,
                                                layer_norm=layer_norm,
                                                layers=layers,
                                                act_fun=act_fun,
                                                use_huber=use_huber)

        # action magnitudes
        ac_mag = 0.5 * (ac_space.high - ac_space.low)

        self.noise = noise * ac_mag
        self.target_policy_noise = np.array([ac_mag * target_policy_noise])
        self.target_noise_clip = np.array([ac_mag * target_noise_clip])
        self.zero_fingerprint = zero_fingerprint
        self.fingerprint_dim = fingerprint_dim
        assert len(self.layers) >= 1, \
            "Error: must have at least one hidden layer for the policy."

        # Compute the shape of the input observation space, which may include
        # the contextual term.
        ob_dim = self._get_ob_dim(ob_space, co_space)

        # =================================================================== #
        # Step 1: Create a replay buffer object.                              #
        # =================================================================== #

        self.replay_buffer = ReplayBuffer(
            buffer_size=self.buffer_size,
            batch_size=self.batch_size,
            obs_dim=ob_dim[0],
            ac_dim=self.ac_space.shape[0],
        )

        # =================================================================== #
        # Step 2: Create input variables.                                     #
        # =================================================================== #

        with tf.compat.v1.variable_scope("input", reuse=False):
            self.terminals1 = tf.compat.v1.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals1')
            self.rew_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, 1),
                                                   name='rewards')
            self.action_ph = tf.compat.v1.placeholder(tf.float32,
                                                      shape=(None, ) +
                                                      ac_space.shape,
                                                      name='actions')
            self.obs_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, ) + ob_dim,
                                                   name='obs0')
            self.obs1_ph = tf.compat.v1.placeholder(tf.float32,
                                                    shape=(None, ) + ob_dim,
                                                    name='obs1')

        # logging of rewards to tensorboard
        with tf.compat.v1.variable_scope("input_info", reuse=False):
            tf.compat.v1.summary.scalar('rewards', tf.reduce_mean(self.rew_ph))

        # =================================================================== #
        # Step 3: Create actor and critic variables.                          #
        # =================================================================== #

        # Create networks and core TF parts that are shared across setup parts.
        with tf.compat.v1.variable_scope("model", reuse=False):
            self.actor_tf = self.make_actor(self.obs_ph)
            self.critic_tf = [
                self.make_critic(self.obs_ph,
                                 self.action_ph,
                                 scope="qf_{}".format(i)) for i in range(2)
            ]
            self.critic_with_actor_tf = [
                self.make_critic(self.obs_ph,
                                 self.actor_tf,
                                 reuse=True,
                                 scope="qf_{}".format(i)) for i in range(2)
            ]

        with tf.compat.v1.variable_scope("target", reuse=False):
            # create the target actor policy
            actor_target = self.make_actor(self.obs1_ph)

            # smooth target policy by adding clipped noise to target actions
            target_noise = tf.random.normal(tf.shape(actor_target),
                                            stddev=self.target_policy_noise)
            target_noise = tf.clip_by_value(target_noise,
                                            -self.target_noise_clip,
                                            self.target_noise_clip)

            # clip the noisy action to remain in the bounds
            noisy_actor_target = tf.clip_by_value(actor_target + target_noise,
                                                  self.ac_space.low,
                                                  self.ac_space.high)

            # create the target critic policies
            critic_target = [
                self.make_critic(self.obs1_ph,
                                 noisy_actor_target,
                                 scope="qf_{}".format(i)) for i in range(2)
            ]

        # Create the target update operations.
        init, soft = self._setup_target_updates('model', 'target', scope, tau,
                                                verbose)
        self.target_init_updates = init
        self.target_soft_updates = soft

        # =================================================================== #
        # Step 4: Setup the optimizers for the actor and critic.              #
        # =================================================================== #

        with tf.compat.v1.variable_scope("Optimizer", reuse=False):
            self._setup_actor_optimizer(scope)
            self._setup_critic_optimizer(critic_target, scope)
            tf.compat.v1.summary.scalar('actor_loss', self.actor_loss)
            tf.compat.v1.summary.scalar('Q1_loss', self.critic_loss[0])
            tf.compat.v1.summary.scalar('Q2_loss', self.critic_loss[1])

        # =================================================================== #
        # Step 5: Setup the operations for computing model statistics.        #
        # =================================================================== #

        # Setup the running means and standard deviations of the model inputs
        # and outputs.
        self.stats_ops, self.stats_names = self._setup_stats(scope or "Model")
Пример #3
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 actor_lr,
                 critic_lr,
                 verbose,
                 tau,
                 gamma,
                 use_huber,
                 l2_penalty,
                 model_params,
                 noise,
                 target_policy_noise,
                 target_noise_clip,
                 scope=None,
                 num_envs=1):
        """Instantiate the feed-forward neural network policy.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        actor_lr : float
            actor learning rate
        critic_lr : float
            critic learning rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        tau : float
            target update rate
        gamma : float
            discount factor
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            for the critic. If set to False, the mean-squared error metric is
            used instead
        l2_penalty : float
            L2 regularization penalty. This is applied to the policy network.
        model_params : dict
            dictionary of model-specific parameters. See parent class.
        noise : float
            scaling term to the range of the action space, that is subsequently
            used as the standard deviation of Gaussian noise added to the
            action if `apply_noise` is set to True in `get_action`
        target_policy_noise : float
            standard deviation term to the noise from the output of the target
            actor policy. See TD3 paper for more.
        target_noise_clip : float
            clipping term for the noise injected in the target actor policy
        scope : str
            an upper-level scope term. Used by policies that call this one.
        """
        super(FeedForwardPolicy, self).__init__(
            sess=sess,
            ob_space=ob_space,
            ac_space=ac_space,
            co_space=co_space,
            buffer_size=buffer_size,
            batch_size=batch_size,
            actor_lr=actor_lr,
            critic_lr=critic_lr,
            verbose=verbose,
            tau=tau,
            gamma=gamma,
            use_huber=use_huber,
            l2_penalty=l2_penalty,
            model_params=model_params,
            num_envs=num_envs,
        )

        # action magnitudes
        ac_mag = 0.5 * (ac_space.high - ac_space.low)

        self.noise = noise * ac_mag
        self.target_policy_noise = np.array([ac_mag * target_policy_noise])
        self.target_noise_clip = np.array([ac_mag * target_noise_clip])

        # Compute the shape of the input observation space, which may include
        # the contextual term.
        ob_dim = self._get_ob_dim(ob_space, co_space)

        # =================================================================== #
        # Step 1: Create a replay buffer object.                              #
        # =================================================================== #

        self.replay_buffer = ReplayBuffer(
            buffer_size=self.buffer_size,
            batch_size=self.batch_size,
            obs_dim=ob_dim[0],
            ac_dim=self.ac_space.shape[0],
        )

        # =================================================================== #
        # Step 2: Create input variables.                                     #
        # =================================================================== #

        with tf.compat.v1.variable_scope("input", reuse=False):
            self.terminals1 = tf.compat.v1.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals1')
            self.rew_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, 1),
                                                   name='rewards')
            self.action_ph = tf.compat.v1.placeholder(tf.float32,
                                                      shape=(None, ) +
                                                      ac_space.shape,
                                                      name='actions')
            self.obs_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, ) + ob_dim,
                                                   name='obs0')
            self.obs1_ph = tf.compat.v1.placeholder(tf.float32,
                                                    shape=(None, ) + ob_dim,
                                                    name='obs1')

        # =================================================================== #
        # Step 3: Create actor and critic variables.                          #
        # =================================================================== #

        # Create networks and core TF parts that are shared across setup parts.
        with tf.compat.v1.variable_scope("model", reuse=False):
            self.actor_tf = self.make_actor(self.obs_ph)
            self.critic_tf = [
                self.make_critic(self.obs_ph,
                                 self.action_ph,
                                 scope="qf_{}".format(i)) for i in range(2)
            ]
            self.critic_with_actor_tf = [
                self.make_critic(self.obs_ph,
                                 self.actor_tf,
                                 reuse=True,
                                 scope="qf_{}".format(i)) for i in range(2)
            ]

        with tf.compat.v1.variable_scope("target", reuse=False):
            # create the target actor policy
            actor_target = self.make_actor(self.obs1_ph)

            # smooth target policy by adding clipped noise to target actions
            target_noise = tf.random.normal(tf.shape(actor_target),
                                            stddev=self.target_policy_noise)
            target_noise = tf.clip_by_value(target_noise,
                                            -self.target_noise_clip,
                                            self.target_noise_clip)

            # clip the noisy action to remain in the bounds
            noisy_actor_target = tf.clip_by_value(actor_target + target_noise,
                                                  self.ac_space.low,
                                                  self.ac_space.high)

            # create the target critic policies
            critic_target = [
                self.make_critic(self.obs1_ph,
                                 noisy_actor_target,
                                 scope="qf_{}".format(i)) for i in range(2)
            ]

        # Create the target update operations.
        init, soft = self._setup_target_updates('model', 'target', scope, tau,
                                                verbose)
        self.target_init_updates = init
        self.target_soft_updates = soft

        # =================================================================== #
        # Step 4: Setup the optimizers for the actor and critic.              #
        # =================================================================== #

        with tf.compat.v1.variable_scope("Optimizer", reuse=False):
            self._setup_actor_optimizer(scope)
            self._setup_critic_optimizer(critic_target, scope)

        # =================================================================== #
        # Step 5: Setup the operations for computing model statistics.        #
        # =================================================================== #

        # Setup the running means and standard deviations of the model inputs
        # and outputs.
        self.stats_ops, self.stats_names = self._setup_stats(scope or "Model")
Пример #4
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 actor_lr,
                 critic_lr,
                 verbose,
                 tau,
                 gamma,
                 use_huber,
                 l2_penalty,
                 model_params,
                 target_entropy,
                 scope=None,
                 num_envs=1):
        """Instantiate the feed-forward neural network policy.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        actor_lr : float
            actor learning rate
        critic_lr : float
            critic learning rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        tau : float
            target update rate
        gamma : float
            discount factor
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            for the critic. If set to False, the mean-squared error metric is
            used instead
        l2_penalty : float
            L2 regularization penalty. This is applied to the policy network.
        model_params : dict
            dictionary of model-specific parameters. See parent class.
        target_entropy : float
            target entropy used when learning the entropy coefficient. If set
            to None, a heuristic value is used.
        scope : str
            an upper-level scope term. Used by policies that call this one.
        """
        super(FeedForwardPolicy, self).__init__(
            sess=sess,
            ob_space=ob_space,
            ac_space=ac_space,
            co_space=co_space,
            buffer_size=buffer_size,
            batch_size=batch_size,
            actor_lr=actor_lr,
            critic_lr=critic_lr,
            verbose=verbose,
            tau=tau,
            gamma=gamma,
            use_huber=use_huber,
            l2_penalty=l2_penalty,
            model_params=model_params,
            num_envs=num_envs,
        )

        if target_entropy is None:
            self.target_entropy = -np.prod(self.ac_space.shape)
        else:
            self.target_entropy = target_entropy

        self._ac_means = 0.5 * (ac_space.high + ac_space.low)
        self._ac_magnitudes = 0.5 * (ac_space.high - ac_space.low)

        # Compute the shape of the input observation space, which may include
        # the contextual term.
        ob_dim = self._get_ob_dim(ob_space, co_space)

        # =================================================================== #
        # Step 1: Create a replay buffer object.                              #
        # =================================================================== #

        self.replay_buffer = ReplayBuffer(
            buffer_size=self.buffer_size,
            batch_size=self.batch_size,
            obs_dim=ob_dim[0],
            ac_dim=self.ac_space.shape[0],
        )

        # =================================================================== #
        # Step 2: Create input variables.                                     #
        # =================================================================== #

        with tf.compat.v1.variable_scope("input", reuse=False):
            self.terminals1 = tf.compat.v1.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals1')
            self.rew_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, 1),
                                                   name='rewards')
            self.action_ph = tf.compat.v1.placeholder(tf.float32,
                                                      shape=(None, ) +
                                                      ac_space.shape,
                                                      name='actions')
            self.obs_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, ) + ob_dim,
                                                   name='obs0')
            self.obs1_ph = tf.compat.v1.placeholder(tf.float32,
                                                    shape=(None, ) + ob_dim,
                                                    name='obs1')

        # =================================================================== #
        # Step 3: Create actor and critic variables.                          #
        # =================================================================== #

        # Create networks and core TF parts that are shared across setup parts.
        with tf.compat.v1.variable_scope("model", reuse=False):
            self.deterministic_action, self.policy_out, self.logp_pi, \
                self.logp_action = self.make_actor(self.obs_ph, self.action_ph)
            self.qf1, self.qf2, self.value_fn = self.make_critic(
                self.obs_ph, self.action_ph, create_qf=True, create_vf=True)
            self.qf1_pi, self.qf2_pi, _ = self.make_critic(self.obs_ph,
                                                           self.policy_out,
                                                           create_qf=True,
                                                           create_vf=False,
                                                           reuse=True)

            # The entropy coefficient or entropy can be learned automatically,
            # see Automating Entropy Adjustment for Maximum Entropy RL section
            # of https://arxiv.org/abs/1812.05905
            self.log_alpha = tf.compat.v1.get_variable('log_alpha',
                                                       dtype=tf.float32,
                                                       initializer=0.0)
            self.alpha = tf.exp(self.log_alpha)

        with tf.compat.v1.variable_scope("target", reuse=False):
            # Create the value network
            _, _, value_target = self.make_critic(self.obs1_ph,
                                                  create_qf=False,
                                                  create_vf=True)
            self.value_target = value_target

        # Create the target update operations.
        init, soft = self._setup_target_updates('model/value_fns/vf',
                                                'target/value_fns/vf', scope,
                                                tau, verbose)
        self.target_init_updates = init
        self.target_soft_updates = soft

        # =================================================================== #
        # Step 4: Setup the optimizers for the actor and critic.              #
        # =================================================================== #

        with tf.compat.v1.variable_scope("Optimizer", reuse=False):
            self._setup_actor_optimizer(scope)
            self._setup_critic_optimizer(scope)

        # =================================================================== #
        # Step 5: Setup the operations for computing model statistics.        #
        # =================================================================== #

        # Setup the running means and standard deviations of the model inputs
        # and outputs.
        self.stats_ops, self.stats_names = self._setup_stats(scope or "Model")
Пример #5
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 learning_rate,
                 verbose,
                 layer_norm,
                 layers,
                 act_fun,
                 use_huber,
                 stochastic,
                 scope=None):
        """Instantiate the policy object.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        learning_rate : float
            the learning rate for the policy
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        layer_norm : bool
            enable layer normalisation
        layers : list of int or None
            the size of the Neural network for the policy
        act_fun : tf.nn.*
            the activation function to use in the neural network
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            function. If set to False, the mean-squared error metric is used
            instead
        stochastic : bool
            specifies whether the policies are stochastic or deterministic
        scope : str
            an upper-level scope term. Used by policies that call this one.
        """
        super(FeedForwardPolicy, self).__init__(sess=sess,
                                                ob_space=ob_space,
                                                ac_space=ac_space,
                                                co_space=co_space,
                                                buffer_size=buffer_size,
                                                batch_size=batch_size,
                                                learning_rate=learning_rate,
                                                verbose=verbose,
                                                layer_norm=layer_norm,
                                                layers=layers,
                                                act_fun=act_fun,
                                                use_huber=use_huber,
                                                stochastic=stochastic)

        assert len(self.layers) >= 1, \
            "Error: must have at least one hidden layer for the policy."

        # Compute the shape of the input observation space, which may include
        # the contextual term.
        ob_dim = self._get_ob_dim(ob_space, co_space)

        # =================================================================== #
        # Step 1: Create a replay buffer object.                              #
        # =================================================================== #

        self.replay_buffer = ReplayBuffer(
            buffer_size=self.buffer_size,
            batch_size=self.batch_size,
            obs_dim=ob_dim[0],
            ac_dim=self.ac_space.shape[0],
        )

        # =================================================================== #
        # Step 2: Create input variables.                                     #
        # =================================================================== #

        with tf.compat.v1.variable_scope("input", reuse=False):
            self.action_ph = tf.compat.v1.placeholder(tf.float32,
                                                      shape=(None, ) +
                                                      ac_space.shape,
                                                      name='actions')
            self.obs_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, ) + ob_dim,
                                                   name='obs0')

        # =================================================================== #
        # Step 3: Create policy variables.                                    #
        # =================================================================== #

        self.policy = None
        self.logp_ac = None

        # Create networks and core TF parts that are shared across setup parts.
        with tf.compat.v1.variable_scope("model", reuse=False):
            if self.stochastic:
                self._setup_stochastic_policy(self.obs_ph, self.action_ph)
            else:
                self._setup_deterministic_policy(self.obs_ph)

        # =================================================================== #
        # Step 4: Setup the optimizer.                                        #
        # =================================================================== #

        self.loss = None
        self.optimizer = None

        with tf.compat.v1.variable_scope("Optimizer", reuse=False):
            if self.stochastic:
                self._setup_stochastic_optimizer(scope)
            else:
                self._setup_deterministic_optimizer(self.action_ph, scope)

        # =================================================================== #
        # Step 5: Setup the operations for computing model statistics.        #
        # =================================================================== #

        # Setup the running means and standard deviations of the model inputs
        # and outputs.
        self.stats_ops, self.stats_names = self._setup_stats(scope or "Model")