Пример #1
0
    def actions_for(self,
                    observations,
                    latents=None,
                    name=None,
                    reuse=tf.AUTO_REUSE,
                    with_log_pis=False,
                    regularize=False):
        name = name or self.name

        with tf.variable_scope(name, reuse=reuse):
            self._distribution = Normal(
                hidden_layers_sizes=self._hidden_layers,
                Dx=self._Da,
                reparameterize=self._reparameterize,
                cond_t_lst=(observations, ),
                reg=self._reg)
        raw_actions = self._distribution.x_t
        actions = tf.tanh(raw_actions) if self._squash else raw_actions

        # TODO: should always return same shape out
        # Figure out how to make the interface for `log_pis` cleaner
        if with_log_pis:
            # TODO.code_consolidation: should come from log_pis_for
            log_pis = self._distribution.log_p_t
            if self._squash:
                log_pis -= self._squash_correction(raw_actions)
            return actions, log_pis

        return actions
Пример #2
0
    def build(self):
        self._observations_ph = tf.placeholder(
            dtype=tf.float32,
            shape=(None, self._Ds),
            name='observations',
        )
        self._raw_actions_ph = tf.placeholder(
            dtype=tf.float32,
            shape=(None, self._Da),
            name='raw_actions',
        )

        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            self.distribution = Normal(
                hidden_layers_sizes=self._hidden_layers,
                Dx=self._Da,
                reparameterize=self._reparameterize,
                cond_t_lst=(self._observations_ph, ),
                reg=self._reg,
            )

        self._raw_actions = tf.stop_gradient(self.distribution.x_t)
        self._actions = tf.tanh(
            self._raw_actions) if self._squash else self._raw_actions
        self._log_pis = self.log_pis_for(self._observations_ph,
                                         self._raw_actions_ph)
Пример #3
0
 def get_distribution_for(self,
                          obs_pl: tf.Tensor,
                          reuse=tf.AUTO_REUSE) -> Normal:
     with tf.variable_scope(self.name, reuse=reuse):
         return Normal(hidden_layers_sizes=self._hidden_layers,
                       Dx=self._Da,
                       reparameterize=self._reparameterize,
                       cond_t_lst=[obs_pl],
                       reg=self._reg)
Пример #4
0
    def log_pis_for(self,
                    observations,
                    raw_actions,
                    name=None,
                    reuse=tf.AUTO_REUSE):
        name = name or self.name

        with tf.variable_scope(name, reuse=reuse):
            distribution = Normal(hidden_layers_sizes=self._hidden_layers,
                                  Dx=self._Da,
                                  reparameterize=self._reparameterize,
                                  cond_t_lst=(observations, ),
                                  reg=self._reg)

        log_pis = distribution.log_prob(raw_actions)
        if self._squash:
            log_pis -= self._squash_correction(raw_actions)

        return log_pis
Пример #5
0
    def build(self):
        self._observations_ph = tf.placeholder(
            dtype=tf.float32,
            shape=(None, self._Ds),
            name='observations',
        )

        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            self.distribution = Normal(hidden_layers_sizes=self._hidden_layers,
                                       Dx=self._Da,
                                       reparameterize=self._reparameterize,
                                       cond_t_lst=(self._observations_ph, ),
                                       reg=self._reg,
                                       todropoutpi=self.todropoutpi,
                                       dropoutpi=self.dropoutpi,
                                       batchnormpi=self.batchnormpi,
                                       isbnpitrainmode=self.isbnpitrainmode)

        raw_actions = tf.stop_gradient(self.distribution.x_t)
        self._actions = tf.tanh(raw_actions) if self._squash else raw_actions
        if self.todropoutpi:
            self.dropoutpi_placeholder = self.distribution.dropoutpi_placeholder
Пример #6
0
 def distribution_for(self, observations, latents=None,
                 name=None, reuse=tf.AUTO_REUSE,
                 with_log_pis=False, regularize=False):
     """
     CG: the function to obtain the normal distribution parameters for the Gaussian policy. 
     :param observations: 
     :param latents: 
     :param name: 
     :param reuse: 
     :param with_log_pis: 
     :param regularize: 
     :return: 
     """
     name = name or self.name
     with tf.variable_scope(name, reuse=reuse):
         distribution = Normal(
             hidden_layers_sizes=self._hidden_layers,
             Dx=self._Da,
             reparameterize=self._reparameterize,
             cond_t_lst=(observations,),
             reg=self._reg
     )
     return distribution._mu_t, distribution._log_sig_t
Пример #7
0
    def actions_for_rac(self, observations, latents=None,
                    name=None, reuse=tf.AUTO_REUSE,
                    with_log_pis=False, regularize=False):
        """
        CG: the function specifically used to work with RAC.
        :param observations: 
        :param latents: 
        :param name: 
        :param reuse: 
        :param with_log_pis: 
        :param regularize: 
        :return: 
        """
        name = name or self.name

        with tf.variable_scope(name, reuse=reuse):
            distribution = Normal(
                hidden_layers_sizes=self._hidden_layers,
                Dx=self._Da,
                reparameterize=self._reparameterize,
                cond_t_lst=(observations,),
                reg=self._reg
            )
        raw_actions = distribution.x_t
        actions = tf.tanh(raw_actions) if self._squash else raw_actions

        # TODO: should always return same shape out
        # Figure out how to make the interface for `log_pis` cleaner
        if with_log_pis:
            # TODO.code_consolidation: should come from log_pis_for
            log_pis_raw = distribution.log_p_t
            log_pis = log_pis_raw
            if self._squash:
                log_pis -= self._squash_correction(raw_actions)
            return actions, log_pis, raw_actions, log_pis_raw

        return actions, raw_actions
Пример #8
0
class GaussianPolicy(NNPolicy, Serializable):
    def __init__(self,
                 env_spec,
                 hidden_layer_sizes=(100, 100),
                 reg=1e-3,
                 squash=True,
                 reparameterize=True,
                 name='gaussian_policy'):
        """
        Args:
            env_spec (`rllab.EnvSpec`): Specification of the environment
                to create the policy for.
            hidden_layer_sizes (`list` of `int`): Sizes for the Multilayer
                perceptron hidden layers.
            reg (`float`): Regularization coeffiecient for the Gaussian parameters.
            squash (`bool`): If True, squash the Gaussian the gmm action samples
               between -1 and 1 with tanh.
            reparameterize ('bool'): If True, gradients will flow directly through
                the action samples.
        """
        Serializable.quick_init(self, locals())

        self._hidden_layers = hidden_layer_sizes
        self._Da = env_spec.action_space.flat_dim
        self._Ds = env_spec.observation_space.flat_dim
        self._is_deterministic = False
        self._fixed_h = None
        self._squash = squash
        self._reparameterize = reparameterize
        self._reg = reg

        self.name = name
        self.build()

        self._scope_name = (tf.get_variable_scope().name + "/" +
                            name).lstrip("/")

        super(NNPolicy, self).__init__(env_spec)

    def actions_for(self,
                    observations,
                    latents=None,
                    name=None,
                    reuse=tf.AUTO_REUSE,
                    with_log_pis=False,
                    regularize=False):
        name = name or self.name

        with tf.variable_scope(name, reuse=reuse):
            self._distribution = Normal(
                hidden_layers_sizes=self._hidden_layers,
                Dx=self._Da,
                reparameterize=self._reparameterize,
                cond_t_lst=(observations, ),
                reg=self._reg)
        raw_actions = self._distribution.x_t
        actions = tf.tanh(raw_actions) if self._squash else raw_actions

        # TODO: should always return same shape out
        # Figure out how to make the interface for `log_pis` cleaner
        if with_log_pis:
            # TODO.code_consolidation: should come from log_pis_for
            log_pis = self._distribution.log_p_t
            if self._squash:
                log_pis -= self._squash_correction(raw_actions)
            return actions, log_pis

        return actions

    def log_pis_for(self, actions):
        if self._squash:
            raw_actions = tf.atanh(actions - tf.sign(actions) * 1e-3)
            log_pis = self._distribution.log_p_action(raw_actions)
            log_pis -= self._squash_correction(raw_actions)
            return log_pis
        return self._distribution.log_p_action(actions)

    def log_pis_for_raw(self, raw_actions):
        if self._squash:
            log_pis = self._distribution.log_p_action(raw_actions)
            log_pis -= self._squash_correction(raw_actions)
            return log_pis
        return self._distribution.log_p_action(raw_actions)

    def build(self):
        self._observations_ph = tf.placeholder(
            dtype=tf.float32,
            shape=(None, self._Ds),
            name='observations',
        )

        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            self.distribution = Normal(
                hidden_layers_sizes=self._hidden_layers,
                Dx=self._Da,
                reparameterize=self._reparameterize,
                cond_t_lst=(self._observations_ph, ),
                reg=self._reg,
            )

        self._ractions = raw_actions = tf.stop_gradient(self.distribution.x_t)
        self._actions = tf.tanh(raw_actions) if self._squash else raw_actions

    @overrides
    def get_actions(self, observations):
        """Sample actions based on the observations.

        If `self._is_deterministic` is True, returns the mean action for the 
        observations. If False, return stochastically sampled action.

        TODO.code_consolidation: This should be somewhat similar with
        `LatentSpacePolicy.get_actions`.
        """
        if self._is_deterministic:  # Handle the deterministic case separately.

            feed_dict = {self._observations_ph: observations}

            # TODO.code_consolidation: these shapes should be double checked
            # for case where `observations.shape[0] > 1`
            mu = tf.get_default_session().run(self.distribution.mu_t,
                                              feed_dict)  # 1 x Da
            if self._squash:
                mu = np.tanh(mu)

            return mu

        return super(GaussianPolicy, self).get_actions(observations)

    def get_action_with_raw(self, observation):
        """Sample actions based on the observations.

        If `self._is_deterministic` is True, returns the mean action for the
        observations. If False, return stochastically sampled action.

        TODO.code_consolidation: This should be somewhat similar with
        `LatentSpacePolicy.get_actions`.
        """
        feed_dict = {self._observations_ph: observation[None]}
        actions, raw_actions = tf.get_default_session().run(
            [self._actions, self._ractions], feed_dict)
        return actions[0], raw_actions[0]

    def _squash_correction(self, actions):
        if not self._squash: return 0
        return tf.reduce_sum(tf.log(1 - tf.tanh(actions)**2 + EPS), axis=1)

    @contextmanager
    def deterministic(self, set_deterministic=True, latent=None):
        """Context manager for changing the determinism of the policy.

        See `self.get_action` for further information about the effect of
        self._is_deterministic.

        Args:
            set_deterministic (`bool`): Value to set the self._is_deterministic
                to during the context. The value will be reset back to the
                previous value when the context exits.
            latent (`Number`): Value to set the latent variable to over the
                deterministic context.
        """
        was_deterministic = self._is_deterministic

        self._is_deterministic = set_deterministic

        yield

        self._is_deterministic = was_deterministic

    def log_diagnostics(self, iteration, batch):
        """Record diagnostic information to the logger.

        Records the mean, min, max, and standard deviation of the GMM
        means, component weights, and covariances.
        """

        feeds = {self._observations_ph: batch['observations']}
        sess = tf_utils.get_default_session()
        mu, log_sig, log_pi = sess.run((
            self.distribution.mu_t,
            self.distribution.log_sig_t,
            self.distribution.log_p_t,
        ), feeds)

        logger.record_tabular('policy-mus-mean', np.mean(mu))
        logger.record_tabular('policy-mus-min', np.min(mu))
        logger.record_tabular('policy-mus-max', np.max(mu))
        logger.record_tabular('policy-mus-std', np.std(mu))
        logger.record_tabular('log-sigs-mean', np.mean(log_sig))
        logger.record_tabular('log-sigs-min', np.min(log_sig))
        logger.record_tabular('log-sigs-max', np.max(log_sig))
        logger.record_tabular('log-sigs-std', np.std(log_sig))
        logger.record_tabular('log-pi-mean', np.mean(log_pi))
        logger.record_tabular('log-pi-max', np.max(log_pi))
        logger.record_tabular('log-pi-min', np.min(log_pi))
        logger.record_tabular('log-pi-std', np.std(log_pi))