예제 #1
0
    def __init__(self,
                 embedding_spec,
                 name='GaussianMLPEncoder',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(),
                 output_b_init=tf.zeros_initializer(),
                 learn_std=True,
                 adaptive_std=False,
                 std_share_network=False,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_hidden_sizes=(32, 32),
                 std_hidden_nonlinearity=tf.nn.tanh,
                 std_output_nonlinearity=None,
                 std_parameterization='exp',
                 layer_normalization=False):
        super().__init__(name)
        self._embedding_spec = embedding_spec
        self._latent_dim = embedding_spec.output_space.flat_dim
        self._input_dim = embedding_spec.input_space.flat_dim
        self._dist = None
        self._f_dist = None

        self.model = GaussianMLPModel(
            output_dim=self._latent_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            hidden_w_init=hidden_w_init,
            hidden_b_init=hidden_b_init,
            output_nonlinearity=output_nonlinearity,
            output_w_init=output_w_init,
            output_b_init=output_b_init,
            learn_std=learn_std,
            adaptive_std=adaptive_std,
            std_share_network=std_share_network,
            init_std=init_std,
            min_std=min_std,
            max_std=max_std,
            std_hidden_sizes=std_hidden_sizes,
            std_hidden_nonlinearity=std_hidden_nonlinearity,
            std_output_nonlinearity=std_output_nonlinearity,
            std_parameterization=std_parameterization,
            layer_normalization=layer_normalization,
            name='GaussianMLPModel')
예제 #2
0
    def __init__(self,
                 env_spec,
                 name='GaussianMLPPolicy',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(),
                 output_b_init=tf.zeros_initializer(),
                 learn_std=True,
                 adaptive_std=False,
                 std_share_network=False,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_hidden_sizes=(32, 32),
                 std_hidden_nonlinearity=tf.nn.tanh,
                 std_output_nonlinearity=None,
                 std_parameterization='exp',
                 layer_normalization=False):
        assert isinstance(env_spec.action_space, akro.Box)
        super().__init__(name, env_spec)
        self.obs_dim = env_spec.observation_space.flat_dim
        self.action_dim = env_spec.action_space.flat_dim

        self.model = GaussianMLPModel(
            output_dim=self.action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            hidden_w_init=hidden_w_init,
            hidden_b_init=hidden_b_init,
            output_nonlinearity=output_nonlinearity,
            output_w_init=output_w_init,
            output_b_init=output_b_init,
            learn_std=learn_std,
            adaptive_std=adaptive_std,
            std_share_network=std_share_network,
            init_std=init_std,
            min_std=min_std,
            max_std=max_std,
            std_hidden_sizes=std_hidden_sizes,
            std_hidden_nonlinearity=std_hidden_nonlinearity,
            std_output_nonlinearity=std_output_nonlinearity,
            std_parameterization=std_parameterization,
            layer_normalization=layer_normalization,
            name='GaussianMLPModel')

        self._initialize()
예제 #3
0
    def test_softplus_min_std(self, output_dim, hidden_sizes):
        model = GaussianMLPModel(output_dim=output_dim,
                                 hidden_sizes=hidden_sizes,
                                 std_share_network=False,
                                 init_std=1,
                                 min_std=10,
                                 std_parameterization='softplus')
        dist = model.build(self.input_var).dist

        log_std = self.sess.run(tf.math.log(dist.stddev()),
                                feed_dict={self.input_var: self.obs})

        expected_log_std = np.full([1, 1, output_dim], np.log(10))

        assert np.allclose(log_std, expected_log_std)
예제 #4
0
 def test_without_std_share_network_shapes(self, output_dim, hidden_sizes):
     model = GaussianMLPModel(output_dim=output_dim,
                              hidden_sizes=hidden_sizes,
                              std_share_network=False,
                              adaptive_std=False)
     model.build(self.input_var)
     with tf.variable_scope(model.name, reuse=True):
         mean_output_weights = tf.get_variable(
             'dist_params/mean_network/output/kernel')
         mean_output_bias = tf.get_variable(
             'dist_params/mean_network/output/bias')
         log_std_output_weights = tf.get_variable(
             'dist_params/log_std_network/parameter')
     assert mean_output_weights.shape[1] == output_dim
     assert mean_output_bias.shape == output_dim
     assert log_std_output_weights.shape == output_dim
예제 #5
0
    def test_softplus_max_std(self, output_dim, hidden_sizes):
        model = GaussianMLPModel(output_dim=output_dim,
                                 hidden_sizes=hidden_sizes,
                                 std_share_network=False,
                                 init_std=10,
                                 max_std=1,
                                 std_parameterization='softplus')
        dist = model.build(self.input_var).dist

        log_std = self.sess.run(tf.math.log(dist.stddev()),
                                feed_dict={self.input_var: self.obs})

        expected_log_std = np.full([1, 1, output_dim], np.log(1))

        # This test fails just outside of the default absolute tolerance.
        assert np.allclose(log_std, expected_log_std, atol=1e-7)
예제 #6
0
    def test_exp_max_std(self, output_dim, hidden_sizes):
        model = GaussianMLPModel(output_dim=output_dim,
                                 hidden_sizes=hidden_sizes,
                                 std_share_network=False,
                                 init_std=10,
                                 max_std=1,
                                 std_parameterization='exp')
        outputs = model.build(self.input_var)

        action, mean, log_std, std_param = self.sess.run(
            outputs[:-1], feed_dict={self.input_var: self.obs})

        expected_log_std = np.full([1, output_dim], np.log(1))
        expected_std_param = np.full([1, output_dim], np.log(10))
        assert np.allclose(log_std, expected_log_std)
        assert np.allclose(std_param, expected_std_param)
예제 #7
0
    def test_std_share_network_is_pickleable(self, output_dim, hidden_sizes,
                                             mock_normal):
        mock_normal.return_value = 0.5
        input_var = tf.placeholder(tf.float32, shape=(None, 5))
        model = GaussianMLPModel(output_dim=output_dim,
                                 hidden_sizes=hidden_sizes,
                                 std_share_network=True,
                                 hidden_nonlinearity=None,
                                 hidden_w_init=tf.ones_initializer(),
                                 output_w_init=tf.ones_initializer())
        outputs = model.build(input_var)
        output1 = self.sess.run(outputs[:-1], feed_dict={input_var: self.obs})
        with tf.Session(graph=tf.Graph()) as sess:
            input_var = tf.placeholder(tf.float32, shape=(None, 5))
            model_pickled = pickle.loads(pickle.dumps(model))
            outputs = model_pickled.build(input_var)
            output2 = sess.run(outputs[:-1], feed_dict={input_var: self.obs})

            assert np.array_equal(output1, output2)
예제 #8
0
    def test_std_share_network_output_values(self, output_dim, hidden_sizes):
        model = GaussianMLPModel(output_dim=output_dim,
                                 hidden_sizes=hidden_sizes,
                                 std_share_network=True,
                                 hidden_nonlinearity=None,
                                 std_parameterization='exp',
                                 hidden_w_init=tf.ones_initializer(),
                                 output_w_init=tf.ones_initializer())
        dist = model.build(self.input_var).dist

        mean, log_std = self.sess.run(
            [dist.loc, tf.math.log(dist.stddev())],
            feed_dict={self.input_var: self.obs})

        expected_mean = np.full([1, 1, output_dim], 5 * np.prod(hidden_sizes))
        expected_log_std = np.full([1, 1, output_dim],
                                   5 * np.prod(hidden_sizes))
        assert np.array_equal(mean, expected_mean)
        assert np.array_equal(log_std, expected_log_std)
예제 #9
0
    def test_without_std_share_network_output_values(self, output_dim,
                                                     hidden_sizes):
        model = GaussianMLPModel(output_dim=output_dim,
                                 hidden_sizes=hidden_sizes,
                                 init_std=2,
                                 std_share_network=False,
                                 adaptive_std=False,
                                 hidden_nonlinearity=None,
                                 hidden_w_init=tf.ones_initializer(),
                                 output_w_init=tf.ones_initializer())
        dist = model.build(self.input_var)

        mean, log_std = self.sess.run(
            [dist.loc, tf.math.log(dist.stddev())],
            feed_dict={self.input_var: self.obs})

        expected_mean = np.full([1, 1, output_dim], 5 * np.prod(hidden_sizes))
        expected_log_std = np.full([1, 1, output_dim], np.log(2.))
        assert np.array_equal(mean, expected_mean)
        assert np.allclose(log_std, expected_log_std)
예제 #10
0
    def test_softplus_output_values(self, output_dim, hidden_sizes):
        model = GaussianMLPModel(output_dim=output_dim,
                                 hidden_sizes=hidden_sizes,
                                 hidden_nonlinearity=None,
                                 std_share_network=False,
                                 adaptive_std=False,
                                 init_std=2,
                                 std_parameterization='softplus',
                                 hidden_w_init=tf.ones_initializer(),
                                 output_w_init=tf.ones_initializer())
        outputs = model.build(self.input_var)

        mean, log_std, std_param = self.sess.run(
            outputs[:-1], feed_dict={self.input_var: self.obs})

        expected_mean = np.full([1, output_dim], 5 * np.prod(hidden_sizes))
        expected_std_param = np.full([1, output_dim], np.log(np.exp(2) - 1))
        expected_log_std = np.log(np.log(1. + np.exp(expected_std_param)))
        assert np.array_equal(mean, expected_mean)
        assert np.allclose(std_param, expected_std_param)
        assert np.allclose(log_std, expected_log_std)
예제 #11
0
    def test_adaptive_std_is_pickleable(self, output_dim, hidden_sizes,
                                        std_hidden_sizes):
        input_var = tf.compat.v1.placeholder(tf.float32, shape=(None, None, 5))
        model = GaussianMLPModel(output_dim=output_dim,
                                 hidden_sizes=hidden_sizes,
                                 std_hidden_sizes=std_hidden_sizes,
                                 std_share_network=False,
                                 adaptive_std=True,
                                 hidden_nonlinearity=None,
                                 hidden_w_init=tf.ones_initializer(),
                                 output_w_init=tf.ones_initializer(),
                                 std_hidden_nonlinearity=None,
                                 std_hidden_w_init=tf.ones_initializer(),
                                 std_output_w_init=tf.ones_initializer())
        dist = model.build(input_var)

        # get output bias
        with tf.compat.v1.variable_scope('GaussianMLPModel', reuse=True):
            bias = tf.compat.v1.get_variable(
                'dist_params/mean_network/output/bias')
        # assign it to all ones
        bias.load(tf.ones_like(bias).eval())

        h = pickle.dumps(model)
        output1 = self.sess.run(
            [dist.loc, tf.math.log(dist.stddev())],
            feed_dict={input_var: self.obs})
        with tf.compat.v1.Session(graph=tf.Graph()) as sess:
            input_var = tf.compat.v1.placeholder(tf.float32,
                                                 shape=(None, None, 5))
            model_pickled = pickle.loads(h)
            dist2 = model_pickled.build(input_var)
            output2 = sess.run(
                [dist2.loc, tf.math.log(dist2.stddev())],
                feed_dict={input_var: self.obs})
            assert np.array_equal(output1, output2)
예제 #12
0
class GaussianMLPPolicyWithModel(StochasticPolicy2):
    """
    GaussianMLPPolicy with GaussianMLPModel.

    :param env_spec:
    :param hidden_sizes: list of sizes for the fully-connected hidden
    layers
    :param learn_std: Is std trainable
    :param init_std: Initial std
    :param adaptive_std:
    :param std_share_network:
    :param std_hidden_sizes: list of sizes for the fully-connected layers
     for std
    :param min_std: whether to make sure that the std is at least some
     threshold value, to avoid numerical issues
    :param std_hidden_nonlinearity:
    :param hidden_nonlinearity: nonlinearity used for each hidden layer
    :param output_nonlinearity: nonlinearity for the output layer
    :param mean_network: custom network for the output mean
    :param std_network: custom network for the output log std
    :param std_parametrization: how the std should be parametrized. There
     are a few options:
        - exp: the logarithm of the std will be stored, and applied a
         exponential transformation
        - softplus: the std will be computed as log(1+exp(x))
    :return:

    """

    def __init__(self,
                 env_spec,
                 name='GaussianMLPPolicy',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 output_nonlinearity=None,
                 learn_std=True,
                 adaptive_std=False,
                 std_share_network=False,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_hidden_sizes=(32, 32),
                 std_hidden_nonlinearity=tf.nn.tanh,
                 std_output_nonlinearity=None,
                 std_parameterization='exp',
                 layer_normalization=False):
        assert isinstance(env_spec.action_space, Box)
        super().__init__(name, env_spec)
        self.obs_dim = env_spec.observation_space.flat_dim
        self.action_dim = env_spec.action_space.flat_dim

        self.model = GaussianMLPModel(
            name=name,
            output_dim=self.action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
            learn_std=learn_std,
            adaptive_std=adaptive_std,
            std_share_network=std_share_network,
            init_std=init_std,
            min_std=min_std,
            max_std=max_std,
            std_hidden_sizes=std_hidden_sizes,
            std_hidden_nonlinearity=std_hidden_nonlinearity,
            std_output_nonlinearity=std_output_nonlinearity,
            std_parameterization=std_parameterization,
            layer_normalization=layer_normalization)

        self._initialize()

    def _initialize(self):
        state_input = tf.placeholder(tf.float32, shape=(None, self.obs_dim))

        with tf.variable_scope(self._variable_scope):
            self.model.build(state_input)

        self._f_dist = tf.get_default_session().make_callable(
            [
                self.model.networks['default'].sample,
                self.model.networks['default'].mean,
                self.model.networks['default'].log_std
            ],
            feed_list=[self.model.networks['default'].input])

    @property
    def vectorized(self):
        """Vectorized or not."""
        return True

    def dist_info_sym(self, obs_var, state_info_vars=None, name='default'):
        """Symbolic graph of the distribution."""
        with tf.variable_scope(self._variable_scope):
            _, mean_var, log_std_var, _, _ = self.model.build(
                obs_var, name=name)
        mean_var = tf.reshape(mean_var, self.action_space.shape)
        log_std_var = tf.reshape(log_std_var, self.action_space.shape)
        return dict(mean=mean_var, log_std=log_std_var)

    def get_action(self, observation):
        """Get action from the policy."""
        flat_obs = self.observation_space.flatten(observation)
        sample, mean, log_std = self._f_dist([flat_obs])
        sample = self.action_space.unflatten(sample[0])
        mean = self.action_space.unflatten(mean[0])
        log_std = self.action_space.unflatten(log_std[0])
        return sample, dict(mean=mean, log_std=log_std)

    def get_actions(self, observations):
        """Get actions from the policy."""
        flat_obs = self.observation_space.flatten_n(observations)
        samples, means, log_stds = self._f_dist(flat_obs)
        samples = self.action_space.unflatten_n(samples)
        means = self.action_space.unflatten_n(means)
        log_stds = self.action_space.unflatten_n(log_stds)
        return samples, dict(mean=means, log_std=log_stds)

    def get_params(self, trainable=True):
        """Get the trainable variables."""
        return self.get_trainable_vars()

    @property
    def distribution(self):
        """Policy distribution."""
        return self.model.networks['default'].dist

    def __getstate__(self):
        """Object.__getstate__."""
        new_dict = self.__dict__.copy()
        del new_dict['_f_dist']
        return new_dict

    def __setstate__(self, state):
        """Object.__setstate__."""
        self.__dict__.update(state)
        self._initialize()
class GaussianMLPPolicyWithModel(StochasticPolicy2):
    """
    GaussianMLPPolicy with GaussianMLPModel.

    A policy that contains a MLP to make prediction based on
    a gaussian distribution.

    Args:
        env_spec (garage.envs.env_spec.EnvSpec): Environment specification.
        name (str): Model name, also the variable scope.
        hidden_sizes (list[int]): Output dimension of dense layer(s) for
            the MLP for mean. For example, (32, 32) means the MLP consists
            of two hidden layers, each with 32 hidden units.
        hidden_nonlinearity (callable): Activation function for intermediate
            dense layer(s). It should return a tf.Tensor. Set it to
            None to maintain a linear activation.
        hidden_w_init (callable): Initializer function for the weight
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        hidden_b_init (callable): Initializer function for the bias
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        output_nonlinearity (callable): Activation function for output dense
            layer. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        output_w_init (callable): Initializer function for the weight
            of output dense layer(s). The function should return a
            tf.Tensor.
        output_b_init (callable): Initializer function for the bias
            of output dense layer(s). The function should return a
            tf.Tensor.
        learn_std (bool): Is std trainable.
        adaptive_std (bool): Is std a neural network. If False, it will be a
            parameter.
        std_share_network (bool): Boolean for whether mean and std share
            the same network.
        init_std (float): Initial value for std.
        std_hidden_sizes (list[int]): Output dimension of dense layer(s) for
            the MLP for std. For example, (32, 32) means the MLP consists
            of two hidden layers, each with 32 hidden units.
        min_std (float): If not None, the std is at least the value of min_std,
            to avoid numerical issues.
        max_std (float): If not None, the std is at most the value of max_std,
            to avoid numerical issues.
        std_hidden_nonlinearity: Nonlinearity for each hidden layer in
            the std network.
        std_output_nonlinearity: Nonlinearity for output layer in
            the std network.
        std_parametrization (str): How the std should be parametrized. There
            are a few options:
        - exp: the logarithm of the std will be stored, and applied a
            exponential transformation
        - softplus: the std will be computed as log(1+exp(x))
        layer_normalization (bool): Bool for using layer normalization or not.
    :return:

    """
    def __init__(self,
                 env_spec,
                 name='GaussianMLPPolicyWithModel',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.glorot_uniform_initializer(),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.glorot_uniform_initializer(),
                 output_b_init=tf.zeros_initializer(),
                 learn_std=True,
                 adaptive_std=False,
                 std_share_network=False,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_hidden_sizes=(32, 32),
                 std_hidden_nonlinearity=tf.nn.tanh,
                 std_output_nonlinearity=None,
                 std_parameterization='exp',
                 layer_normalization=False):
        assert isinstance(env_spec.action_space, Box)
        super().__init__(name, env_spec)
        self.obs_dim = env_spec.observation_space.flat_dim
        self.action_dim = env_spec.action_space.flat_dim

        self.model = GaussianMLPModel(
            output_dim=self.action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            hidden_w_init=hidden_w_init,
            hidden_b_init=hidden_b_init,
            output_nonlinearity=output_nonlinearity,
            output_w_init=output_w_init,
            output_b_init=output_b_init,
            learn_std=learn_std,
            adaptive_std=adaptive_std,
            std_share_network=std_share_network,
            init_std=init_std,
            min_std=min_std,
            max_std=max_std,
            std_hidden_sizes=std_hidden_sizes,
            std_hidden_nonlinearity=std_hidden_nonlinearity,
            std_output_nonlinearity=std_output_nonlinearity,
            std_parameterization=std_parameterization,
            layer_normalization=layer_normalization,
            name='GaussianMLPModel')

        self._initialize()

    def _initialize(self):
        state_input = tf.placeholder(tf.float32, shape=(None, self.obs_dim))

        with tf.variable_scope(self._variable_scope):
            self.model.build(state_input)

        self._f_dist = tf.get_default_session().make_callable(
            [
                self.model.networks['default'].sample,
                self.model.networks['default'].mean,
                self.model.networks['default'].log_std
            ],
            feed_list=[self.model.networks['default'].input])

    @property
    def vectorized(self):
        """Vectorized or not."""
        return True

    def dist_info_sym(self, obs_var, state_info_vars=None, name='default'):
        """Symbolic graph of the distribution."""
        with tf.variable_scope(self._variable_scope):
            _, mean_var, log_std_var, _, _ = self.model.build(obs_var,
                                                              name=name)
        return dict(mean=mean_var, log_std=log_std_var)

    def get_action(self, observation):
        """Get action from the policy."""
        flat_obs = self.observation_space.flatten(observation)
        sample, mean, log_std = self._f_dist([flat_obs])
        sample = self.action_space.unflatten(sample[0])
        mean = self.action_space.unflatten(mean[0])
        log_std = self.action_space.unflatten(log_std[0])
        return sample, dict(mean=mean, log_std=log_std)

    def get_actions(self, observations):
        """Get actions from the policy."""
        flat_obs = self.observation_space.flatten_n(observations)
        samples, means, log_stds = self._f_dist(flat_obs)
        samples = self.action_space.unflatten_n(samples)
        means = self.action_space.unflatten_n(means)
        log_stds = self.action_space.unflatten_n(log_stds)
        return samples, dict(mean=means, log_std=log_stds)

    def get_params(self, trainable=True):
        """Get the trainable variables."""
        return self.get_trainable_vars()

    @property
    def distribution(self):
        """Policy distribution."""
        return self.model.networks['default'].dist

    def __getstate__(self):
        """Object.__getstate__."""
        new_dict = self.__dict__.copy()
        del new_dict['_f_dist']
        return new_dict

    def __setstate__(self, state):
        """Object.__setstate__."""
        self.__dict__.update(state)
        self._initialize()
예제 #14
0
class GaussianMLPPolicy(StochasticPolicy):
    """GaussianMLPPolicy with GaussianMLPModel.

    A policy that contains a MLP to make prediction based on
    a gaussian distribution.

    Args:
        env_spec (garage.envs.env_spec.EnvSpec): Environment specification.
        name (str): Model name, also the variable scope.
        hidden_sizes (list[int]): Output dimension of dense layer(s) for
            the MLP for mean. For example, (32, 32) means the MLP consists
            of two hidden layers, each with 32 hidden units.
        hidden_nonlinearity (callable): Activation function for intermediate
            dense layer(s). It should return a tf.Tensor. Set it to
            None to maintain a linear activation.
        hidden_w_init (callable): Initializer function for the weight
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        hidden_b_init (callable): Initializer function for the bias
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        output_nonlinearity (callable): Activation function for output dense
            layer. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        output_w_init (callable): Initializer function for the weight
            of output dense layer(s). The function should return a
            tf.Tensor.
        output_b_init (callable): Initializer function for the bias
            of output dense layer(s). The function should return a
            tf.Tensor.
        learn_std (bool): Is std trainable.
        adaptive_std (bool): Is std a neural network. If False, it will be a
            parameter.
        std_share_network (bool): Boolean for whether mean and std share
            the same network.
        init_std (float): Initial value for std.
        std_hidden_sizes (list[int]): Output dimension of dense layer(s) for
            the MLP for std. For example, (32, 32) means the MLP consists
            of two hidden layers, each with 32 hidden units.
        min_std (float): If not None, the std is at least the value of min_std,
            to avoid numerical issues.
        max_std (float): If not None, the std is at most the value of max_std,
            to avoid numerical issues.
        std_hidden_nonlinearity (callable): Nonlinearity for each hidden layer
            in the std network. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        std_output_nonlinearity (callable): Nonlinearity for output layer in
            the std network. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        std_parameterization (str): How the std should be parametrized. There
            are a few options:
            - exp: the logarithm of the std will be stored, and applied a
                exponential transformation
            - softplus: the std will be computed as log(1+exp(x))
        layer_normalization (bool): Bool for using layer normalization or not.

    """

    def __init__(self,
                 env_spec,
                 name='GaussianMLPPolicy',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.glorot_uniform_initializer(),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.glorot_uniform_initializer(),
                 output_b_init=tf.zeros_initializer(),
                 learn_std=True,
                 adaptive_std=False,
                 std_share_network=False,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_hidden_sizes=(32, 32),
                 std_hidden_nonlinearity=tf.nn.tanh,
                 std_output_nonlinearity=None,
                 std_parameterization='exp',
                 layer_normalization=False):
        assert isinstance(env_spec.action_space, akro.Box)
        super().__init__(name, env_spec)
        self.obs_dim = env_spec.observation_space.flat_dim
        self.action_dim = env_spec.action_space.flat_dim

        self.model = GaussianMLPModel(
            output_dim=self.action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            hidden_w_init=hidden_w_init,
            hidden_b_init=hidden_b_init,
            output_nonlinearity=output_nonlinearity,
            output_w_init=output_w_init,
            output_b_init=output_b_init,
            learn_std=learn_std,
            adaptive_std=adaptive_std,
            std_share_network=std_share_network,
            init_std=init_std,
            min_std=min_std,
            max_std=max_std,
            std_hidden_sizes=std_hidden_sizes,
            std_hidden_nonlinearity=std_hidden_nonlinearity,
            std_output_nonlinearity=std_output_nonlinearity,
            std_parameterization=std_parameterization,
            layer_normalization=layer_normalization,
            name='GaussianMLPModel')

        self._initialize()

    def _initialize(self):
        state_input = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, self.obs_dim))

        with tf.compat.v1.variable_scope(self.name) as vs:
            self._variable_scope = vs
            self.model.build(state_input)

        self._f_dist = tf.compat.v1.get_default_session().make_callable(
            [
                self.model.networks['default'].mean,
                self.model.networks['default'].log_std
            ],
            feed_list=[self.model.networks['default'].input])

    @property
    def vectorized(self):
        """Vectorized or not.

        Returns:
            Bool: True if primitive supports vectorized operations.

        """
        return True

    def dist_info_sym(self, obs_var, state_info_vars=None, name='default'):
        """Build a symbolic graph of the distribution parameters.

        Args:
            obs_var (tf.Tensor): Tensor input for symbolic graph.
            state_info_vars (dict): Extra state information, e.g.
                previous action.
            name (str): Name for symbolic graph.

        Returns:
            dict[tf.Tensor]: Outputs of the symbolic graph of distribution
                parameters.

        """
        with tf.compat.v1.variable_scope(self._variable_scope):
            mean_var, log_std_var, _, _ = self.model.build(obs_var, name=name)
        return dict(mean=mean_var, log_std=log_std_var)

    def get_action(self, observation):
        """Get single action from this policy for the input observation.

        Args:
            observation (numpy.ndarray): Observation from environment.

        Returns:
            numpy.ndarray: Actions
            dict: Predicted action and agent information.

        Note:
            It returns an action and a dict, with keys
            - mean (numpy.ndarray): Mean of the distribution.
            - log_std (numpy.ndarray): Log standard deviation of the
                distribution.

        """
        flat_obs = self.observation_space.flatten(observation)
        mean, log_std = self._f_dist([flat_obs])
        rnd = np.random.normal(size=mean.shape)
        sample = rnd * np.exp(log_std) + mean
        sample = self.action_space.unflatten(sample[0])
        mean = self.action_space.unflatten(mean[0])
        log_std = self.action_space.unflatten(log_std[0])
        return sample, dict(mean=mean, log_std=log_std)

    def get_actions(self, observations):
        """Get multiple actions from this policy for the input observations.

        Args:
            observations (numpy.ndarray): Observations from environment.

        Returns:
            numpy.ndarray: Actions
            dict: Predicted action and agent information.

        Note:
            It returns actions and a dict, with keys
            - mean (numpy.ndarray): Means of the distribution.
            - log_std (numpy.ndarray): Log standard deviations of the
                distribution.

        """
        flat_obs = self.observation_space.flatten_n(observations)
        means, log_stds = self._f_dist(flat_obs)
        rnd = np.random.normal(size=means.shape)
        samples = rnd * np.exp(log_stds) + means
        samples = self.action_space.unflatten_n(samples)
        means = self.action_space.unflatten_n(means)
        log_stds = self.action_space.unflatten_n(log_stds)
        return samples, dict(mean=means, log_std=log_stds)

    def get_params(self):
        """Get the params, which are the trainable variables.

        Returns:
            List[tf.Variable]: A list of trainable variables in the current
                variable scope.

        """
        return self.get_trainable_vars()

    @property
    def distribution(self):
        """Policy distribution.

        Returns:
            garage.tf.distributions.DiagonalGaussian: Policy distribution.

        """
        return self.model.networks['default'].dist

    def __getstate__(self):
        """Object.__getstate__.

        Returns:
            dict: the state to be pickled for the instance.

        """
        new_dict = super().__getstate__()
        del new_dict['_f_dist']
        return new_dict

    def __setstate__(self, state):
        """Object.__setstate__.

        Args:
            state (dict): Unpickled state.

        """
        super().__setstate__(state)
        self._initialize()
예제 #15
0
class GaussianMLPEncoder(StochasticEncoder, StochasticModule):
    """GaussianMLPEncoder with GaussianMLPModel.

    An embedding that contains a MLP to make prediction based on
    a gaussian distribution.

    Args:
        embedding_spec (garage.InOutSpec):
            Encoder specification.
        name (str): Model name, also the variable scope.
        hidden_sizes (list[int]): Output dimension of dense layer(s) for
            the MLP for mean. For example, (32, 32) means the MLP consists
            of two hidden layers, each with 32 hidden units.
        hidden_nonlinearity (callable): Activation function for intermediate
            dense layer(s). It should return a tf.Tensor. Set it to
            None to maintain a linear activation.
        hidden_w_init (callable): Initializer function for the weight
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        hidden_b_init (callable): Initializer function for the bias
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        output_nonlinearity (callable): Activation function for output dense
            layer. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        output_w_init (callable): Initializer function for the weight
            of output dense layer(s). The function should return a
            tf.Tensor.
        output_b_init (callable): Initializer function for the bias
            of output dense layer(s). The function should return a
            tf.Tensor.
        learn_std (bool): Is std trainable.
        adaptive_std (bool): Is std a neural network. If False, it will be a
            parameter.
        std_share_network (bool): Boolean for whether mean and std share
            the same network.
        init_std (float): Initial value for std.
        std_hidden_sizes (list[int]): Output dimension of dense layer(s) for
            the MLP for std. For example, (32, 32) means the MLP consists
            of two hidden layers, each with 32 hidden units.
        min_std (float): If not None, the std is at least the value of min_std,
            to avoid numerical issues.
        max_std (float): If not None, the std is at most the value of max_std,
            to avoid numerical issues.
        std_hidden_nonlinearity (callable): Nonlinearity for each hidden layer
            in the std network. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        std_output_nonlinearity (callable): Nonlinearity for output layer in
            the std network. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        std_parameterization (str): How the std should be parametrized. There
            are a few options:
            - exp: the logarithm of the std will be stored, and applied a
                exponential transformation
            - softplus: the std will be computed as log(1+exp(x))
        layer_normalization (bool): Bool for using layer normalization or not.

    """

    def __init__(self,
                 embedding_spec,
                 name='GaussianMLPEncoder',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(),
                 output_b_init=tf.zeros_initializer(),
                 learn_std=True,
                 adaptive_std=False,
                 std_share_network=False,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_hidden_sizes=(32, 32),
                 std_hidden_nonlinearity=tf.nn.tanh,
                 std_output_nonlinearity=None,
                 std_parameterization='exp',
                 layer_normalization=False):
        super().__init__(name)
        self._embedding_spec = embedding_spec
        self._hidden_sizes = hidden_sizes
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._learn_std = learn_std
        self._adaptive_std = adaptive_std
        self._std_share_network = std_share_network
        self._init_std = init_std
        self._min_std = min_std
        self._max_std = max_std
        self._std_hidden_sizes = std_hidden_sizes
        self._std_hidden_nonlinearity = std_hidden_nonlinearity
        self._std_output_nonlinearity = std_output_nonlinearity
        self._std_parameterization = std_parameterization
        self._layer_normalization = layer_normalization

        self._latent_dim = embedding_spec.output_space.flat_dim
        self._input_dim = embedding_spec.input_space.flat_dim
        self._network = None
        self._f_dist = None

        self.model = GaussianMLPModel(
            output_dim=self._latent_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            hidden_w_init=hidden_w_init,
            hidden_b_init=hidden_b_init,
            output_nonlinearity=output_nonlinearity,
            output_w_init=output_w_init,
            output_b_init=output_b_init,
            learn_std=learn_std,
            adaptive_std=adaptive_std,
            std_share_network=std_share_network,
            init_std=init_std,
            min_std=min_std,
            max_std=max_std,
            std_hidden_sizes=std_hidden_sizes,
            std_hidden_nonlinearity=std_hidden_nonlinearity,
            std_output_nonlinearity=std_output_nonlinearity,
            std_parameterization=std_parameterization,
            layer_normalization=layer_normalization,
            name='GaussianMLPModel')

        self._initialize()

    def _initialize(self):
        """Initialize encoder."""
        embedding_input = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, None,
                                                          self._input_dim),
                                                   name='default_encoder')
        with tf.compat.v1.variable_scope(self._name) as vs:
            self._variable_scope = vs
            self._network = self.model.build(embedding_input)
            self._f_dist = tf.compat.v1.get_default_session().make_callable(
                [
                    self._network.dist.sample(), self._network.mean,
                    self._network.log_std
                ],
                feed_list=[embedding_input])

    def build(self, embedding_input, name=None):
        """Build encoder.

        Args:
            embedding_input (tf.Tensor) : Embedding input.
            name (str): Name of the model, which is also the name scope.

        Returns:
            tfp.distributions.MultivariateNormalDiag: Distribution.
            tf.tensor: Mean.
            tf.Tensor: Log of standard deviation.

        """
        with tf.compat.v1.variable_scope(self._variable_scope):
            return self.model.build(embedding_input, name=name)

    @property
    def spec(self):
        """garage.InOutSpec: Specification of input and output."""
        return self._embedding_spec

    @property
    def input_dim(self):
        """int: Dimension of the encoder input."""
        return self._embedding_spec.input_space.flat_dim

    @property
    def output_dim(self):
        """int: Dimension of the encoder output (embedding)."""
        return self._embedding_spec.output_space.flat_dim

    @property
    def vectorized(self):
        """bool: If this module supports vectorization input."""
        return True

    def get_latent(self, input_value):
        """Get a sample of embedding for the given input.

        Args:
            input_value (numpy.ndarray): Tensor to encode.

        Returns:
            numpy.ndarray: An embedding sampled from embedding distribution.
            dict: Embedding distribution information.

        Note:
            It returns an embedding and a dict, with keys
            - mean (numpy.ndarray): Mean of the distribution.
            - log_std (numpy.ndarray): Log standard deviation of the
                distribution.

        """
        flat_input = self._embedding_spec.input_space.flatten(input_value)
        sample, mean, log_std = self._f_dist(np.expand_dims([flat_input], 1))
        sample = self._embedding_spec.output_space.unflatten(
            np.squeeze(sample, 1)[0])
        mean = self._embedding_spec.output_space.unflatten(
            np.squeeze(mean, 1)[0])
        log_std = self._embedding_spec.output_space.unflatten(
            np.squeeze(log_std, 1)[0])
        return sample, dict(mean=mean, log_std=log_std)

    def get_latents(self, input_values):
        """Get samples of embedding for the given inputs.

        Args:
            input_values (numpy.ndarray): Tensors to encode.

        Returns:
            numpy.ndarray: Embeddings sampled from embedding distribution.
            dict: Embedding distribution information.

        Note:
            It returns an embedding and a dict, with keys
            - mean (list[numpy.ndarray]): Means of the distribution.
            - log_std (list[numpy.ndarray]): Log standard deviations of the
                distribution.

        """
        flat_input = self._embedding_spec.input_space.flatten_n(input_values)
        samples, means, log_stds = self._f_dist(np.expand_dims(flat_input, 1))
        samples = self._embedding_spec.output_space.unflatten_n(
            np.squeeze(samples, 1))
        means = self._embedding_spec.output_space.unflatten_n(
            np.squeeze(means, 1))
        log_stds = self._embedding_spec.output_space.unflatten_n(
            np.squeeze(log_stds, 1))
        return samples, dict(mean=means, log_std=log_stds)

    @property
    def distribution(self):
        """Encoder distribution.

        Returns:
            tfp.Distribution.MultivariateNormalDiag: Encoder distribution.

        """
        return self._network.dist

    @property
    def input(self):
        """tf.Tensor: Input to encoder network."""
        return self._network.input

    @property
    def latent_mean(self):
        """tf.Tensor: Predicted mean of a Gaussian distribution."""
        return self._network.mean

    @property
    def latent_std_param(self):
        """tf.Tensor: Predicted std of a Gaussian distribution."""
        return self._network.log_std

    def clone(self, name):
        """Return a clone of the encoder.

        Args:
            name (str): Name of the newly created encoder. It has to be
                different from source encoder if cloned under the same
                computational graph.

        Returns:
            garage.tf.embeddings.encoder.Encoder: Newly cloned encoder.

        """
        new_encoder = self.__class__(
            embedding_spec=self._embedding_spec,
            name=name,
            hidden_sizes=self._hidden_sizes,
            hidden_nonlinearity=self._hidden_nonlinearity,
            hidden_w_init=self._hidden_w_init,
            hidden_b_init=self._hidden_b_init,
            output_nonlinearity=self._output_nonlinearity,
            output_w_init=self._output_w_init,
            output_b_init=self._output_b_init,
            learn_std=self._learn_std,
            adaptive_std=self._adaptive_std,
            std_share_network=self._std_share_network,
            init_std=self._init_std,
            min_std=self._min_std,
            max_std=self._max_std,
            std_hidden_sizes=self._std_hidden_sizes,
            std_hidden_nonlinearity=self._std_hidden_nonlinearity,
            std_output_nonlinearity=self._std_output_nonlinearity,
            std_parameterization=self._std_parameterization,
            layer_normalization=self._layer_normalization)

        return new_encoder

    def __getstate__(self):
        """Object.__getstate__.

        Returns:
            dict: the state to be pickled for the instance.

        """
        new_dict = super().__getstate__()
        del new_dict['_f_dist']
        del new_dict['_network']
        return new_dict

    def __setstate__(self, state):
        """Parameters to restore from snapshot.

        Args:
            state (dict): Parameters to restore from.

        """
        super().__setstate__(state)
        self._initialize()
class GaussianMLPTaskEmbeddingPolicy(TaskEmbeddingPolicy):
    """GaussianMLPTaskEmbeddingPolicy.

    Args:
        env_spec (garage.envs.env_spec.EnvSpec): Environment specification.
        encoder (garage.tf.embeddings.StochasticEncoder): Embedding network.
        name (str): Model name, also the variable scope.
        hidden_sizes (list[int]): Output dimension of dense layer(s) for
            the MLP for mean. For example, (32, 32) means the MLP consists
            of two hidden layers, each with 32 hidden units.
        hidden_nonlinearity (callable): Activation function for intermediate
            dense layer(s). It should return a tf.Tensor. Set it to
            None to maintain a linear activation.
        hidden_w_init (callable): Initializer function for the weight
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        hidden_b_init (callable): Initializer function for the bias
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        output_nonlinearity (callable): Activation function for output dense
            layer. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        output_w_init (callable): Initializer function for the weight
            of output dense layer(s). The function should return a
            tf.Tensor.
        output_b_init (callable): Initializer function for the bias
            of output dense layer(s). The function should return a
            tf.Tensor.
        learn_std (bool): Is std trainable.
        adaptive_std (bool): Is std a neural network. If False, it will be a
            parameter.
        std_share_network (bool): Boolean for whether mean and std share
            the same network.
        init_std (float): Initial value for std.
        std_hidden_sizes (list[int]): Output dimension of dense layer(s) for
            the MLP for std. For example, (32, 32) means the MLP consists
            of two hidden layers, each with 32 hidden units.
        min_std (float): If not None, the std is at least the value of min_std,
            to avoid numerical issues.
        max_std (float): If not None, the std is at most the value of max_std,
            to avoid numerical issues.
        std_hidden_nonlinearity (callable): Nonlinearity for each hidden layer
            in the std network. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        std_output_nonlinearity (callable): Nonlinearity for output layer in
            the std network. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        std_parameterization (str): How the std should be parametrized. There
            are a few options:
            - exp: the logarithm of the std will be stored, and applied a
                exponential transformation
            - softplus: the std will be computed as log(1+exp(x))
        layer_normalization (bool): Bool for using layer normalization or not.

    """

    def __init__(self,
                 env_spec,
                 encoder,
                 name='GaussianMLPTaskEmbeddingPolicy',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(),
                 output_b_init=tf.zeros_initializer(),
                 learn_std=True,
                 adaptive_std=False,
                 std_share_network=False,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_hidden_sizes=(32, 32),
                 std_hidden_nonlinearity=tf.nn.tanh,
                 std_output_nonlinearity=None,
                 std_parameterization='exp',
                 layer_normalization=False):
        assert isinstance(env_spec.action_space, akro.Box)
        super().__init__(name, env_spec, encoder)
        self._obs_dim = env_spec.observation_space.flat_dim
        self._action_dim = env_spec.action_space.flat_dim
        self._dist = None

        self.model = GaussianMLPModel(
            output_dim=self._action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            hidden_w_init=hidden_w_init,
            hidden_b_init=hidden_b_init,
            output_nonlinearity=output_nonlinearity,
            output_w_init=output_w_init,
            output_b_init=output_b_init,
            learn_std=learn_std,
            adaptive_std=adaptive_std,
            std_share_network=std_share_network,
            init_std=init_std,
            min_std=min_std,
            max_std=max_std,
            std_hidden_sizes=std_hidden_sizes,
            std_hidden_nonlinearity=std_hidden_nonlinearity,
            std_output_nonlinearity=std_output_nonlinearity,
            std_parameterization=std_parameterization,
            layer_normalization=layer_normalization,
            name='GaussianMLPModel')

        self._initialize()

    def _initialize(self):
        """Initialize policy."""
        obs_input = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, None, self._obs_dim))
        latent_input = tf.compat.v1.placeholder(
            tf.float32, shape=(None, None, self._encoder.output_dim))

        # Encoder should be outside policy scope
        with tf.compat.v1.variable_scope('concat_obs_task'):
            latent_var = self._encoder.distribution.sample()

        with tf.compat.v1.variable_scope(self.name) as vs:
            self._variable_scope = vs

            with tf.compat.v1.variable_scope('concat_obs_latent'):
                obs_latent_input = tf.concat([obs_input, latent_input], -1)
            self._dist, mean_var, log_std_var = self.model.build(
                obs_latent_input, name='given_latent').outputs

            with tf.compat.v1.variable_scope('concat_obs_latent_var'):
                embed_state_input = tf.concat([obs_input, latent_var], -1)

            dist_given_task, mean_g_t, log_std_g_t = self.model.build(
                embed_state_input, name='given_task').outputs

        self._f_dist_obs_latent = tf.compat.v1.get_default_session(
        ).make_callable([self._dist.sample(), mean_var, log_std_var],
                        feed_list=[obs_input, latent_input])

        self._f_dist_obs_task = tf.compat.v1.get_default_session(
        ).make_callable([dist_given_task.sample(), mean_g_t, log_std_g_t],
                        feed_list=[obs_input, self._encoder.input])

    @property
    def distribution(self):
        """Policy action distribution.

        Returns:
            garage.tf.distributions.DiagonalGaussian: Policy distribution.

        """
        return self._dist

    def get_action(self, observation):
        """Get action sampled from the policy.

        Args:
            observation (np.ndarray): Augmented observation from the
                environment, with shape :math:`(O+N, )`. O is the dimension
                of observation, N is the number of tasks.

        Returns:
            np.ndarray: Action sampled from the policy,
                with shape :math:`(A, )`. A is the dimension of action.
            dict: Action distribution information, with keys:
                - mean (numpy.ndarray): Mean of the distribution,
                    with shape :math:`(A, )`. A is the dimension of
                    action.
                - log_std (numpy.ndarray): Log standard deviation of the
                    distribution, with shape :math:`(A, )`.
                    A is the dimension of action.

        """
        obs, task = self.split_augmented_observation(observation)
        return self.get_action_given_task(obs, task)

    def get_actions(self, observations):
        """Get actions sampled from the policy.

        Args:
            observations (np.ndarray): Augmented observation from the
                environment, with shape :math:`(T, O+N)`. T is the number of
                environment steps, O is the dimension of observation, N is the
                number of tasks.

        Returns:
            np.ndarray: Actions sampled from the policy,
                with shape :math:`(T, A)`. T is the number of environment
                steps, A is the dimension of action.
            dict: Action distribution information, with keys:
                - mean (numpy.ndarray): Mean of the distribution,
                    with shape :math:`(T, A)`. T is the number of environment
                    steps, A is the dimension of action.
                - log_std (numpy.ndarray): Log standard deviation of the
                    distribution, with shape :math:`(T, A)`. T is the number of
                    environment steps, Z is the dimension of action.

        """
        obses, tasks = zip(*[
            self.split_augmented_observation(aug_obs)
            for aug_obs in observations
        ])
        return self.get_actions_given_tasks(np.array(obses), np.array(tasks))

    def get_action_given_latent(self, observation, latent):
        """Sample an action given observation and latent.

        Args:
            observation (np.ndarray): Observation from the environment,
                with shape :math:`(O, )`. O is the dimension of observation.
            latent (np.ndarray): Latent, with shape :math:`(Z, )`. Z is the
                dimension of the latent embedding.

        Returns:
            np.ndarray: Action sampled from the policy,
                with shape :math:`(A, )`. A is the dimension of action.
            dict: Action distribution information, with keys:
                - mean (numpy.ndarray): Mean of the distribution,
                    with shape :math:`(A, )`. A is the dimension of action.
                - log_std (numpy.ndarray): Log standard deviation of the
                    distribution, with shape :math:`(A, )`. A is the dimension
                    of action.

        """
        flat_obs = self.observation_space.flatten(observation)
        flat_obs = np.expand_dims([flat_obs], 1)
        flat_latent = self.latent_space.flatten(latent)
        flat_latent = np.expand_dims([flat_latent], 1)

        sample, mean, log_std = self._f_dist_obs_latent(flat_obs, flat_latent)
        sample = self.action_space.unflatten(np.squeeze(sample, 1)[0])
        mean = self.action_space.unflatten(np.squeeze(mean, 1)[0])
        log_std = self.action_space.unflatten(np.squeeze(log_std, 1)[0])
        return sample, dict(mean=mean, log_std=log_std)

    def get_actions_given_latents(self, observations, latents):
        """Sample a batch of actions given observations and latents.

        Args:
            observations (np.ndarray): Observations from the environment, with
                shape :math:`(T, O)`. T is the number of environment steps, O
                is the dimension of observation.
            latents (np.ndarray): Latents, with shape :math:`(T, Z)`. T is the
                number of environment steps, Z is the dimension of
                latent embedding.

        Returns:
            np.ndarray: Actions sampled from the policy,
                with shape :math:`(T, A)`. T is the number of environment
                steps, A is the dimension of action.
            dict: Action distribution information, , with keys:
                - mean (numpy.ndarray): Mean of the distribution,
                    with shape :math:`(T, A)`. T is the number of
                    environment steps. A is the dimension of action.
                - log_std (numpy.ndarray): Log standard deviation of the
                    distribution, with shape :math:`(T, A)`. T is the number of
                    environment steps. A is the dimension of action.

        """
        flat_obses = self.observation_space.flatten_n(observations)
        flat_obses = np.expand_dims(flat_obses, 1)
        flat_latents = self.latent_space.flatten_n(latents)
        flat_latents = np.expand_dims(flat_latents, 1)

        samples, means, log_stds = self._f_dist_obs_latent(
            flat_obses, flat_latents)
        samples = self.action_space.unflatten_n(np.squeeze(samples, 1))
        means = self.action_space.unflatten_n(np.squeeze(means, 1))
        log_stds = self.action_space.unflatten_n(np.squeeze(log_stds, 1))
        return samples, dict(mean=means, log_std=log_stds)

    def get_action_given_task(self, observation, task_id):
        """Sample an action given observation and task id.

        Args:
            observation (np.ndarray): Observation from the environment, with
                shape :math:`(O, )`. O is the dimension of the observation.
            task_id (np.ndarray): One-hot task id, with shape :math:`(N, ).
                N is the number of tasks.

        Returns:
            np.ndarray: Action sampled from the policy, with shape
                :math:`(A, )`. A is the dimension of action.
            dict: Action distribution information, with keys:
                - mean (numpy.ndarray): Mean of the distribution,
                    with shape :math:`(A, )`. A is the dimension of action.
                - log_std (numpy.ndarray): Log standard deviation of the
                    distribution, with shape :math:`(A, )`. A is the dimension
                    of action.

        """
        flat_obs = self.observation_space.flatten(observation)
        flat_obs = np.expand_dims([flat_obs], 1)
        task_id = np.expand_dims([task_id], 1)

        sample, mean, log_std = self._f_dist_obs_task(flat_obs, task_id)
        sample = self.action_space.unflatten(np.squeeze(sample, 1)[0])
        mean = self.action_space.unflatten(np.squeeze(mean, 1)[0])
        log_std = self.action_space.unflatten(np.squeeze(log_std, 1)[0])
        return sample, dict(mean=mean, log_std=log_std)

    def get_actions_given_tasks(self, observations, task_ids):
        """Sample a batch of actions given observations and task ids.

        Args:
            observations (np.ndarray): Observations from the environment, with
                shape :math:`(T, O)`. T is the number of environment steps,
                O is the dimension of observation.
            task_ids (np.ndarry): One-hot task ids, with shape :math:`(T, N)`.
                T is the number of environment steps, N is the number of tasks.

        Returns:
            np.ndarray: Actions sampled from the policy,
                with shape :math:`(T, A)`. T is the number of environment
                steps, A is the dimension of action.
            dict: Action distribution information, , with keys:
                - mean (numpy.ndarray): Mean of the distribution,
                    with shape :math:`(T, A)`. T is the number of
                    environment steps. A is the dimension of action.
                - log_std (numpy.ndarray): Log standard deviation of the
                    distribution, with shape :math:`(T, A)`. T is the number of
                    environment steps. A is the dimension of action.

        """
        flat_obses = self.observation_space.flatten_n(observations)
        flat_obses = np.expand_dims(flat_obses, 1)
        task_ids = np.expand_dims(task_ids, 1)

        samples, means, log_stds = self._f_dist_obs_task(flat_obses, task_ids)
        samples = self.action_space.unflatten_n(np.squeeze(samples, 1))
        means = self.action_space.unflatten_n(np.squeeze(means, 1))
        log_stds = self.action_space.unflatten_n(np.squeeze(log_stds, 1))
        return samples, dict(mean=means, log_std=log_stds)

    def __getstate__(self):
        """Object.__getstate__.

        Returns:
            dict: The state to be pickled for the instance.

        """
        new_dict = super().__getstate__()
        del new_dict['_f_dist_obs_latent']
        del new_dict['_f_dist_obs_task']
        del new_dict['_dist']
        return new_dict

    def __setstate__(self, state):
        """Object.__setstate__.

        Args:
            state (dict): Unpickled state.

        """
        super().__setstate__(state)
        self._initialize()
예제 #17
0
class GaussianMLPEncoder(StochasticEncoder, StochasticModule):
    """GaussianMLPEncoder with GaussianMLPModel.

    An embedding that contains a MLP to make prediction based on
    a gaussian distribution.

    Args:
        embedding_spec (garage.InOutSpec):
            Encoder specification.
        name (str): Model name, also the variable scope.
        hidden_sizes (list[int]): Output dimension of dense layer(s) for
            the MLP for mean. For example, (32, 32) means the MLP consists
            of two hidden layers, each with 32 hidden units.
        hidden_nonlinearity (callable): Activation function for intermediate
            dense layer(s). It should return a tf.Tensor. Set it to
            None to maintain a linear activation.
        hidden_w_init (callable): Initializer function for the weight
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        hidden_b_init (callable): Initializer function for the bias
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        output_nonlinearity (callable): Activation function for output dense
            layer. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        output_w_init (callable): Initializer function for the weight
            of output dense layer(s). The function should return a
            tf.Tensor.
        output_b_init (callable): Initializer function for the bias
            of output dense layer(s). The function should return a
            tf.Tensor.
        learn_std (bool): Is std trainable.
        adaptive_std (bool): Is std a neural network. If False, it will be a
            parameter.
        std_share_network (bool): Boolean for whether mean and std share
            the same network.
        init_std (float): Initial value for std.
        std_hidden_sizes (list[int]): Output dimension of dense layer(s) for
            the MLP for std. For example, (32, 32) means the MLP consists
            of two hidden layers, each with 32 hidden units.
        min_std (float): If not None, the std is at least the value of min_std,
            to avoid numerical issues.
        max_std (float): If not None, the std is at most the value of max_std,
            to avoid numerical issues.
        std_hidden_nonlinearity (callable): Nonlinearity for each hidden layer
            in the std network. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        std_output_nonlinearity (callable): Nonlinearity for output layer in
            the std network. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        std_parameterization (str): How the std should be parametrized. There
            are a few options:
            - exp: the logarithm of the std will be stored, and applied a
                exponential transformation
            - softplus: the std will be computed as log(1+exp(x))
        layer_normalization (bool): Bool for using layer normalization or not.

    """
    def __init__(self,
                 embedding_spec,
                 name='GaussianMLPEncoder',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(),
                 output_b_init=tf.zeros_initializer(),
                 learn_std=True,
                 adaptive_std=False,
                 std_share_network=False,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_hidden_sizes=(32, 32),
                 std_hidden_nonlinearity=tf.nn.tanh,
                 std_output_nonlinearity=None,
                 std_parameterization='exp',
                 layer_normalization=False):
        super().__init__(name)
        self._embedding_spec = embedding_spec
        self._latent_dim = embedding_spec.output_space.flat_dim
        self._input_dim = embedding_spec.input_space.flat_dim

        self.model = GaussianMLPModel(
            output_dim=self._latent_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            hidden_w_init=hidden_w_init,
            hidden_b_init=hidden_b_init,
            output_nonlinearity=output_nonlinearity,
            output_w_init=output_w_init,
            output_b_init=output_b_init,
            learn_std=learn_std,
            adaptive_std=adaptive_std,
            std_share_network=std_share_network,
            init_std=init_std,
            min_std=min_std,
            max_std=max_std,
            std_hidden_sizes=std_hidden_sizes,
            std_hidden_nonlinearity=std_hidden_nonlinearity,
            std_output_nonlinearity=std_output_nonlinearity,
            std_parameterization=std_parameterization,
            layer_normalization=layer_normalization,
            name='GaussianMLPModel')

        self._initialize()

    def _initialize(self):
        embedding_input = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None,
                                                          self._input_dim))

        with tf.compat.v1.variable_scope(self._name) as vs:
            self._variable_scope = vs
            self.model.build(embedding_input)

        self._f_dist = tf.compat.v1.get_default_session().make_callable(
            [
                self.model.networks['default'].mean,
                self.model.networks['default'].log_std
            ],
            feed_list=[self.model.networks['default'].input])

    def dist_info(self, input_val, state_infos=None):
        """Distribution info.

        Get the information of embedding distribution given an input.

        Args:
            input_val (np.ndarray): input values
            state_infos (dict): a dictionary whose values contain
                information about the predicted embedding given an input.

        Returns:
            dict[numpy.ndarray]: Distribution parameters, with keys
                - mean (numpy.ndarray): Mean of the distribution.
                - log_std (numpy.ndarray): Log standard deviation of the
                    distribution.

        """
        flat_input = self._embedding_spec.input_space.flatten(input_val)
        mean, log_std = self._f_dist([flat_input])
        mean = self._embedding_spec.output_space.unflatten(mean[0])
        log_std = self._embedding_spec.output_space.unflatten(log_std[0])
        return dict(mean=mean, log_std=log_std)

    def dist_info_sym(self, input_var, state_info_vars=None, name='default'):
        """Build a symbolic graph of the distribution parameters.

        Args:
            input_var (tf.Tensor): Tensor input for symbolic graph.
            state_info_vars (dict): Extra state information, e.g.
                previous embedding.
            name (str): Name for symbolic graph.

        Returns:
            dict[tf.Tensor]: Outputs of the symbolic graph of distribution
                parameters.

        """
        with tf.compat.v1.variable_scope(self._variable_scope):
            mean_var, log_std_var, _, _ = self.model.build(input_var,
                                                           name=name)
        return dict(mean=mean_var, log_std=log_std_var)

    @property
    def spec(self):
        """garage.InOutSpec: Specification of input and output."""
        return self._embedding_spec

    @property
    def input_dim(self):
        """int: Dimension of the encoder input."""
        return self._embedding_spec.input_space.flat_dim

    @property
    def output_dim(self):
        """int: Dimension of the encoder output (embedding)."""
        return self._embedding_spec.output_space.flat_dim

    @property
    def recurrent(self):
        """bool: If this module has a hidden state."""
        return False

    @property
    def vectorized(self):
        """bool: If this module supports vectorization input."""
        return True

    def forward(self, input_value):
        """Get an sample of embedding for the given input.

        Args:
            input_value (numpy.ndarray): Tensor to encode.

        Returns:
            numpy.ndarray: An embedding sampled from embedding distribution.
            dict: Embedding distribution information.

        Note:
            It returns an embedding and a dict, with keys
            - mean (numpy.ndarray): Mean of the distribution.
            - log_std (numpy.ndarray): Log standard deviation of the
                distribution.

        """
        flat_input = self._embedding_spec.input_space.flatten(input_value)
        mean, log_std = self._f_dist([flat_input])
        rnd = np.random.normal(size=mean.shape)
        sample = rnd * np.exp(log_std) + mean
        sample = self._embedding_spec.output_space.unflatten(sample[0])
        mean = self._embedding_spec.output_space.unflatten(mean[0])
        log_std = self._embedding_spec.output_space.unflatten(log_std[0])
        return sample, dict(mean=mean, log_std=log_std)

    @property
    def distribution(self):
        """Embedding distribution.

        Returns:
            garage.tf.distributions.DiagonalGaussian: embedding distribution.

        """
        return self.model.networks['default'].dist

    @property
    def input(self):
        """tf.Tensor: Input to encoder network."""
        return self.model.networks['default'].input

    @property
    def latent_mean(self):
        """tf.Tensor: Predicted mean of a Gaussian distribution."""
        return self.model.networks['default'].mean

    @property
    def latent_std_param(self):
        """tf.Tensor: Predicted std of a Gaussian distribution."""
        return self.model.networks['default'].log_std

    def __getstate__(self):
        """Object.__getstate__.

        Returns:
            dict: the state to be pickled for the instance.

        """
        new_dict = super().__getstate__()
        del new_dict['_f_dist']
        return new_dict

    def __setstate__(self, state):
        """Object.__setstate__.

        Args:
            state (dict): Unpickled state.

        """
        super().__setstate__(state)
        self._initialize()
예제 #18
0
 def test_unknown_std_parameterization(self):
     with pytest.raises(ValueError):
         GaussianMLPModel(output_dim=1, std_parameterization='unknown')
예제 #19
0
 def test_unknown_std_parameterization(self):
     with self.assertRaises(NotImplementedError):
         GaussianMLPModel(output_dim=1, std_parameterization='unknown')
예제 #20
0
class GaussianMLPPolicy(StochasticPolicy):
    """Gaussian MLP Policy.

    A policy represented by a Gaussian distribution
    which is parameterized by a multilayer perceptron (MLP).

    Args:
        env_spec (garage.envs.env_spec.EnvSpec): Environment specification.
        name (str): Model name, also the variable scope.
        hidden_sizes (list[int]): Output dimension of dense layer(s) for
            the MLP for mean. For example, (32, 32) means the MLP consists
            of two hidden layers, each with 32 hidden units.
        hidden_nonlinearity (callable): Activation function for intermediate
            dense layer(s). It should return a tf.Tensor. Set it to
            None to maintain a linear activation.
        hidden_w_init (callable): Initializer function for the weight
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        hidden_b_init (callable): Initializer function for the bias
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        output_nonlinearity (callable): Activation function for output dense
            layer. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        output_w_init (callable): Initializer function for the weight
            of output dense layer(s). The function should return a
            tf.Tensor.
        output_b_init (callable): Initializer function for the bias
            of output dense layer(s). The function should return a
            tf.Tensor.
        learn_std (bool): Is std trainable.
        adaptive_std (bool): Is std a neural network. If False, it will be a
            parameter.
        std_share_network (bool): Boolean for whether mean and std share
            the same network.
        init_std (float): Initial value for std.
        std_hidden_sizes (list[int]): Output dimension of dense layer(s) for
            the MLP for std. For example, (32, 32) means the MLP consists
            of two hidden layers, each with 32 hidden units.
        min_std (float): If not None, the std is at least the value of min_std,
            to avoid numerical issues.
        max_std (float): If not None, the std is at most the value of max_std,
            to avoid numerical issues.
        std_hidden_nonlinearity (callable): Nonlinearity for each hidden layer
            in the std network. The function should return a tf.Tensor.
        std_output_nonlinearity (callable): Nonlinearity for output layer in
            the std network. The function should return a
            tf.Tensor.
        std_parameterization (str): How the std should be parametrized. There
            are a few options:
        - exp: the logarithm of the std will be stored, and applied a
            exponential transformation
        - softplus: the std will be computed as log(1+exp(x))
        layer_normalization (bool): Bool for using layer normalization or not.

    """

    def __init__(self,
                 env_spec,
                 name='GaussianMLPPolicy',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(),
                 output_b_init=tf.zeros_initializer(),
                 learn_std=True,
                 adaptive_std=False,
                 std_share_network=False,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_hidden_sizes=(32, 32),
                 std_hidden_nonlinearity=tf.nn.tanh,
                 std_output_nonlinearity=None,
                 std_parameterization='exp',
                 layer_normalization=False):
        if not isinstance(env_spec.action_space, akro.Box):
            raise ValueError('GaussianMLPPolicy only works with '
                             'akro.Box action space, but not {}'.format(
                                 env_spec.action_space))
        super().__init__(name, env_spec)
        self.obs_dim = env_spec.observation_space.flat_dim
        self.action_dim = env_spec.action_space.flat_dim

        self._hidden_sizes = hidden_sizes
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._learn_std = learn_std
        self._adaptive_std = adaptive_std
        self._std_share_network = std_share_network
        self._init_std = init_std
        self._min_std = min_std
        self._max_std = max_std
        self._std_hidden_sizes = std_hidden_sizes
        self._std_hidden_nonlinearity = std_hidden_nonlinearity
        self._std_output_nonlinearity = std_output_nonlinearity
        self._std_parameterization = std_parameterization
        self._layer_normalization = layer_normalization

        self._f_dist = None
        self._dist = None

        self.model = GaussianMLPModel(
            output_dim=self.action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            hidden_w_init=hidden_w_init,
            hidden_b_init=hidden_b_init,
            output_nonlinearity=output_nonlinearity,
            output_w_init=output_w_init,
            output_b_init=output_b_init,
            learn_std=learn_std,
            adaptive_std=adaptive_std,
            std_share_network=std_share_network,
            init_std=init_std,
            min_std=min_std,
            max_std=max_std,
            std_hidden_sizes=std_hidden_sizes,
            std_hidden_nonlinearity=std_hidden_nonlinearity,
            std_output_nonlinearity=std_output_nonlinearity,
            std_parameterization=std_parameterization,
            layer_normalization=layer_normalization,
            name='GaussianMLPModel')

    def build(self, state_input, name=None):
        """Build model.

        Args:
          state_input (tf.Tensor): State input.
          name (str): Name of the model, which is also the name scope.

        """
        with tf.compat.v1.variable_scope(self.name) as vs:
            self._variable_scope = vs
            self._dist = self.model.build(state_input, name=name)
            self._f_dist = tf.compat.v1.get_default_session().make_callable(
                [self._dist.sample(), self._dist.loc,
                 self._dist.stddev()],
                feed_list=[state_input])

    @property
    def vectorized(self):
        """Vectorized or not.

        Returns:
            Bool: True if primitive supports vectorized operations.

        """
        return True

    def get_action(self, observation):
        """Get single action from this policy for the input observation.

        Args:
            observation (numpy.ndarray): Observation from environment.

        Returns:
            numpy.ndarray: Actions
            dict: Predicted action and agent information.

        Note:
            It returns an action and a dict, with keys
            - mean (numpy.ndarray): Mean of the distribution.
            - log_std (numpy.ndarray): Log standard deviation of the
                distribution.

        """
        sample, mean, log_std = self._f_dist(np.expand_dims([observation], 1))
        sample = self.action_space.unflatten(np.squeeze(sample, 1)[0])
        mean = self.action_space.unflatten(np.squeeze(mean, 1)[0])
        log_std = self.action_space.unflatten(np.squeeze(log_std, 1)[0])
        return sample, dict(mean=mean, log_std=log_std)

    def get_actions(self, observations):
        """Get multiple actions from this policy for the input observations.

        Args:
            observations (numpy.ndarray): Observations from environment.

        Returns:
            numpy.ndarray: Actions
            dict: Predicted action and agent information.

        Note:
            It returns actions and a dict, with keys
            - mean (numpy.ndarray): Means of the distribution.
            - log_std (numpy.ndarray): Log standard deviations of the
                distribution.

        """
        samples, means, log_stds = self._f_dist(np.expand_dims(
            observations, 1))
        samples = self.action_space.unflatten_n(np.squeeze(samples, 1))
        means = self.action_space.unflatten_n(np.squeeze(means, 1))
        log_stds = self.action_space.unflatten_n(np.squeeze(log_stds, 1))
        return samples, dict(mean=means, log_std=log_stds)

    @property
    def distribution(self):
        """Policy distribution.

        Returns:
            tfp.Distribution.MultivariateNormalDiag: Policy distribution.

        """
        return self._dist

    def clone(self, name):
        """Return a clone of the policy.

        It only copies the configuration of the primitive,
        not the parameters.

        Args:
            name (str): Name of the newly created policy. It has to be
                different from source policy if cloned under the same
                computational graph.

        Returns:
            garage.tf.policies.GaussianMLPPolicy: Newly cloned policy.

        """
        return self.__class__(
            name=name,
            env_spec=self._env_spec,
            hidden_sizes=self._hidden_sizes,
            hidden_nonlinearity=self._hidden_nonlinearity,
            hidden_w_init=self._hidden_w_init,
            hidden_b_init=self._hidden_b_init,
            output_nonlinearity=self._output_nonlinearity,
            output_w_init=self._output_w_init,
            output_b_init=self._output_b_init,
            learn_std=self._learn_std,
            adaptive_std=self._adaptive_std,
            std_share_network=self._std_share_network,
            init_std=self._init_std,
            min_std=self._min_std,
            max_std=self._max_std,
            std_hidden_sizes=self._std_hidden_sizes,
            std_hidden_nonlinearity=self._std_hidden_nonlinearity,
            std_output_nonlinearity=self._std_output_nonlinearity,
            std_parameterization=self._std_parameterization,
            layer_normalization=self._layer_normalization)

    def __getstate__(self):
        """Object.__getstate__.

        Returns:
            dict: the state to be pickled for the instance.

        """
        new_dict = super().__getstate__()
        del new_dict['_f_dist']
        del new_dict['_dist']
        return new_dict
예제 #21
0
 def test_dist(self):
     model = GaussianMLPModel(output_dim=1)
     dist = model.build(self.input_var).dist
     assert isinstance(dist, tfp.distributions.MultivariateNormalDiag)