def test_adaptive_std_network_output_values(self, output_dim, hidden_sizes, std_hidden_sizes, mock_normal): mock_normal.return_value = 0.5 model = GaussianMLPModel(output_dim=output_dim, std_share_network=False, hidden_sizes=hidden_sizes, std_hidden_sizes=std_hidden_sizes, adaptive_std=True, hidden_nonlinearity=None, hidden_w_init=tf.ones_initializer(), output_w_init=tf.ones_initializer(), std_hidden_nonlinearity=None, std_hidden_w_init=tf.ones_initializer(), std_output_w_init=tf.ones_initializer()) model.build(self.input_var) action, mean, log_std, std_param = self.sess.run( model.networks['default'].outputs[:-1], feed_dict={self.input_var: self.obs}) expected_mean = np.full([1, output_dim], 5 * np.prod(hidden_sizes)) expected_std_param = np.full([1, output_dim], 5 * np.prod(std_hidden_sizes)) expected_log_std = np.full([1, output_dim], 5 * np.prod(std_hidden_sizes)) assert np.array_equal(mean, expected_mean) assert np.array_equal(std_param, expected_std_param) assert np.array_equal(log_std, expected_log_std) expected_action = 0.5 * np.exp(expected_log_std) + expected_mean assert np.allclose(action, expected_action)
def test_std_share_network_shapes(self, output_dim, hidden_sizes): # should be 2 * output_dim model = GaussianMLPModel(output_dim=output_dim, hidden_sizes=hidden_sizes, std_share_network=True) model.build(self.input_var) with tf.variable_scope(model.name, reuse=True): std_share_output_weights = tf.get_variable( 'dist_params/mean_std_network/output/kernel') std_share_output_bias = tf.get_variable( 'dist_params/mean_std_network/output/bias') assert std_share_output_weights.shape[1] == output_dim * 2 assert std_share_output_bias.shape == output_dim * 2
def test_adaptive_std_is_pickleable(self, mock_normal, output_dim, hidden_sizes, std_hidden_sizes): mock_normal.return_value = 0.5 input_var = tf.placeholder(tf.float32, shape=(None, 5)) model = GaussianMLPModel(output_dim=output_dim, hidden_sizes=hidden_sizes, std_hidden_sizes=std_hidden_sizes, std_share_network=False, adaptive_std=True, hidden_nonlinearity=None, hidden_w_init=tf.ones_initializer(), output_w_init=tf.ones_initializer(), std_hidden_nonlinearity=None, std_hidden_w_init=tf.ones_initializer(), std_output_w_init=tf.ones_initializer()) outputs = model.build(input_var) # get output bias with tf.variable_scope('GaussianMLPModel', reuse=True): bias = tf.get_variable('dist_params/mean_network/output/bias') # assign it to all ones bias.load(tf.ones_like(bias).eval()) h = pickle.dumps(model) output1 = self.sess.run(outputs[:-1], feed_dict={input_var: self.obs}) with tf.Session(graph=tf.Graph()) as sess: input_var = tf.placeholder(tf.float32, shape=(None, 5)) model_pickled = pickle.loads(h) outputs = model_pickled.build(input_var) output2 = sess.run(outputs[:-1], feed_dict={input_var: self.obs}) assert np.array_equal(output1, output2)
def test_softplus_output_values(self, output_dim, hidden_sizes, mock_normal): mock_normal.return_value = 0.5 model = GaussianMLPModel(output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=None, std_share_network=False, adaptive_std=False, init_std=2, std_parameterization='softplus', hidden_w_init=tf.ones_initializer(), output_w_init=tf.ones_initializer()) outputs = model.build(self.input_var) action, mean, log_std, std_param = self.sess.run( outputs[:-1], feed_dict={self.input_var: self.obs}) expected_mean = np.full([1, output_dim], 5 * np.prod(hidden_sizes)) expected_std_param = np.full([1, output_dim], np.log(np.exp(2) - 1)) expected_log_std = np.log(1. + np.exp(expected_std_param)) assert np.array_equal(mean, expected_mean) assert np.allclose(std_param, expected_std_param) assert np.allclose(log_std, expected_log_std) expected_action = 0.5 * np.exp(expected_log_std) + expected_mean assert np.allclose(action, expected_action)
def test_adaptive_std_is_pickleable(self, output_dim, hidden_sizes, std_hidden_sizes): model = GaussianMLPModel(output_dim=output_dim, hidden_sizes=hidden_sizes, std_hidden_sizes=std_hidden_sizes, std_share_network=False, adaptive_std=True, hidden_nonlinearity=None, hidden_w_init=tf.ones_initializer(), output_w_init=tf.ones_initializer(), std_hidden_nonlinearity=None, std_hidden_w_init=tf.ones_initializer(), std_output_w_init=tf.ones_initializer()) dist = model.build(self.input_var).dist # get output bias with tf.compat.v1.variable_scope('GaussianMLPModel', reuse=True): bias = tf.compat.v1.get_variable( 'dist_params/mean_network/output/bias') # assign it to all ones bias.load(tf.ones_like(bias).eval()) h = pickle.dumps(model) output1 = self.sess.run( [dist.loc, tf.math.log(dist.stddev())], feed_dict={self.input_var: self.obs}) with tf.compat.v1.Session(graph=tf.Graph()) as sess: input_var = tf.compat.v1.placeholder(tf.float32, shape=(None, None, 5)) model_pickled = pickle.loads(h) dist2 = model_pickled.build(input_var).dist output2 = sess.run( [dist2.loc, tf.math.log(dist2.stddev())], feed_dict={input_var: self.obs}) assert np.array_equal(output1, output2)
def test_without_std_share_network_shapes(self, output_dim, hidden_sizes): model = GaussianMLPModel(output_dim=output_dim, hidden_sizes=hidden_sizes, std_share_network=False, adaptive_std=False) model.build(self.input_var) with tf.variable_scope(model.name, reuse=True): mean_output_weights = tf.get_variable( 'dist_params/mean_network/output/kernel') mean_output_bias = tf.get_variable( 'dist_params/mean_network/output/bias') log_std_output_weights = tf.get_variable( 'dist_params/log_std_network/parameter') assert mean_output_weights.shape[1] == output_dim assert mean_output_bias.shape == output_dim assert log_std_output_weights.shape == output_dim
def test_exp_max_std(self, output_dim, hidden_sizes): model = GaussianMLPModel(output_dim=output_dim, hidden_sizes=hidden_sizes, std_share_network=False, init_std=10, max_std=1, std_parameterization='exp') dist = model.build(self.input_var).dist log_std = self.sess.run(tf.math.log(dist.stddev()), feed_dict={self.input_var: self.obs}) expected_log_std = np.full([1, 1, output_dim], np.log(1)) assert np.allclose(log_std, expected_log_std)
def test_exp_max_std(self, output_dim, hidden_sizes): model = GaussianMLPModel(output_dim=output_dim, hidden_sizes=hidden_sizes, std_share_network=False, init_std=10, max_std=1, std_parameterization='exp') outputs = model.build(self.input_var) action, mean, log_std, std_param = self.sess.run( outputs[:-1], feed_dict={self.input_var: self.obs}) expected_log_std = np.full([1, output_dim], np.log(1)) expected_std_param = np.full([1, output_dim], np.log(10)) assert np.allclose(log_std, expected_log_std) assert np.allclose(std_param, expected_std_param)
def test_softplus_max_std(self, output_dim, hidden_sizes): model = GaussianMLPModel(output_dim=output_dim, hidden_sizes=hidden_sizes, std_share_network=False, init_std=10, max_std=1, std_parameterization='softplus') dist = model.build(self.input_var).dist log_std = self.sess.run(tf.math.log(dist.stddev()), feed_dict={self.input_var: self.obs}) expected_log_std = np.full([1, 1, output_dim], np.log(1)) # This test fails just outside of the default absolute tolerance. assert np.allclose(log_std, expected_log_std, atol=1e-7)
def test_std_share_network_is_pickleable(self, output_dim, hidden_sizes, mock_normal): mock_normal.return_value = 0.5 input_var = tf.placeholder(tf.float32, shape=(None, 5)) model = GaussianMLPModel(output_dim=output_dim, hidden_sizes=hidden_sizes, std_share_network=True, hidden_nonlinearity=None, hidden_w_init=tf.ones_initializer(), output_w_init=tf.ones_initializer()) outputs = model.build(input_var) output1 = self.sess.run(outputs[:-1], feed_dict={input_var: self.obs}) with tf.Session(graph=tf.Graph()) as sess: input_var = tf.placeholder(tf.float32, shape=(None, 5)) model_pickled = pickle.loads(pickle.dumps(model)) outputs = model_pickled.build(input_var) output2 = sess.run(outputs[:-1], feed_dict={input_var: self.obs}) assert np.array_equal(output1, output2)
def test_std_share_network_output_values(self, output_dim, hidden_sizes): model = GaussianMLPModel(output_dim=output_dim, hidden_sizes=hidden_sizes, std_share_network=True, hidden_nonlinearity=None, std_parameterization='exp', hidden_w_init=tf.ones_initializer(), output_w_init=tf.ones_initializer()) dist = model.build(self.input_var).dist mean, log_std = self.sess.run( [dist.loc, tf.math.log(dist.stddev())], feed_dict={self.input_var: self.obs}) expected_mean = np.full([1, 1, output_dim], 5 * np.prod(hidden_sizes)) expected_log_std = np.full([1, 1, output_dim], 5 * np.prod(hidden_sizes)) assert np.array_equal(mean, expected_mean) assert np.array_equal(log_std, expected_log_std)
def test_without_std_share_network_output_values(self, output_dim, hidden_sizes): model = GaussianMLPModel(output_dim=output_dim, hidden_sizes=hidden_sizes, init_std=2, std_share_network=False, adaptive_std=False, hidden_nonlinearity=None, hidden_w_init=tf.ones_initializer(), output_w_init=tf.ones_initializer()) dist = model.build(self.input_var) mean, log_std = self.sess.run( [dist.loc, tf.math.log(dist.stddev())], feed_dict={self.input_var: self.obs}) expected_mean = np.full([1, 1, output_dim], 5 * np.prod(hidden_sizes)) expected_log_std = np.full([1, 1, output_dim], np.log(2.)) assert np.array_equal(mean, expected_mean) assert np.allclose(log_std, expected_log_std)
class GaussianMLPEncoder(StochasticEncoder, StochasticModule): """GaussianMLPEncoder with GaussianMLPModel. An embedding that contains a MLP to make prediction based on a gaussian distribution. Args: embedding_spec (garage.InOutSpec): Encoder specification. name (str): Model name, also the variable scope. hidden_sizes (list[int]): Output dimension of dense layer(s) for the MLP for mean. For example, (32, 32) means the MLP consists of two hidden layers, each with 32 hidden units. hidden_nonlinearity (callable): Activation function for intermediate dense layer(s). It should return a tf.Tensor. Set it to None to maintain a linear activation. hidden_w_init (callable): Initializer function for the weight of intermediate dense layer(s). The function should return a tf.Tensor. hidden_b_init (callable): Initializer function for the bias of intermediate dense layer(s). The function should return a tf.Tensor. output_nonlinearity (callable): Activation function for output dense layer. It should return a tf.Tensor. Set it to None to maintain a linear activation. output_w_init (callable): Initializer function for the weight of output dense layer(s). The function should return a tf.Tensor. output_b_init (callable): Initializer function for the bias of output dense layer(s). The function should return a tf.Tensor. learn_std (bool): Is std trainable. adaptive_std (bool): Is std a neural network. If False, it will be a parameter. std_share_network (bool): Boolean for whether mean and std share the same network. init_std (float): Initial value for std. std_hidden_sizes (list[int]): Output dimension of dense layer(s) for the MLP for std. For example, (32, 32) means the MLP consists of two hidden layers, each with 32 hidden units. min_std (float): If not None, the std is at least the value of min_std, to avoid numerical issues. max_std (float): If not None, the std is at most the value of max_std, to avoid numerical issues. std_hidden_nonlinearity (callable): Nonlinearity for each hidden layer in the std network. It should return a tf.Tensor. Set it to None to maintain a linear activation. std_output_nonlinearity (callable): Nonlinearity for output layer in the std network. It should return a tf.Tensor. Set it to None to maintain a linear activation. std_parameterization (str): How the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) layer_normalization (bool): Bool for using layer normalization or not. """ def __init__(self, embedding_spec, name='GaussianMLPEncoder', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform(), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform(), output_b_init=tf.zeros_initializer(), learn_std=True, adaptive_std=False, std_share_network=False, init_std=1.0, min_std=1e-6, max_std=None, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=tf.nn.tanh, std_output_nonlinearity=None, std_parameterization='exp', layer_normalization=False): super().__init__(name) self._embedding_spec = embedding_spec self._hidden_sizes = hidden_sizes self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._learn_std = learn_std self._adaptive_std = adaptive_std self._std_share_network = std_share_network self._init_std = init_std self._min_std = min_std self._max_std = max_std self._std_hidden_sizes = std_hidden_sizes self._std_hidden_nonlinearity = std_hidden_nonlinearity self._std_output_nonlinearity = std_output_nonlinearity self._std_parameterization = std_parameterization self._layer_normalization = layer_normalization self._latent_dim = embedding_spec.output_space.flat_dim self._input_dim = embedding_spec.input_space.flat_dim self._network = None self._f_dist = None self.model = GaussianMLPModel( output_dim=self._latent_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=min_std, max_std=max_std, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_hidden_nonlinearity, std_output_nonlinearity=std_output_nonlinearity, std_parameterization=std_parameterization, layer_normalization=layer_normalization, name='GaussianMLPModel') self._initialize() def _initialize(self): """Initialize encoder.""" embedding_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, self._input_dim), name='default_encoder') with tf.compat.v1.variable_scope(self._name) as vs: self._variable_scope = vs self._network = self.model.build(embedding_input) self._f_dist = tf.compat.v1.get_default_session().make_callable( [ self._network.dist.sample(), self._network.mean, self._network.log_std ], feed_list=[embedding_input]) def build(self, embedding_input, name=None): """Build encoder. Args: embedding_input (tf.Tensor) : Embedding input. name (str): Name of the model, which is also the name scope. Returns: tfp.distributions.MultivariateNormalDiag: Distribution. tf.tensor: Mean. tf.Tensor: Log of standard deviation. """ with tf.compat.v1.variable_scope(self._variable_scope): return self.model.build(embedding_input, name=name) @property def spec(self): """garage.InOutSpec: Specification of input and output.""" return self._embedding_spec @property def input_dim(self): """int: Dimension of the encoder input.""" return self._embedding_spec.input_space.flat_dim @property def output_dim(self): """int: Dimension of the encoder output (embedding).""" return self._embedding_spec.output_space.flat_dim @property def vectorized(self): """bool: If this module supports vectorization input.""" return True def get_latent(self, input_value): """Get a sample of embedding for the given input. Args: input_value (numpy.ndarray): Tensor to encode. Returns: numpy.ndarray: An embedding sampled from embedding distribution. dict: Embedding distribution information. Note: It returns an embedding and a dict, with keys - mean (numpy.ndarray): Mean of the distribution. - log_std (numpy.ndarray): Log standard deviation of the distribution. """ flat_input = self._embedding_spec.input_space.flatten(input_value) sample, mean, log_std = self._f_dist(np.expand_dims([flat_input], 1)) sample = self._embedding_spec.output_space.unflatten( np.squeeze(sample, 1)[0]) mean = self._embedding_spec.output_space.unflatten( np.squeeze(mean, 1)[0]) log_std = self._embedding_spec.output_space.unflatten( np.squeeze(log_std, 1)[0]) return sample, dict(mean=mean, log_std=log_std) def get_latents(self, input_values): """Get samples of embedding for the given inputs. Args: input_values (numpy.ndarray): Tensors to encode. Returns: numpy.ndarray: Embeddings sampled from embedding distribution. dict: Embedding distribution information. Note: It returns an embedding and a dict, with keys - mean (list[numpy.ndarray]): Means of the distribution. - log_std (list[numpy.ndarray]): Log standard deviations of the distribution. """ flat_input = self._embedding_spec.input_space.flatten_n(input_values) samples, means, log_stds = self._f_dist(np.expand_dims(flat_input, 1)) samples = self._embedding_spec.output_space.unflatten_n( np.squeeze(samples, 1)) means = self._embedding_spec.output_space.unflatten_n( np.squeeze(means, 1)) log_stds = self._embedding_spec.output_space.unflatten_n( np.squeeze(log_stds, 1)) return samples, dict(mean=means, log_std=log_stds) @property def distribution(self): """Encoder distribution. Returns: tfp.Distribution.MultivariateNormalDiag: Encoder distribution. """ return self._network.dist @property def input(self): """tf.Tensor: Input to encoder network.""" return self._network.input @property def latent_mean(self): """tf.Tensor: Predicted mean of a Gaussian distribution.""" return self._network.mean @property def latent_std_param(self): """tf.Tensor: Predicted std of a Gaussian distribution.""" return self._network.log_std def clone(self, name): """Return a clone of the encoder. Args: name (str): Name of the newly created encoder. It has to be different from source encoder if cloned under the same computational graph. Returns: garage.tf.embeddings.encoder.Encoder: Newly cloned encoder. """ new_encoder = self.__class__( embedding_spec=self._embedding_spec, name=name, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, learn_std=self._learn_std, adaptive_std=self._adaptive_std, std_share_network=self._std_share_network, init_std=self._init_std, min_std=self._min_std, max_std=self._max_std, std_hidden_sizes=self._std_hidden_sizes, std_hidden_nonlinearity=self._std_hidden_nonlinearity, std_output_nonlinearity=self._std_output_nonlinearity, std_parameterization=self._std_parameterization, layer_normalization=self._layer_normalization) return new_encoder def __getstate__(self): """Object.__getstate__. Returns: dict: the state to be pickled for the instance. """ new_dict = super().__getstate__() del new_dict['_f_dist'] del new_dict['_network'] return new_dict def __setstate__(self, state): """Parameters to restore from snapshot. Args: state (dict): Parameters to restore from. """ super().__setstate__(state) self._initialize()
class GaussianMLPTaskEmbeddingPolicy(TaskEmbeddingPolicy): """GaussianMLPTaskEmbeddingPolicy. Args: env_spec (garage.envs.env_spec.EnvSpec): Environment specification. encoder (garage.tf.embeddings.StochasticEncoder): Embedding network. name (str): Model name, also the variable scope. hidden_sizes (list[int]): Output dimension of dense layer(s) for the MLP for mean. For example, (32, 32) means the MLP consists of two hidden layers, each with 32 hidden units. hidden_nonlinearity (callable): Activation function for intermediate dense layer(s). It should return a tf.Tensor. Set it to None to maintain a linear activation. hidden_w_init (callable): Initializer function for the weight of intermediate dense layer(s). The function should return a tf.Tensor. hidden_b_init (callable): Initializer function for the bias of intermediate dense layer(s). The function should return a tf.Tensor. output_nonlinearity (callable): Activation function for output dense layer. It should return a tf.Tensor. Set it to None to maintain a linear activation. output_w_init (callable): Initializer function for the weight of output dense layer(s). The function should return a tf.Tensor. output_b_init (callable): Initializer function for the bias of output dense layer(s). The function should return a tf.Tensor. learn_std (bool): Is std trainable. adaptive_std (bool): Is std a neural network. If False, it will be a parameter. std_share_network (bool): Boolean for whether mean and std share the same network. init_std (float): Initial value for std. std_hidden_sizes (list[int]): Output dimension of dense layer(s) for the MLP for std. For example, (32, 32) means the MLP consists of two hidden layers, each with 32 hidden units. min_std (float): If not None, the std is at least the value of min_std, to avoid numerical issues. max_std (float): If not None, the std is at most the value of max_std, to avoid numerical issues. std_hidden_nonlinearity (callable): Nonlinearity for each hidden layer in the std network. It should return a tf.Tensor. Set it to None to maintain a linear activation. std_output_nonlinearity (callable): Nonlinearity for output layer in the std network. It should return a tf.Tensor. Set it to None to maintain a linear activation. std_parameterization (str): How the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) layer_normalization (bool): Bool for using layer normalization or not. """ def __init__(self, env_spec, encoder, name='GaussianMLPTaskEmbeddingPolicy', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform(), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform(), output_b_init=tf.zeros_initializer(), learn_std=True, adaptive_std=False, std_share_network=False, init_std=1.0, min_std=1e-6, max_std=None, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=tf.nn.tanh, std_output_nonlinearity=None, std_parameterization='exp', layer_normalization=False): assert isinstance(env_spec.action_space, akro.Box) super().__init__(name, env_spec, encoder) self._obs_dim = env_spec.observation_space.flat_dim self._action_dim = env_spec.action_space.flat_dim self._dist = None self.model = GaussianMLPModel( output_dim=self._action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=min_std, max_std=max_std, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_hidden_nonlinearity, std_output_nonlinearity=std_output_nonlinearity, std_parameterization=std_parameterization, layer_normalization=layer_normalization, name='GaussianMLPModel') self._initialize() def _initialize(self): """Initialize policy.""" obs_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, self._obs_dim)) latent_input = tf.compat.v1.placeholder( tf.float32, shape=(None, None, self._encoder.output_dim)) # Encoder should be outside policy scope with tf.compat.v1.variable_scope('concat_obs_task'): latent_var = self._encoder.distribution.sample() with tf.compat.v1.variable_scope(self.name) as vs: self._variable_scope = vs with tf.compat.v1.variable_scope('concat_obs_latent'): obs_latent_input = tf.concat([obs_input, latent_input], -1) self._dist, mean_var, log_std_var = self.model.build( obs_latent_input, name='given_latent').outputs with tf.compat.v1.variable_scope('concat_obs_latent_var'): embed_state_input = tf.concat([obs_input, latent_var], -1) dist_given_task, mean_g_t, log_std_g_t = self.model.build( embed_state_input, name='given_task').outputs self._f_dist_obs_latent = tf.compat.v1.get_default_session( ).make_callable([self._dist.sample(), mean_var, log_std_var], feed_list=[obs_input, latent_input]) self._f_dist_obs_task = tf.compat.v1.get_default_session( ).make_callable([dist_given_task.sample(), mean_g_t, log_std_g_t], feed_list=[obs_input, self._encoder.input]) @property def distribution(self): """Policy action distribution. Returns: garage.tf.distributions.DiagonalGaussian: Policy distribution. """ return self._dist def get_action(self, observation): """Get action sampled from the policy. Args: observation (np.ndarray): Augmented observation from the environment, with shape :math:`(O+N, )`. O is the dimension of observation, N is the number of tasks. Returns: np.ndarray: Action sampled from the policy, with shape :math:`(A, )`. A is the dimension of action. dict: Action distribution information, with keys: - mean (numpy.ndarray): Mean of the distribution, with shape :math:`(A, )`. A is the dimension of action. - log_std (numpy.ndarray): Log standard deviation of the distribution, with shape :math:`(A, )`. A is the dimension of action. """ obs, task = self.split_augmented_observation(observation) return self.get_action_given_task(obs, task) def get_actions(self, observations): """Get actions sampled from the policy. Args: observations (np.ndarray): Augmented observation from the environment, with shape :math:`(T, O+N)`. T is the number of environment steps, O is the dimension of observation, N is the number of tasks. Returns: np.ndarray: Actions sampled from the policy, with shape :math:`(T, A)`. T is the number of environment steps, A is the dimension of action. dict: Action distribution information, with keys: - mean (numpy.ndarray): Mean of the distribution, with shape :math:`(T, A)`. T is the number of environment steps, A is the dimension of action. - log_std (numpy.ndarray): Log standard deviation of the distribution, with shape :math:`(T, A)`. T is the number of environment steps, Z is the dimension of action. """ obses, tasks = zip(*[ self.split_augmented_observation(aug_obs) for aug_obs in observations ]) return self.get_actions_given_tasks(np.array(obses), np.array(tasks)) def get_action_given_latent(self, observation, latent): """Sample an action given observation and latent. Args: observation (np.ndarray): Observation from the environment, with shape :math:`(O, )`. O is the dimension of observation. latent (np.ndarray): Latent, with shape :math:`(Z, )`. Z is the dimension of the latent embedding. Returns: np.ndarray: Action sampled from the policy, with shape :math:`(A, )`. A is the dimension of action. dict: Action distribution information, with keys: - mean (numpy.ndarray): Mean of the distribution, with shape :math:`(A, )`. A is the dimension of action. - log_std (numpy.ndarray): Log standard deviation of the distribution, with shape :math:`(A, )`. A is the dimension of action. """ flat_obs = self.observation_space.flatten(observation) flat_obs = np.expand_dims([flat_obs], 1) flat_latent = self.latent_space.flatten(latent) flat_latent = np.expand_dims([flat_latent], 1) sample, mean, log_std = self._f_dist_obs_latent(flat_obs, flat_latent) sample = self.action_space.unflatten(np.squeeze(sample, 1)[0]) mean = self.action_space.unflatten(np.squeeze(mean, 1)[0]) log_std = self.action_space.unflatten(np.squeeze(log_std, 1)[0]) return sample, dict(mean=mean, log_std=log_std) def get_actions_given_latents(self, observations, latents): """Sample a batch of actions given observations and latents. Args: observations (np.ndarray): Observations from the environment, with shape :math:`(T, O)`. T is the number of environment steps, O is the dimension of observation. latents (np.ndarray): Latents, with shape :math:`(T, Z)`. T is the number of environment steps, Z is the dimension of latent embedding. Returns: np.ndarray: Actions sampled from the policy, with shape :math:`(T, A)`. T is the number of environment steps, A is the dimension of action. dict: Action distribution information, , with keys: - mean (numpy.ndarray): Mean of the distribution, with shape :math:`(T, A)`. T is the number of environment steps. A is the dimension of action. - log_std (numpy.ndarray): Log standard deviation of the distribution, with shape :math:`(T, A)`. T is the number of environment steps. A is the dimension of action. """ flat_obses = self.observation_space.flatten_n(observations) flat_obses = np.expand_dims(flat_obses, 1) flat_latents = self.latent_space.flatten_n(latents) flat_latents = np.expand_dims(flat_latents, 1) samples, means, log_stds = self._f_dist_obs_latent( flat_obses, flat_latents) samples = self.action_space.unflatten_n(np.squeeze(samples, 1)) means = self.action_space.unflatten_n(np.squeeze(means, 1)) log_stds = self.action_space.unflatten_n(np.squeeze(log_stds, 1)) return samples, dict(mean=means, log_std=log_stds) def get_action_given_task(self, observation, task_id): """Sample an action given observation and task id. Args: observation (np.ndarray): Observation from the environment, with shape :math:`(O, )`. O is the dimension of the observation. task_id (np.ndarray): One-hot task id, with shape :math:`(N, ). N is the number of tasks. Returns: np.ndarray: Action sampled from the policy, with shape :math:`(A, )`. A is the dimension of action. dict: Action distribution information, with keys: - mean (numpy.ndarray): Mean of the distribution, with shape :math:`(A, )`. A is the dimension of action. - log_std (numpy.ndarray): Log standard deviation of the distribution, with shape :math:`(A, )`. A is the dimension of action. """ flat_obs = self.observation_space.flatten(observation) flat_obs = np.expand_dims([flat_obs], 1) task_id = np.expand_dims([task_id], 1) sample, mean, log_std = self._f_dist_obs_task(flat_obs, task_id) sample = self.action_space.unflatten(np.squeeze(sample, 1)[0]) mean = self.action_space.unflatten(np.squeeze(mean, 1)[0]) log_std = self.action_space.unflatten(np.squeeze(log_std, 1)[0]) return sample, dict(mean=mean, log_std=log_std) def get_actions_given_tasks(self, observations, task_ids): """Sample a batch of actions given observations and task ids. Args: observations (np.ndarray): Observations from the environment, with shape :math:`(T, O)`. T is the number of environment steps, O is the dimension of observation. task_ids (np.ndarry): One-hot task ids, with shape :math:`(T, N)`. T is the number of environment steps, N is the number of tasks. Returns: np.ndarray: Actions sampled from the policy, with shape :math:`(T, A)`. T is the number of environment steps, A is the dimension of action. dict: Action distribution information, , with keys: - mean (numpy.ndarray): Mean of the distribution, with shape :math:`(T, A)`. T is the number of environment steps. A is the dimension of action. - log_std (numpy.ndarray): Log standard deviation of the distribution, with shape :math:`(T, A)`. T is the number of environment steps. A is the dimension of action. """ flat_obses = self.observation_space.flatten_n(observations) flat_obses = np.expand_dims(flat_obses, 1) task_ids = np.expand_dims(task_ids, 1) samples, means, log_stds = self._f_dist_obs_task(flat_obses, task_ids) samples = self.action_space.unflatten_n(np.squeeze(samples, 1)) means = self.action_space.unflatten_n(np.squeeze(means, 1)) log_stds = self.action_space.unflatten_n(np.squeeze(log_stds, 1)) return samples, dict(mean=means, log_std=log_stds) def __getstate__(self): """Object.__getstate__. Returns: dict: The state to be pickled for the instance. """ new_dict = super().__getstate__() del new_dict['_f_dist_obs_latent'] del new_dict['_f_dist_obs_task'] del new_dict['_dist'] return new_dict def __setstate__(self, state): """Object.__setstate__. Args: state (dict): Unpickled state. """ super().__setstate__(state) self._initialize()
class GaussianMLPEncoder(StochasticEncoder, StochasticModule): """GaussianMLPEncoder with GaussianMLPModel. An embedding that contains a MLP to make prediction based on a gaussian distribution. Args: embedding_spec (garage.InOutSpec): Encoder specification. name (str): Model name, also the variable scope. hidden_sizes (list[int]): Output dimension of dense layer(s) for the MLP for mean. For example, (32, 32) means the MLP consists of two hidden layers, each with 32 hidden units. hidden_nonlinearity (callable): Activation function for intermediate dense layer(s). It should return a tf.Tensor. Set it to None to maintain a linear activation. hidden_w_init (callable): Initializer function for the weight of intermediate dense layer(s). The function should return a tf.Tensor. hidden_b_init (callable): Initializer function for the bias of intermediate dense layer(s). The function should return a tf.Tensor. output_nonlinearity (callable): Activation function for output dense layer. It should return a tf.Tensor. Set it to None to maintain a linear activation. output_w_init (callable): Initializer function for the weight of output dense layer(s). The function should return a tf.Tensor. output_b_init (callable): Initializer function for the bias of output dense layer(s). The function should return a tf.Tensor. learn_std (bool): Is std trainable. adaptive_std (bool): Is std a neural network. If False, it will be a parameter. std_share_network (bool): Boolean for whether mean and std share the same network. init_std (float): Initial value for std. std_hidden_sizes (list[int]): Output dimension of dense layer(s) for the MLP for std. For example, (32, 32) means the MLP consists of two hidden layers, each with 32 hidden units. min_std (float): If not None, the std is at least the value of min_std, to avoid numerical issues. max_std (float): If not None, the std is at most the value of max_std, to avoid numerical issues. std_hidden_nonlinearity (callable): Nonlinearity for each hidden layer in the std network. It should return a tf.Tensor. Set it to None to maintain a linear activation. std_output_nonlinearity (callable): Nonlinearity for output layer in the std network. It should return a tf.Tensor. Set it to None to maintain a linear activation. std_parameterization (str): How the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) layer_normalization (bool): Bool for using layer normalization or not. """ def __init__(self, embedding_spec, name='GaussianMLPEncoder', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform(), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform(), output_b_init=tf.zeros_initializer(), learn_std=True, adaptive_std=False, std_share_network=False, init_std=1.0, min_std=1e-6, max_std=None, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=tf.nn.tanh, std_output_nonlinearity=None, std_parameterization='exp', layer_normalization=False): super().__init__(name) self._embedding_spec = embedding_spec self._latent_dim = embedding_spec.output_space.flat_dim self._input_dim = embedding_spec.input_space.flat_dim self.model = GaussianMLPModel( output_dim=self._latent_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=min_std, max_std=max_std, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_hidden_nonlinearity, std_output_nonlinearity=std_output_nonlinearity, std_parameterization=std_parameterization, layer_normalization=layer_normalization, name='GaussianMLPModel') self._initialize() def _initialize(self): embedding_input = tf.compat.v1.placeholder(tf.float32, shape=(None, self._input_dim)) with tf.compat.v1.variable_scope(self._name) as vs: self._variable_scope = vs self.model.build(embedding_input) self._f_dist = tf.compat.v1.get_default_session().make_callable( [ self.model.networks['default'].mean, self.model.networks['default'].log_std ], feed_list=[self.model.networks['default'].input]) def dist_info(self, input_val, state_infos=None): """Distribution info. Get the information of embedding distribution given an input. Args: input_val (np.ndarray): input values state_infos (dict): a dictionary whose values contain information about the predicted embedding given an input. Returns: dict[numpy.ndarray]: Distribution parameters, with keys - mean (numpy.ndarray): Mean of the distribution. - log_std (numpy.ndarray): Log standard deviation of the distribution. """ flat_input = self._embedding_spec.input_space.flatten(input_val) mean, log_std = self._f_dist([flat_input]) mean = self._embedding_spec.output_space.unflatten(mean[0]) log_std = self._embedding_spec.output_space.unflatten(log_std[0]) return dict(mean=mean, log_std=log_std) def dist_info_sym(self, input_var, state_info_vars=None, name='default'): """Build a symbolic graph of the distribution parameters. Args: input_var (tf.Tensor): Tensor input for symbolic graph. state_info_vars (dict): Extra state information, e.g. previous embedding. name (str): Name for symbolic graph. Returns: dict[tf.Tensor]: Outputs of the symbolic graph of distribution parameters. """ with tf.compat.v1.variable_scope(self._variable_scope): mean_var, log_std_var, _, _ = self.model.build(input_var, name=name) return dict(mean=mean_var, log_std=log_std_var) @property def spec(self): """garage.InOutSpec: Specification of input and output.""" return self._embedding_spec @property def input_dim(self): """int: Dimension of the encoder input.""" return self._embedding_spec.input_space.flat_dim @property def output_dim(self): """int: Dimension of the encoder output (embedding).""" return self._embedding_spec.output_space.flat_dim @property def recurrent(self): """bool: If this module has a hidden state.""" return False @property def vectorized(self): """bool: If this module supports vectorization input.""" return True def forward(self, input_value): """Get an sample of embedding for the given input. Args: input_value (numpy.ndarray): Tensor to encode. Returns: numpy.ndarray: An embedding sampled from embedding distribution. dict: Embedding distribution information. Note: It returns an embedding and a dict, with keys - mean (numpy.ndarray): Mean of the distribution. - log_std (numpy.ndarray): Log standard deviation of the distribution. """ flat_input = self._embedding_spec.input_space.flatten(input_value) mean, log_std = self._f_dist([flat_input]) rnd = np.random.normal(size=mean.shape) sample = rnd * np.exp(log_std) + mean sample = self._embedding_spec.output_space.unflatten(sample[0]) mean = self._embedding_spec.output_space.unflatten(mean[0]) log_std = self._embedding_spec.output_space.unflatten(log_std[0]) return sample, dict(mean=mean, log_std=log_std) @property def distribution(self): """Embedding distribution. Returns: garage.tf.distributions.DiagonalGaussian: embedding distribution. """ return self.model.networks['default'].dist @property def input(self): """tf.Tensor: Input to encoder network.""" return self.model.networks['default'].input @property def latent_mean(self): """tf.Tensor: Predicted mean of a Gaussian distribution.""" return self.model.networks['default'].mean @property def latent_std_param(self): """tf.Tensor: Predicted std of a Gaussian distribution.""" return self.model.networks['default'].log_std def __getstate__(self): """Object.__getstate__. Returns: dict: the state to be pickled for the instance. """ new_dict = super().__getstate__() del new_dict['_f_dist'] return new_dict def __setstate__(self, state): """Object.__setstate__. Args: state (dict): Unpickled state. """ super().__setstate__(state) self._initialize()
class GaussianMLPPolicy(StochasticPolicy): """Gaussian MLP Policy. A policy represented by a Gaussian distribution which is parameterized by a multilayer perceptron (MLP). Args: env_spec (garage.envs.env_spec.EnvSpec): Environment specification. name (str): Model name, also the variable scope. hidden_sizes (list[int]): Output dimension of dense layer(s) for the MLP for mean. For example, (32, 32) means the MLP consists of two hidden layers, each with 32 hidden units. hidden_nonlinearity (callable): Activation function for intermediate dense layer(s). It should return a tf.Tensor. Set it to None to maintain a linear activation. hidden_w_init (callable): Initializer function for the weight of intermediate dense layer(s). The function should return a tf.Tensor. hidden_b_init (callable): Initializer function for the bias of intermediate dense layer(s). The function should return a tf.Tensor. output_nonlinearity (callable): Activation function for output dense layer. It should return a tf.Tensor. Set it to None to maintain a linear activation. output_w_init (callable): Initializer function for the weight of output dense layer(s). The function should return a tf.Tensor. output_b_init (callable): Initializer function for the bias of output dense layer(s). The function should return a tf.Tensor. learn_std (bool): Is std trainable. adaptive_std (bool): Is std a neural network. If False, it will be a parameter. std_share_network (bool): Boolean for whether mean and std share the same network. init_std (float): Initial value for std. std_hidden_sizes (list[int]): Output dimension of dense layer(s) for the MLP for std. For example, (32, 32) means the MLP consists of two hidden layers, each with 32 hidden units. min_std (float): If not None, the std is at least the value of min_std, to avoid numerical issues. max_std (float): If not None, the std is at most the value of max_std, to avoid numerical issues. std_hidden_nonlinearity (callable): Nonlinearity for each hidden layer in the std network. The function should return a tf.Tensor. std_output_nonlinearity (callable): Nonlinearity for output layer in the std network. The function should return a tf.Tensor. std_parameterization (str): How the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) layer_normalization (bool): Bool for using layer normalization or not. """ def __init__(self, env_spec, name='GaussianMLPPolicy', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform(), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform(), output_b_init=tf.zeros_initializer(), learn_std=True, adaptive_std=False, std_share_network=False, init_std=1.0, min_std=1e-6, max_std=None, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=tf.nn.tanh, std_output_nonlinearity=None, std_parameterization='exp', layer_normalization=False): if not isinstance(env_spec.action_space, akro.Box): raise ValueError('GaussianMLPPolicy only works with ' 'akro.Box action space, but not {}'.format( env_spec.action_space)) super().__init__(name, env_spec) self.obs_dim = env_spec.observation_space.flat_dim self.action_dim = env_spec.action_space.flat_dim self._hidden_sizes = hidden_sizes self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._learn_std = learn_std self._adaptive_std = adaptive_std self._std_share_network = std_share_network self._init_std = init_std self._min_std = min_std self._max_std = max_std self._std_hidden_sizes = std_hidden_sizes self._std_hidden_nonlinearity = std_hidden_nonlinearity self._std_output_nonlinearity = std_output_nonlinearity self._std_parameterization = std_parameterization self._layer_normalization = layer_normalization self._f_dist = None self._dist = None self.model = GaussianMLPModel( output_dim=self.action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=min_std, max_std=max_std, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_hidden_nonlinearity, std_output_nonlinearity=std_output_nonlinearity, std_parameterization=std_parameterization, layer_normalization=layer_normalization, name='GaussianMLPModel') def build(self, state_input, name=None): """Build model. Args: state_input (tf.Tensor): State input. name (str): Name of the model, which is also the name scope. """ with tf.compat.v1.variable_scope(self.name) as vs: self._variable_scope = vs self._dist = self.model.build(state_input, name=name) self._f_dist = tf.compat.v1.get_default_session().make_callable( [self._dist.sample(), self._dist.loc, self._dist.stddev()], feed_list=[state_input]) @property def vectorized(self): """Vectorized or not. Returns: Bool: True if primitive supports vectorized operations. """ return True def get_action(self, observation): """Get single action from this policy for the input observation. Args: observation (numpy.ndarray): Observation from environment. Returns: numpy.ndarray: Actions dict: Predicted action and agent information. Note: It returns an action and a dict, with keys - mean (numpy.ndarray): Mean of the distribution. - log_std (numpy.ndarray): Log standard deviation of the distribution. """ sample, mean, log_std = self._f_dist(np.expand_dims([observation], 1)) sample = self.action_space.unflatten(np.squeeze(sample, 1)[0]) mean = self.action_space.unflatten(np.squeeze(mean, 1)[0]) log_std = self.action_space.unflatten(np.squeeze(log_std, 1)[0]) return sample, dict(mean=mean, log_std=log_std) def get_actions(self, observations): """Get multiple actions from this policy for the input observations. Args: observations (numpy.ndarray): Observations from environment. Returns: numpy.ndarray: Actions dict: Predicted action and agent information. Note: It returns actions and a dict, with keys - mean (numpy.ndarray): Means of the distribution. - log_std (numpy.ndarray): Log standard deviations of the distribution. """ samples, means, log_stds = self._f_dist(np.expand_dims( observations, 1)) samples = self.action_space.unflatten_n(np.squeeze(samples, 1)) means = self.action_space.unflatten_n(np.squeeze(means, 1)) log_stds = self.action_space.unflatten_n(np.squeeze(log_stds, 1)) return samples, dict(mean=means, log_std=log_stds) @property def distribution(self): """Policy distribution. Returns: tfp.Distribution.MultivariateNormalDiag: Policy distribution. """ return self._dist def clone(self, name): """Return a clone of the policy. It only copies the configuration of the primitive, not the parameters. Args: name (str): Name of the newly created policy. It has to be different from source policy if cloned under the same computational graph. Returns: garage.tf.policies.GaussianMLPPolicy: Newly cloned policy. """ return self.__class__( name=name, env_spec=self._env_spec, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, learn_std=self._learn_std, adaptive_std=self._adaptive_std, std_share_network=self._std_share_network, init_std=self._init_std, min_std=self._min_std, max_std=self._max_std, std_hidden_sizes=self._std_hidden_sizes, std_hidden_nonlinearity=self._std_hidden_nonlinearity, std_output_nonlinearity=self._std_output_nonlinearity, std_parameterization=self._std_parameterization, layer_normalization=self._layer_normalization) def __getstate__(self): """Object.__getstate__. Returns: dict: the state to be pickled for the instance. """ new_dict = super().__getstate__() del new_dict['_f_dist'] del new_dict['_dist'] return new_dict
def test_dist(self): model = GaussianMLPModel(output_dim=1) dist = model.build(self.input_var).dist assert isinstance(dist, tfp.distributions.MultivariateNormalDiag)
class GaussianMLPPolicy(StochasticPolicy): """GaussianMLPPolicy with GaussianMLPModel. A policy that contains a MLP to make prediction based on a gaussian distribution. Args: env_spec (garage.envs.env_spec.EnvSpec): Environment specification. name (str): Model name, also the variable scope. hidden_sizes (list[int]): Output dimension of dense layer(s) for the MLP for mean. For example, (32, 32) means the MLP consists of two hidden layers, each with 32 hidden units. hidden_nonlinearity (callable): Activation function for intermediate dense layer(s). It should return a tf.Tensor. Set it to None to maintain a linear activation. hidden_w_init (callable): Initializer function for the weight of intermediate dense layer(s). The function should return a tf.Tensor. hidden_b_init (callable): Initializer function for the bias of intermediate dense layer(s). The function should return a tf.Tensor. output_nonlinearity (callable): Activation function for output dense layer. It should return a tf.Tensor. Set it to None to maintain a linear activation. output_w_init (callable): Initializer function for the weight of output dense layer(s). The function should return a tf.Tensor. output_b_init (callable): Initializer function for the bias of output dense layer(s). The function should return a tf.Tensor. learn_std (bool): Is std trainable. adaptive_std (bool): Is std a neural network. If False, it will be a parameter. std_share_network (bool): Boolean for whether mean and std share the same network. init_std (float): Initial value for std. std_hidden_sizes (list[int]): Output dimension of dense layer(s) for the MLP for std. For example, (32, 32) means the MLP consists of two hidden layers, each with 32 hidden units. min_std (float): If not None, the std is at least the value of min_std, to avoid numerical issues. max_std (float): If not None, the std is at most the value of max_std, to avoid numerical issues. std_hidden_nonlinearity (callable): Nonlinearity for each hidden layer in the std network. It should return a tf.Tensor. Set it to None to maintain a linear activation. std_output_nonlinearity (callable): Nonlinearity for output layer in the std network. It should return a tf.Tensor. Set it to None to maintain a linear activation. std_parameterization (str): How the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) layer_normalization (bool): Bool for using layer normalization or not. """ def __init__(self, env_spec, name='GaussianMLPPolicy', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.glorot_uniform_initializer(), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.glorot_uniform_initializer(), output_b_init=tf.zeros_initializer(), learn_std=True, adaptive_std=False, std_share_network=False, init_std=1.0, min_std=1e-6, max_std=None, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=tf.nn.tanh, std_output_nonlinearity=None, std_parameterization='exp', layer_normalization=False): assert isinstance(env_spec.action_space, akro.Box) super().__init__(name, env_spec) self.obs_dim = env_spec.observation_space.flat_dim self.action_dim = env_spec.action_space.flat_dim self.model = GaussianMLPModel( output_dim=self.action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=min_std, max_std=max_std, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_hidden_nonlinearity, std_output_nonlinearity=std_output_nonlinearity, std_parameterization=std_parameterization, layer_normalization=layer_normalization, name='GaussianMLPModel') self._initialize() def _initialize(self): state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, self.obs_dim)) with tf.compat.v1.variable_scope(self.name) as vs: self._variable_scope = vs self.model.build(state_input) self._f_dist = tf.compat.v1.get_default_session().make_callable( [ self.model.networks['default'].mean, self.model.networks['default'].log_std ], feed_list=[self.model.networks['default'].input]) @property def vectorized(self): """Vectorized or not. Returns: Bool: True if primitive supports vectorized operations. """ return True def dist_info_sym(self, obs_var, state_info_vars=None, name='default'): """Build a symbolic graph of the distribution parameters. Args: obs_var (tf.Tensor): Tensor input for symbolic graph. state_info_vars (dict): Extra state information, e.g. previous action. name (str): Name for symbolic graph. Returns: dict[tf.Tensor]: Outputs of the symbolic graph of distribution parameters. """ with tf.compat.v1.variable_scope(self._variable_scope): mean_var, log_std_var, _, _ = self.model.build(obs_var, name=name) return dict(mean=mean_var, log_std=log_std_var) def get_action(self, observation): """Get single action from this policy for the input observation. Args: observation (numpy.ndarray): Observation from environment. Returns: numpy.ndarray: Actions dict: Predicted action and agent information. Note: It returns an action and a dict, with keys - mean (numpy.ndarray): Mean of the distribution. - log_std (numpy.ndarray): Log standard deviation of the distribution. """ flat_obs = self.observation_space.flatten(observation) mean, log_std = self._f_dist([flat_obs]) rnd = np.random.normal(size=mean.shape) sample = rnd * np.exp(log_std) + mean sample = self.action_space.unflatten(sample[0]) mean = self.action_space.unflatten(mean[0]) log_std = self.action_space.unflatten(log_std[0]) return sample, dict(mean=mean, log_std=log_std) def get_actions(self, observations): """Get multiple actions from this policy for the input observations. Args: observations (numpy.ndarray): Observations from environment. Returns: numpy.ndarray: Actions dict: Predicted action and agent information. Note: It returns actions and a dict, with keys - mean (numpy.ndarray): Means of the distribution. - log_std (numpy.ndarray): Log standard deviations of the distribution. """ flat_obs = self.observation_space.flatten_n(observations) means, log_stds = self._f_dist(flat_obs) rnd = np.random.normal(size=means.shape) samples = rnd * np.exp(log_stds) + means samples = self.action_space.unflatten_n(samples) means = self.action_space.unflatten_n(means) log_stds = self.action_space.unflatten_n(log_stds) return samples, dict(mean=means, log_std=log_stds) def get_params(self): """Get the params, which are the trainable variables. Returns: List[tf.Variable]: A list of trainable variables in the current variable scope. """ return self.get_trainable_vars() @property def distribution(self): """Policy distribution. Returns: garage.tf.distributions.DiagonalGaussian: Policy distribution. """ return self.model.networks['default'].dist def __getstate__(self): """Object.__getstate__. Returns: dict: the state to be pickled for the instance. """ new_dict = super().__getstate__() del new_dict['_f_dist'] return new_dict def __setstate__(self, state): """Object.__setstate__. Args: state (dict): Unpickled state. """ super().__setstate__(state) self._initialize()
class GaussianMLPPolicyWithModel(StochasticPolicy2): """ GaussianMLPPolicy with GaussianMLPModel. :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :param std_parametrization: how the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) :return: """ def __init__(self, env_spec, name='GaussianMLPPolicy', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, learn_std=True, adaptive_std=False, std_share_network=False, init_std=1.0, min_std=1e-6, max_std=None, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=tf.nn.tanh, std_output_nonlinearity=None, std_parameterization='exp', layer_normalization=False): assert isinstance(env_spec.action_space, Box) super().__init__(name, env_spec) self.obs_dim = env_spec.observation_space.flat_dim self.action_dim = env_spec.action_space.flat_dim self.model = GaussianMLPModel( name=name, output_dim=self.action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=min_std, max_std=max_std, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_hidden_nonlinearity, std_output_nonlinearity=std_output_nonlinearity, std_parameterization=std_parameterization, layer_normalization=layer_normalization) self._initialize() def _initialize(self): state_input = tf.placeholder(tf.float32, shape=(None, self.obs_dim)) with tf.variable_scope(self._variable_scope): self.model.build(state_input) self._f_dist = tf.get_default_session().make_callable( [ self.model.networks['default'].sample, self.model.networks['default'].mean, self.model.networks['default'].log_std ], feed_list=[self.model.networks['default'].input]) @property def vectorized(self): """Vectorized or not.""" return True def dist_info_sym(self, obs_var, state_info_vars=None, name='default'): """Symbolic graph of the distribution.""" with tf.variable_scope(self._variable_scope): _, mean_var, log_std_var, _, _ = self.model.build( obs_var, name=name) mean_var = tf.reshape(mean_var, self.action_space.shape) log_std_var = tf.reshape(log_std_var, self.action_space.shape) return dict(mean=mean_var, log_std=log_std_var) def get_action(self, observation): """Get action from the policy.""" flat_obs = self.observation_space.flatten(observation) sample, mean, log_std = self._f_dist([flat_obs]) sample = self.action_space.unflatten(sample[0]) mean = self.action_space.unflatten(mean[0]) log_std = self.action_space.unflatten(log_std[0]) return sample, dict(mean=mean, log_std=log_std) def get_actions(self, observations): """Get actions from the policy.""" flat_obs = self.observation_space.flatten_n(observations) samples, means, log_stds = self._f_dist(flat_obs) samples = self.action_space.unflatten_n(samples) means = self.action_space.unflatten_n(means) log_stds = self.action_space.unflatten_n(log_stds) return samples, dict(mean=means, log_std=log_stds) def get_params(self, trainable=True): """Get the trainable variables.""" return self.get_trainable_vars() @property def distribution(self): """Policy distribution.""" return self.model.networks['default'].dist def __getstate__(self): """Object.__getstate__.""" new_dict = self.__dict__.copy() del new_dict['_f_dist'] return new_dict def __setstate__(self, state): """Object.__setstate__.""" self.__dict__.update(state) self._initialize()
class GaussianMLPPolicyWithModel(StochasticPolicy2): """ GaussianMLPPolicy with GaussianMLPModel. A policy that contains a MLP to make prediction based on a gaussian distribution. Args: env_spec (garage.envs.env_spec.EnvSpec): Environment specification. name (str): Model name, also the variable scope. hidden_sizes (list[int]): Output dimension of dense layer(s) for the MLP for mean. For example, (32, 32) means the MLP consists of two hidden layers, each with 32 hidden units. hidden_nonlinearity (callable): Activation function for intermediate dense layer(s). It should return a tf.Tensor. Set it to None to maintain a linear activation. hidden_w_init (callable): Initializer function for the weight of intermediate dense layer(s). The function should return a tf.Tensor. hidden_b_init (callable): Initializer function for the bias of intermediate dense layer(s). The function should return a tf.Tensor. output_nonlinearity (callable): Activation function for output dense layer. It should return a tf.Tensor. Set it to None to maintain a linear activation. output_w_init (callable): Initializer function for the weight of output dense layer(s). The function should return a tf.Tensor. output_b_init (callable): Initializer function for the bias of output dense layer(s). The function should return a tf.Tensor. learn_std (bool): Is std trainable. adaptive_std (bool): Is std a neural network. If False, it will be a parameter. std_share_network (bool): Boolean for whether mean and std share the same network. init_std (float): Initial value for std. std_hidden_sizes (list[int]): Output dimension of dense layer(s) for the MLP for std. For example, (32, 32) means the MLP consists of two hidden layers, each with 32 hidden units. min_std (float): If not None, the std is at least the value of min_std, to avoid numerical issues. max_std (float): If not None, the std is at most the value of max_std, to avoid numerical issues. std_hidden_nonlinearity: Nonlinearity for each hidden layer in the std network. std_output_nonlinearity: Nonlinearity for output layer in the std network. std_parametrization (str): How the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) layer_normalization (bool): Bool for using layer normalization or not. :return: """ def __init__(self, env_spec, name='GaussianMLPPolicyWithModel', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.glorot_uniform_initializer(), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.glorot_uniform_initializer(), output_b_init=tf.zeros_initializer(), learn_std=True, adaptive_std=False, std_share_network=False, init_std=1.0, min_std=1e-6, max_std=None, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=tf.nn.tanh, std_output_nonlinearity=None, std_parameterization='exp', layer_normalization=False): assert isinstance(env_spec.action_space, Box) super().__init__(name, env_spec) self.obs_dim = env_spec.observation_space.flat_dim self.action_dim = env_spec.action_space.flat_dim self.model = GaussianMLPModel( output_dim=self.action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=min_std, max_std=max_std, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_hidden_nonlinearity, std_output_nonlinearity=std_output_nonlinearity, std_parameterization=std_parameterization, layer_normalization=layer_normalization, name='GaussianMLPModel') self._initialize() def _initialize(self): state_input = tf.placeholder(tf.float32, shape=(None, self.obs_dim)) with tf.variable_scope(self._variable_scope): self.model.build(state_input) self._f_dist = tf.get_default_session().make_callable( [ self.model.networks['default'].sample, self.model.networks['default'].mean, self.model.networks['default'].log_std ], feed_list=[self.model.networks['default'].input]) @property def vectorized(self): """Vectorized or not.""" return True def dist_info_sym(self, obs_var, state_info_vars=None, name='default'): """Symbolic graph of the distribution.""" with tf.variable_scope(self._variable_scope): _, mean_var, log_std_var, _, _ = self.model.build(obs_var, name=name) return dict(mean=mean_var, log_std=log_std_var) def get_action(self, observation): """Get action from the policy.""" flat_obs = self.observation_space.flatten(observation) sample, mean, log_std = self._f_dist([flat_obs]) sample = self.action_space.unflatten(sample[0]) mean = self.action_space.unflatten(mean[0]) log_std = self.action_space.unflatten(log_std[0]) return sample, dict(mean=mean, log_std=log_std) def get_actions(self, observations): """Get actions from the policy.""" flat_obs = self.observation_space.flatten_n(observations) samples, means, log_stds = self._f_dist(flat_obs) samples = self.action_space.unflatten_n(samples) means = self.action_space.unflatten_n(means) log_stds = self.action_space.unflatten_n(log_stds) return samples, dict(mean=means, log_std=log_stds) def get_params(self, trainable=True): """Get the trainable variables.""" return self.get_trainable_vars() @property def distribution(self): """Policy distribution.""" return self.model.networks['default'].dist def __getstate__(self): """Object.__getstate__.""" new_dict = self.__dict__.copy() del new_dict['_f_dist'] return new_dict def __setstate__(self, state): """Object.__setstate__.""" self.__dict__.update(state) self._initialize()