def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): """ Builds symbols to sample actions and compute log-probs of actions. Special instructions: Make log_std a tf variable with the same shape as the action vector, independent of x, initialized to [-0.5, -0.5, ..., -0.5]. Args: x: Input tensor of states. Shape [batch, obs_dim]. a: Input tensor of actions. Shape [batch, act_dim]. hidden_sizes: Sizes of hidden layers for action network MLP. activation: Activation function for all layers except last. output_activation: Activation function for last layer (action layer). action_space: A gym.spaces object describing the action space of the environment this agent will interact with. Returns: pi: A symbol for sampling stochastic actions from a Gaussian distribution. logp: A symbol for computing log-likelihoods of actions from a Gaussian distribution. logp_pi: A symbol for computing log-likelihoods of actions in pi from a Gaussian distribution. """ ####################### # # # YOUR CODE HERE # # # ####################### #New code #get dimensions of actions act_dim = a.shape[-1] mu = mlp(x, list(hidden_sizes) + [act_dim], activation, output_activation) log_std = tf.get_variable(name="log_std", initializer=-0.5 * np.ones(shape=act_dim, dtype=np.float32)) std = tf.exp(log_std) pi = mu + tf.random_normal(tf.shape(mu)) * std #Old code ''' mu, var = tf.nn.moments(a, axes=[1, np.shape(a)[1]]) log_std = tf.Variable(-0.5, shape=tf.shape(a)) pi = exercise1_1.gaussian_likelihood(a, mu, log_std) ''' logp = exercise1_1.gaussian_likelihood(a, mu, log_std) logp_pi = exercise1_1.gaussian_likelihood(pi, mu, log_std) return pi, logp, logp_pi
def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): """ Builds symbols to sample actions and compute log-probs of actions. Special instructions: Make log_std a tf variable with the same shape as the action vector, independent of x, initialized to [-0.5, -0.5, ..., -0.5]. Args: x: Input tensor of states. Shape [batch, obs_dim]. a: Input tensor of actions. Shape [batch, act_dim]. hidden_sizes: Sizes of hidden layers for action network MLP. activation: Activation function for all layers except last. output_activation: Activation function for last layer (action layer). action_space: A gym.spaces object describing the action space of the environment this agent will interact with. Returns: pi: A symbol for sampling stochastic actions from a Gaussian distribution. logp: A symbol for computing log-likelihoods of actions from a Gaussian distribution. logp_pi: A symbol for computing log-likelihoods of actions in pi from a Gaussian distribution. """ action_space_dim = a.shape.as_list()[-1] #add layer for logits the size of action space dim hidden = list(hidden_sizes) + [action_space_dim] mu = mlp(x, hidden_sizes=hidden, activation=activation, output_activation=output_activation) log_std = tf.get_variable(name='log_std', initializer=-0.5 * np.ones(action_space_dim, dtype=np.float32)) std = tf.exp(log_std) # sample actions from the current estimated policy pi = mu + tf.random_normal(tf.shape(mu)) * std # compute log liklihood of actions logp = exercise1_1.gaussian_likelihood(a, mu, log_std) logp_pi = exercise1_1.gaussian_likelihood(pi, mu, log_std) return pi, logp, logp_pi
def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): """ Builds symbols to sample actions and compute log-probs of actions. Special instructions: Make log_std a tf variable with the same shape as the action vector, independent of x, initialized to [-0.5, -0.5, ..., -0.5]. Args: x: Input tensor of states. Shape [batch, obs_dim]. a: Input tensor of actions. Shape [batch, act_dim]. hidden_sizes: Sizes of hidden layers for action network MLP. activation: Activation function for all layers except last. output_activation: Activation function for last layer (action layer). action_space: A gym.spaces object describing the action space of the environment this agent will interact with. Returns: pi: A symbol for sampling stochastic actions from a Gaussian distribution. logp: A symbol for computing log-likelihoods of actions from a Gaussian distribution. logp_pi: A symbol for computing log-likelihoods of actions in pi from a Gaussian distribution. """ ####################### # # # YOUR CODE HERE # # # ####################### # mu = # log_std = # pi = logits = mlp(x, hidden_sizes + [action_space], activation, output_activation) #action vector,[batch,action_space] mu = tf.reduce_mean(x, axis=1) log_std = tf.Variable(tf.ones(logits.shape) * (-0.5), validate_shape=False) pi = gaussian_likelihood(logits, mu, log_std) # logp = exercise1_1.gaussian_likelihood(a, mu, log_std) logp_pi = exercise1_1.gaussian_likelihood(pi, mu, log_std) return pi, logp, logp_pi