Exemplo n.º 1
0
# abstract way allows us to delegate to the environment for handling
# the correct data type for the variable. For instance, for an
# environment with discrete observations, we might want to use integer
# types if the observations are represented as one-hot vectors.
observations_var = env.observation_space.new_tensor_variable(
    'observations',
    # It should have 1 extra dimension since we want to represent a list of
    # observations
    extra_dims=1)
actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1)
advantages_var = TT.vector('advantages')

# policy.dist_info_sym returns a dictionary, whose values are symbolic
# expressions for quantities related to the distribution of the actions. For a
# Gaussian policy, it contains the mean and (log) standard deviation.
dist_info_vars = policy.dist_info_sym(observations_var)

# policy.distribution returns a distribution object under
# garage.distributions. It contains many utilities for computing
# distribution-related quantities, given the computed dist_info_vars.
# Below we use dist.log_likelihood_sym to compute the symbolic
# log-likelihood. For this example, the corresponding distribution is
# an instance of the class garage.distributions.DiagonalGaussian
dist = policy.distribution

# Note that we negate the objective, since most optimizers assume a
# minimization problem
surr = -TT.mean(
    dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var)

# Get the list of trainable parameters.
Exemplo n.º 2
0
env = TheanoEnv(normalize(SwimmerEnv()))
policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes)
backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes)
mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes)
pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes)
neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes)

observations_var = env.observation_space.new_tensor_variable('observations',
                                                             extra_dims=1)
actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1)
rewards_var = tensor_utils.new_tensor('rewards',
                                      ndim=1,
                                      dtype=theano.config.floatX)

dist = policy.distribution
dist_info_vars = policy.dist_info_sym(observations_var)
old_dist_info_vars = backup_policy.dist_info_sym(observations_var)
kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
mean_kl = TT.mean(kl)
max_kl = TT.max(kl)

#for test
surr_ll = dist.log_likelihood_sym(actions_var, dist_info_vars)
surr_ll_cumsum = dist.log_likelihood_sym_cumsum(actions_var, dist_info_vars)
surr = TT.sum(surr_ll_cumsum * rewards_var)

f_surr_ll = theano.function(inputs=[observations_var, actions_var],
                            outputs=surr_ll)
f_surr_ll_cumsum = theano.function(inputs=[observations_var, actions_var],
                                   outputs=surr_ll_cumsum)