def test_log_likehihood_sym(self): log_prob_sym1 = self.policy1.distribution.log_likelihood_sym( self.action_ph, self.dist1_sym) log_prob_func = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym1) log_prob1 = log_prob_func([self.obs, self.obs], np.ones((2, self.time_step, 1))) log_prob_sym2 = self.policy3.distribution.log_likelihood_sym( self.action_ph, self.dist3_sym) log_prob_func2 = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym2) log_prob2 = log_prob_func2([self.obs, self.obs], np.ones((2, self.time_step, 1))) assert np.array_equal(log_prob1, log_prob2) log_prob_sym1 = self.policy2.distribution.log_likelihood_sym( self.action_ph, self.dist2_sym) log_prob_func = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym1) log_prob1 = log_prob_func([self.obs, self.obs], np.ones((2, self.time_step, 1))) log_prob_sym2 = self.policy4.distribution.log_likelihood_sym( self.action_ph, self.dist4_sym) log_prob_func2 = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym2) log_prob2 = log_prob_func2([self.obs, self.obs], np.ones((2, self.time_step, 1))) assert np.array_equal(log_prob1, log_prob2)
def test_log_likehihood_sym(self): log_prob_sym1 = self.policy1.distribution.log_likelihood_sym( self.action_ph, self.dist1_sym) log_prob_func = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym1) log_prob1 = log_prob_func(self.obs, [[1, 1]]) log_prob_sym2 = self.policy3.model.networks[ 'default'].dist.log_likelihood_sym(self.action_ph, self.dist3_sym) log_prob_func2 = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym2) log_prob2 = log_prob_func2(self.obs, [[1, 1]]) assert log_prob1 == log_prob2 log_prob_sym1 = self.policy2.distribution.log_likelihood_sym( self.action_ph, self.dist2_sym) log_prob_func = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym1) log_prob1 = log_prob_func(self.obs, [[1, 1]]) log_prob_sym2 = self.policy4.model.networks[ 'default'].dist.log_likelihood_sym(self.action_ph, self.dist4_sym) log_prob_func2 = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym2) log_prob2 = log_prob_func2(self.obs, [[1, 1]]) assert log_prob1 == log_prob2
def update_opt(self, loss, target, leq_constraint, inputs, constraint_name='constraint', name=None, *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`garage.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ params = target.get_params(trainable=True) with tf.name_scope(name, 'PenaltyLbfgsOptimizer', [leq_constraint, loss, params]): constraint_term, constraint_value = leq_constraint penalty_var = tf.compat.v1.placeholder(tf.float32, tuple(), name='penalty') penalized_loss = loss + penalty_var * constraint_term self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name def get_opt_output(): with tf.name_scope('get_opt_output', values=[params, penalized_loss]): grads = tf.gradients(penalized_loss, params) for idx, (grad, param) in enumerate(zip(grads, params)): if grad is None: grads[idx] = tf.zeros_like(param) flat_grad = tensor_utils.flatten_tensor_variables(grads) return [ tf.cast(penalized_loss, tf.float64), tf.cast(flat_grad, tf.float64), ] self._opt_fun = LazyDict( f_loss=lambda: tensor_utils.compile_function( inputs, loss, log_name='f_loss'), f_constraint=lambda: tensor_utils.compile_function( inputs, constraint_term, log_name='f_constraint'), f_penalized_loss=lambda: tensor_utils.compile_function( inputs=inputs + [penalty_var], outputs=[penalized_loss, loss, constraint_term], log_name='f_penalized_loss', ), f_opt=lambda: tensor_utils.compile_function( inputs=inputs + [penalty_var], outputs=get_opt_output(), ))
def test_policy_entropy_sym(self): entropy_sym1 = self.policy1.distribution.entropy_sym( self.dist1_sym, name='entropy_sym1') entropy_func = tensor_utils.compile_function([self.obs_ph], entropy_sym1) entropy1 = entropy_func([self.obs, self.obs]) entropy_sym2 = self.policy3.distribution.entropy_sym( self.dist3_sym, name='entropy_sym1') entropy_func = tensor_utils.compile_function([self.obs_ph], entropy_sym2) entropy2 = entropy_func([self.obs, self.obs]) assert np.array_equal(entropy1, entropy2)
def update_opt(self, loss, target, inputs, extra_inputs=None, name='LbfgsOptimizer', **kwargs): """Construct operation graph for the optimizer. Args: loss (tf.Tensor): Loss objective to minimize. target (object): Target object to optimize. The object should implemenet `get_params()` and `get_param_values`. inputs (list[tf.Tensor]): List of input placeholders. extra_inputs (list[tf.Tensor]): List of extra input placeholders. name (str): Name scope. kwargs (dict): Extra unused keyword arguments. Some optimizers have extra input, e.g. KL constraint. """ del kwargs self._target = target params = target.get_params() with tf.name_scope(name): def get_opt_output(): """Helper function to construct graph. Returns: list[tf.Tensor]: Loss and gradient tensor. """ with tf.name_scope('get_opt_output'): flat_grad = tensor_utils.flatten_tensor_variables( tf.gradients(loss, params)) return [ tf.cast(loss, tf.float64), tf.cast(flat_grad, tf.float64) ] if extra_inputs is None: extra_inputs = list() self._opt_fun = LazyDict( f_loss=lambda: tensor_utils.compile_function( inputs + extra_inputs, loss), f_opt=lambda: tensor_utils.compile_function( inputs=inputs + extra_inputs, outputs=get_opt_output(), ))
def _build_entropy_term(self, i): with tf.name_scope("policy_entropy"): if self.policy.recurrent: policy_dist_info_flat = self.policy.dist_info_sym( i.obs_var, i.policy_state_info_vars, name="policy_dist_info") else: policy_dist_info_flat = self.policy.dist_info_sym( i.flat.obs_var, i.flat.policy_state_info_vars, name="policy_dist_info_flat") policy_entropy_flat = self.policy.distribution.entropy_sym( policy_dist_info_flat) policy_entropy = tf.reshape(policy_entropy_flat, [-1, self.max_path_length]) # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) policy_entropy = tf.reduce_mean(policy_entropy * i.valid_var) self.f_policy_entropy = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), policy_entropy, log_name="f_policy_entropy") return policy_entropy
def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`garage.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ with tf.name_scope(self._name, values=[ loss, target.get_params(trainable=True), inputs, extra_inputs ]): self._target = target self._train_op = self._tf_optimizer.minimize( loss, var_list=target.get_params(trainable=True)) # updates = OrderedDict( # [(k, v.astype(k.dtype)) for k, v in updates.iteritems()]) if extra_inputs is None: extra_inputs = list() self._input_vars = inputs + extra_inputs self._opt_fun = LazyDict( f_loss=lambda: tensor_utils.compile_function( inputs + extra_inputs, loss), )
def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs): """Construct operation graph for the optimizer. Args: loss (tf.Tensor): Loss objective to minimize. target (object): Target object to optimize. The object should implemenet `get_params()` and `get_param_values`. inputs (list[tf.Tensor]): List of input placeholders. extra_inputs (list[tf.Tensor]): List of extra input placeholders. kwargs (dict): Extra unused keyword arguments. Some optimizers have extra input, e.g. KL constraint. """ del kwargs with tf.name_scope(self._name): self._target = target tf_optimizer = make_optimizer(self._tf_optimizer, **self._learning_rate) self._train_op = tf_optimizer.minimize( loss, var_list=target.get_params()) if extra_inputs is None: extra_inputs = list() self._input_vars = inputs + extra_inputs self._opt_fun = LazyDict( f_loss=lambda: tensor_utils.compile_function( inputs + extra_inputs, loss), )
def _initialize(self): input_var = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + self._input_shape) ys_var = tf.compat.v1.placeholder(dtype=tf.float32, name='ys', shape=(None, self._output_dim)) self._old_network = self._old_model.build(input_var) (norm_dist, norm_mean, norm_log_std, _, mean, _, self._x_mean, self._x_std, self._y_mean, self._y_std) = self.build(input_var).outputs self._old_model.parameters = self.parameters normalized_ys_var = (ys_var - self._y_mean) / self._y_std old_normalized_dist = self._old_network.normalized_dist mean_kl = tf.reduce_mean(old_normalized_dist.kl_divergence(norm_dist)) loss = -tf.reduce_mean(norm_dist.log_prob(normalized_ys_var)) self._f_predict = tensor_utils.compile_function([input_var], mean) optimizer_args = dict( loss=loss, target=self, network_outputs=[norm_mean, norm_log_std], ) if self._use_trust_region: optimizer_args['leq_constraint'] = (mean_kl, self._max_kl_step) optimizer_args['inputs'] = [input_var, ys_var] with tf.name_scope('update_opt'): self._optimizer.update_opt(**optimizer_args)
def test_kl_sym(self): kl_diff_sym1 = self.policy1.distribution.kl_sym( self.dist1_sym, self.dist2_sym) objective1 = tf.reduce_mean(kl_diff_sym1) kl_func = tensor_utils.compile_function([self.obs_ph], objective1) kl1 = kl_func([self.obs, self.obs]) kl_diff_sym2 = self.policy3.distribution.kl_sym( self.dist3_sym, self.dist4_sym) objective2 = tf.reduce_mean(kl_diff_sym2) kl_func = tensor_utils.compile_function([self.obs_ph], objective2) kl2 = kl_func([self.obs, self.obs]) assert np.array_equal(kl1, kl2)
def update_opt(self, f, target, inputs, reg_coeff): self.target = target self.reg_coeff = reg_coeff params = target.get_params(trainable=True) constraint_grads = tf.gradients(f, xs=params) for idx, (grad, param) in enumerate(zip(constraint_grads, params)): if grad is None: constraint_grads[idx] = tf.zeros_like(param) xs = tuple([ tensor_utils.new_tensor_like(p.name.split(":")[0], p) for p in params ]) def Hx_plain(): Hx_plain_splits = tf.gradients( tf.reduce_sum( tf.stack([ tf.reduce_sum(g * x) for g, x in zip(constraint_grads, xs) ])), params) for idx, (Hx, param) in enumerate(zip(Hx_plain_splits, params)): if Hx is None: Hx_plain_splits[idx] = tf.zeros_like(param) return tensor_utils.flatten_tensor_variables(Hx_plain_splits) self._opt_fun = LazyDict( f_Hx_plain=lambda: tensor_utils.compile_function( inputs=inputs + xs, outputs=Hx_plain(), log_name="f_Hx_plain", ), )
def _build_entropy_term(self, i): """Build policy entropy tensor. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy entropy. """ pol_dist = self.policy.distribution with tf.name_scope('policy_entropy'): if self._use_neg_logli_entropy: policy_entropy = -pol_dist.log_prob(i.action_var, name='policy_log_likeli') else: policy_entropy = pol_dist.entropy() # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) if self._stop_entropy_gradient: policy_entropy = tf.stop_gradient(policy_entropy) # dense form, match the shape of advantage policy_entropy = tf.reshape(policy_entropy, [-1, self.max_path_length]) self._f_policy_entropy = compile_function( flatten_inputs(self._policy_opt_inputs), policy_entropy) return policy_entropy
def _initialize(self): input_var = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + self._input_shape) with tf.compat.v1.variable_scope(self._name) as vs: self._variable_scope = vs self.model.build(input_var) ys_var = tf.compat.v1.placeholder(dtype=tf.float32, name='ys', shape=(None, self._output_dim)) y_hat = self.model.networks['default'].y_hat loss = tf.reduce_mean(tf.square(y_hat - ys_var)) self._f_predict = tensor_utils.compile_function([input_var], y_hat) optimizer_args = dict( loss=loss, target=self, network_outputs=[ys_var], ) optimizer_args['inputs'] = [input_var, ys_var] with tf.name_scope('update_opt'): self._optimizer.update_opt(**optimizer_args)
def _build_embedding_kl(self, i): dist = self.policy._embedding._dist with tf.name_scope("embedding_kl"): # new distribution embed_dist_info_flat = self.policy._embedding.dist_info_sym( i.flat.task_var, i.flat.embed_state_info_vars, name="embed_dist_info_flat") embed_dist_info_valid = filter_valids_dict( embed_dist_info_flat, i.flat.valid_var, name="embed_dist_info_valid") # calculate KL divergence kl = dist.kl_sym(i.valid.embed_old_dist_info_vars, embed_dist_info_valid) mean_kl = tf.reduce_mean(kl) # Diagnostic function self.f_embedding_kl = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), mean_kl, log_name="f_embedding_kl") return mean_kl
def update_hvp(self, f, target, inputs, reg_coeff, name='PearlmutterHvp'): """Build the symbolic graph to compute the Hessian-vector product. Args: f (tf.Tensor): The function whose Hessian needs to be computed. target (garage.tf.policies.Policy): A parameterized object to optimize over. inputs (tuple[tf.Tensor]): The inputs for function f. reg_coeff (float): A small value so that A -> A + reg*I. name (str): Name to be used in tf.name_scope. """ self._target = target self._reg_coeff = reg_coeff params = target.get_params() with tf.name_scope(name): constraint_grads = tf.gradients(f, xs=params, name='gradients_constraint') for idx, (grad, param) in enumerate(zip(constraint_grads, params)): if grad is None: constraint_grads[idx] = tf.zeros_like(param) xs = tuple([ tensor_utils.new_tensor_like(p.name.split(':')[0], p) for p in params ]) def hx_plain(): """Computes product of Hessian(f) and vector v. Returns: tf.Tensor: Symbolic result. """ with tf.name_scope('hx_plain'): with tf.name_scope('hx_function'): hx_f = tf.reduce_sum( tf.stack([ tf.reduce_sum(g * x) for g, x in zip(constraint_grads, xs) ])), hx_plain_splits = tf.gradients(hx_f, params, name='gradients_hx_plain') for idx, (hx, param) in enumerate(zip(hx_plain_splits, params)): if hx is None: hx_plain_splits[idx] = tf.zeros_like(param) return tensor_utils.flatten_tensor_variables( hx_plain_splits) self._hvp_fun = LazyDict( f_hx_plain=lambda: tensor_utils.compile_function( inputs=inputs + xs, outputs=hx_plain(), log_name='f_hx_plain', ), )
def _initialize(self): input_var = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + self._input_shape) with tf.compat.v1.variable_scope(self._variable_scope): self.model.build(input_var) ys_var = tf.compat.v1.placeholder(dtype=tf.float32, name='ys', shape=(None, self._output_dim)) old_prob_var = tf.compat.v1.placeholder(dtype=tf.float32, name='old_prob', shape=(None, self._output_dim)) y_hat = self.model.networks['default'].y_hat old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=y_hat) self._dist = Categorical(self._output_dim) mean_kl = tf.reduce_mean( self._dist.kl_sym(old_info_vars, info_vars)) loss = -tf.reduce_mean( self._dist.log_likelihood_sym(ys_var, info_vars)) predicted = tf.one_hot(tf.argmax(y_hat, axis=1), depth=self._output_dim) self._f_predict = tensor_utils.compile_function([input_var], predicted) self._f_prob = tensor_utils.compile_function([input_var], y_hat) self._optimizer.update_opt(loss=loss, target=self, network_output=[y_hat], inputs=[input_var, ys_var]) self._tr_optimizer.update_opt( loss=loss, target=self, network_output=[y_hat], inputs=[input_var, ys_var, old_prob_var], leq_constraint=(mean_kl, self._max_kl_step))
def _build_net(self, reuse=None, custom_getter=None, trainable=None): """ Set up q network based on class attributes. This function uses layers defined in rllab.tf. Args: reuse: A bool indicates whether reuse variables in the same scope. custom_getter: A customized getter object used to get variables. trainable: A bool indicates whether variables are trainable. """ with tf.variable_scope(self.name, reuse=reuse, custom_getter=custom_getter): l_obs = L.InputLayer(shape=(None, self._obs_dim), name="obs") l_action = L.InputLayer(shape=(None, self._action_dim), name="actions") n_layers = len(self._hidden_sizes) + 1 if n_layers > 1: action_merge_layer = \ (self._action_merge_layer % n_layers + n_layers) % n_layers else: action_merge_layer = 1 l_hidden = l_obs for idx, size in enumerate(self._hidden_sizes): if self._batch_norm: l_hidden = batch_norm(l_hidden) if idx == action_merge_layer: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_hidden = L.DenseLayer(l_hidden, num_units=size, nonlinearity=self._hidden_nonlinearity, trainable=trainable, name="hidden_%d" % (idx + 1)) if action_merge_layer == n_layers: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_output = L.DenseLayer(l_hidden, num_units=1, nonlinearity=self._output_nonlinearity, trainable=trainable, name="output") output_var = L.get_output(l_output) self._f_qval = tensor_utils.compile_function( [l_obs.input_var, l_action.input_var], output_var) self._output_layer = l_output self._obs_layer = l_obs self._action_layer = l_action LayersPowered.__init__(self, [l_output])
def __init__( self, env_spec, conv_filters, conv_filter_sizes, conv_strides, conv_pads, hidden_sizes=[], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax, prob_network=None, name="CategoricalConvPolicy", ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ assert isinstance(env_spec.action_space, Discrete) Serializable.quick_init(self, locals()) self._name = name self._env_spec = env_spec self._prob_network_name = "prob_network" with tf.variable_scope(name, "CategoricalConvPolicy"): if prob_network is None: prob_network = ConvNetwork( input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.n, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="conv_prob_network", ) with tf.name_scope(self._prob_network_name): out_prob = L.get_output(prob_network.output_layer) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = tensor_utils.compile_function( [prob_network.input_layer.input_var], [out_prob]) self._dist = Categorical(env_spec.action_space.n) super(CategoricalConvPolicy, self).__init__(env_spec) LayersPowered.__init__(self, [prob_network.output_layer])
def build_net(self, trainable=True, name=None): """ Set up q network based on class attributes. This function uses layers defined in garage.tf. Args: reuse: A bool indicates whether reuse variables in the same scope. trainable: A bool indicates whether variables are trainable. """ with tf.variable_scope(name): l_obs = L.InputLayer(shape=(None, self._obs_dim), name="obs") l_action = L.InputLayer(shape=(None, self._action_dim), name="actions") n_layers = len(self._hidden_sizes) + 1 if n_layers > 1: action_merge_layer = \ (self._action_merge_layer % n_layers + n_layers) % n_layers else: action_merge_layer = 1 l_hidden = l_obs for idx, size in enumerate(self._hidden_sizes): if self._batch_norm: l_hidden = batch_norm(l_hidden) if idx == action_merge_layer: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_hidden = L.DenseLayer(l_hidden, num_units=size, nonlinearity=self._hidden_nonlinearity, trainable=trainable, name="hidden_%d" % (idx + 1)) if action_merge_layer == n_layers: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_output = L.DenseLayer(l_hidden, num_units=1, nonlinearity=self._output_nonlinearity, trainable=trainable, name="output") output_var = L.get_output(l_output) f_qval = tensor_utils.compile_function( [l_obs.input_var, l_action.input_var], output_var) output_layer = l_output obs_layer = l_obs action_layer = l_action return f_qval, output_layer, obs_layer, action_layer
def update_opt(self, loss, target, inputs, extra_inputs=None, name=None, *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`garage.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ self._target = target params = target.get_params(trainable=True) with tf.name_scope(name, "LbfgsOptimizer", [loss, inputs, params, extra_inputs]): def get_opt_output(): with tf.name_scope("get_opt_output", [loss, params]): flat_grad = tensor_utils.flatten_tensor_variables( tf.gradients(loss, params)) return [ tf.cast(loss, tf.float64), tf.cast(flat_grad, tf.float64) ] if extra_inputs is None: extra_inputs = list() self._opt_fun = ext.lazydict( f_loss=lambda: tensor_utils.compile_function( inputs + extra_inputs, loss), f_opt=lambda: tensor_utils.compile_function( inputs=inputs + extra_inputs, outputs=get_opt_output(), ))
def _build_entropy_term(self, i): with tf.name_scope("policy_entropy"): if self.policy.recurrent: policy_dist_info = self.policy.dist_info_sym( i.obs_var, i.policy_state_info_vars, name="policy_dist_info") policy_neg_log_likeli = self.policy.distribution.log_likelihood_sym( # noqa: E501 i.action_var, policy_dist_info, name="policy_log_likeli") if self._use_neg_logli_entropy: policy_entropy = policy_neg_log_likeli else: policy_entropy = self.policy.distribution.entropy_sym( policy_dist_info) else: policy_dist_info_flat = self.policy.dist_info_sym( i.flat.obs_var, i.flat.policy_state_info_vars, name="policy_dist_info_flat_entropy") policy_dist_info_valid = filter_valids_dict( policy_dist_info_flat, i.flat.valid_var, name="policy_dist_info_valid") policy_neg_log_likeli_valid = self.policy.distribution.log_likelihood_sym( # noqa: E501 i.valid.action_var, policy_dist_info_valid, name="policy_log_likeli") if self._use_neg_logli_entropy: policy_entropy = policy_neg_log_likeli_valid else: policy_entropy = self.policy.distribution.entropy_sym( policy_dist_info_valid) # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) if self._stop_entropy_gradient: policy_entropy = tf.stop_gradient(policy_entropy) self.f_policy_entropy = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), policy_entropy, log_name="f_policy_entropy") return policy_entropy
def __init__(self, dim, name=None): with tf.variable_scope(name, "Categorical"): self._dim = dim self._name = name weights_var = tf.placeholder( dtype=tf.float32, shape=(None, dim), name="weights") self._f_sample = compile_function( inputs=[weights_var], outputs=tf.multinomial( tf.log(weights_var + 1e-8), num_samples=1)[:, 0], )
def test_likelihood_ratio_sym(self): likelihood_ratio_sym1 = self.policy1.distribution.likelihood_ratio_sym( self.action_ph, self.dist1_sym, self.dist2_sym, name='li_ratio_sym1') likelihood_ratio_func = tensor_utils.compile_function( [self.action_ph, self.obs_ph], likelihood_ratio_sym1) likelihood_ratio1 = likelihood_ratio_func([[1, 1]], self.obs) likelihood_ratio_sym2 = self.policy3.distribution.likelihood_ratio_sym( self.action_ph, self.dist3_sym, self.dist4_sym, name='li_ratio_sym2') likelihood_ratio_func = tensor_utils.compile_function( [self.action_ph, self.obs_ph], likelihood_ratio_sym2) likelihood_ratio2 = likelihood_ratio_func([[1, 1]], self.obs) assert likelihood_ratio1 == likelihood_ratio2
def __init__(self, dim, name=None): with tf.compat.v1.variable_scope(name, 'Categorical'): self._dim = dim self._name = name weights_var = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, dim), name='weights') self._f_sample = compile_function( inputs=[weights_var], outputs=tf.random.categorical(tf.math.log(weights_var + 1e-8), num_samples=1)[:, 0], )
def __init__( self, env_spec, name='CategoricalMLPPolicy', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, prob_network=None, ): """ CategoricalMLPPolicy. A policy that uses a MLP to estimate a categorical distribution. Args: env_spec (garage.envs.env_spec.EnvSpec): Environment specification. hidden_sizes (list[int]): Output dimension of dense layer(s). For example, (32, 32) means the MLP of this policy consists of two hidden layers, each with 32 hidden units. hidden_nonlinearity: Activation function for intermediate dense layer(s). prob_network (tf.Tensor): manually specified network for this policy. If None, a MLP with the network parameters will be created. If not None, other network params are ignored. """ assert isinstance(env_spec.action_space, akro.Discrete) Serializable.quick_init(self, locals()) self.name = name self._prob_network_name = 'prob_network' with tf.variable_scope(name, 'CategoricalMLPPolicy'): if prob_network is None: prob_network = MLP( input_shape=(env_spec.observation_space.flat_dim, ), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, name=self._prob_network_name, ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer with tf.name_scope(self._prob_network_name): prob_network_outputs = L.get_output(prob_network.output_layer) self._f_prob = tensor_utils.compile_function( [prob_network.input_layer.input_var], prob_network_outputs) self._dist = Categorical(env_spec.action_space.n) super(CategoricalMLPPolicy, self).__init__(env_spec) LayersPowered.__init__(self, [prob_network.output_layer])
def test_likelihood_ratio_sym(self): likelihood_ratio_sym1 = self.policy1.distribution.likelihood_ratio_sym( self.action_ph, self.dist1_sym, self.dist2_sym, name='li_ratio_sym1') likelihood_ratio_func = tensor_utils.compile_function( [self.action_ph, self.obs_ph], likelihood_ratio_sym1) likelihood_ratio1 = likelihood_ratio_func( np.ones((2, 1, 1)), [self.obs, self.obs]) likelihood_ratio_sym2 = self.policy3.distribution.likelihood_ratio_sym( self.action_ph, self.dist3_sym, self.dist4_sym, name='li_ratio_sym2') likelihood_ratio_func = tensor_utils.compile_function( [self.action_ph, self.obs_ph], likelihood_ratio_sym2) likelihood_ratio2 = likelihood_ratio_func( np.ones((2, 1, 1)), [self.obs, self.obs]) assert np.array_equal(likelihood_ratio1, likelihood_ratio2)
def update_opt(self, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args, **kwargs): """Update the internal tensowflow operations. Parameters ---------- target : A parameterized object to optimize over. It should implement methods of the :py:class:`garage.core.paramerized.Parameterized` class. leq_constraint : :py:class:'tensorflow.Tensor' The variable to be constrained. inputs : A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed that the first dimension of these inputs should correspond to the number of data points. extra_inputs : A list of symbolic variables as extra inputs which should not be subsampled. """ inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() else: extra_inputs = tuple(extra_inputs) # constraint_term, constraint_value = leq_constraint constraint_term = leq_constraint # params = target.get_params(trainable=True) self._hvp_approach.update_hvp(f=constraint_term, target=target, inputs=inputs + extra_inputs, reg_coeff=self._reg_coeff) self._target = target # self._max_constraint_val = constraint_value self._max_constraint_val = np.inf self._constraint_name = constraint_name self._opt_fun = LazyDict( f_constraint=lambda: tensor_utils.compile_function( inputs=inputs + extra_inputs, outputs=constraint_term, log_name="constraint", ), )
def _initialize(self): input_var = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + self._input_shape) self._old_model.build(input_var) self._old_model.parameters = self.model.parameters with tf.compat.v1.variable_scope(self._variable_scope): self.model.build(input_var) ys_var = tf.compat.v1.placeholder(dtype=tf.float32, name='ys', shape=(None, self._output_dim)) y_mean_var = self.model.networks['default'].y_mean y_std_var = self.model.networks['default'].y_std means_var = self.model.networks['default'].mean normalized_means_var = self.model.networks[ 'default'].normalized_mean normalized_log_stds_var = self.model.networks[ 'default'].normalized_log_std normalized_ys_var = (ys_var - y_mean_var) / y_std_var old_normalized_dist = self._old_model.networks[ 'default'].normalized_dist normalized_dist = self.model.networks['default'].normalized_dist mean_kl = tf.reduce_mean( old_normalized_dist.kl_divergence(normalized_dist)) loss = -tf.reduce_mean(normalized_dist.log_prob(normalized_ys_var)) self._f_predict = tensor_utils.compile_function([input_var], means_var) optimizer_args = dict( loss=loss, target=self, network_outputs=[ normalized_means_var, normalized_log_stds_var ], ) if self._use_trust_region: optimizer_args['leq_constraint'] = (mean_kl, self._max_kl_step) optimizer_args['inputs'] = [input_var, ys_var] with tf.name_scope('update_opt'): self._optimizer.update_opt(**optimizer_args)
def update_opt(self, f, target, inputs, reg_coeff, name=None): self.target = target self.reg_coeff = reg_coeff params = target.get_params(trainable=True) with tf.name_scope(name, "FiniteDifferenceHvp", [f, inputs, params, target]): constraint_grads = tf.gradients(f, xs=params, name="gradients_constraint") for idx, (grad, param) in enumerate(zip(constraint_grads, params)): if grad is None: constraint_grads[idx] = tf.zeros_like(param) flat_grad = tensor_utils.flatten_tensor_variables(constraint_grads) def f_hx_plain(*args): with tf.name_scope("f_hx_plain", values=[inputs, self.target]): inputs_ = args[:len(inputs)] xs = args[len(inputs):] flat_xs = np.concatenate( [np.reshape(x, (-1, )) for x in xs]) param_val = self.target.get_param_values(trainable=True) eps = np.cast['float32']( self.base_eps / (np.linalg.norm(param_val) + 1e-8)) self.target.set_param_values(param_val + eps * flat_xs, trainable=True) flat_grad_dvplus = self.opt_fun["f_grad"](*inputs_) self.target.set_param_values(param_val, trainable=True) if self.symmetric: self.target.set_param_values(param_val - eps * flat_xs, trainable=True) flat_grad_dvminus = self.opt_fun["f_grad"](*inputs_) hx = (flat_grad_dvplus - flat_grad_dvminus) / (2 * eps) self.target.set_param_values(param_val, trainable=True) else: flat_grad = self.opt_fun["f_grad"](*inputs_) hx = (flat_grad_dvplus - flat_grad) / eps return hx self.opt_fun = ext.LazyDict( f_grad=lambda: tensor_utils.compile_function( inputs=inputs, outputs=flat_grad, log_name="f_grad", ), f_hx_plain=lambda: f_hx_plain, )
def _build_net(self, reuse=None, custom_getter=None, trainable=None): """ Set up q network based on class attributes. This function uses layers defined in garage.tf. Args: reuse: A bool indicates whether reuse variables in the same scope. custom_getter: A customized getter object used to get variables. trainable: A bool indicates whether variables are trainable. """ with tf.variable_scope(self.name, reuse=reuse, custom_getter=custom_getter): l_in = layers.InputLayer(shape=(None, self._obs_dim), name="obs") l_hidden = l_in for idx, hidden_size in enumerate(self._hidden_sizes): if self._batch_norm: l_hidden = batch_norm(l_hidden) l_hidden = layers.DenseLayer( l_hidden, hidden_size, nonlinearity=self._hidden_nonlinearity, trainable=trainable, name="hidden_%d" % idx) l_output = layers.DenseLayer( l_hidden, self._action_dim, nonlinearity=self._output_nonlinearity, trainable=trainable, name="output") with tf.name_scope(self._policy_network_name): action = layers.get_output(l_output) scaled_action = tf.multiply(action, self._action_bound, name="scaled_action") self._f_prob_online = tensor_utils.compile_function( inputs=[l_in.input_var], outputs=scaled_action) self._output_layer = l_output self._obs_layer = l_in LayersPowered.__init__(self, [l_output])