def update_opt(self, loss, target, inputs, extra_inputs=None, name=None, **kwargs): """Construct operation graph for the optimizer. Args: loss (tf.Tensor): Loss objective to minimize. target (object): Target object to optimize. The object should implemenet `get_params()` and `get_param_values`. inputs (list[tf.Tensor]): List of input placeholders. extra_inputs (list[tf.Tensor]): List of extra input placeholders. name (str): Name scope. kwargs (dict): Extra unused keyword arguments. Some optimizers have extra input, e.g. KL constraint. """ self._target = target params = target.get_params() with tf.name_scope(name, 'LbfgsOptimizer', [loss, inputs, params, extra_inputs]): def get_opt_output(): """Helper function to construct graph. Returns: list[tf.Tensor]: Loss and gradient tensor. """ with tf.name_scope('get_opt_output', values=[loss, params]): flat_grad = tensor_utils.flatten_tensor_variables( tf.gradients(loss, params)) return [ tf.cast(loss, tf.float64), tf.cast(flat_grad, tf.float64) ] if extra_inputs is None: extra_inputs = list() self._opt_fun = LazyDict( f_loss=lambda: tensor_utils.compile_function( inputs + extra_inputs, loss), f_opt=lambda: tensor_utils.compile_function( inputs=inputs + extra_inputs, outputs=get_opt_output(), ))
def _initialize(self): input_var = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + self._input_shape) with tf.compat.v1.variable_scope(self._name) as vs: self._variable_scope = vs self.model.build(input_var) ys_var = tf.compat.v1.placeholder(dtype=tf.float32, name='ys', shape=(None, self._output_dim)) y_hat = self.model.networks['default'].y_hat loss = tf.reduce_mean(tf.square(y_hat - ys_var)) self._f_predict = tensor_utils.compile_function([input_var], y_hat) optimizer_args = dict( loss=loss, target=self, network_outputs=[ys_var], ) optimizer_args['inputs'] = [input_var, ys_var] with tf.name_scope('update_opt'): self._optimizer.update_opt(**optimizer_args)
def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs): """Construct operation graph for the optimizer. Args: loss (tf.Tensor): Loss objective to minimize. target (object): Target object to optimize. The object should implemenet `get_params()` and `get_param_values`. inputs (list[tf.Tensor]): List of input placeholders. extra_inputs (list[tf.Tensor]): List of extra input placeholders. kwargs (dict): Extra unused keyword arguments. Some optimizers have extra input, e.g. KL constraint. """ with tf.name_scope( self._name, values=[loss, target.get_params(), inputs, extra_inputs]): self._target = target self._train_op = self._tf_optimizer.minimize( loss, var_list=target.get_params()) if extra_inputs is None: extra_inputs = list() self._input_vars = inputs + extra_inputs self._opt_fun = LazyDict( f_loss=lambda: tensor_utils.compile_function( inputs + extra_inputs, loss), )
def _build_entropy_term(self, i): """Build policy entropy tensor. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy entropy. """ pol_dist = self.policy.distribution with tf.name_scope('policy_entropy'): if self._use_neg_logli_entropy: policy_entropy = -pol_dist.log_prob(i.action_var, name='policy_log_likeli') else: policy_entropy = pol_dist.entropy() # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) if self._stop_entropy_gradient: policy_entropy = tf.stop_gradient(policy_entropy) # dense form, match the shape of advantage policy_entropy = tf.reshape(policy_entropy, [-1, self.max_path_length]) self._f_policy_entropy = compile_function( flatten_inputs(self._policy_opt_inputs), policy_entropy) return policy_entropy
def update_hvp(self, f, target, inputs, reg_coeff, name='PearlmutterHvp'): """Build the symbolic graph to compute the Hessian-vector product. Args: f (tf.Tensor): The function whose Hessian needs to be computed. target (metarl.tf.policies.Policy): A parameterized object to optimize over. inputs (tuple[tf.Tensor]): The inputs for function f. reg_coeff (float): A small value so that A -> A + reg*I. name (str): Name to be used in tf.name_scope. """ self._target = target self._reg_coeff = reg_coeff params = target.get_params() with tf.name_scope(name): constraint_grads = tf.gradients(f, xs=params, name='gradients_constraint') for idx, (grad, param) in enumerate(zip(constraint_grads, params)): if grad is None: constraint_grads[idx] = tf.zeros_like(param) xs = tuple([ tensor_utils.new_tensor_like(p.name.split(':')[0], p) for p in params ]) def hx_plain(): """Computes product of Hessian(f) and vector v. Returns: tf.Tensor: Symbolic result. """ with tf.name_scope('hx_plain'): with tf.name_scope('hx_function'): hx_f = tf.reduce_sum( tf.stack([ tf.reduce_sum(g * x) for g, x in zip(constraint_grads, xs) ])), hx_plain_splits = tf.gradients(hx_f, params, name='gradients_hx_plain') for idx, (hx, param) in enumerate(zip(hx_plain_splits, params)): if hx is None: hx_plain_splits[idx] = tf.zeros_like(param) return tensor_utils.flatten_tensor_variables( hx_plain_splits) self._hvp_fun = LazyDict( f_hx_plain=lambda: tensor_utils.compile_function( inputs=inputs + xs, outputs=hx_plain(), log_name='f_hx_plain', ), )
def _initialize(self): input_var = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + self._input_shape) with tf.compat.v1.variable_scope(self._variable_scope): self.model.build(input_var) ys_var = tf.compat.v1.placeholder(dtype=tf.float32, name='ys', shape=(None, self._output_dim)) old_prob_var = tf.compat.v1.placeholder(dtype=tf.float32, name='old_prob', shape=(None, self._output_dim)) y_hat = self.model.networks['default'].y_hat old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=y_hat) self._dist = Categorical(self._output_dim) mean_kl = tf.reduce_mean( self._dist.kl_sym(old_info_vars, info_vars)) loss = -tf.reduce_mean( self._dist.log_likelihood_sym(ys_var, info_vars)) # pylint: disable=no-value-for-parameter predicted = tf.one_hot(tf.argmax(y_hat, axis=1), depth=self._output_dim) self._f_predict = tensor_utils.compile_function([input_var], predicted) self._f_prob = tensor_utils.compile_function([input_var], y_hat) self._optimizer.update_opt(loss=loss, target=self, inputs=[input_var, ys_var]) self._tr_optimizer.update_opt( loss=loss, target=self, inputs=[input_var, ys_var, old_prob_var], leq_constraint=(mean_kl, self._max_kl_step))
def __init__(self, dim, name=None): with tf.compat.v1.variable_scope(name, 'Categorical'): self._dim = dim self._name = name weights_var = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, dim), name='weights') self._f_sample = compile_function( inputs=[weights_var], outputs=tf.random.categorical(tf.math.log(weights_var + 1e-8), num_samples=1)[:, 0], )
def _initialize(self): input_var = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + self._input_shape) self._old_model.build(input_var) self._old_model.parameters = self.model.parameters with tf.compat.v1.variable_scope(self._variable_scope): self.model.build(input_var) ys_var = tf.compat.v1.placeholder(dtype=tf.float32, name='ys', shape=(None, self._output_dim)) y_mean_var = self.model.networks['default'].y_mean y_std_var = self.model.networks['default'].y_std means_var = self.model.networks['default'].mean normalized_means_var = self.model.networks[ 'default'].normalized_mean normalized_log_stds_var = self.model.networks[ 'default'].normalized_log_std normalized_ys_var = (ys_var - y_mean_var) / y_std_var old_normalized_dist = self._old_model.networks[ 'default'].normalized_dist normalized_dist = self.model.networks['default'].normalized_dist mean_kl = tf.reduce_mean( old_normalized_dist.kl_divergence(normalized_dist)) loss = -tf.reduce_mean(normalized_dist.log_prob(normalized_ys_var)) self._f_predict = tensor_utils.compile_function([input_var], means_var) optimizer_args = dict( loss=loss, target=self, network_outputs=[ normalized_means_var, normalized_log_stds_var ], ) if self._use_trust_region: optimizer_args['leq_constraint'] = (mean_kl, self._max_kl_step) optimizer_args['inputs'] = [input_var, ys_var] with tf.name_scope('update_opt'): self._optimizer.update_opt(**optimizer_args)
def _build_policy_loss(self, i): """Build policy loss and other output tensors. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy loss. tf.Tensor: Mean policy KL divergence. Raises: NotImplementedError: If is_recurrent is True. """ pol_dist = self.policy.distribution # Initialize dual params self._param_eta = 15. self._param_v = np.random.rand( self._env_spec.observation_space.flat_dim * 2 + 4) with tf.name_scope('bellman_error'): delta_v = tf.boolean_mask(i.reward_var, i.valid_var) + tf.tensordot( i.feat_diff, i.param_v, 1) with tf.name_scope('policy_loss'): ll = pol_dist.log_prob(i.action_var) ll = tf.boolean_mask(ll, i.valid_var) loss = -tf.reduce_mean( ll * tf.exp(delta_v / i.param_eta - tf.reduce_max(delta_v / i.param_eta))) reg_params = self.policy.get_regularizable_vars() loss += self._l2_reg_loss * tf.reduce_sum( [tf.reduce_mean(tf.square(param)) for param in reg_params]) / len(reg_params) with tf.name_scope('kl'): kl = self._old_policy.distribution.kl_divergence( self.policy.distribution) pol_mean_kl = tf.reduce_mean(kl) with tf.name_scope('dual'): dual_loss = i.param_eta * self._epsilon + ( i.param_eta * tf.math.log( tf.reduce_mean( tf.exp(delta_v / i.param_eta - tf.reduce_max(delta_v / i.param_eta)))) + i.param_eta * tf.reduce_max(delta_v / i.param_eta)) dual_loss += self._l2_reg_dual * (tf.square(i.param_eta) + tf.square(1 / i.param_eta)) dual_grad = tf.gradients(dual_loss, [i.param_eta, i.param_v]) # yapf: disable self._f_dual = tensor_utils.compile_function( flatten_inputs(self._dual_opt_inputs), dual_loss, log_name='f_dual') # yapf: enable self._f_dual_grad = tensor_utils.compile_function( flatten_inputs(self._dual_opt_inputs), dual_grad, log_name='f_dual_grad') self._f_policy_kl = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), pol_mean_kl, log_name='f_policy_kl') return loss
def init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self.name, 'TD3'): # Create target policy (actor) and qf (critic) networks self.target_policy_f_prob_online = tensor_utils.compile_function( inputs=[self.target_policy.model.networks['default'].input], outputs=self.target_policy.model.networks['default'].outputs) self.target_qf_f_prob_online = tensor_utils.compile_function( inputs=self.target_qf.model.networks['default'].inputs, outputs=self.target_qf.model.networks['default'].outputs) self.target_qf2_f_prob_online = tensor_utils.compile_function( inputs=self.target_qf2.model.networks['default'].inputs, outputs=self.target_qf2.model.networks['default'].outputs) # Set up target init and update functions with tf.name_scope('setup_target'): policy_init_op, policy_update_op = tensor_utils.get_target_ops( self.policy.get_global_vars(), self.target_policy.get_global_vars(), self.tau) qf_init_ops, qf_update_ops = tensor_utils.get_target_ops( self.qf.get_global_vars(), self.target_qf.get_global_vars(), self.tau) qf2_init_ops, qf2_update_ops = tensor_utils.get_target_ops( self.qf2.get_global_vars(), self.target_qf2.get_global_vars(), self.tau) target_init_op = policy_init_op + qf_init_ops + qf2_init_ops target_update_op = (policy_update_op + qf_update_ops + qf2_update_ops) f_init_target = tensor_utils.compile_function( inputs=[], outputs=target_init_op) f_update_target = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope('inputs'): if self.input_include_goal: obs_dim = self.env_spec.observation_space.\ flat_dim_with_keys(['observation', 'desired_goal']) else: obs_dim = self.env_spec.observation_space.flat_dim y = tf.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.placeholder( tf.float32, shape=(None, self.env_spec.action_space.flat_dim), name='input_action') # Set up policy training function next_action = self.policy.get_action_sym(obs, name='policy_action') next_qval = self.qf.get_qval_sym(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) with tf.name_scope('minimize_action_loss'): policy_train_op = self.policy_optimizer( self.policy_lr, name='PolicyOptimizer').minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = tensor_utils.compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self.qf.get_qval_sym(obs, actions, name='q_value') q2val = self.qf2.get_qval_sym(obs, actions, name='q2_value') with tf.name_scope('qval1_loss'): qval1_loss = tf.reduce_mean(tf.math.squared_difference( y, qval)) with tf.name_scope('qval2_loss'): qval2_loss = tf.reduce_mean( tf.math.squared_difference(y, q2val)) with tf.name_scope('minimize_qf_loss'): qf_train_op = self.qf_optimizer( self.qf_lr, name='QFunctionOptimizer').minimize( qval1_loss, var_list=self.qf.get_trainable_vars()) qf2_train_op = self.qf_optimizer( self.qf_lr, name='QFunctionOptimizer').minimize( qval2_loss, var_list=self.qf2.get_trainable_vars()) f_train_qf = tensor_utils.compile_function( inputs=[y, obs, actions], outputs=[qf_train_op, qval1_loss, qval]) f_train_qf2 = tensor_utils.compile_function( inputs=[y, obs, actions], outputs=[qf2_train_op, qval2_loss, q2val]) self.f_train_policy = f_train_policy self.f_train_qf = f_train_qf self.f_init_target = f_init_target self.f_update_target = f_update_target self.f_train_qf2 = f_train_qf2
def init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self.name, 'DDPG'): # Create target policy and qf network self.target_policy_f_prob_online = tensor_utils.compile_function( inputs=[self.target_policy.model.networks['default'].input], outputs=self.target_policy.model.networks['default'].outputs) self.target_qf_f_prob_online = tensor_utils.compile_function( inputs=self.target_qf.model.networks['default'].inputs, outputs=self.target_qf.model.networks['default'].outputs) # Set up target init and update function with tf.name_scope('setup_target'): ops = tensor_utils.get_target_ops( self.policy.get_global_vars(), self.target_policy.get_global_vars(), self.tau) policy_init_ops, policy_update_ops = ops qf_init_ops, qf_update_ops = tensor_utils.get_target_ops( self.qf.get_global_vars(), self.target_qf.get_global_vars(), self.tau) target_init_op = policy_init_ops + qf_init_ops target_update_op = policy_update_ops + qf_update_ops f_init_target = tensor_utils.compile_function( inputs=[], outputs=target_init_op) f_update_target = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope('inputs'): if self.input_include_goal: obs_dim = self.env_spec.observation_space.\ flat_dim_with_keys(['observation', 'desired_goal']) else: obs_dim = self.env_spec.observation_space.flat_dim input_y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self.env_spec.action_space.flat_dim), name='input_action') # Set up policy training function next_action = self.policy.get_action_sym(obs, name='policy_action') next_qval = self.qf.get_qval_sym(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) if self.policy_weight_decay > 0.: policy_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.policy_weight_decay), weights_list=self.policy.get_regularizable_vars()) action_loss += policy_reg with tf.name_scope('minimize_action_loss'): policy_train_op = self.policy_optimizer( self.policy_lr, name='PolicyOptimizer').minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = tensor_utils.compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self.qf.get_qval_sym(obs, actions, name='q_value') with tf.name_scope('qval_loss'): qval_loss = tf.reduce_mean( tf.compat.v1.squared_difference(input_y, qval)) if self.qf_weight_decay > 0.: qf_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.qf_weight_decay), weights_list=self.qf.get_regularizable_vars()) qval_loss += qf_reg with tf.name_scope('minimize_qf_loss'): qf_train_op = self.qf_optimizer( self.qf_lr, name='QFunctionOptimizer').minimize( qval_loss, var_list=self.qf.get_trainable_vars()) f_train_qf = tensor_utils.compile_function( inputs=[input_y, obs, actions], outputs=[qf_train_op, qval_loss, qval]) self.f_train_policy = f_train_policy self.f_train_qf = f_train_qf self.f_init_target = f_init_target self.f_update_target = f_update_target
def update_opt( self, loss, target, leq_constraint, inputs, extra_inputs=None, name=None, constraint_name='constraint', ): """Update the optimizer. Build the functions for computing loss, gradient, and the constraint value. Args: loss (tf.Tensor): Symbolic expression for the loss function. target (metarl.tf.policies.Policy): A parameterized object to optimize over. leq_constraint (tuple[tf.Tensor, float]): A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. inputs (list(tf.Tenosr)): A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed that the first dimension of these inputs should correspond to the number of data points. extra_inputs (list[tf.Tenosr]): A list of symbolic variables as extra inputs which should not be subsampled. name (str): Name to be passed to tf.name_scope. constraint_name (str): A constraint name for prupose of logging and variable names. """ params = target.get_params() ns_vals = [loss, target, leq_constraint, inputs, extra_inputs, params] with tf.name_scope(name, 'ConjugateGradientOptimizer', ns_vals): inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() else: extra_inputs = tuple(extra_inputs) constraint_term, constraint_value = leq_constraint with tf.name_scope('loss_gradients', values=[loss, params]): grads = tf.gradients(loss, xs=params) for idx, (grad, param) in enumerate(zip(grads, params)): if grad is None: grads[idx] = tf.zeros_like(param) flat_grad = tensor_utils.flatten_tensor_variables(grads) self._hvp_approach.update_hvp(f=constraint_term, target=target, inputs=inputs + extra_inputs, reg_coeff=self._reg_coeff, name='update_opt_' + constraint_name) self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name self._opt_fun = LazyDict( f_loss=lambda: tensor_utils.compile_function( inputs=inputs + extra_inputs, outputs=loss, log_name='f_loss', ), f_grad=lambda: tensor_utils.compile_function( inputs=inputs + extra_inputs, outputs=flat_grad, log_name='f_grad', ), f_constraint=lambda: tensor_utils.compile_function( inputs=inputs + extra_inputs, outputs=constraint_term, log_name='constraint', ), f_loss_constraint=lambda: tensor_utils.compile_function( inputs=inputs + extra_inputs, outputs=[loss, constraint_term], log_name='f_loss_constraint', ), )
def _initialize(self): input_var = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + self._input_shape) with tf.compat.v1.variable_scope(self._variable_scope): self.model.build(input_var) ys_var = tf.compat.v1.placeholder(dtype=tf.float32, name='ys', shape=(None, self._output_dim)) old_means_var = tf.compat.v1.placeholder(dtype=tf.float32, name='old_means', shape=(None, self._output_dim)) old_log_stds_var = tf.compat.v1.placeholder( dtype=tf.float32, name='old_log_stds', shape=(None, self._output_dim)) y_mean_var = self.model.networks['default'].y_mean y_std_var = self.model.networks['default'].y_std means_var = self.model.networks['default'].means log_stds_var = self.model.networks['default'].log_stds normalized_means_var = self.model.networks[ 'default'].normalized_means normalized_log_stds_var = self.model.networks[ 'default'].normalized_log_stds normalized_ys_var = (ys_var - y_mean_var) / y_std_var normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var normalized_old_log_stds_var = (old_log_stds_var - tf.math.log(y_std_var)) normalized_dist_info_vars = dict(mean=normalized_means_var, log_std=normalized_log_stds_var) mean_kl = tf.reduce_mean( self.model.networks['default'].dist.kl_sym( dict(mean=normalized_old_means_var, log_std=normalized_old_log_stds_var), normalized_dist_info_vars, )) loss = -tf.reduce_mean( self.model.networks['default'].dist.log_likelihood_sym( normalized_ys_var, normalized_dist_info_vars)) self._f_predict = tensor_utils.compile_function([input_var], means_var) self._f_pdists = tensor_utils.compile_function( [input_var], [means_var, log_stds_var]) optimizer_args = dict( loss=loss, target=self, network_outputs=[ normalized_means_var, normalized_log_stds_var ], ) if self._use_trust_region: optimizer_args['leq_constraint'] = (mean_kl, self._max_kl_step) optimizer_args['inputs'] = [ input_var, ys_var, old_means_var, old_log_stds_var ] else: optimizer_args['inputs'] = [input_var, ys_var] with tf.name_scope('update_opt'): self._optimizer.update_opt(**optimizer_args)
def update_hvp(self, f, target, inputs, reg_coeff, name=None): """Build the symbolic graph to compute the Hessian-vector product. Args: f (tf.Tensor): The function whose Hessian needs to be computed. target (metarl.tf.policies.Policy): A parameterized object to optimize over. inputs (tuple[tf.Tensor]): The inputs for function f. reg_coeff (float): A small value so that A -> A + reg*I. name (str): Name to be used in tf.name_scope. """ self._target = target self._reg_coeff = reg_coeff params = target.get_params() with tf.name_scope(name, 'FiniteDifferenceHvp', [f, inputs, params, target]): constraint_grads = tf.gradients(f, xs=params, name='gradients_constraint') for idx, (grad, param) in enumerate(zip(constraint_grads, params)): if grad is None: constraint_grads[idx] = tf.zeros_like(param) flat_grad = tensor_utils.flatten_tensor_variables(constraint_grads) def f_hx_plain(*args): """Computes product of Hessian(f) and vector v. Args: args (tuple[numpy.ndarray]): Contains inputs of function f , and vector v. Returns: tf.Tensor: Symbolic result. """ with tf.name_scope('f_hx_plain', values=[inputs, self._target]): inputs_ = args[:len(inputs)] xs = args[len(inputs):] flat_xs = np.concatenate( [np.reshape(x, (-1, )) for x in xs]) param_val = self._target.get_param_values() eps = np.cast['float32']( self.base_eps / (np.linalg.norm(param_val) + 1e-8)) self._target.set_param_values(param_val + eps * flat_xs) flat_grad_dvplus = self._hvp_fun['f_grad'](*inputs_) self._target.set_param_values(param_val) if self.symmetric: self._target.set_param_values(param_val - eps * flat_xs) flat_grad_dvminus = self._hvp_fun['f_grad'](*inputs_) hx = (flat_grad_dvplus - flat_grad_dvminus) / (2 * eps) self._target.set_param_values(param_val) else: flat_grad = self._hvp_fun['f_grad'](*inputs_) hx = (flat_grad_dvplus - flat_grad) / eps return hx self._hvp_fun = LazyDict( f_grad=lambda: tensor_utils.compile_function( inputs=inputs, outputs=flat_grad, log_name='f_grad', ), f_hx_plain=lambda: f_hx_plain, )
def _build_entropy_term(self, i): """Build policy entropy tensor. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy entropy. """ with tf.name_scope('policy_entropy'): if self.policy.recurrent: policy_dist_info = self.policy.dist_info_sym( i.obs_var, i.policy_state_info_vars, name='policy_dist_info_2') policy_neg_log_likeli = -self.policy.distribution.log_likelihood_sym( # noqa: E501 i.action_var, policy_dist_info, name='policy_log_likeli') if self._use_neg_logli_entropy: policy_entropy = policy_neg_log_likeli else: policy_entropy = self.policy.distribution.entropy_sym( policy_dist_info) else: policy_dist_info_flat = self.policy.dist_info_sym( i.flat.obs_var, i.flat.policy_state_info_vars, name='policy_dist_info_flat_2') policy_neg_log_likeli_flat = -self.policy.distribution.log_likelihood_sym( # noqa: E501 i.flat.action_var, policy_dist_info_flat, name='policy_log_likeli_flat') policy_dist_info_valid = filter_valids_dict( policy_dist_info_flat, i.flat.valid_var, name='policy_dist_info_valid_2') policy_neg_log_likeli_valid = -self.policy.distribution.log_likelihood_sym( # noqa: E501 i.valid.action_var, policy_dist_info_valid, name='policy_log_likeli_valid') if self._use_neg_logli_entropy: if self._maximum_entropy: policy_entropy = tf.reshape(policy_neg_log_likeli_flat, [-1, self.max_path_length]) else: policy_entropy = policy_neg_log_likeli_valid else: if self._maximum_entropy: policy_entropy_flat = self.policy.distribution.entropy_sym( # noqa: E501 policy_dist_info_flat) policy_entropy = tf.reshape(policy_entropy_flat, [-1, self.max_path_length]) else: policy_entropy_valid = self.policy.distribution.entropy_sym( # noqa: E501 policy_dist_info_valid) policy_entropy = policy_entropy_valid # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) if self._stop_entropy_gradient: policy_entropy = tf.stop_gradient(policy_entropy) self._f_policy_entropy = compile_function(flatten_inputs( self._policy_opt_inputs), policy_entropy, log_name='f_policy_entropy') return policy_entropy
def init_opt(self): """Initialize the networks and Ops. Assume discrete space for dqn, so action dimension will always be action_space.n """ action_dim = self.env_spec.action_space.n self.episode_rewards = [] self.episode_qf_losses = [] # build q networks with tf.name_scope(self._name): action_t_ph = tf.compat.v1.placeholder(tf.int32, None, name='action') reward_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='reward') done_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='done') with tf.name_scope('update_ops'): target_update_op = tensor_utils.get_target_ops( self.qf.get_global_vars(), self._target_qf.get_global_vars()) self._qf_update_ops = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope('td_error'): # Q-value of the selected action action = tf.one_hot(action_t_ph, action_dim, on_value=1., off_value=0.) q_selected = tf.reduce_sum( self.qf.q_vals * action, # yapf: disable axis=1) # r + Q'(s', argmax_a(Q(s', _)) - Q(s, a) if self._double_q: target_qval_with_online_q = self.qf.get_qval_sym( self._target_qf.input, self.qf.name) future_best_q_val_action = tf.argmax( target_qval_with_online_q, 1) future_best_q_val = tf.reduce_sum( self._target_qf.q_vals * tf.one_hot(future_best_q_val_action, action_dim, on_value=1., off_value=0.), axis=1) else: # r + max_a(Q'(s', _)) - Q(s, a) future_best_q_val = tf.reduce_max(self._target_qf.q_vals, axis=1) q_best_masked = (1.0 - done_t_ph) * future_best_q_val # if done, it's just reward # else reward + discount * future_best_q_val target_q_values = (reward_t_ph + self.discount * q_best_masked) # td_error = q_selected - tf.stop_gradient(target_q_values) loss = tf.compat.v1.losses.huber_loss( q_selected, tf.stop_gradient(target_q_values)) loss = tf.reduce_mean(loss) with tf.name_scope('optimize_ops'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr) if self._grad_norm_clipping is not None: gradients = qf_optimizer.compute_gradients( loss, var_list=self.qf.get_trainable_vars()) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm( grad, self._grad_norm_clipping), var) optimize_loss = qf_optimizer.apply_gradients(gradients) else: optimize_loss = qf_optimizer.minimize( loss, var_list=self.qf.get_trainable_vars()) self._train_qf = tensor_utils.compile_function( inputs=[ self.qf.input, action_t_ph, reward_t_ph, done_t_ph, self._target_qf.input ], outputs=[loss, optimize_loss])
def _build_policy_loss(self, i): """Build policy loss and other output tensors. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy loss. tf.Tensor: Mean policy KL divergence. """ pol_dist = self.policy.distribution policy_entropy = self._build_entropy_term(i) rewards = i.reward_var if self._maximum_entropy: with tf.name_scope('augmented_rewards'): rewards = i.reward_var + (self._policy_ent_coeff * policy_entropy) with tf.name_scope('policy_loss'): adv = compute_advantages(self.discount, self.gae_lambda, self.max_path_length, i.baseline_var, rewards, name='adv') adv_flat = flatten_batch(adv, name='adv_flat') adv_valid = filter_valids(adv_flat, i.flat.valid_var, name='adv_valid') if self.policy.recurrent: adv = tf.reshape(adv, [-1, self.max_path_length]) # Optionally normalize advantages eps = tf.constant(1e-8, dtype=tf.float32) if self.center_adv: if self.policy.recurrent: adv = center_advs(adv, axes=[0], eps=eps) else: adv_valid = center_advs(adv_valid, axes=[0], eps=eps) if self.positive_adv: if self.policy.recurrent: adv = positive_advs(adv, eps) else: adv_valid = positive_advs(adv_valid, eps) if self.policy.recurrent: policy_dist_info = self.policy.dist_info_sym( i.obs_var, i.policy_state_info_vars, name='policy_dist_info') else: policy_dist_info_flat = self.policy.dist_info_sym( i.flat.obs_var, i.flat.policy_state_info_vars, name='policy_dist_info_flat') policy_dist_info_valid = filter_valids_dict( policy_dist_info_flat, i.flat.valid_var, name='policy_dist_info_valid') policy_dist_info = policy_dist_info_valid # Calculate loss function and KL divergence with tf.name_scope('kl'): if self.policy.recurrent: kl = pol_dist.kl_sym( i.policy_old_dist_info_vars, policy_dist_info, ) pol_mean_kl = tf.reduce_sum( kl * i.valid_var) / tf.reduce_sum(i.valid_var) else: kl = pol_dist.kl_sym( i.valid.policy_old_dist_info_vars, policy_dist_info_valid, ) pol_mean_kl = tf.reduce_mean(kl) # Calculate vanilla loss with tf.name_scope('vanilla_loss'): if self.policy.recurrent: ll = pol_dist.log_likelihood_sym(i.action_var, policy_dist_info, name='log_likelihood') vanilla = ll * adv * i.valid_var else: ll = pol_dist.log_likelihood_sym(i.valid.action_var, policy_dist_info_valid, name='log_likelihood') vanilla = ll * adv_valid # Calculate surrogate loss with tf.name_scope('surrogate_loss'): if self.policy.recurrent: lr = pol_dist.likelihood_ratio_sym( i.action_var, i.policy_old_dist_info_vars, policy_dist_info, name='lr') surrogate = lr * adv * i.valid_var else: lr = pol_dist.likelihood_ratio_sym( i.valid.action_var, i.valid.policy_old_dist_info_vars, policy_dist_info_valid, name='lr') surrogate = lr * adv_valid # Finalize objective function with tf.name_scope('loss'): if self._pg_loss == 'vanilla': # VPG uses the vanilla objective obj = tf.identity(vanilla, name='vanilla_obj') elif self._pg_loss == 'surrogate': # TRPO uses the standard surrogate objective obj = tf.identity(surrogate, name='surr_obj') elif self._pg_loss == 'surrogate_clip': lr_clip = tf.clip_by_value(lr, 1 - self._lr_clip_range, 1 + self._lr_clip_range, name='lr_clip') if self.policy.recurrent: surr_clip = lr_clip * adv * i.valid_var else: surr_clip = lr_clip * adv_valid obj = tf.minimum(surrogate, surr_clip, name='surr_obj') if self._entropy_regularzied: obj += self._policy_ent_coeff * policy_entropy # Maximize E[surrogate objective] by minimizing # -E_t[surrogate objective] if self.policy.recurrent: loss = -tf.reduce_sum(obj) / tf.reduce_sum(i.valid_var) else: loss = -tf.reduce_mean(obj) # Diagnostic functions self._f_policy_kl = compile_function(flatten_inputs( self._policy_opt_inputs), pol_mean_kl, log_name='f_policy_kl') self._f_rewards = compile_function(flatten_inputs( self._policy_opt_inputs), rewards, log_name='f_rewards') returns = discounted_returns(self.discount, self.max_path_length, rewards) self._f_returns = compile_function(flatten_inputs( self._policy_opt_inputs), returns, log_name='f_returns') return loss, pol_mean_kl
def update_opt(self, loss, target, leq_constraint, inputs, constraint_name='constraint', name=None, **kwargs): """Construct operation graph for the optimizer. Args: loss (tf.Tensor): Loss objective to minimize. target (object): Target object to optimize. The object should implemenet `get_params()` and `get_param_values`. leq_constraint (tuple): It contains a tf.Tensor and a float value. The tf.Tensor represents the constraint term, and the float value is the constraint value. inputs (list[tf.Tensor]): List of input placeholders. constraint_name (str): Constraint name for logging. name (str): Name scope. kwargs (dict): Extra unused keyword arguments. Some optimizers have extra input, e.g. KL constraint. """ params = target.get_params() with tf.name_scope(name, 'PenaltyLbfgsOptimizer', [leq_constraint, loss, params]): constraint_term, constraint_value = leq_constraint penalty_var = tf.compat.v1.placeholder(tf.float32, tuple(), name='penalty') penalized_loss = loss + penalty_var * constraint_term self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name def get_opt_output(): """Helper function to construct graph. Returns: list[tf.Tensor]: Penalized loss and gradient tensor. """ with tf.name_scope('get_opt_output', values=[params, penalized_loss]): grads = tf.gradients(penalized_loss, params) for idx, (grad, param) in enumerate(zip(grads, params)): if grad is None: grads[idx] = tf.zeros_like(param) flat_grad = tensor_utils.flatten_tensor_variables(grads) return [ tf.cast(penalized_loss, tf.float64), tf.cast(flat_grad, tf.float64), ] self._opt_fun = LazyDict( f_loss=lambda: tensor_utils.compile_function( inputs, loss, log_name='f_loss'), f_constraint=lambda: tensor_utils.compile_function( inputs, constraint_term, log_name='f_constraint'), f_penalized_loss=lambda: tensor_utils.compile_function( inputs=inputs + [penalty_var], outputs=[penalized_loss, loss, constraint_term], log_name='f_penalized_loss', ), f_opt=lambda: tensor_utils.compile_function( inputs=inputs + [penalty_var], outputs=get_opt_output(), ))