def update_opt(self, loss, target, inputs, extra_inputs=None, name='LBFGSOptimizer', **kwargs): """Construct operation graph for the optimizer. Args: loss (tf.Tensor): Loss objective to minimize. target (object): Target object to optimize. The object should implemenet `get_params()` and `get_param_values`. inputs (list[tf.Tensor]): List of input placeholders. extra_inputs (list[tf.Tensor]): List of extra input placeholders. name (str): Name scope. kwargs (dict): Extra unused keyword arguments. Some optimizers have extra input, e.g. KL constraint. """ del kwargs self._target = target params = target.get_params() with tf.name_scope(name): def get_opt_output(): """Helper function to construct graph. Returns: list[tf.Tensor]: Loss and gradient tensor. """ with tf.name_scope('get_opt_output'): flat_grad = flatten_tensor_variables( tf.gradients(loss, params)) return [ tf.cast(loss, tf.float64), tf.cast(flat_grad, tf.float64) ] if extra_inputs is None: extra_inputs = list() self._opt_fun = LazyDict( f_loss=lambda: compile_function(inputs + extra_inputs, loss), f_opt=lambda: compile_function( inputs=inputs + extra_inputs, outputs=get_opt_output(), ))
def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs): """Construct operation graph for the optimizer. Args: loss (tf.Tensor): Loss objective to minimize. target (object): Target object to optimize. The object should implemenet `get_params()` and `get_param_values`. inputs (list[tf.Tensor]): List of input placeholders. extra_inputs (list[tf.Tensor]): List of extra input placeholders. kwargs (dict): Extra unused keyword arguments. Some optimizers have extra input, e.g. KL constraint. """ del kwargs with tf.name_scope(self._name): self._target = target tf_optimizer = make_optimizer(self._tf_optimizer, **self._learning_rate) self._train_op = tf_optimizer.minimize( loss, var_list=target.get_params()) if extra_inputs is None: extra_inputs = list() self._input_vars = inputs + extra_inputs self._opt_fun = LazyDict( f_loss=lambda: compile_function(inputs + extra_inputs, loss), )
def _initialize(self): input_var = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + self._input_shape) ys_var = tf.compat.v1.placeholder(dtype=tf.float32, name='ys', shape=(None, self._output_dim)) self._old_network = self._old_model.build(input_var) (norm_dist, norm_mean, norm_log_std, _, mean, _, self._x_mean, self._x_std, self._y_mean, self._y_std) = self.build(input_var).outputs normalized_ys_var = (ys_var - self._y_mean) / self._y_std old_normalized_dist = self._old_network.normalized_dist mean_kl = tf.reduce_mean(old_normalized_dist.kl_divergence(norm_dist)) loss = -tf.reduce_mean(norm_dist.log_prob(normalized_ys_var)) self._f_predict = compile_function([input_var], mean) optimizer_args = dict( loss=loss, target=self, network_outputs=[norm_mean, norm_log_std], ) if self._use_trust_region: optimizer_args['leq_constraint'] = (mean_kl, self._max_kl_step) optimizer_args['inputs'] = [input_var, ys_var] with tf.name_scope('update_opt'): self._optimizer.update_opt(**optimizer_args)
def _build_entropy_term(self, i): """Build policy entropy tensor. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy entropy. """ pol_dist = self._policy_network.dist with tf.name_scope('policy_entropy'): if self._use_neg_logli_entropy: policy_entropy = -pol_dist.log_prob(i.action_var, name='policy_log_likeli') else: policy_entropy = pol_dist.entropy() # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) if self._stop_entropy_gradient: policy_entropy = tf.stop_gradient(policy_entropy) # dense form, match the shape of advantage policy_entropy = tf.reshape(policy_entropy, [-1, self.max_episode_length]) self._f_policy_entropy = compile_function( flatten_inputs(self._policy_opt_inputs), policy_entropy) return policy_entropy
def _build_encoder_kl(self): """Build graph for encoder KL divergence. Returns: tf.Tensor: Encoder KL divergence. """ dist = self._encoder_network.dist old_dist = self._old_encoder_network.dist with tf.name_scope('encoder_kl'): kl = old_dist.kl_divergence(dist) mean_kl = tf.reduce_mean(kl) # Diagnostic function self._f_encoder_kl = compile_function( flatten_inputs(self._policy_opt_inputs), mean_kl) return mean_kl
def _initialize(self): input_var = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + self._input_shape) ys_var = tf.compat.v1.placeholder(dtype=tf.float32, name='ys', shape=(None, self._output_dim)) (self._y_hat, self._x_mean, self._x_std) = self.build(input_var).outputs loss = tf.reduce_mean(tf.square(self._y_hat - ys_var)) self._f_predict = compile_function([input_var], self._y_hat) optimizer_args = dict( loss=loss, target=self, network_outputs=[ys_var], ) optimizer_args['inputs'] = [input_var, ys_var] with tf.name_scope('update_opt'): self._optimizer.update_opt(**optimizer_args)
def _init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self._name): # Create target policy (actor) and qf (critic) networks with tf.name_scope('inputs'): obs_dim = self._env_spec.observation_space.flat_dim y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self._env_spec.action_space.flat_dim), name='input_action') policy_network_outputs = self._target_policy.build(obs, name='policy') target_qf_outputs = self._target_qf.build(obs, actions, name='qf') target_qf2_outputs = self._target_qf2.build(obs, actions, name='qf') self._target_policy_f_prob_online = compile_function( inputs=[obs], outputs=policy_network_outputs) self._target_qf_f_prob_online = compile_function( inputs=[obs, actions], outputs=target_qf_outputs) self._target_qf2_f_prob_online = compile_function( inputs=[obs, actions], outputs=target_qf2_outputs) # Set up target init and update functions with tf.name_scope('setup_target'): policy_init_op, policy_update_op = get_target_ops( self.policy.get_global_vars(), self._target_policy.get_global_vars(), self._tau) qf_init_ops, qf_update_ops = get_target_ops( self.qf.get_global_vars(), self._target_qf.get_global_vars(), self._tau) qf2_init_ops, qf2_update_ops = get_target_ops( self.qf2.get_global_vars(), self._target_qf2.get_global_vars(), self._tau) target_init_op = policy_init_op + qf_init_ops + qf2_init_ops target_update_op = (policy_update_op + qf_update_ops + qf2_update_ops) f_init_target = compile_function(inputs=[], outputs=target_init_op) f_update_target = compile_function(inputs=[], outputs=target_update_op) # Set up policy training function next_action = self.policy.build(obs, name='policy_action') next_qval = self.qf.build(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) with tf.name_scope('minimize_action_loss'): policy_optimizer = make_optimizer( self._policy_optimizer, learning_rate=self._policy_lr, name='PolicyOptimizer') policy_train_op = policy_optimizer.minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self.qf.build(obs, actions, name='q_value') q2val = self.qf2.build(obs, actions, name='q2_value') with tf.name_scope('qval1_loss'): qval1_loss = tf.reduce_mean(tf.math.squared_difference( y, qval)) with tf.name_scope('qval2_loss'): qval2_loss = tf.reduce_mean( tf.math.squared_difference(y, q2val)) with tf.name_scope('minimize_qf_loss'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr, name='QFunctionOptimizer') qf_train_op = qf_optimizer.minimize( qval1_loss, var_list=self.qf.get_trainable_vars()) qf2_train_op = qf_optimizer.minimize( qval2_loss, var_list=self.qf2.get_trainable_vars()) f_train_qf = compile_function( inputs=[y, obs, actions], outputs=[qf_train_op, qval1_loss, qval]) f_train_qf2 = compile_function( inputs=[y, obs, actions], outputs=[qf2_train_op, qval2_loss, q2val]) self._f_train_policy = f_train_policy self._f_train_qf = f_train_qf self._f_init_target = f_init_target self._f_update_target = f_update_target self._f_train_qf2 = f_train_qf2
def _build_entropy_terms(self, i): """Build policy entropy tensor. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy entropy. """ pol_dist = self._policy_network.dist infer_dist = self._infer_network.dist enc_dist = self._encoder_network.dist with tf.name_scope('entropy_terms'): # 1. Encoder distribution total entropy with tf.name_scope('encoder_entropy'): encoder_dist, _, _ = self.policy.encoder.build( i.task_var, name='encoder_entropy').outputs encoder_all_task_entropies = -encoder_dist.log_prob( i.latent_var) if self._use_softplus_entropy: encoder_entropy = tf.nn.softplus( encoder_all_task_entropies) encoder_entropy = tf.reduce_mean(encoder_entropy, name='encoder_entropy') encoder_entropy = tf.stop_gradient(encoder_entropy) # 2. Infernece distribution cross-entropy (log-likelihood) with tf.name_scope('inference_ce'): # Build inference with trajectory windows traj_ll = infer_dist.log_prob( enc_dist.sample(seed=deterministic.get_tf_seed_stream()), name='traj_ll') inference_ce_raw = -traj_ll inference_ce = tf.clip_by_value(inference_ce_raw, -3, 3) if self._use_softplus_entropy: inference_ce = tf.nn.softplus(inference_ce) if self._stop_ce_gradient: inference_ce = tf.stop_gradient(inference_ce) # 3. Policy path entropies with tf.name_scope('policy_entropy'): policy_entropy = -pol_dist.log_prob(i.action_var, name='policy_log_likeli') # This prevents entropy from becoming negative # for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) policy_entropy = tf.stop_gradient(policy_entropy) # Diagnostic functions self._f_task_entropies = compile_function( flatten_inputs(self._policy_opt_inputs), encoder_all_task_entropies) self._f_encoder_entropy = compile_function( flatten_inputs(self._policy_opt_inputs), encoder_entropy) self._f_inference_ce = compile_function( flatten_inputs(self._policy_opt_inputs), tf.reduce_mean(inference_ce * i.valid_var)) self._f_policy_entropy = compile_function( flatten_inputs(self._policy_opt_inputs), policy_entropy) return encoder_entropy, inference_ce, policy_entropy
def _init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self._name): # Create target policy and qf network with tf.name_scope('inputs'): obs_dim = self._env_spec.observation_space.flat_dim input_y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self._env_spec.action_space.flat_dim), name='input_action') policy_network_outputs = self._target_policy.build(obs, name='policy') target_qf_outputs = self._target_qf.build(obs, actions, name='qf') self._target_policy_f_prob_online = compile_function( inputs=[obs], outputs=policy_network_outputs) self._target_qf_f_prob_online = compile_function( inputs=[obs, actions], outputs=target_qf_outputs) # Set up target init and update function with tf.name_scope('setup_target'): ops = get_target_ops(self.policy.get_global_vars(), self._target_policy.get_global_vars(), self._tau) policy_init_ops, policy_update_ops = ops qf_init_ops, qf_update_ops = get_target_ops( self._qf.get_global_vars(), self._target_qf.get_global_vars(), self._tau) target_init_op = policy_init_ops + qf_init_ops target_update_op = policy_update_ops + qf_update_ops f_init_target = compile_function(inputs=[], outputs=target_init_op) f_update_target = compile_function(inputs=[], outputs=target_update_op) with tf.name_scope('inputs'): obs_dim = self._env_spec.observation_space.flat_dim input_y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self._env_spec.action_space.flat_dim), name='input_action') # Set up policy training function next_action = self.policy.build(obs, name='policy_action') next_qval = self._qf.build(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) if self._policy_weight_decay > 0.: regularizer = tf.keras.regularizers.l2( self._policy_weight_decay) for var in self.policy.get_regularizable_vars(): policy_reg = regularizer(var) action_loss += policy_reg with tf.name_scope('minimize_action_loss'): policy_optimizer = make_optimizer( self._policy_optimizer, learning_rate=self._policy_lr, name='PolicyOptimizer') policy_train_op = policy_optimizer.minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self._qf.build(obs, actions, name='q_value') with tf.name_scope('qval_loss'): qval_loss = tf.reduce_mean( tf.compat.v1.squared_difference(input_y, qval)) if self._qf_weight_decay > 0.: regularizer = tf.keras.regularizers.l2( self._qf_weight_decay) for var in self._qf.get_regularizable_vars(): qf_reg = regularizer(var) qval_loss += qf_reg with tf.name_scope('minimize_qf_loss'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr, name='QFunctionOptimizer') qf_train_op = qf_optimizer.minimize( qval_loss, var_list=self._qf.get_trainable_vars()) f_train_qf = compile_function( inputs=[input_y, obs, actions], outputs=[qf_train_op, qval_loss, qval]) self._f_train_policy = f_train_policy self._f_train_qf = f_train_qf self._f_init_target = f_init_target self._f_update_target = f_update_target
def _build_policy_loss(self, i): """Build policy loss and other output tensors. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy loss. tf.Tensor: Mean policy KL divergence. Raises: NotImplementedError: If is_recurrent is True. """ pol_dist = self._policy_network.dist old_pol_dist = self._old_policy_network.dist # Initialize dual params self._param_eta = 15. self._param_v = np.random.rand( self._env_spec.observation_space.flat_dim * 2 + 4) with tf.name_scope('bellman_error'): delta_v = tf.boolean_mask(i.reward_var, i.valid_var) + tf.tensordot( i.feat_diff, i.param_v, 1) with tf.name_scope('policy_loss'): ll = pol_dist.log_prob(i.action_var) ll = tf.boolean_mask(ll, i.valid_var) loss = -tf.reduce_mean( ll * tf.exp(delta_v / i.param_eta - tf.reduce_max(delta_v / i.param_eta))) reg_params = self.policy.get_regularizable_vars() loss += self._l2_reg_loss * tf.reduce_sum( [tf.reduce_mean(tf.square(param)) for param in reg_params]) / len(reg_params) with tf.name_scope('kl'): kl = old_pol_dist.kl_divergence(pol_dist) pol_mean_kl = tf.reduce_mean(kl) with tf.name_scope('dual'): dual_loss = i.param_eta * self._epsilon + ( i.param_eta * tf.math.log( tf.reduce_mean( tf.exp(delta_v / i.param_eta - tf.reduce_max(delta_v / i.param_eta)))) + i.param_eta * tf.reduce_max(delta_v / i.param_eta)) dual_loss += self._l2_reg_dual * (tf.square(i.param_eta) + tf.square(1 / i.param_eta)) dual_grad = tf.gradients(dual_loss, [i.param_eta, i.param_v]) self._f_dual = compile_function( flatten_inputs(self._dual_opt_inputs), dual_loss) self._f_dual_grad = compile_function( flatten_inputs(self._dual_opt_inputs), dual_grad) self._f_policy_kl = compile_function( flatten_inputs(self._policy_opt_inputs), pol_mean_kl) return loss
def update_opt(self, loss, target, leq_constraint, inputs, constraint_name='constraint', name='PenaltyLBFGSOptimizer', **kwargs): """Construct operation graph for the optimizer. Args: loss (tf.Tensor): Loss objective to minimize. target (object): Target object to optimize. The object should implemenet `get_params()` and `get_param_values`. leq_constraint (tuple): It contains a tf.Tensor and a float value. The tf.Tensor represents the constraint term, and the float value is the constraint value. inputs (list[tf.Tensor]): List of input placeholders. constraint_name (str): Constraint name for logging. name (str): Name scope. kwargs (dict): Extra unused keyword arguments. Some optimizers have extra input, e.g. KL constraint. """ del kwargs params = target.get_params() with tf.name_scope(name): constraint_term, constraint_value = leq_constraint penalty_var = tf.compat.v1.placeholder(tf.float32, tuple(), name='penalty') penalized_loss = loss + penalty_var * constraint_term self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name def get_opt_output(): """Helper function to construct graph. Returns: list[tf.Tensor]: Penalized loss and gradient tensor. """ with tf.name_scope('get_opt_output'): grads = tf.gradients(penalized_loss, params) for idx, (grad, param) in enumerate(zip(grads, params)): if grad is None: grads[idx] = tf.zeros_like(param) flat_grad = flatten_tensor_variables(grads) return [ tf.cast(penalized_loss, tf.float64), tf.cast(flat_grad, tf.float64), ] self._opt_fun = LazyDict( f_loss=lambda: compile_function(inputs, loss), f_constraint=lambda: compile_function(inputs, constraint_term), f_penalized_loss=lambda: compile_function( inputs=inputs + [penalty_var], outputs=[penalized_loss, loss, constraint_term], ), f_opt=lambda: compile_function( inputs=inputs + [penalty_var], outputs=get_opt_output(), ))
def _init_opt(self): """Initialize the networks and Ops. Assume discrete space for dqn, so action dimension will always be action_space.n """ action_dim = self._env_spec.action_space.n # build q networks with tf.name_scope(self._name): action_t_ph = tf.compat.v1.placeholder(tf.int32, None, name='action') reward_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='reward') done_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='done') with tf.name_scope('update_ops'): target_update_op = get_target_ops( self._qf.get_global_vars(), self._target_qf.get_global_vars()) self._qf_update_ops = compile_function(inputs=[], outputs=target_update_op) with tf.name_scope('td_error'): # Q-value of the selected action action = tf.one_hot(action_t_ph, action_dim, on_value=1., off_value=0.) q_selected = tf.reduce_sum( self._qf.q_vals * action, # yapf: disable axis=1) # r + Q'(s', argmax_a(Q(s', _)) - Q(s, a) if self._double_q: target_qval_with_online_q = self._qf.build( self._target_qf.input, self._qf.name) future_best_q_val_action = tf.argmax( target_qval_with_online_q, 1) future_best_q_val = tf.reduce_sum( self._target_qf.q_vals * tf.one_hot(future_best_q_val_action, action_dim, on_value=1., off_value=0.), axis=1) else: # r + max_a(Q'(s', _)) - Q(s, a) future_best_q_val = tf.reduce_max(self._target_qf.q_vals, axis=1) q_best_masked = (1.0 - done_t_ph) * future_best_q_val # if done, it's just reward # else reward + discount * future_best_q_val target_q_values = (reward_t_ph + self._discount * q_best_masked) # td_error = q_selected - tf.stop_gradient(target_q_values) loss = tf.compat.v1.losses.huber_loss( q_selected, tf.stop_gradient(target_q_values)) loss = tf.reduce_mean(loss) with tf.name_scope('optimize_ops'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr) if self._grad_norm_clipping is not None: gradients = qf_optimizer.compute_gradients( loss, var_list=self._qf.get_trainable_vars()) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm( grad, self._grad_norm_clipping), var) optimize_loss = qf_optimizer.apply_gradients(gradients) else: optimize_loss = qf_optimizer.minimize( loss, var_list=self._qf.get_trainable_vars()) self._train_qf = compile_function(inputs=[ self._qf.input, action_t_ph, reward_t_ph, done_t_ph, self._target_qf.input ], outputs=[loss, optimize_loss])