def init_opt(self): if self.policy.recurrent: raise NotImplementedError # Input variables (pol_loss_inputs, pol_opt_inputs, infer_loss_inputs, infer_opt_inputs) = self._build_inputs() self._policy_opt_inputs = pol_opt_inputs self._inference_opt_inputs = infer_opt_inputs # Jointly optimize policy and embedding network pol_loss, pol_kl, embed_kl = self._build_policy_loss(pol_loss_inputs) self.optimizer.update_opt(loss=pol_loss, target=self.policy, leq_constraint=(pol_kl, self.max_kl_step), inputs=flatten_inputs( self._policy_opt_inputs), constraint_name="mean_kl") # Optimize inference distribution separately (supervised learning) infer_loss, infer_kl = self._build_inference_loss(infer_loss_inputs) self.inference_optimizer.update_opt(loss=infer_loss, target=self.inference, inputs=flatten_inputs( self._inference_opt_inputs)) return dict()
def init_opt(self): """Initialize optimizater. Raises: NotImplementedError: Raise if the policy is recurrent. """ # Input variables (pol_loss_inputs, pol_opt_inputs, infer_loss_inputs, infer_opt_inputs) = self._build_inputs() self._policy_opt_inputs = pol_opt_inputs self._inference_opt_inputs = infer_opt_inputs # Jointly optimize policy and encoder network pol_loss, pol_kl, _ = self._build_policy_loss(pol_loss_inputs) self._optimizer.update_opt(loss=pol_loss, target=self.policy, leq_constraint=(pol_kl, self._max_kl_step), inputs=flatten_inputs( self._policy_opt_inputs), constraint_name='mean_kl') # Optimize inference distribution separately (supervised learning) infer_loss, _ = self._build_inference_loss(infer_loss_inputs) self.inference_optimizer.update_opt(loss=infer_loss, target=self._inference, inputs=flatten_inputs( self._inference_opt_inputs))
def _policy_opt_input_values(self, samples_data): """Update policy optimize input values based on samples data. Args: samples_data (dict): Processed sample data. See process_samples() for details. Returns: list(np.ndarray): Flatten policy optimization input values. """ policy_state_info_list = [ samples_data['agent_infos'][k] for k in self.policy.state_info_keys ] # yapf: disable # pylint: disable=unexpected-keyword-arg policy_opt_input_values = self._policy_opt_inputs._replace( obs_var=samples_data['observations'], action_var=samples_data['actions'], reward_var=samples_data['rewards'], valid_var=samples_data['valids'], feat_diff=self._feat_diff, param_eta=self._param_eta, param_v=self._param_v, policy_state_info_vars_list=policy_state_info_list, ) return flatten_inputs(policy_opt_input_values)
def _build_entropy_term(self, i): """Build policy entropy tensor. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy entropy. """ pol_dist = self.policy.distribution with tf.name_scope('policy_entropy'): if self._use_neg_logli_entropy: policy_entropy = -pol_dist.log_prob(i.action_var, name='policy_log_likeli') else: policy_entropy = pol_dist.entropy() # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) if self._stop_entropy_gradient: policy_entropy = tf.stop_gradient(policy_entropy) # dense form, match the shape of advantage policy_entropy = tf.reshape(policy_entropy, [-1, self.max_path_length]) self._f_policy_entropy = compile_function( flatten_inputs(self._policy_opt_inputs), policy_entropy) return policy_entropy
def _policy_opt_input_values(self, samples_data): """Map rollout samples to the policy optimizer inputs. Args: samples_data (dict): Processed sample data. See process_samples() for details. Returns: list(np.ndarray): Flatten policy optimization input values. """ policy_state_info_list = [ samples_data['agent_infos'][k] for k in self.policy.state_info_keys ] embed_state_info_list = [ samples_data['latent_infos'][k] for k in self.policy.encoder.state_info_keys ] # pylint: disable=unexpected-keyword-arg policy_opt_input_values = self._policy_opt_inputs._replace( obs_var=samples_data['observations'], action_var=samples_data['actions'], reward_var=samples_data['rewards'], baseline_var=samples_data['baselines'], trajectory_var=samples_data['trajectories'], task_var=samples_data['tasks'], latent_var=samples_data['latents'], valid_var=samples_data['valids'], policy_state_info_vars_list=policy_state_info_list, embed_state_info_vars_list=embed_state_info_list, ) return flatten_inputs(policy_opt_input_values)
def _policy_opt_input_values(self, samples_data): """Map rollout samples to the policy optimizer inputs. Args: samples_data (dict): Processed sample data. See garage.tf.paths_to_tensors() for details. Returns: list(np.ndarray): Flatten policy optimization input values. """ policy_state_info_list = [ samples_data['agent_infos'][k] for k in self.policy.state_info_keys ] policy_opt_input_values = self._policy_opt_inputs._replace( obs_var=samples_data['observations'], action_var=samples_data['actions'], reward_var=samples_data['rewards'], baseline_var=samples_data['baselines'], valid_var=samples_data['valids'], policy_state_info_vars_list=policy_state_info_list, ) return flatten_inputs(policy_opt_input_values)
def _build_entropy_term(self, i): with tf.name_scope("policy_entropy"): if self.policy.recurrent: policy_dist_info_flat = self.policy.dist_info_sym( i.obs_var, i.policy_state_info_vars, name="policy_dist_info") else: policy_dist_info_flat = self.policy.dist_info_sym( i.flat.obs_var, i.flat.policy_state_info_vars, name="policy_dist_info_flat") policy_entropy_flat = self.policy.distribution.entropy_sym( policy_dist_info_flat) policy_entropy = tf.reshape(policy_entropy_flat, [-1, self.max_path_length]) # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) policy_entropy = tf.reduce_mean(policy_entropy * i.valid_var) self.f_policy_entropy = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), policy_entropy, log_name="f_policy_entropy") return policy_entropy
def _dual_opt_input_values(self, samples_data): """Update dual func optimize input values based on samples data. Args: samples_data (dict): Processed sample data. See garage.tf.paths_to_tensors() for details. Returns: list(np.ndarray): Flatten dual function optimization input values. """ policy_state_info_list = [ samples_data['agent_infos'][k] for k in self.policy.state_info_keys ] # yapf: disable # pylint: disable=unexpected-keyword-arg dual_opt_input_values = self._dual_opt_inputs._replace( reward_var=samples_data['rewards'], valid_var=samples_data['valids'], feat_diff=self._feat_diff, param_eta=self._param_eta, param_v=self._param_v, policy_state_info_vars_list=policy_state_info_list, ) return flatten_inputs(dual_opt_input_values)
def _build_embedding_kl(self, i): dist = self.policy._embedding._dist with tf.name_scope("embedding_kl"): # new distribution embed_dist_info_flat = self.policy._embedding.dist_info_sym( i.flat.task_var, i.flat.embed_state_info_vars, name="embed_dist_info_flat") embed_dist_info_valid = filter_valids_dict( embed_dist_info_flat, i.flat.valid_var, name="embed_dist_info_valid") # calculate KL divergence kl = dist.kl_sym(i.valid.embed_old_dist_info_vars, embed_dist_info_valid) mean_kl = tf.reduce_mean(kl) # Diagnostic function self.f_embedding_kl = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), mean_kl, log_name="f_embedding_kl") return mean_kl
def _policy_opt_input_values(self, samples_data): """ Map rollout samples to the policy optimizer inputs """ policy_state_info_list = [ samples_data["agent_infos"][k] for k in self.policy.state_info_keys ] policy_old_dist_info_list = [ samples_data["agent_infos"][k] for k in self.policy._dist.dist_info_keys ] embed_state_info_list = [ samples_data["latent_infos"][k] for k in self.policy.embedding.state_info_keys ] embed_old_dist_info_list = [ samples_data["latent_infos"][k] for k in self.policy.embedding._dist.dist_info_keys ] policy_opt_input_values = self._policy_opt_inputs._replace( obs_var=samples_data["observations"], action_var=samples_data["actions"], reward_var=samples_data["rewards"], baseline_var=samples_data["baselines"], trajectory_var=samples_data["trajectories"], task_var=samples_data["tasks"], latent_var=samples_data["latents"], valid_var=samples_data["valids"], policy_state_info_vars_list=policy_state_info_list, policy_old_dist_info_vars_list=policy_old_dist_info_list, embed_state_info_vars_list=embed_state_info_list, embed_old_dist_info_vars_list=embed_old_dist_info_list, ) return flatten_inputs(policy_opt_input_values)
def _policy_opt_input_values(self, samples_data): """Update policy optimize input values based on samples data.""" policy_state_info_list = [ samples_data['agent_infos'][k] for k in self.policy.state_info_keys ] # yapf: disable policy_old_dist_info_list = [ samples_data['agent_infos'][k] for k in self.policy.distribution.dist_info_keys ] # pylint: disable=locally-disabled, unexpected-keyword-arg policy_opt_input_values = self._policy_opt_inputs._replace( obs_var=samples_data['observations'], action_var=samples_data['actions'], reward_var=samples_data['rewards'], valid_var=samples_data['valids'], feat_diff=self.feat_diff, param_eta=self.param_eta, param_v=self.param_v, policy_state_info_vars_list=policy_state_info_list, policy_old_dist_info_vars_list=policy_old_dist_info_list, ) return flatten_inputs(policy_opt_input_values)
def _build_entropy_term(self, i): with tf.name_scope("policy_entropy"): if self.policy.recurrent: policy_dist_info = self.policy.dist_info_sym( i.obs_var, i.policy_state_info_vars, name="policy_dist_info") policy_neg_log_likeli = self.policy.distribution.log_likelihood_sym( # noqa: E501 i.action_var, policy_dist_info, name="policy_log_likeli") if self._use_neg_logli_entropy: policy_entropy = policy_neg_log_likeli else: policy_entropy = self.policy.distribution.entropy_sym( policy_dist_info) else: policy_dist_info_flat = self.policy.dist_info_sym( i.flat.obs_var, i.flat.policy_state_info_vars, name="policy_dist_info_flat_entropy") policy_dist_info_valid = filter_valids_dict( policy_dist_info_flat, i.flat.valid_var, name="policy_dist_info_valid") policy_neg_log_likeli_valid = self.policy.distribution.log_likelihood_sym( # noqa: E501 i.valid.action_var, policy_dist_info_valid, name="policy_log_likeli") if self._use_neg_logli_entropy: policy_entropy = policy_neg_log_likeli_valid else: policy_entropy = self.policy.distribution.entropy_sym( policy_dist_info_valid) # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) if self._stop_entropy_gradient: policy_entropy = tf.stop_gradient(policy_entropy) self.f_policy_entropy = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), policy_entropy, log_name="f_policy_entropy") return policy_entropy
def init_opt(self): """Initialize the optimization procedure.""" pol_loss_inputs, pol_opt_inputs, dual_opt_inputs = self._build_inputs() self._policy_opt_inputs = pol_opt_inputs self._dual_opt_inputs = dual_opt_inputs pol_loss = self._build_policy_loss(pol_loss_inputs) self.optimizer.update_opt(loss=pol_loss, target=self.policy, inputs=flatten_inputs( self._policy_opt_inputs))
def init_opt(self): """Initialize optimizater.""" pol_loss_inputs, pol_opt_inputs = self._build_inputs() self._policy_opt_inputs = pol_opt_inputs pol_loss, pol_kl = self._build_policy_loss(pol_loss_inputs) self._optimizer.update_opt(loss=pol_loss, target=self.policy, leq_constraint=(pol_kl, self._max_kl_step), inputs=flatten_inputs( self._policy_opt_inputs), constraint_name='mean_kl')
def init_opt(self): pol_loss_inputs, pol_opt_inputs = self._build_inputs() self._policy_opt_inputs = pol_opt_inputs pol_loss, pol_kl = self._build_policy_loss(pol_loss_inputs) self.optimizer.update_opt( loss=pol_loss, target=self.policy, leq_constraint=(pol_kl, self.max_kl_step), inputs=flatten_inputs(self._policy_opt_inputs), constraint_name="mean_kl") return dict()
def _inference_opt_input_values(self, samples_data): """ Map rollout samples to the inference optimizer inputs """ infer_state_info_list = [ samples_data["trajectory_infos"][k] for k in self.inference.state_info_keys ] infer_old_dist_info_list = [ samples_data["trajectory_infos"][k] for k in self.inference._dist.dist_info_keys ] inference_opt_input_values = self._inference_opt_inputs._replace( latent_var=samples_data["latents"], trajectory_var=samples_data["trajectories"], valid_var=samples_data["valids"], infer_state_info_vars_list=infer_state_info_list, infer_old_dist_info_vars_list=infer_old_dist_info_list, ) return flatten_inputs(inference_opt_input_values)
def _policy_opt_input_values(self, samples_data): """ Map rollout samples to the policy optimizer inputs """ policy_state_info_list = [ samples_data["agent_infos"][k] for k in self.policy.state_info_keys ] policy_old_dist_info_list = [ samples_data["agent_infos"][k] for k in self.policy.distribution.dist_info_keys ] policy_opt_input_values = self._policy_opt_inputs._replace( obs_var=samples_data["observations"], action_var=samples_data["actions"], reward_var=samples_data["rewards"], baseline_var=samples_data["baselines"], valid_var=samples_data["valids"], policy_state_info_vars_list=policy_state_info_list, policy_old_dist_info_vars_list=policy_old_dist_info_list, ) return flatten_inputs(policy_opt_input_values)
def _build_encoder_kl(self): """Build graph for encoder KL divergence. Returns: tf.Tensor: Encoder KL divergence. """ dist = self._encoder_network.dist old_dist = self._old_encoder_network.dist with tf.name_scope('encoder_kl'): kl = old_dist.kl_divergence(dist) mean_kl = tf.reduce_mean(kl) # Diagnostic function self._f_encoder_kl = compile_function(flatten_inputs( self._policy_opt_inputs), mean_kl, log_name='f_encoder_kl') return mean_kl
def _dual_opt_input_values(self, samples_data): """Update dual func optimize input values based on samples data.""" policy_state_info_list = [ samples_data['agent_infos'][k] for k in self.policy.state_info_keys ] # yapf: disable policy_old_dist_info_list = [ samples_data['agent_infos'][k] for k in self.policy.distribution.dist_info_keys ] dual_opt_input_values = self._dual_opt_inputs._replace( reward_var=samples_data['rewards'], valid_var=samples_data['valids'], feat_diff=self.feat_diff, param_eta=self.param_eta, param_v=self.param_v, policy_state_info_vars_list=policy_state_info_list, policy_old_dist_info_vars_list=policy_old_dist_info_list, ) return flatten_inputs(dual_opt_input_values)
def _policy_opt_input_values(self, samples_data): """Update policy optimize input values based on samples data.""" policy_state_info_list = [ samples_data["agent_infos"][k] for k in self.policy.state_info_keys ] # yapf: disable policy_old_dist_info_list = [ samples_data["agent_infos"][k] for k in self.policy.distribution.dist_info_keys ] policy_opt_input_values = self._policy_opt_inputs._replace( obs_var=samples_data["observations"], action_var=samples_data["actions"], reward_var=samples_data["rewards"], valid_var=samples_data["valids"], feat_diff=self.feat_diff, param_eta=self.param_eta, param_v=self.param_v, policy_state_info_vars_list=policy_state_info_list, policy_old_dist_info_vars_list=policy_old_dist_info_list, ) return flatten_inputs(policy_opt_input_values)
def _inference_opt_input_values(self, samples_data): """Map rollout samples to the inference optimizer inputs. Args: samples_data (dict): Processed sample data. See process_samples() for details. Returns: list(np.ndarray): Flatten inference optimization input values. """ infer_state_info_list = [ samples_data['trajectory_infos'][k] for k in self._inference.state_info_keys ] # pylint: disable=unexpected-keyword-arg inference_opt_input_values = self._inference_opt_inputs._replace( latent_var=samples_data['latents'], trajectory_var=samples_data['trajectories'], valid_var=samples_data['valids'], infer_state_info_vars_list=infer_state_info_list, ) return flatten_inputs(inference_opt_input_values)
def _build_entropy_term(self, i): with tf.name_scope('policy_entropy'): if self.policy.recurrent: policy_dist_info = self.policy.dist_info_sym( i.obs_var, i.policy_state_info_vars, name='policy_dist_info_2') policy_neg_log_likeli = -self.policy.distribution.log_likelihood_sym( # noqa: E501 i.action_var, policy_dist_info, name='policy_log_likeli') if self._use_neg_logli_entropy: policy_entropy = policy_neg_log_likeli else: policy_entropy = self.policy.distribution.entropy_sym( policy_dist_info) else: policy_dist_info_flat = self.policy.dist_info_sym( i.flat.obs_var, i.flat.policy_state_info_vars, name='policy_dist_info_flat_2') policy_neg_log_likeli_flat = -self.policy.distribution.log_likelihood_sym( # noqa: E501 i.flat.action_var, policy_dist_info_flat, name='policy_log_likeli_flat') policy_dist_info_valid = filter_valids_dict( policy_dist_info_flat, i.flat.valid_var, name='policy_dist_info_valid_2') policy_neg_log_likeli_valid = -self.policy.distribution.log_likelihood_sym( # noqa: E501 i.valid.action_var, policy_dist_info_valid, name='policy_log_likeli_valid') if self._use_neg_logli_entropy: if self._maximum_entropy: policy_entropy = tf.reshape(policy_neg_log_likeli_flat, [-1, self.max_path_length]) else: policy_entropy = policy_neg_log_likeli_valid else: if self._maximum_entropy: policy_entropy_flat = self.policy.distribution.entropy_sym( # noqa: E501 policy_dist_info_flat) policy_entropy = tf.reshape(policy_entropy_flat, [-1, self.max_path_length]) else: policy_entropy_valid = self.policy.distribution.entropy_sym( # noqa: E501 policy_dist_info_valid) policy_entropy = policy_entropy_valid # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) if self._stop_entropy_gradient: policy_entropy = tf.stop_gradient(policy_entropy) self.f_policy_entropy = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), policy_entropy, log_name='f_policy_entropy') return policy_entropy
def _build_entropy_terms(self, i): """ Calculate entropy terms """ with tf.name_scope("entropy_terms"): # 1. Embedding distribution total entropy with tf.name_scope('embedding_entropy'): all_task_entropies = self.policy.embedding.entropy_sym( i.flat.task_var) if self._use_softplus_entropy: all_task_entropies = tf.nn.softplus(all_task_entropies) embedding_entropy = tf.reduce_mean(all_task_entropies, name="embedding_entropy") # 2. Infernece distribution cross-entropy (log-likelihood) with tf.name_scope('inference_ce'): traj_ll_flat = self.inference.log_likelihood_sym( i.flat.trajectory_var, self.policy._embedding.latent_sym(i.flat.task_var), name="traj_ll_flat") traj_ll = tf.reshape(traj_ll_flat, [-1, self.max_path_length], name="traj_ll") inference_ce_raw = -traj_ll inference_ce = tf.clip_by_value(inference_ce_raw, -3, 3) if self._use_softplus_entropy: inference_ce = tf.nn.softplus(inference_ce) if self._stop_ce_graident: inference = tf.stop_gradient(inference_ce) # 3. Policy path entropies with tf.name_scope('policy_entropy'): policy_entropy_flat = self.policy.entropy_sym( i.flat.task_var, i.flat.obs_var, name="policy_entropy_flat") policy_entropy = tf.reshape(policy_entropy_flat, [-1, self.max_path_length], name="policy_entropy") if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) # Diagnostic functions self.f_task_entropies = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), all_task_entropies, log_name="f_task_entropies") self.f_embedding_entropy = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), embedding_entropy, log_name="f_embedding_entropy") self.f_inference_ce = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), tf.reduce_mean(inference_ce * i.valid_var), log_name="f_inference_ce") self.f_policy_entropy = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), tf.reduce_mean(policy_entropy * i.valid_var), log_name="f_policy_entropy") return embedding_entropy, inference_ce, policy_entropy
def _build_policy_loss(self, i): """ Build policy network loss """ pol_dist = self.policy._dist # Entropy terms embedding_entropy, inference_ce, policy_entropy = \ self._build_entropy_terms(i) # Augment the path rewards with entropy terms with tf.name_scope("augmented_rewards"): rewards = i.reward_var \ - (self.inference_ce_coeff * inference_ce) \ + (self.policy_ent_coeff * policy_entropy) with tf.name_scope("policy_loss"): with tf.name_scope("advantages"): advantages = compute_advantages(self.discount, self.gae_lambda, self.max_path_length, i.baseline_var, rewards, name="advantages") # Flatten and filter valids adv_flat = flatten_batch(advantages, name="adv_flat") adv_valid = filter_valids(adv_flat, i.flat.valid_var, name="adv_valid") policy_dist_info_flat = self.policy.dist_info_sym( i.flat.task_var, i.flat.obs_var, i.flat.policy_state_info_vars, name="policy_dist_info_flat") policy_dist_info_valid = filter_valids_dict( policy_dist_info_flat, i.flat.valid_var, name="policy_dist_info_valid") # Optionally normalize advantages eps = tf.constant(1e-8, dtype=tf.float32) if self.center_adv: with tf.name_scope("center_adv"): mean, var = tf.nn.moments(adv_valid, axes=[0]) adv_valid = tf.nn.batch_normalization( adv_valid, mean, var, 0, 1, eps) if self.positive_adv: with tf.name_scope("positive_adv"): m = tf.reduce_min(adv_valid) adv_valid = (adv_valid - m) + eps # Calculate loss function and KL divergence with tf.name_scope("kl"): kl = pol_dist.kl_sym( i.valid.policy_old_dist_info_vars, policy_dist_info_valid, ) pol_mean_kl = tf.reduce_mean(kl) # Calculate surrogate loss with tf.name_scope("surr_loss"): lr = pol_dist.likelihood_ratio_sym( i.valid.action_var, i.valid.policy_old_dist_info_vars, policy_dist_info_valid, name="lr") # Policy gradient surrogate objective surr_vanilla = lr * adv_valid if self._pg_loss == PGLoss.VANILLA: # VPG, TRPO use the standard surrogate objective surr_obj = tf.identity(surr_vanilla, name="surr_obj") elif self._pg_loss == PGLoss.CLIP: # PPO uses a surrogate objective with clipped LR lr_clip = tf.clip_by_value(lr, 1 - self.lr_clip_range, 1 + self.lr_clip_range, name="lr_clip") surr_clip = lr_clip * adv_valid surr_obj = tf.minimum(surr_vanilla, surr_clip, name="surr_obj") else: raise NotImplementedError("Unknown PGLoss") # Maximize E[surrogate objective] by minimizing # -E_t[surrogate objective] surr_loss = -tf.reduce_mean(surr_obj) # Embedding entropy bonus surr_loss -= self.embedding_ent_coeff * embedding_entropy embed_mean_kl = self._build_embedding_kl(i) # Diagnostic functions self.f_policy_kl = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), pol_mean_kl, log_name="f_policy_kl") self.f_rewards = tensor_utils.compile_function(flatten_inputs( self._policy_opt_inputs), rewards, log_name="f_rewards") # returns = self._build_returns(rewards) returns = discounted_returns(self.discount, self.max_path_length, rewards, name="returns") self.f_returns = tensor_utils.compile_function(flatten_inputs( self._policy_opt_inputs), returns, log_name="f_returns") return surr_loss, pol_mean_kl, embed_mean_kl
def _build_policy_loss(self, i): """Build policy loss and other output tensors. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy loss. tf.Tensor: Mean policy KL divergence. """ # pylint: disable=too-many-statements self._policy_network, self._encoder_network = (self.policy.build( i.augmented_obs_var, i.task_var, name='loss_policy')) self._old_policy_network, self._old_encoder_network = ( self._old_policy.build(i.augmented_obs_var, i.task_var, name='loss_old_policy')) self._infer_network = self._inference.build(i.augmented_traj_var, name='loss_infer') self._old_infer_network = self._old_inference.build( i.augmented_traj_var, name='loss_old_infer') pol_dist = self._policy_network.dist old_pol_dist = self._old_policy_network.dist # Entropy terms encoder_entropy, inference_ce, policy_entropy = ( self._build_entropy_terms(i)) # Augment the path rewards with entropy terms with tf.name_scope('augmented_rewards'): rewards = (i.reward_var - (self.inference_ce_coeff * inference_ce) + (self._policy_ent_coeff * policy_entropy)) with tf.name_scope('policy_loss'): with tf.name_scope('advantages'): adv = compute_advantages(self._discount, self._gae_lambda, self.max_path_length, i.baseline_var, rewards, name='advantages') adv = tf.reshape(adv, [-1, self.max_path_length]) # Optionally normalize advantages eps = tf.constant(1e-8, dtype=tf.float32) if self._center_adv: adv = center_advs(adv, axes=[0], eps=eps) if self._positive_adv: adv = positive_advs(adv, eps) # Calculate loss function and KL divergence with tf.name_scope('kl'): kl = old_pol_dist.kl_divergence(pol_dist) pol_mean_kl = tf.reduce_mean(kl) ll = pol_dist.log_prob(i.action_var, name='log_likelihood') # Calculate surrogate loss with tf.name_scope('surr_loss'): old_ll = old_pol_dist.log_prob(i.action_var) old_ll = tf.stop_gradient(old_ll) # Clip early to avoid overflow lr = tf.exp( tf.minimum(ll - old_ll, np.log(1 + self._lr_clip_range))) surrogate = lr * adv surrogate = tf.debugging.check_numerics(surrogate, message='surrogate') # Finalize objective function with tf.name_scope('loss'): lr_clip = tf.clip_by_value(lr, 1 - self._lr_clip_range, 1 + self._lr_clip_range, name='lr_clip') surr_clip = lr_clip * adv obj = tf.minimum(surrogate, surr_clip, name='surr_obj') obj = tf.boolean_mask(obj, i.valid_var) # Maximize E[surrogate objective] by minimizing # -E_t[surrogate objective] loss = -tf.reduce_mean(obj) # Encoder entropy bonus loss -= self.encoder_ent_coeff * encoder_entropy encoder_mean_kl = self._build_encoder_kl() # Diagnostic functions self._f_policy_kl = tf.compat.v1.get_default_session( ).make_callable(pol_mean_kl, feed_list=flatten_inputs(self._policy_opt_inputs)) self._f_rewards = tf.compat.v1.get_default_session().make_callable( rewards, feed_list=flatten_inputs(self._policy_opt_inputs)) returns = discounted_returns(self._discount, self.max_path_length, rewards, name='returns') self._f_returns = tf.compat.v1.get_default_session().make_callable( returns, feed_list=flatten_inputs(self._policy_opt_inputs)) return loss, pol_mean_kl, encoder_mean_kl
def _build_entropy_terms(self, i): """Build policy entropy tensor. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy entropy. """ pol_dist = self._policy_network.dist infer_dist = self._infer_network.dist enc_dist = self._encoder_network.dist with tf.name_scope('entropy_terms'): # 1. Encoder distribution total entropy with tf.name_scope('encoder_entropy'): encoder_dist, _, _ = self.policy.encoder.build( i.task_var, name='encoder_entropy').outputs encoder_all_task_entropies = -encoder_dist.log_prob( i.latent_var) if self._use_softplus_entropy: encoder_entropy = tf.nn.softplus( encoder_all_task_entropies) encoder_entropy = tf.reduce_mean(encoder_entropy, name='encoder_entropy') encoder_entropy = tf.stop_gradient(encoder_entropy) # 2. Infernece distribution cross-entropy (log-likelihood) with tf.name_scope('inference_ce'): # Build inference with trajectory windows traj_ll = infer_dist.log_prob(enc_dist.sample(), name='traj_ll') inference_ce_raw = -traj_ll inference_ce = tf.clip_by_value(inference_ce_raw, -3, 3) if self._use_softplus_entropy: inference_ce = tf.nn.softplus(inference_ce) if self._stop_ce_gradient: inference_ce = tf.stop_gradient(inference_ce) # 3. Policy path entropies with tf.name_scope('policy_entropy'): policy_entropy = -pol_dist.log_prob(i.action_var, name='policy_log_likeli') # This prevents entropy from becoming negative # for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) policy_entropy = tf.stop_gradient(policy_entropy) # Diagnostic functions self._f_task_entropies = compile_function(flatten_inputs( self._policy_opt_inputs), encoder_all_task_entropies, log_name='f_task_entropies') self._f_encoder_entropy = compile_function( flatten_inputs(self._policy_opt_inputs), encoder_entropy, log_name='f_encoder_entropy') self._f_inference_ce = compile_function( flatten_inputs(self._policy_opt_inputs), tf.reduce_mean(inference_ce * i.valid_var), log_name='f_inference_ce') self._f_policy_entropy = compile_function(flatten_inputs( self._policy_opt_inputs), policy_entropy, log_name='f_policy_entropy') return encoder_entropy, inference_ce, policy_entropy
def _build_policy_loss(self, i): pol_dist = self.policy.distribution policy_entropy = self._build_entropy_term(i) with tf.name_scope("augmented_rewards"): rewards = i.reward_var + (self.policy_ent_coeff * policy_entropy) with tf.name_scope("policy_loss"): advantages = compute_advantages( self.discount, self.gae_lambda, self.max_path_length, i.baseline_var, rewards, name="advantages") adv_flat = flatten_batch(advantages, name="adv_flat") adv_valid = filter_valids( adv_flat, i.flat.valid_var, name="adv_valid") if self.policy.recurrent: advantages = tf.reshape(advantages, [-1, self.max_path_length]) # Optionally normalize advantages eps = tf.constant(1e-8, dtype=tf.float32) if self.center_adv: with tf.name_scope("center_adv"): mean, var = tf.nn.moments(adv_valid, axes=[0]) adv_valid = tf.nn.batch_normalization( adv_valid, mean, var, 0, 1, eps) if self.positive_adv: with tf.name_scope("positive_adv"): m = tf.reduce_min(adv_valid) adv_valid = (adv_valid - m) + eps if self.policy.recurrent: policy_dist_info = self.policy.dist_info_sym( i.obs_var, i.policy_state_info_vars, name="policy_dist_info") else: policy_dist_info_flat = self.policy.dist_info_sym( i.flat.obs_var, i.flat.policy_state_info_vars, name="policy_dist_info_flat") policy_dist_info_valid = filter_valids_dict( policy_dist_info_flat, i.flat.valid_var, name="policy_dist_info_valid") # Calculate loss function and KL divergence with tf.name_scope("kl"): if self.policy.recurrent: kl = pol_dist.kl_sym( i.policy_old_dist_info_vars, policy_dist_info, ) pol_mean_kl = tf.reduce_sum( kl * i.valid_var) / tf.reduce_sum(i.valid_var) else: kl = pol_dist.kl_sym( i.valid.policy_old_dist_info_vars, policy_dist_info_valid, ) pol_mean_kl = tf.reduce_mean(kl) # Calculate vanilla loss with tf.name_scope("vanilla_loss"): if self.policy.recurrent: ll = pol_dist.log_likelihood_sym( i.action_var, policy_dist_info, name="log_likelihood") vanilla = ll * advantages * i.valid_var else: ll = pol_dist.log_likelihood_sym( i.valid.action_var, policy_dist_info_valid, name="log_likelihood") vanilla = ll * adv_valid # Calculate surrogate loss with tf.name_scope("surrogate_loss"): if self.policy.recurrent: lr = pol_dist.likelihood_ratio_sym( i.action_var, i.policy_old_dist_info_vars, policy_dist_info, name="lr") surrogate = lr * advantages * i.valid_var else: lr = pol_dist.likelihood_ratio_sym( i.valid.action_var, i.valid.policy_old_dist_info_vars, policy_dist_info_valid, name="lr") surrogate = lr * adv_valid # Finalize objective function with tf.name_scope("loss"): if self._pg_loss == PGLoss.VANILLA: # VPG uses the vanilla objective obj = tf.identity(vanilla, name="vanilla_obj") elif self._pg_loss == PGLoss.SURROGATE: # TRPO uses the standard surrogate objective obj = tf.identity(surrogate, name="surr_obj") elif self._pg_loss == PGLoss.SURROGATE_CLIP: lr_clip = tf.clip_by_value( lr, 1 - self.lr_clip_range, 1 + self.lr_clip_range, name="lr_clip") if self.policy.recurrent: surr_clip = lr_clip * advantages * i.valid_var else: surr_clip = lr_clip * adv_valid obj = tf.minimum(surrogate, surr_clip, name="surr_obj") else: raise NotImplementedError("Unknown PGLoss") # Maximize E[surrogate objective] by minimizing # -E_t[surrogate objective] if self.policy.recurrent: loss = -tf.reduce_sum(obj) / tf.reduce_sum(i.valid_var) else: loss = -tf.reduce_mean(obj) # Diagnostic functions self.f_policy_kl = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), pol_mean_kl, log_name="f_policy_kl") self.f_rewards = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), rewards, log_name="f_rewards") returns = discounted_returns(self.discount, self.max_path_length, rewards) self.f_returns = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), returns, log_name="f_returns") return loss, pol_mean_kl
def _build_policy_loss(self, i): """Build policy loss and other output tensors. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy loss. tf.Tensor: Mean policy KL divergence. """ policy_entropy = self._build_entropy_term(i) rewards = i.reward_var if self._maximum_entropy: with tf.name_scope('augmented_rewards'): rewards = i.reward_var + (self._policy_ent_coeff * policy_entropy) with tf.name_scope('policy_loss'): adv = compute_advantages(self._discount, self._gae_lambda, self.max_path_length, i.baseline_var, rewards, name='adv') adv = tf.reshape(adv, [-1, self.max_path_length]) # Optionally normalize advantages eps = tf.constant(1e-8, dtype=tf.float32) if self._center_adv: adv = center_advs(adv, axes=[0], eps=eps) if self._positive_adv: adv = positive_advs(adv, eps) with tf.name_scope('kl'): kl = self._old_policy.distribution.kl_divergence( self.policy.distribution) pol_mean_kl = tf.reduce_mean(kl) # Calculate vanilla loss with tf.name_scope('vanilla_loss'): ll = self.policy.distribution.log_prob(i.action_var, name='log_likelihood') vanilla = ll * adv # Calculate surrogate loss with tf.name_scope('surrogate_loss'): lr = tf.exp( ll - self._old_policy.distribution.log_prob(i.action_var)) surrogate = lr * adv # Finalize objective function with tf.name_scope('loss'): if self._pg_loss == 'vanilla': # VPG uses the vanilla objective obj = tf.identity(vanilla, name='vanilla_obj') elif self._pg_loss == 'surrogate': # TRPO uses the standard surrogate objective obj = tf.identity(surrogate, name='surr_obj') elif self._pg_loss == 'surrogate_clip': lr_clip = tf.clip_by_value(lr, 1 - self._lr_clip_range, 1 + self._lr_clip_range, name='lr_clip') surr_clip = lr_clip * adv obj = tf.minimum(surrogate, surr_clip, name='surr_obj') if self._entropy_regularzied: obj += self._policy_ent_coeff * policy_entropy # filter only the valid values obj = tf.boolean_mask(obj, i.valid_var) # Maximize E[surrogate objective] by minimizing # -E_t[surrogate objective] loss = -tf.reduce_mean(obj) # Diagnostic functions self._f_policy_kl = tf.compat.v1.get_default_session( ).make_callable(pol_mean_kl, feed_list=flatten_inputs(self._policy_opt_inputs)) self._f_rewards = tf.compat.v1.get_default_session().make_callable( rewards, feed_list=flatten_inputs(self._policy_opt_inputs)) returns = discounted_returns(self._discount, self.max_path_length, rewards) self._f_returns = tf.compat.v1.get_default_session().make_callable( returns, feed_list=flatten_inputs(self._policy_opt_inputs)) return loss, pol_mean_kl
def _build_policy_loss(self, i): """Build policy loss and other output tensors. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy loss. tf.Tensor: Mean policy KL divergence. Raises: NotImplementedError: If is_recurrent is True. """ pol_dist = self.policy.distribution # Initialize dual params self._param_eta = 15. self._param_v = np.random.rand( self.env_spec.observation_space.flat_dim * 2 + 4) with tf.name_scope('bellman_error'): delta_v = tf.boolean_mask(i.reward_var, i.valid_var) + tf.tensordot( i.feat_diff, i.param_v, 1) with tf.name_scope('policy_loss'): ll = pol_dist.log_prob(i.action_var) ll = tf.boolean_mask(ll, i.valid_var) loss = -tf.reduce_mean( ll * tf.exp(delta_v / i.param_eta - tf.reduce_max(delta_v / i.param_eta))) reg_params = self.policy.get_regularizable_vars() loss += self._l2_reg_loss * tf.reduce_sum( [tf.reduce_mean(tf.square(param)) for param in reg_params]) / len(reg_params) with tf.name_scope('kl'): kl = self._old_policy.distribution.kl_divergence( self.policy.distribution) pol_mean_kl = tf.reduce_mean(kl) with tf.name_scope('dual'): dual_loss = i.param_eta * self._epsilon + ( i.param_eta * tf.math.log( tf.reduce_mean( tf.exp(delta_v / i.param_eta - tf.reduce_max(delta_v / i.param_eta)))) + i.param_eta * tf.reduce_max(delta_v / i.param_eta)) dual_loss += self._l2_reg_dual * (tf.square(i.param_eta) + tf.square(1 / i.param_eta)) dual_grad = tf.gradients(dual_loss, [i.param_eta, i.param_v]) # yapf: disable self._f_dual = tensor_utils.compile_function( flatten_inputs(self._dual_opt_inputs), dual_loss, log_name='f_dual') # yapf: enable self._f_dual_grad = tensor_utils.compile_function( flatten_inputs(self._dual_opt_inputs), dual_grad, log_name='f_dual_grad') self._f_policy_kl = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), pol_mean_kl, log_name='f_policy_kl') return loss
def _build_policy_loss(self, i): """Initialize policy loss complie function based on inputs i.""" pol_dist = self.policy.distribution is_recurrent = self.policy.recurrent # Initialize dual params self.param_eta = 15. self.param_v = np.random.rand( self.env_spec.observation_space.flat_dim * 2 + 4) if is_recurrent: raise NotImplementedError policy_dist_info_flat = self.policy.dist_info_sym( i.flat.obs_var, i.flat.policy_state_info_vars, name='policy_dist_info_flat') policy_dist_info_valid = filter_valids_dict( policy_dist_info_flat, i.flat.valid_var, name='policy_dist_info_valid') with tf.name_scope('bellman_error'): delta_v = i.valid.reward_var + tf.tensordot( i.feat_diff, i.param_v, 1) with tf.name_scope('policy_loss'): ll = pol_dist.log_likelihood_sym(i.valid.action_var, policy_dist_info_valid) loss = -tf.reduce_mean( ll * tf.exp(delta_v / i.param_eta - tf.reduce_max(delta_v / i.param_eta))) reg_params = self.policy.get_params(regularizable=True) loss += self.l2_reg_loss * tf.reduce_sum( [tf.reduce_mean(tf.square(param)) for param in reg_params]) / len(reg_params) with tf.name_scope('kl'): kl = pol_dist.kl_sym( i.valid.policy_old_dist_info_vars, policy_dist_info_valid, ) pol_mean_kl = tf.reduce_mean(kl) with tf.name_scope('dual'): dual_loss = i.param_eta * self.epsilon + i.param_eta * tf.log( tf.reduce_mean( tf.exp(delta_v / i.param_eta - tf.reduce_max(delta_v / i.param_eta))) ) + i.param_eta * tf.reduce_max(delta_v / i.param_eta) dual_loss += self.l2_reg_dual * (tf.square(i.param_eta) + tf.square(1 / i.param_eta)) dual_grad = tf.gradients(dual_loss, [i.param_eta, i.param_v]) self.f_dual = tensor_utils.compile_function(flatten_inputs( self._dual_opt_inputs), dual_loss, log_name='f_dual') self.f_dual_grad = tensor_utils.compile_function( flatten_inputs(self._dual_opt_inputs), dual_grad, log_name='f_dual_grad') self.f_policy_kl = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), pol_mean_kl, log_name='f_policy_kl') return loss