def _init_opt(self): """Initialize optimizater. Raises: NotImplementedError: Raise if the policy is recurrent. """ # Input variables (pol_loss_inputs, pol_opt_inputs, infer_loss_inputs, infer_opt_inputs) = self._build_inputs() self._policy_opt_inputs = pol_opt_inputs self._inference_opt_inputs = infer_opt_inputs # Jointly optimize policy and encoder network pol_loss, pol_kl, _ = self._build_policy_loss(pol_loss_inputs) self._optimizer.update_opt(loss=pol_loss, target=self.policy, leq_constraint=(pol_kl, self._max_kl_step), inputs=flatten_inputs( self._policy_opt_inputs), constraint_name='mean_kl') # Optimize inference distribution separately (supervised learning) infer_loss, _ = self._build_inference_loss(infer_loss_inputs) self.inference_optimizer.update_opt(loss=infer_loss, target=self._inference, inputs=flatten_inputs( self._inference_opt_inputs))
def _dual_opt_input_values(self, episodes): """Update dual func optimize input values based on samples data. Args: episodes (EpisodeBatch): Batch of episodes. Returns: list(np.ndarray): Flatten dual function optimization input values. """ agent_infos = episodes.padded_agent_infos policy_state_info_list = [ agent_infos[k] for k in self.policy.state_info_keys ] # pylint: disable=unexpected-keyword-arg dual_opt_input_values = self._dual_opt_inputs._replace( reward_var=episodes.padded_rewards, valid_var=episodes.valids, feat_diff=self._feat_diff, param_eta=self._param_eta, param_v=self._param_v, policy_state_info_vars_list=policy_state_info_list, ) return flatten_inputs(dual_opt_input_values)
def _policy_opt_input_values(self, episodes): """Update policy optimize input values based on samples data. Args: episodes (EpisodeBatch): Batch of episodes. Returns: list(np.ndarray): Flatten policy optimization input values. """ agent_infos = episodes.padded_agent_infos policy_state_info_list = [ agent_infos[k] for k in self.policy.state_info_keys ] actions = [ self._env_spec.action_space.flatten_n(act) for act in episodes.actions_list ] padded_actions = episodes.pad_to_last(np.concatenate(actions)) # pylint: disable=unexpected-keyword-arg policy_opt_input_values = self._policy_opt_inputs._replace( obs_var=episodes.padded_observations, action_var=padded_actions, reward_var=episodes.padded_rewards, valid_var=episodes.valids, feat_diff=self._feat_diff, param_eta=self._param_eta, param_v=self._param_v, policy_state_info_vars_list=policy_state_info_list, ) return flatten_inputs(policy_opt_input_values)
def _policy_opt_input_values(self, samples_data): """Update policy optimize input values based on samples data. Args: samples_data (dict): Processed sample data. See garage.tf.paths_to_tensors() for details. Returns: list(np.ndarray): Flatten policy optimization input values. """ policy_state_info_list = [ samples_data['agent_infos'][k] for k in self.policy.state_info_keys ] # yapf: disable # pylint: disable=unexpected-keyword-arg policy_opt_input_values = self._policy_opt_inputs._replace( obs_var=samples_data['observations'], action_var=samples_data['actions'], reward_var=samples_data['rewards'], valid_var=samples_data['valids'], feat_diff=self._feat_diff, param_eta=self._param_eta, param_v=self._param_v, policy_state_info_vars_list=policy_state_info_list, ) return flatten_inputs(policy_opt_input_values)
def _policy_opt_input_values(self, samples_data): """Map episode samples to the policy optimizer inputs. Args: samples_data (dict): Processed sample data. See process_samples() for details. Returns: list(np.ndarray): Flatten policy optimization input values. """ policy_state_info_list = [ samples_data['agent_infos'][k] for k in self.policy.state_info_keys ] embed_state_info_list = [ samples_data['latent_infos'][k] for k in self.policy.encoder.state_info_keys ] # pylint: disable=unexpected-keyword-arg policy_opt_input_values = self._policy_opt_inputs._replace( obs_var=samples_data['observations'], action_var=samples_data['actions'], reward_var=samples_data['rewards'], baseline_var=samples_data['baselines'], trajectory_var=samples_data['trajectories'], task_var=samples_data['tasks'], latent_var=samples_data['latents'], valid_var=samples_data['valids'], policy_state_info_vars_list=policy_state_info_list, embed_state_info_vars_list=embed_state_info_list, ) return flatten_inputs(policy_opt_input_values)
def _inference_opt_input_values(self, episodes, embed_eps, embed_ep_infos): """Map episode samples to the inference optimizer inputs. Args: episodes (EpisodeBatch): Batch of episodes. embed_eps (np.ndarray): Embedding episodes. embed_ep_infos (dict): Embedding distribution information. Returns: list(np.ndarray): Flatten inference optimization input values. """ latents = pad_batch_array(episodes.agent_infos['latent'], episodes.lengths, self.max_episode_length) infer_state_info_list = [ embed_ep_infos[k] for k in self._inference.state_info_keys ] # pylint: disable=unexpected-keyword-arg inference_opt_input_values = self._inference_opt_inputs._replace( latent_var=latents, trajectory_var=embed_eps, valid_var=episodes.valids, infer_state_info_vars_list=infer_state_info_list, ) return flatten_inputs(inference_opt_input_values)
def _policy_opt_input_values(self, episodes, baselines): """Map episode samples to the policy optimizer inputs. Args: episodes (EpisodeBatch): Batch of episodes. baselines (np.ndarray): Baseline predictions. Returns: list(np.ndarray): Flatten policy optimization input values. """ agent_infos = episodes.padded_agent_infos policy_state_info_list = [ agent_infos[k] for k in self.policy.state_info_keys ] actions = [ self._env_spec.action_space.flatten_n(act) for act in episodes.actions_list ] padded_actions = pad_batch_array(np.concatenate(actions), episodes.lengths, self.max_episode_length) # pylint: disable=unexpected-keyword-arg policy_opt_input_values = self._policy_opt_inputs._replace( obs_var=episodes.padded_observations, action_var=padded_actions, reward_var=episodes.padded_rewards, baseline_var=baselines, valid_var=episodes.valids, policy_state_info_vars_list=policy_state_info_list, ) return flatten_inputs(policy_opt_input_values)
def _build_entropy_term(self, i): """Build policy entropy tensor. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy entropy. """ pol_dist = self._policy_network.dist with tf.name_scope('policy_entropy'): if self._use_neg_logli_entropy: policy_entropy = -pol_dist.log_prob(i.action_var, name='policy_log_likeli') else: policy_entropy = pol_dist.entropy() # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) if self._stop_entropy_gradient: policy_entropy = tf.stop_gradient(policy_entropy) # dense form, match the shape of advantage policy_entropy = tf.reshape(policy_entropy, [-1, self.max_episode_length]) self._f_policy_entropy = compile_function( flatten_inputs(self._policy_opt_inputs), policy_entropy) return policy_entropy
def _init_opt(self): """Initialize the optimization procedure.""" pol_loss_inputs, pol_opt_inputs, dual_opt_inputs = self._build_inputs() self._policy_opt_inputs = pol_opt_inputs self._dual_opt_inputs = dual_opt_inputs pol_loss = self._build_policy_loss(pol_loss_inputs) self._optimizer.update_opt(loss=pol_loss, target=self.policy, inputs=flatten_inputs( self._policy_opt_inputs))
def _init_opt(self): """Initialize optimizater.""" pol_loss_inputs, pol_opt_inputs = self._build_inputs() self._policy_opt_inputs = pol_opt_inputs pol_loss, pol_kl = self._build_policy_loss(pol_loss_inputs) self._optimizer.update_opt(loss=pol_loss, target=self.policy, leq_constraint=(pol_kl, self._max_kl_step), inputs=flatten_inputs( self._policy_opt_inputs), constraint_name='mean_kl')
def _policy_opt_input_values(self, episodes, baselines, embed_eps): """Map episode samples to the policy optimizer inputs. Args: episodes (EpisodeBatch): Batch of episodes. baselines (np.ndarray): Baseline predictions. embed_eps (np.ndarray): Embedding episodes. Returns: list(np.ndarray): Flatten policy optimization input values. """ actions = [ self._env_spec.action_space.flatten_n(act) for act in episodes.actions_list ] actions = pad_batch_array(np.concatenate(actions), episodes.lengths, self.max_episode_length) tasks = pad_batch_array(episodes.env_infos['task_onehot'], episodes.lengths, self.max_episode_length) latents = pad_batch_array(episodes.agent_infos['latent'], episodes.lengths, self.max_episode_length) agent_infos = episodes.padded_agent_infos policy_state_info_list = [ agent_infos[k] for k in self.policy.state_info_keys ] embed_state_info_list = [ agent_infos['latent_' + k] for k in self.policy.encoder.state_info_keys ] # pylint: disable=unexpected-keyword-arg policy_opt_input_values = self._policy_opt_inputs._replace( obs_var=episodes.padded_observations, action_var=actions, reward_var=episodes.padded_rewards, baseline_var=baselines, trajectory_var=embed_eps, task_var=tasks, latent_var=latents, valid_var=episodes.valids, policy_state_info_vars_list=policy_state_info_list, embed_state_info_vars_list=embed_state_info_list, ) return flatten_inputs(policy_opt_input_values)
def _build_encoder_kl(self): """Build graph for encoder KL divergence. Returns: tf.Tensor: Encoder KL divergence. """ dist = self._encoder_network.dist old_dist = self._old_encoder_network.dist with tf.name_scope('encoder_kl'): kl = old_dist.kl_divergence(dist) mean_kl = tf.reduce_mean(kl) # Diagnostic function self._f_encoder_kl = compile_function( flatten_inputs(self._policy_opt_inputs), mean_kl) return mean_kl
def _inference_opt_input_values(self, samples_data): """Map episode samples to the inference optimizer inputs. Args: samples_data (dict): Processed sample data. See process_samples() for details. Returns: list(np.ndarray): Flatten inference optimization input values. """ infer_state_info_list = [ samples_data['trajectory_infos'][k] for k in self._inference.state_info_keys ] # pylint: disable=unexpected-keyword-arg inference_opt_input_values = self._inference_opt_inputs._replace( latent_var=samples_data['latents'], trajectory_var=samples_data['trajectories'], valid_var=samples_data['valids'], infer_state_info_vars_list=infer_state_info_list, ) return flatten_inputs(inference_opt_input_values)
def _build_policy_loss(self, i): """Build policy loss and other output tensors. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy loss. tf.Tensor: Mean policy KL divergence. Raises: NotImplementedError: If is_recurrent is True. """ pol_dist = self._policy_network.dist old_pol_dist = self._old_policy_network.dist # Initialize dual params self._param_eta = 15. self._param_v = np.random.rand( self._env_spec.observation_space.flat_dim * 2 + 4) with tf.name_scope('bellman_error'): delta_v = tf.boolean_mask(i.reward_var, i.valid_var) + tf.tensordot( i.feat_diff, i.param_v, 1) with tf.name_scope('policy_loss'): ll = pol_dist.log_prob(i.action_var) ll = tf.boolean_mask(ll, i.valid_var) loss = -tf.reduce_mean( ll * tf.exp(delta_v / i.param_eta - tf.reduce_max(delta_v / i.param_eta))) reg_params = self.policy.get_regularizable_vars() loss += self._l2_reg_loss * tf.reduce_sum( [tf.reduce_mean(tf.square(param)) for param in reg_params]) / len(reg_params) with tf.name_scope('kl'): kl = old_pol_dist.kl_divergence(pol_dist) pol_mean_kl = tf.reduce_mean(kl) with tf.name_scope('dual'): dual_loss = i.param_eta * self._epsilon + ( i.param_eta * tf.math.log( tf.reduce_mean( tf.exp(delta_v / i.param_eta - tf.reduce_max(delta_v / i.param_eta)))) + i.param_eta * tf.reduce_max(delta_v / i.param_eta)) dual_loss += self._l2_reg_dual * (tf.square(i.param_eta) + tf.square(1 / i.param_eta)) dual_grad = tf.gradients(dual_loss, [i.param_eta, i.param_v]) self._f_dual = compile_function( flatten_inputs(self._dual_opt_inputs), dual_loss) self._f_dual_grad = compile_function( flatten_inputs(self._dual_opt_inputs), dual_grad) self._f_policy_kl = compile_function( flatten_inputs(self._policy_opt_inputs), pol_mean_kl) return loss
def _build_policy_loss(self, i): """Build policy loss and other output tensors. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy loss. tf.Tensor: Mean policy KL divergence. """ # pylint: disable=too-many-statements self._policy_network, self._encoder_network = (self.policy.build( i.augmented_obs_var, i.task_var, name='loss_policy')) self._old_policy_network, self._old_encoder_network = ( self._old_policy.build(i.augmented_obs_var, i.task_var, name='loss_old_policy')) self._infer_network = self._inference.build(i.augmented_traj_var, name='loss_infer') self._old_infer_network = self._old_inference.build( i.augmented_traj_var, name='loss_old_infer') pol_dist = self._policy_network.dist old_pol_dist = self._old_policy_network.dist # Entropy terms encoder_entropy, inference_ce, policy_entropy = ( self._build_entropy_terms(i)) # Augment the path rewards with entropy terms with tf.name_scope('augmented_rewards'): rewards = (i.reward_var - (self.inference_ce_coeff * inference_ce) + (self._policy_ent_coeff * policy_entropy)) with tf.name_scope('policy_loss'): with tf.name_scope('advantages'): adv = compute_advantages(self._discount, self._gae_lambda, self.max_episode_length, i.baseline_var, rewards, name='advantages') adv = tf.reshape(adv, [-1, self.max_episode_length]) # Optionally normalize advantages eps = tf.constant(1e-8, dtype=tf.float32) if self._center_adv: adv = center_advs(adv, axes=[0], eps=eps) if self._positive_adv: adv = positive_advs(adv, eps) # Calculate loss function and KL divergence with tf.name_scope('kl'): kl = old_pol_dist.kl_divergence(pol_dist) pol_mean_kl = tf.reduce_mean(kl) ll = pol_dist.log_prob(i.action_var, name='log_likelihood') # Calculate surrogate loss with tf.name_scope('surr_loss'): old_ll = old_pol_dist.log_prob(i.action_var) old_ll = tf.stop_gradient(old_ll) # Clip early to avoid overflow lr = tf.exp( tf.minimum(ll - old_ll, np.log(1 + self._lr_clip_range))) surrogate = lr * adv surrogate = tf.debugging.check_numerics(surrogate, message='surrogate') # Finalize objective function with tf.name_scope('loss'): lr_clip = tf.clip_by_value(lr, 1 - self._lr_clip_range, 1 + self._lr_clip_range, name='lr_clip') surr_clip = lr_clip * adv obj = tf.minimum(surrogate, surr_clip, name='surr_obj') obj = tf.boolean_mask(obj, i.valid_var) # Maximize E[surrogate objective] by minimizing # -E_t[surrogate objective] loss = -tf.reduce_mean(obj) # Encoder entropy bonus loss -= self.encoder_ent_coeff * encoder_entropy encoder_mean_kl = self._build_encoder_kl() # Diagnostic functions self._f_policy_kl = tf.compat.v1.get_default_session( ).make_callable(pol_mean_kl, feed_list=flatten_inputs(self._policy_opt_inputs)) self._f_rewards = tf.compat.v1.get_default_session().make_callable( rewards, feed_list=flatten_inputs(self._policy_opt_inputs)) returns = discounted_returns(self._discount, self.max_episode_length, rewards, name='returns') self._f_returns = tf.compat.v1.get_default_session().make_callable( returns, feed_list=flatten_inputs(self._policy_opt_inputs)) return loss, pol_mean_kl, encoder_mean_kl
def _build_policy_loss(self, i): """Build policy loss and other output tensors. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy loss. tf.Tensor: Mean policy KL divergence. """ policy_entropy = self._build_entropy_term(i) rewards = i.reward_var if self._maximum_entropy: with tf.name_scope('augmented_rewards'): rewards = i.reward_var + (self._policy_ent_coeff * policy_entropy) with tf.name_scope('policy_loss'): adv = compute_advantages(self._discount, self._gae_lambda, self.max_episode_length, i.baseline_var, rewards, name='adv') adv = tf.reshape(adv, [-1, self.max_episode_length]) # Optionally normalize advantages eps = tf.constant(1e-8, dtype=tf.float32) if self._center_adv: adv = center_advs(adv, axes=[0], eps=eps) if self._positive_adv: adv = positive_advs(adv, eps) old_policy_dist = self._old_policy_network.dist policy_dist = self._policy_network.dist with tf.name_scope('kl'): kl = old_policy_dist.kl_divergence(policy_dist) pol_mean_kl = tf.reduce_mean(kl) # Calculate vanilla loss with tf.name_scope('vanilla_loss'): ll = policy_dist.log_prob(i.action_var, name='log_likelihood') vanilla = ll * adv # Calculate surrogate loss with tf.name_scope('surrogate_loss'): lr = tf.exp(ll - old_policy_dist.log_prob(i.action_var)) surrogate = lr * adv # Finalize objective function with tf.name_scope('loss'): if self._pg_loss == 'vanilla': # VPG uses the vanilla objective obj = tf.identity(vanilla, name='vanilla_obj') elif self._pg_loss == 'surrogate': # TRPO uses the standard surrogate objective obj = tf.identity(surrogate, name='surr_obj') elif self._pg_loss == 'surrogate_clip': lr_clip = tf.clip_by_value(lr, 1 - self._lr_clip_range, 1 + self._lr_clip_range, name='lr_clip') surr_clip = lr_clip * adv obj = tf.minimum(surrogate, surr_clip, name='surr_obj') if self._entropy_regularzied: obj += self._policy_ent_coeff * policy_entropy # filter only the valid values obj = tf.boolean_mask(obj, i.valid_var) # Maximize E[surrogate objective] by minimizing # -E_t[surrogate objective] loss = -tf.reduce_mean(obj) # Diagnostic functions self._f_policy_kl = tf.compat.v1.get_default_session( ).make_callable(pol_mean_kl, feed_list=flatten_inputs(self._policy_opt_inputs)) self._f_rewards = tf.compat.v1.get_default_session().make_callable( rewards, feed_list=flatten_inputs(self._policy_opt_inputs)) returns = discounted_returns(self._discount, self.max_episode_length, rewards) self._f_returns = tf.compat.v1.get_default_session().make_callable( returns, feed_list=flatten_inputs(self._policy_opt_inputs)) return loss, pol_mean_kl
def _build_entropy_terms(self, i): """Build policy entropy tensor. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy entropy. """ pol_dist = self._policy_network.dist infer_dist = self._infer_network.dist enc_dist = self._encoder_network.dist with tf.name_scope('entropy_terms'): # 1. Encoder distribution total entropy with tf.name_scope('encoder_entropy'): encoder_dist, _, _ = self.policy.encoder.build( i.task_var, name='encoder_entropy').outputs encoder_all_task_entropies = -encoder_dist.log_prob( i.latent_var) if self._use_softplus_entropy: encoder_entropy = tf.nn.softplus( encoder_all_task_entropies) encoder_entropy = tf.reduce_mean(encoder_entropy, name='encoder_entropy') encoder_entropy = tf.stop_gradient(encoder_entropy) # 2. Infernece distribution cross-entropy (log-likelihood) with tf.name_scope('inference_ce'): # Build inference with trajectory windows traj_ll = infer_dist.log_prob( enc_dist.sample(seed=deterministic.get_tf_seed_stream()), name='traj_ll') inference_ce_raw = -traj_ll inference_ce = tf.clip_by_value(inference_ce_raw, -3, 3) if self._use_softplus_entropy: inference_ce = tf.nn.softplus(inference_ce) if self._stop_ce_gradient: inference_ce = tf.stop_gradient(inference_ce) # 3. Policy path entropies with tf.name_scope('policy_entropy'): policy_entropy = -pol_dist.log_prob(i.action_var, name='policy_log_likeli') # This prevents entropy from becoming negative # for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) policy_entropy = tf.stop_gradient(policy_entropy) # Diagnostic functions self._f_task_entropies = compile_function( flatten_inputs(self._policy_opt_inputs), encoder_all_task_entropies) self._f_encoder_entropy = compile_function( flatten_inputs(self._policy_opt_inputs), encoder_entropy) self._f_inference_ce = compile_function( flatten_inputs(self._policy_opt_inputs), tf.reduce_mean(inference_ce * i.valid_var)) self._f_policy_entropy = compile_function( flatten_inputs(self._policy_opt_inputs), policy_entropy) return encoder_entropy, inference_ce, policy_entropy