Exemplo n.º 1
0
    def init_opt(self):
        if self.policy.recurrent:
            raise NotImplementedError

        # Input variables
        (pol_loss_inputs, pol_opt_inputs, infer_loss_inputs,
         infer_opt_inputs) = self._build_inputs()

        self._policy_opt_inputs = pol_opt_inputs
        self._inference_opt_inputs = infer_opt_inputs

        # Jointly optimize policy and embedding network
        pol_loss, pol_kl, embed_kl = self._build_policy_loss(pol_loss_inputs)
        self.optimizer.update_opt(loss=pol_loss,
                                  target=self.policy,
                                  leq_constraint=(pol_kl, self.max_kl_step),
                                  inputs=flatten_inputs(
                                      self._policy_opt_inputs),
                                  constraint_name="mean_kl")

        # Optimize inference distribution separately (supervised learning)
        infer_loss, infer_kl = self._build_inference_loss(infer_loss_inputs)
        self.inference_optimizer.update_opt(loss=infer_loss,
                                            target=self.inference,
                                            inputs=flatten_inputs(
                                                self._inference_opt_inputs))

        return dict()
Exemplo n.º 2
0
    def init_opt(self):
        """Initialize optimizater.

        Raises:
            NotImplementedError: Raise if the policy is recurrent.

        """
        # Input variables
        (pol_loss_inputs, pol_opt_inputs, infer_loss_inputs,
         infer_opt_inputs) = self._build_inputs()

        self._policy_opt_inputs = pol_opt_inputs
        self._inference_opt_inputs = infer_opt_inputs

        # Jointly optimize policy and encoder network
        pol_loss, pol_kl, _ = self._build_policy_loss(pol_loss_inputs)
        self._optimizer.update_opt(loss=pol_loss,
                                   target=self.policy,
                                   leq_constraint=(pol_kl, self._max_kl_step),
                                   inputs=flatten_inputs(
                                       self._policy_opt_inputs),
                                   constraint_name='mean_kl')

        # Optimize inference distribution separately (supervised learning)
        infer_loss, _ = self._build_inference_loss(infer_loss_inputs)
        self.inference_optimizer.update_opt(loss=infer_loss,
                                            target=self._inference,
                                            inputs=flatten_inputs(
                                                self._inference_opt_inputs))
Exemplo n.º 3
0
    def _policy_opt_input_values(self, samples_data):
        """Update policy optimize input values based on samples data.

        Args:
            samples_data (dict): Processed sample data.
                See process_samples() for details.

        Returns:
            list(np.ndarray): Flatten policy optimization input values.

        """
        policy_state_info_list = [
            samples_data['agent_infos'][k]
            for k in self.policy.state_info_keys
        ]   # yapf: disable

        # pylint: disable=unexpected-keyword-arg
        policy_opt_input_values = self._policy_opt_inputs._replace(
            obs_var=samples_data['observations'],
            action_var=samples_data['actions'],
            reward_var=samples_data['rewards'],
            valid_var=samples_data['valids'],
            feat_diff=self._feat_diff,
            param_eta=self._param_eta,
            param_v=self._param_v,
            policy_state_info_vars_list=policy_state_info_list,
        )

        return flatten_inputs(policy_opt_input_values)
Exemplo n.º 4
0
    def _build_entropy_term(self, i):
        """Build policy entropy tensor.

        Args:
            i (namedtuple): Collection of variables to compute policy loss.

        Returns:
            tf.Tensor: Policy entropy.

        """
        pol_dist = self.policy.distribution

        with tf.name_scope('policy_entropy'):
            if self._use_neg_logli_entropy:
                policy_entropy = -pol_dist.log_prob(i.action_var,
                                                    name='policy_log_likeli')
            else:
                policy_entropy = pol_dist.entropy()

            # This prevents entropy from becoming negative for small policy std
            if self._use_softplus_entropy:
                policy_entropy = tf.nn.softplus(policy_entropy)

            if self._stop_entropy_gradient:
                policy_entropy = tf.stop_gradient(policy_entropy)

        # dense form, match the shape of advantage
        policy_entropy = tf.reshape(policy_entropy, [-1, self.max_path_length])

        self._f_policy_entropy = compile_function(
            flatten_inputs(self._policy_opt_inputs), policy_entropy)

        return policy_entropy
Exemplo n.º 5
0
    def _policy_opt_input_values(self, samples_data):
        """Map rollout samples to the policy optimizer inputs.

        Args:
            samples_data (dict): Processed sample data.
                See process_samples() for details.

        Returns:
            list(np.ndarray): Flatten policy optimization input values.

        """
        policy_state_info_list = [
            samples_data['agent_infos'][k] for k in self.policy.state_info_keys
        ]
        embed_state_info_list = [
            samples_data['latent_infos'][k]
            for k in self.policy.encoder.state_info_keys
        ]
        # pylint: disable=unexpected-keyword-arg
        policy_opt_input_values = self._policy_opt_inputs._replace(
            obs_var=samples_data['observations'],
            action_var=samples_data['actions'],
            reward_var=samples_data['rewards'],
            baseline_var=samples_data['baselines'],
            trajectory_var=samples_data['trajectories'],
            task_var=samples_data['tasks'],
            latent_var=samples_data['latents'],
            valid_var=samples_data['valids'],
            policy_state_info_vars_list=policy_state_info_list,
            embed_state_info_vars_list=embed_state_info_list,
        )

        return flatten_inputs(policy_opt_input_values)
Exemplo n.º 6
0
    def _policy_opt_input_values(self, samples_data):
        """Map rollout samples to the policy optimizer inputs.

        Args:
            samples_data (dict): Processed sample data.
                See garage.tf.paths_to_tensors() for details.

        Returns:
            list(np.ndarray): Flatten policy optimization input values.

        """
        policy_state_info_list = [
            samples_data['agent_infos'][k] for k in self.policy.state_info_keys
        ]

        policy_opt_input_values = self._policy_opt_inputs._replace(
            obs_var=samples_data['observations'],
            action_var=samples_data['actions'],
            reward_var=samples_data['rewards'],
            baseline_var=samples_data['baselines'],
            valid_var=samples_data['valids'],
            policy_state_info_vars_list=policy_state_info_list,
        )

        return flatten_inputs(policy_opt_input_values)
Exemplo n.º 7
0
    def _build_entropy_term(self, i):
        with tf.name_scope("policy_entropy"):
            if self.policy.recurrent:
                policy_dist_info_flat = self.policy.dist_info_sym(
                    i.obs_var,
                    i.policy_state_info_vars,
                    name="policy_dist_info")
            else:
                policy_dist_info_flat = self.policy.dist_info_sym(
                    i.flat.obs_var,
                    i.flat.policy_state_info_vars,
                    name="policy_dist_info_flat")

            policy_entropy_flat = self.policy.distribution.entropy_sym(
                policy_dist_info_flat)
            policy_entropy = tf.reshape(policy_entropy_flat,
                                        [-1, self.max_path_length])

            # This prevents entropy from becoming negative for small policy std
            if self._use_softplus_entropy:
                policy_entropy = tf.nn.softplus(policy_entropy)

            policy_entropy = tf.reduce_mean(policy_entropy * i.valid_var)

        self.f_policy_entropy = tensor_utils.compile_function(
            flatten_inputs(self._policy_opt_inputs),
            policy_entropy,
            log_name="f_policy_entropy")

        return policy_entropy
Exemplo n.º 8
0
    def _dual_opt_input_values(self, samples_data):
        """Update dual func optimize input values based on samples data.

        Args:
            samples_data (dict): Processed sample data.
                See garage.tf.paths_to_tensors() for details.

        Returns:
            list(np.ndarray): Flatten dual function optimization input values.

        """
        policy_state_info_list = [
            samples_data['agent_infos'][k]
            for k in self.policy.state_info_keys
        ]  # yapf: disable

        # pylint: disable=unexpected-keyword-arg
        dual_opt_input_values = self._dual_opt_inputs._replace(
            reward_var=samples_data['rewards'],
            valid_var=samples_data['valids'],
            feat_diff=self._feat_diff,
            param_eta=self._param_eta,
            param_v=self._param_v,
            policy_state_info_vars_list=policy_state_info_list,
        )

        return flatten_inputs(dual_opt_input_values)
Exemplo n.º 9
0
    def _build_embedding_kl(self, i):
        dist = self.policy._embedding._dist
        with tf.name_scope("embedding_kl"):
            # new distribution
            embed_dist_info_flat = self.policy._embedding.dist_info_sym(
                i.flat.task_var,
                i.flat.embed_state_info_vars,
                name="embed_dist_info_flat")
            embed_dist_info_valid = filter_valids_dict(
                embed_dist_info_flat,
                i.flat.valid_var,
                name="embed_dist_info_valid")

            # calculate KL divergence
            kl = dist.kl_sym(i.valid.embed_old_dist_info_vars,
                             embed_dist_info_valid)
            mean_kl = tf.reduce_mean(kl)

            # Diagnostic function
            self.f_embedding_kl = tensor_utils.compile_function(
                flatten_inputs(self._policy_opt_inputs),
                mean_kl,
                log_name="f_embedding_kl")

            return mean_kl
Exemplo n.º 10
0
    def _policy_opt_input_values(self, samples_data):
        """ Map rollout samples to the policy optimizer inputs """

        policy_state_info_list = [
            samples_data["agent_infos"][k] for k in self.policy.state_info_keys
        ]
        policy_old_dist_info_list = [
            samples_data["agent_infos"][k]
            for k in self.policy._dist.dist_info_keys
        ]
        embed_state_info_list = [
            samples_data["latent_infos"][k]
            for k in self.policy.embedding.state_info_keys
        ]
        embed_old_dist_info_list = [
            samples_data["latent_infos"][k]
            for k in self.policy.embedding._dist.dist_info_keys
        ]
        policy_opt_input_values = self._policy_opt_inputs._replace(
            obs_var=samples_data["observations"],
            action_var=samples_data["actions"],
            reward_var=samples_data["rewards"],
            baseline_var=samples_data["baselines"],
            trajectory_var=samples_data["trajectories"],
            task_var=samples_data["tasks"],
            latent_var=samples_data["latents"],
            valid_var=samples_data["valids"],
            policy_state_info_vars_list=policy_state_info_list,
            policy_old_dist_info_vars_list=policy_old_dist_info_list,
            embed_state_info_vars_list=embed_state_info_list,
            embed_old_dist_info_vars_list=embed_old_dist_info_list,
        )
        return flatten_inputs(policy_opt_input_values)
Exemplo n.º 11
0
    def _policy_opt_input_values(self, samples_data):
        """Update policy optimize input values based on samples data."""
        policy_state_info_list = [
            samples_data['agent_infos'][k]
            for k in self.policy.state_info_keys
        ]   # yapf: disable
        policy_old_dist_info_list = [
            samples_data['agent_infos'][k]
            for k in self.policy.distribution.dist_info_keys
        ]

        # pylint: disable=locally-disabled, unexpected-keyword-arg
        policy_opt_input_values = self._policy_opt_inputs._replace(
            obs_var=samples_data['observations'],
            action_var=samples_data['actions'],
            reward_var=samples_data['rewards'],
            valid_var=samples_data['valids'],
            feat_diff=self.feat_diff,
            param_eta=self.param_eta,
            param_v=self.param_v,
            policy_state_info_vars_list=policy_state_info_list,
            policy_old_dist_info_vars_list=policy_old_dist_info_list,
        )

        return flatten_inputs(policy_opt_input_values)
Exemplo n.º 12
0
    def _build_entropy_term(self, i):
        with tf.name_scope("policy_entropy"):
            if self.policy.recurrent:
                policy_dist_info = self.policy.dist_info_sym(
                    i.obs_var,
                    i.policy_state_info_vars,
                    name="policy_dist_info")

                policy_neg_log_likeli = self.policy.distribution.log_likelihood_sym(  # noqa: E501
                    i.action_var,
                    policy_dist_info,
                    name="policy_log_likeli")

                if self._use_neg_logli_entropy:
                    policy_entropy = policy_neg_log_likeli
                else:
                    policy_entropy = self.policy.distribution.entropy_sym(
                        policy_dist_info)

            else:
                policy_dist_info_flat = self.policy.dist_info_sym(
                    i.flat.obs_var,
                    i.flat.policy_state_info_vars,
                    name="policy_dist_info_flat_entropy")

                policy_dist_info_valid = filter_valids_dict(
                    policy_dist_info_flat,
                    i.flat.valid_var,
                    name="policy_dist_info_valid")

                policy_neg_log_likeli_valid = self.policy.distribution.log_likelihood_sym(  # noqa: E501
                    i.valid.action_var,
                    policy_dist_info_valid,
                    name="policy_log_likeli")

                if self._use_neg_logli_entropy:
                    policy_entropy = policy_neg_log_likeli_valid
                else:
                    policy_entropy = self.policy.distribution.entropy_sym(
                        policy_dist_info_valid)

            # This prevents entropy from becoming negative for small policy std
            if self._use_softplus_entropy:
                policy_entropy = tf.nn.softplus(policy_entropy)

            if self._stop_entropy_gradient:
                policy_entropy = tf.stop_gradient(policy_entropy)

        self.f_policy_entropy = tensor_utils.compile_function(
            flatten_inputs(self._policy_opt_inputs),
            policy_entropy,
            log_name="f_policy_entropy")

        return policy_entropy
Exemplo n.º 13
0
    def init_opt(self):
        """Initialize the optimization procedure."""
        pol_loss_inputs, pol_opt_inputs, dual_opt_inputs = self._build_inputs()
        self._policy_opt_inputs = pol_opt_inputs
        self._dual_opt_inputs = dual_opt_inputs

        pol_loss = self._build_policy_loss(pol_loss_inputs)
        self.optimizer.update_opt(loss=pol_loss,
                                  target=self.policy,
                                  inputs=flatten_inputs(
                                      self._policy_opt_inputs))
Exemplo n.º 14
0
    def init_opt(self):
        """Initialize optimizater."""
        pol_loss_inputs, pol_opt_inputs = self._build_inputs()
        self._policy_opt_inputs = pol_opt_inputs

        pol_loss, pol_kl = self._build_policy_loss(pol_loss_inputs)
        self._optimizer.update_opt(loss=pol_loss,
                                   target=self.policy,
                                   leq_constraint=(pol_kl, self._max_kl_step),
                                   inputs=flatten_inputs(
                                       self._policy_opt_inputs),
                                   constraint_name='mean_kl')
Exemplo n.º 15
0
    def init_opt(self):
        pol_loss_inputs, pol_opt_inputs = self._build_inputs()
        self._policy_opt_inputs = pol_opt_inputs

        pol_loss, pol_kl = self._build_policy_loss(pol_loss_inputs)
        self.optimizer.update_opt(
            loss=pol_loss,
            target=self.policy,
            leq_constraint=(pol_kl, self.max_kl_step),
            inputs=flatten_inputs(self._policy_opt_inputs),
            constraint_name="mean_kl")

        return dict()
Exemplo n.º 16
0
    def _inference_opt_input_values(self, samples_data):
        """ Map rollout samples to the inference optimizer inputs """

        infer_state_info_list = [
            samples_data["trajectory_infos"][k]
            for k in self.inference.state_info_keys
        ]
        infer_old_dist_info_list = [
            samples_data["trajectory_infos"][k]
            for k in self.inference._dist.dist_info_keys
        ]
        inference_opt_input_values = self._inference_opt_inputs._replace(
            latent_var=samples_data["latents"],
            trajectory_var=samples_data["trajectories"],
            valid_var=samples_data["valids"],
            infer_state_info_vars_list=infer_state_info_list,
            infer_old_dist_info_vars_list=infer_old_dist_info_list,
        )

        return flatten_inputs(inference_opt_input_values)
Exemplo n.º 17
0
    def _policy_opt_input_values(self, samples_data):
        """ Map rollout samples to the policy optimizer inputs """
        policy_state_info_list = [
            samples_data["agent_infos"][k] for k in self.policy.state_info_keys
        ]
        policy_old_dist_info_list = [
            samples_data["agent_infos"][k]
            for k in self.policy.distribution.dist_info_keys
        ]
        policy_opt_input_values = self._policy_opt_inputs._replace(
            obs_var=samples_data["observations"],
            action_var=samples_data["actions"],
            reward_var=samples_data["rewards"],
            baseline_var=samples_data["baselines"],
            valid_var=samples_data["valids"],
            policy_state_info_vars_list=policy_state_info_list,
            policy_old_dist_info_vars_list=policy_old_dist_info_list,
        )

        return flatten_inputs(policy_opt_input_values)
Exemplo n.º 18
0
    def _build_encoder_kl(self):
        """Build graph for encoder KL divergence.

        Returns:
            tf.Tensor: Encoder KL divergence.

        """
        dist = self._encoder_network.dist
        old_dist = self._old_encoder_network.dist

        with tf.name_scope('encoder_kl'):
            kl = old_dist.kl_divergence(dist)
            mean_kl = tf.reduce_mean(kl)

            # Diagnostic function
            self._f_encoder_kl = compile_function(flatten_inputs(
                self._policy_opt_inputs),
                                                  mean_kl,
                                                  log_name='f_encoder_kl')

            return mean_kl
Exemplo n.º 19
0
    def _dual_opt_input_values(self, samples_data):
        """Update dual func optimize input values based on samples data."""
        policy_state_info_list = [
            samples_data['agent_infos'][k]
            for k in self.policy.state_info_keys
        ]   # yapf: disable
        policy_old_dist_info_list = [
            samples_data['agent_infos'][k]
            for k in self.policy.distribution.dist_info_keys
        ]

        dual_opt_input_values = self._dual_opt_inputs._replace(
            reward_var=samples_data['rewards'],
            valid_var=samples_data['valids'],
            feat_diff=self.feat_diff,
            param_eta=self.param_eta,
            param_v=self.param_v,
            policy_state_info_vars_list=policy_state_info_list,
            policy_old_dist_info_vars_list=policy_old_dist_info_list,
        )

        return flatten_inputs(dual_opt_input_values)
Exemplo n.º 20
0
    def _policy_opt_input_values(self, samples_data):
        """Update policy optimize input values based on samples data."""
        policy_state_info_list = [
            samples_data["agent_infos"][k]
            for k in self.policy.state_info_keys
        ]   # yapf: disable
        policy_old_dist_info_list = [
            samples_data["agent_infos"][k]
            for k in self.policy.distribution.dist_info_keys
        ]

        policy_opt_input_values = self._policy_opt_inputs._replace(
            obs_var=samples_data["observations"],
            action_var=samples_data["actions"],
            reward_var=samples_data["rewards"],
            valid_var=samples_data["valids"],
            feat_diff=self.feat_diff,
            param_eta=self.param_eta,
            param_v=self.param_v,
            policy_state_info_vars_list=policy_state_info_list,
            policy_old_dist_info_vars_list=policy_old_dist_info_list,
        )

        return flatten_inputs(policy_opt_input_values)
Exemplo n.º 21
0
    def _inference_opt_input_values(self, samples_data):
        """Map rollout samples to the inference optimizer inputs.

        Args:
            samples_data (dict): Processed sample data.
                See process_samples() for details.

        Returns:
            list(np.ndarray): Flatten inference optimization input values.

        """
        infer_state_info_list = [
            samples_data['trajectory_infos'][k]
            for k in self._inference.state_info_keys
        ]
        # pylint: disable=unexpected-keyword-arg
        inference_opt_input_values = self._inference_opt_inputs._replace(
            latent_var=samples_data['latents'],
            trajectory_var=samples_data['trajectories'],
            valid_var=samples_data['valids'],
            infer_state_info_vars_list=infer_state_info_list,
        )

        return flatten_inputs(inference_opt_input_values)
Exemplo n.º 22
0
    def _build_entropy_term(self, i):
        with tf.name_scope('policy_entropy'):
            if self.policy.recurrent:
                policy_dist_info = self.policy.dist_info_sym(
                    i.obs_var,
                    i.policy_state_info_vars,
                    name='policy_dist_info_2')

                policy_neg_log_likeli = -self.policy.distribution.log_likelihood_sym(  # noqa: E501
                    i.action_var,
                    policy_dist_info,
                    name='policy_log_likeli')

                if self._use_neg_logli_entropy:
                    policy_entropy = policy_neg_log_likeli
                else:
                    policy_entropy = self.policy.distribution.entropy_sym(
                        policy_dist_info)
            else:
                policy_dist_info_flat = self.policy.dist_info_sym(
                    i.flat.obs_var,
                    i.flat.policy_state_info_vars,
                    name='policy_dist_info_flat_2')

                policy_neg_log_likeli_flat = -self.policy.distribution.log_likelihood_sym(  # noqa: E501
                    i.flat.action_var,
                    policy_dist_info_flat,
                    name='policy_log_likeli_flat')

                policy_dist_info_valid = filter_valids_dict(
                    policy_dist_info_flat,
                    i.flat.valid_var,
                    name='policy_dist_info_valid_2')

                policy_neg_log_likeli_valid = -self.policy.distribution.log_likelihood_sym(  # noqa: E501
                    i.valid.action_var,
                    policy_dist_info_valid,
                    name='policy_log_likeli_valid')

                if self._use_neg_logli_entropy:
                    if self._maximum_entropy:
                        policy_entropy = tf.reshape(policy_neg_log_likeli_flat,
                                                    [-1, self.max_path_length])
                    else:
                        policy_entropy = policy_neg_log_likeli_valid
                else:
                    if self._maximum_entropy:
                        policy_entropy_flat = self.policy.distribution.entropy_sym(  # noqa: E501
                            policy_dist_info_flat)
                        policy_entropy = tf.reshape(policy_entropy_flat,
                                                    [-1, self.max_path_length])
                    else:
                        policy_entropy_valid = self.policy.distribution.entropy_sym(  # noqa: E501
                            policy_dist_info_valid)
                        policy_entropy = policy_entropy_valid

            # This prevents entropy from becoming negative for small policy std
            if self._use_softplus_entropy:
                policy_entropy = tf.nn.softplus(policy_entropy)

            if self._stop_entropy_gradient:
                policy_entropy = tf.stop_gradient(policy_entropy)

        self.f_policy_entropy = tensor_utils.compile_function(
            flatten_inputs(self._policy_opt_inputs),
            policy_entropy,
            log_name='f_policy_entropy')

        return policy_entropy
Exemplo n.º 23
0
    def _build_entropy_terms(self, i):
        """ Calculate entropy terms """

        with tf.name_scope("entropy_terms"):
            # 1. Embedding distribution total entropy
            with tf.name_scope('embedding_entropy'):
                all_task_entropies = self.policy.embedding.entropy_sym(
                    i.flat.task_var)

                if self._use_softplus_entropy:
                    all_task_entropies = tf.nn.softplus(all_task_entropies)

                embedding_entropy = tf.reduce_mean(all_task_entropies,
                                                   name="embedding_entropy")

            # 2. Infernece distribution cross-entropy (log-likelihood)
            with tf.name_scope('inference_ce'):
                traj_ll_flat = self.inference.log_likelihood_sym(
                    i.flat.trajectory_var,
                    self.policy._embedding.latent_sym(i.flat.task_var),
                    name="traj_ll_flat")
                traj_ll = tf.reshape(traj_ll_flat, [-1, self.max_path_length],
                                     name="traj_ll")
                inference_ce_raw = -traj_ll
                inference_ce = tf.clip_by_value(inference_ce_raw, -3, 3)

                if self._use_softplus_entropy:
                    inference_ce = tf.nn.softplus(inference_ce)

                if self._stop_ce_graident:
                    inference = tf.stop_gradient(inference_ce)

            # 3. Policy path entropies
            with tf.name_scope('policy_entropy'):
                policy_entropy_flat = self.policy.entropy_sym(
                    i.flat.task_var,
                    i.flat.obs_var,
                    name="policy_entropy_flat")
                policy_entropy = tf.reshape(policy_entropy_flat,
                                            [-1, self.max_path_length],
                                            name="policy_entropy")

                if self._use_softplus_entropy:
                    policy_entropy = tf.nn.softplus(policy_entropy)

        # Diagnostic functions
        self.f_task_entropies = tensor_utils.compile_function(
            flatten_inputs(self._policy_opt_inputs),
            all_task_entropies,
            log_name="f_task_entropies")
        self.f_embedding_entropy = tensor_utils.compile_function(
            flatten_inputs(self._policy_opt_inputs),
            embedding_entropy,
            log_name="f_embedding_entropy")
        self.f_inference_ce = tensor_utils.compile_function(
            flatten_inputs(self._policy_opt_inputs),
            tf.reduce_mean(inference_ce * i.valid_var),
            log_name="f_inference_ce")
        self.f_policy_entropy = tensor_utils.compile_function(
            flatten_inputs(self._policy_opt_inputs),
            tf.reduce_mean(policy_entropy * i.valid_var),
            log_name="f_policy_entropy")

        return embedding_entropy, inference_ce, policy_entropy
Exemplo n.º 24
0
    def _build_policy_loss(self, i):
        """ Build policy network loss """
        pol_dist = self.policy._dist

        # Entropy terms
        embedding_entropy, inference_ce, policy_entropy = \
            self._build_entropy_terms(i)

        # Augment the path rewards with entropy terms
        with tf.name_scope("augmented_rewards"):
            rewards = i.reward_var \
                      - (self.inference_ce_coeff * inference_ce) \
                      + (self.policy_ent_coeff * policy_entropy)

        with tf.name_scope("policy_loss"):
            with tf.name_scope("advantages"):
                advantages = compute_advantages(self.discount,
                                                self.gae_lambda,
                                                self.max_path_length,
                                                i.baseline_var,
                                                rewards,
                                                name="advantages")

                # Flatten and filter valids
                adv_flat = flatten_batch(advantages, name="adv_flat")
                adv_valid = filter_valids(adv_flat,
                                          i.flat.valid_var,
                                          name="adv_valid")

            policy_dist_info_flat = self.policy.dist_info_sym(
                i.flat.task_var,
                i.flat.obs_var,
                i.flat.policy_state_info_vars,
                name="policy_dist_info_flat")
            policy_dist_info_valid = filter_valids_dict(
                policy_dist_info_flat,
                i.flat.valid_var,
                name="policy_dist_info_valid")

            # Optionally normalize advantages
            eps = tf.constant(1e-8, dtype=tf.float32)
            if self.center_adv:
                with tf.name_scope("center_adv"):
                    mean, var = tf.nn.moments(adv_valid, axes=[0])
                    adv_valid = tf.nn.batch_normalization(
                        adv_valid, mean, var, 0, 1, eps)
            if self.positive_adv:
                with tf.name_scope("positive_adv"):
                    m = tf.reduce_min(adv_valid)
                    adv_valid = (adv_valid - m) + eps

            # Calculate loss function and KL divergence
            with tf.name_scope("kl"):
                kl = pol_dist.kl_sym(
                    i.valid.policy_old_dist_info_vars,
                    policy_dist_info_valid,
                )
                pol_mean_kl = tf.reduce_mean(kl)

            # Calculate surrogate loss
            with tf.name_scope("surr_loss"):
                lr = pol_dist.likelihood_ratio_sym(
                    i.valid.action_var,
                    i.valid.policy_old_dist_info_vars,
                    policy_dist_info_valid,
                    name="lr")

                # Policy gradient surrogate objective
                surr_vanilla = lr * adv_valid

                if self._pg_loss == PGLoss.VANILLA:
                    # VPG, TRPO use the standard surrogate objective
                    surr_obj = tf.identity(surr_vanilla, name="surr_obj")
                elif self._pg_loss == PGLoss.CLIP:
                    # PPO uses a surrogate objective with clipped LR
                    lr_clip = tf.clip_by_value(lr,
                                               1 - self.lr_clip_range,
                                               1 + self.lr_clip_range,
                                               name="lr_clip")
                    surr_clip = lr_clip * adv_valid
                    surr_obj = tf.minimum(surr_vanilla,
                                          surr_clip,
                                          name="surr_obj")
                else:
                    raise NotImplementedError("Unknown PGLoss")

                # Maximize E[surrogate objective] by minimizing
                # -E_t[surrogate objective]
                surr_loss = -tf.reduce_mean(surr_obj)

                # Embedding entropy bonus
                surr_loss -= self.embedding_ent_coeff * embedding_entropy

            embed_mean_kl = self._build_embedding_kl(i)

        # Diagnostic functions
        self.f_policy_kl = tensor_utils.compile_function(
            flatten_inputs(self._policy_opt_inputs),
            pol_mean_kl,
            log_name="f_policy_kl")

        self.f_rewards = tensor_utils.compile_function(flatten_inputs(
            self._policy_opt_inputs),
                                                       rewards,
                                                       log_name="f_rewards")

        # returns = self._build_returns(rewards)
        returns = discounted_returns(self.discount,
                                     self.max_path_length,
                                     rewards,
                                     name="returns")
        self.f_returns = tensor_utils.compile_function(flatten_inputs(
            self._policy_opt_inputs),
                                                       returns,
                                                       log_name="f_returns")

        return surr_loss, pol_mean_kl, embed_mean_kl
Exemplo n.º 25
0
    def _build_policy_loss(self, i):
        """Build policy loss and other output tensors.

        Args:
            i (namedtuple): Collection of variables to compute policy loss.

        Returns:
            tf.Tensor: Policy loss.
            tf.Tensor: Mean policy KL divergence.

        """
        # pylint: disable=too-many-statements
        self._policy_network, self._encoder_network = (self.policy.build(
            i.augmented_obs_var, i.task_var, name='loss_policy'))
        self._old_policy_network, self._old_encoder_network = (
            self._old_policy.build(i.augmented_obs_var,
                                   i.task_var,
                                   name='loss_old_policy'))
        self._infer_network = self._inference.build(i.augmented_traj_var,
                                                    name='loss_infer')
        self._old_infer_network = self._old_inference.build(
            i.augmented_traj_var, name='loss_old_infer')

        pol_dist = self._policy_network.dist
        old_pol_dist = self._old_policy_network.dist

        # Entropy terms
        encoder_entropy, inference_ce, policy_entropy = (
            self._build_entropy_terms(i))

        # Augment the path rewards with entropy terms
        with tf.name_scope('augmented_rewards'):
            rewards = (i.reward_var -
                       (self.inference_ce_coeff * inference_ce) +
                       (self._policy_ent_coeff * policy_entropy))

        with tf.name_scope('policy_loss'):
            with tf.name_scope('advantages'):
                adv = compute_advantages(self._discount,
                                         self._gae_lambda,
                                         self.max_path_length,
                                         i.baseline_var,
                                         rewards,
                                         name='advantages')
                adv = tf.reshape(adv, [-1, self.max_path_length])

            # Optionally normalize advantages
            eps = tf.constant(1e-8, dtype=tf.float32)
            if self._center_adv:
                adv = center_advs(adv, axes=[0], eps=eps)

            if self._positive_adv:
                adv = positive_advs(adv, eps)

            # Calculate loss function and KL divergence
            with tf.name_scope('kl'):
                kl = old_pol_dist.kl_divergence(pol_dist)
                pol_mean_kl = tf.reduce_mean(kl)

            ll = pol_dist.log_prob(i.action_var, name='log_likelihood')

            # Calculate surrogate loss
            with tf.name_scope('surr_loss'):
                old_ll = old_pol_dist.log_prob(i.action_var)
                old_ll = tf.stop_gradient(old_ll)
                # Clip early to avoid overflow
                lr = tf.exp(
                    tf.minimum(ll - old_ll, np.log(1 + self._lr_clip_range)))

                surrogate = lr * adv

                surrogate = tf.debugging.check_numerics(surrogate,
                                                        message='surrogate')

            # Finalize objective function
            with tf.name_scope('loss'):
                lr_clip = tf.clip_by_value(lr,
                                           1 - self._lr_clip_range,
                                           1 + self._lr_clip_range,
                                           name='lr_clip')
                surr_clip = lr_clip * adv
                obj = tf.minimum(surrogate, surr_clip, name='surr_obj')
                obj = tf.boolean_mask(obj, i.valid_var)
                # Maximize E[surrogate objective] by minimizing
                # -E_t[surrogate objective]
                loss = -tf.reduce_mean(obj)

                # Encoder entropy bonus
                loss -= self.encoder_ent_coeff * encoder_entropy

            encoder_mean_kl = self._build_encoder_kl()

            # Diagnostic functions
            self._f_policy_kl = tf.compat.v1.get_default_session(
            ).make_callable(pol_mean_kl,
                            feed_list=flatten_inputs(self._policy_opt_inputs))

            self._f_rewards = tf.compat.v1.get_default_session().make_callable(
                rewards, feed_list=flatten_inputs(self._policy_opt_inputs))

            returns = discounted_returns(self._discount,
                                         self.max_path_length,
                                         rewards,
                                         name='returns')
            self._f_returns = tf.compat.v1.get_default_session().make_callable(
                returns, feed_list=flatten_inputs(self._policy_opt_inputs))

        return loss, pol_mean_kl, encoder_mean_kl
Exemplo n.º 26
0
    def _build_entropy_terms(self, i):
        """Build policy entropy tensor.

        Args:
            i (namedtuple): Collection of variables to compute policy loss.

        Returns:
            tf.Tensor: Policy entropy.

        """
        pol_dist = self._policy_network.dist
        infer_dist = self._infer_network.dist
        enc_dist = self._encoder_network.dist
        with tf.name_scope('entropy_terms'):
            # 1. Encoder distribution total entropy
            with tf.name_scope('encoder_entropy'):
                encoder_dist, _, _ = self.policy.encoder.build(
                    i.task_var, name='encoder_entropy').outputs
                encoder_all_task_entropies = -encoder_dist.log_prob(
                    i.latent_var)

                if self._use_softplus_entropy:
                    encoder_entropy = tf.nn.softplus(
                        encoder_all_task_entropies)

                encoder_entropy = tf.reduce_mean(encoder_entropy,
                                                 name='encoder_entropy')
                encoder_entropy = tf.stop_gradient(encoder_entropy)

            # 2. Infernece distribution cross-entropy (log-likelihood)
            with tf.name_scope('inference_ce'):
                # Build inference with trajectory windows

                traj_ll = infer_dist.log_prob(enc_dist.sample(),
                                              name='traj_ll')

                inference_ce_raw = -traj_ll
                inference_ce = tf.clip_by_value(inference_ce_raw, -3, 3)

                if self._use_softplus_entropy:
                    inference_ce = tf.nn.softplus(inference_ce)

                if self._stop_ce_gradient:
                    inference_ce = tf.stop_gradient(inference_ce)

            # 3. Policy path entropies
            with tf.name_scope('policy_entropy'):
                policy_entropy = -pol_dist.log_prob(i.action_var,
                                                    name='policy_log_likeli')

                # This prevents entropy from becoming negative
                # for small policy std
                if self._use_softplus_entropy:
                    policy_entropy = tf.nn.softplus(policy_entropy)

                policy_entropy = tf.stop_gradient(policy_entropy)

        # Diagnostic functions
        self._f_task_entropies = compile_function(flatten_inputs(
            self._policy_opt_inputs),
                                                  encoder_all_task_entropies,
                                                  log_name='f_task_entropies')
        self._f_encoder_entropy = compile_function(
            flatten_inputs(self._policy_opt_inputs),
            encoder_entropy,
            log_name='f_encoder_entropy')
        self._f_inference_ce = compile_function(
            flatten_inputs(self._policy_opt_inputs),
            tf.reduce_mean(inference_ce * i.valid_var),
            log_name='f_inference_ce')
        self._f_policy_entropy = compile_function(flatten_inputs(
            self._policy_opt_inputs),
                                                  policy_entropy,
                                                  log_name='f_policy_entropy')

        return encoder_entropy, inference_ce, policy_entropy
Exemplo n.º 27
0
    def _build_policy_loss(self, i):
        pol_dist = self.policy.distribution

        policy_entropy = self._build_entropy_term(i)

        with tf.name_scope("augmented_rewards"):
            rewards = i.reward_var + (self.policy_ent_coeff * policy_entropy)

        with tf.name_scope("policy_loss"):
            advantages = compute_advantages(
                self.discount,
                self.gae_lambda,
                self.max_path_length,
                i.baseline_var,
                rewards,
                name="advantages")

            adv_flat = flatten_batch(advantages, name="adv_flat")
            adv_valid = filter_valids(
                adv_flat, i.flat.valid_var, name="adv_valid")

            if self.policy.recurrent:
                advantages = tf.reshape(advantages, [-1, self.max_path_length])

            # Optionally normalize advantages
            eps = tf.constant(1e-8, dtype=tf.float32)
            if self.center_adv:
                with tf.name_scope("center_adv"):
                    mean, var = tf.nn.moments(adv_valid, axes=[0])
                    adv_valid = tf.nn.batch_normalization(
                        adv_valid, mean, var, 0, 1, eps)
            if self.positive_adv:
                with tf.name_scope("positive_adv"):
                    m = tf.reduce_min(adv_valid)
                    adv_valid = (adv_valid - m) + eps

            if self.policy.recurrent:
                policy_dist_info = self.policy.dist_info_sym(
                    i.obs_var,
                    i.policy_state_info_vars,
                    name="policy_dist_info")
            else:
                policy_dist_info_flat = self.policy.dist_info_sym(
                    i.flat.obs_var,
                    i.flat.policy_state_info_vars,
                    name="policy_dist_info_flat")

                policy_dist_info_valid = filter_valids_dict(
                    policy_dist_info_flat,
                    i.flat.valid_var,
                    name="policy_dist_info_valid")

            # Calculate loss function and KL divergence
            with tf.name_scope("kl"):
                if self.policy.recurrent:
                    kl = pol_dist.kl_sym(
                        i.policy_old_dist_info_vars,
                        policy_dist_info,
                    )
                    pol_mean_kl = tf.reduce_sum(
                        kl * i.valid_var) / tf.reduce_sum(i.valid_var)
                else:
                    kl = pol_dist.kl_sym(
                        i.valid.policy_old_dist_info_vars,
                        policy_dist_info_valid,
                    )
                    pol_mean_kl = tf.reduce_mean(kl)

            # Calculate vanilla loss
            with tf.name_scope("vanilla_loss"):
                if self.policy.recurrent:
                    ll = pol_dist.log_likelihood_sym(
                        i.action_var, policy_dist_info, name="log_likelihood")

                    vanilla = ll * advantages * i.valid_var
                else:
                    ll = pol_dist.log_likelihood_sym(
                        i.valid.action_var,
                        policy_dist_info_valid,
                        name="log_likelihood")

                    vanilla = ll * adv_valid

            # Calculate surrogate loss
            with tf.name_scope("surrogate_loss"):
                if self.policy.recurrent:
                    lr = pol_dist.likelihood_ratio_sym(
                        i.action_var,
                        i.policy_old_dist_info_vars,
                        policy_dist_info,
                        name="lr")

                    surrogate = lr * advantages * i.valid_var
                else:
                    lr = pol_dist.likelihood_ratio_sym(
                        i.valid.action_var,
                        i.valid.policy_old_dist_info_vars,
                        policy_dist_info_valid,
                        name="lr")

                    surrogate = lr * adv_valid

            # Finalize objective function
            with tf.name_scope("loss"):
                if self._pg_loss == PGLoss.VANILLA:
                    # VPG uses the vanilla objective
                    obj = tf.identity(vanilla, name="vanilla_obj")
                elif self._pg_loss == PGLoss.SURROGATE:
                    # TRPO uses the standard surrogate objective
                    obj = tf.identity(surrogate, name="surr_obj")
                elif self._pg_loss == PGLoss.SURROGATE_CLIP:
                    lr_clip = tf.clip_by_value(
                        lr,
                        1 - self.lr_clip_range,
                        1 + self.lr_clip_range,
                        name="lr_clip")
                    if self.policy.recurrent:
                        surr_clip = lr_clip * advantages * i.valid_var
                    else:
                        surr_clip = lr_clip * adv_valid
                    obj = tf.minimum(surrogate, surr_clip, name="surr_obj")
                else:
                    raise NotImplementedError("Unknown PGLoss")

                # Maximize E[surrogate objective] by minimizing
                # -E_t[surrogate objective]
                if self.policy.recurrent:
                    loss = -tf.reduce_sum(obj) / tf.reduce_sum(i.valid_var)
                else:
                    loss = -tf.reduce_mean(obj)

            # Diagnostic functions
            self.f_policy_kl = tensor_utils.compile_function(
                flatten_inputs(self._policy_opt_inputs),
                pol_mean_kl,
                log_name="f_policy_kl")

            self.f_rewards = tensor_utils.compile_function(
                flatten_inputs(self._policy_opt_inputs),
                rewards,
                log_name="f_rewards")

            returns = discounted_returns(self.discount, self.max_path_length,
                                         rewards)
            self.f_returns = tensor_utils.compile_function(
                flatten_inputs(self._policy_opt_inputs),
                returns,
                log_name="f_returns")

            return loss, pol_mean_kl
Exemplo n.º 28
0
    def _build_policy_loss(self, i):
        """Build policy loss and other output tensors.

        Args:
            i (namedtuple): Collection of variables to compute policy loss.

        Returns:
            tf.Tensor: Policy loss.
            tf.Tensor: Mean policy KL divergence.

        """
        policy_entropy = self._build_entropy_term(i)
        rewards = i.reward_var

        if self._maximum_entropy:
            with tf.name_scope('augmented_rewards'):
                rewards = i.reward_var + (self._policy_ent_coeff *
                                          policy_entropy)

        with tf.name_scope('policy_loss'):
            adv = compute_advantages(self._discount,
                                     self._gae_lambda,
                                     self.max_path_length,
                                     i.baseline_var,
                                     rewards,
                                     name='adv')

            adv = tf.reshape(adv, [-1, self.max_path_length])
            # Optionally normalize advantages
            eps = tf.constant(1e-8, dtype=tf.float32)
            if self._center_adv:
                adv = center_advs(adv, axes=[0], eps=eps)

            if self._positive_adv:
                adv = positive_advs(adv, eps)

            with tf.name_scope('kl'):
                kl = self._old_policy.distribution.kl_divergence(
                    self.policy.distribution)
                pol_mean_kl = tf.reduce_mean(kl)

            # Calculate vanilla loss
            with tf.name_scope('vanilla_loss'):
                ll = self.policy.distribution.log_prob(i.action_var,
                                                       name='log_likelihood')
                vanilla = ll * adv

            # Calculate surrogate loss
            with tf.name_scope('surrogate_loss'):
                lr = tf.exp(
                    ll - self._old_policy.distribution.log_prob(i.action_var))
                surrogate = lr * adv

            # Finalize objective function
            with tf.name_scope('loss'):
                if self._pg_loss == 'vanilla':
                    # VPG uses the vanilla objective
                    obj = tf.identity(vanilla, name='vanilla_obj')
                elif self._pg_loss == 'surrogate':
                    # TRPO uses the standard surrogate objective
                    obj = tf.identity(surrogate, name='surr_obj')
                elif self._pg_loss == 'surrogate_clip':
                    lr_clip = tf.clip_by_value(lr,
                                               1 - self._lr_clip_range,
                                               1 + self._lr_clip_range,
                                               name='lr_clip')
                    surr_clip = lr_clip * adv
                    obj = tf.minimum(surrogate, surr_clip, name='surr_obj')

                if self._entropy_regularzied:
                    obj += self._policy_ent_coeff * policy_entropy

                # filter only the valid values
                obj = tf.boolean_mask(obj, i.valid_var)
                # Maximize E[surrogate objective] by minimizing
                # -E_t[surrogate objective]
                loss = -tf.reduce_mean(obj)

            # Diagnostic functions
            self._f_policy_kl = tf.compat.v1.get_default_session(
            ).make_callable(pol_mean_kl,
                            feed_list=flatten_inputs(self._policy_opt_inputs))

            self._f_rewards = tf.compat.v1.get_default_session().make_callable(
                rewards, feed_list=flatten_inputs(self._policy_opt_inputs))

            returns = discounted_returns(self._discount, self.max_path_length,
                                         rewards)
            self._f_returns = tf.compat.v1.get_default_session().make_callable(
                returns, feed_list=flatten_inputs(self._policy_opt_inputs))

            return loss, pol_mean_kl
Exemplo n.º 29
0
    def _build_policy_loss(self, i):
        """Build policy loss and other output tensors.

        Args:
            i (namedtuple): Collection of variables to compute policy loss.

        Returns:
            tf.Tensor: Policy loss.
            tf.Tensor: Mean policy KL divergence.

        Raises:
            NotImplementedError: If is_recurrent is True.

        """
        pol_dist = self.policy.distribution

        # Initialize dual params
        self._param_eta = 15.
        self._param_v = np.random.rand(
            self.env_spec.observation_space.flat_dim * 2 + 4)

        with tf.name_scope('bellman_error'):
            delta_v = tf.boolean_mask(i.reward_var,
                                      i.valid_var) + tf.tensordot(
                                          i.feat_diff, i.param_v, 1)

        with tf.name_scope('policy_loss'):
            ll = pol_dist.log_prob(i.action_var)
            ll = tf.boolean_mask(ll, i.valid_var)
            loss = -tf.reduce_mean(
                ll * tf.exp(delta_v / i.param_eta -
                            tf.reduce_max(delta_v / i.param_eta)))

            reg_params = self.policy.get_regularizable_vars()
            loss += self._l2_reg_loss * tf.reduce_sum(
                [tf.reduce_mean(tf.square(param))
                 for param in reg_params]) / len(reg_params)

        with tf.name_scope('kl'):
            kl = self._old_policy.distribution.kl_divergence(
                self.policy.distribution)
            pol_mean_kl = tf.reduce_mean(kl)

        with tf.name_scope('dual'):
            dual_loss = i.param_eta * self._epsilon + (
                i.param_eta * tf.math.log(
                    tf.reduce_mean(
                        tf.exp(delta_v / i.param_eta -
                               tf.reduce_max(delta_v / i.param_eta)))) +
                i.param_eta * tf.reduce_max(delta_v / i.param_eta))

            dual_loss += self._l2_reg_dual * (tf.square(i.param_eta) +
                                              tf.square(1 / i.param_eta))

            dual_grad = tf.gradients(dual_loss, [i.param_eta, i.param_v])

        # yapf: disable
        self._f_dual = tensor_utils.compile_function(
            flatten_inputs(self._dual_opt_inputs),
            dual_loss,
            log_name='f_dual')
        # yapf: enable

        self._f_dual_grad = tensor_utils.compile_function(
            flatten_inputs(self._dual_opt_inputs),
            dual_grad,
            log_name='f_dual_grad')

        self._f_policy_kl = tensor_utils.compile_function(
            flatten_inputs(self._policy_opt_inputs),
            pol_mean_kl,
            log_name='f_policy_kl')

        return loss
Exemplo n.º 30
0
    def _build_policy_loss(self, i):
        """Initialize policy loss complie function based on inputs i."""
        pol_dist = self.policy.distribution
        is_recurrent = self.policy.recurrent

        # Initialize dual params
        self.param_eta = 15.
        self.param_v = np.random.rand(
            self.env_spec.observation_space.flat_dim * 2 + 4)

        if is_recurrent:
            raise NotImplementedError

        policy_dist_info_flat = self.policy.dist_info_sym(
            i.flat.obs_var,
            i.flat.policy_state_info_vars,
            name='policy_dist_info_flat')

        policy_dist_info_valid = filter_valids_dict(
            policy_dist_info_flat,
            i.flat.valid_var,
            name='policy_dist_info_valid')

        with tf.name_scope('bellman_error'):
            delta_v = i.valid.reward_var + tf.tensordot(
                i.feat_diff, i.param_v, 1)

        with tf.name_scope('policy_loss'):
            ll = pol_dist.log_likelihood_sym(i.valid.action_var,
                                             policy_dist_info_valid)
            loss = -tf.reduce_mean(
                ll * tf.exp(delta_v / i.param_eta -
                            tf.reduce_max(delta_v / i.param_eta)))

            reg_params = self.policy.get_params(regularizable=True)
            loss += self.l2_reg_loss * tf.reduce_sum(
                [tf.reduce_mean(tf.square(param))
                 for param in reg_params]) / len(reg_params)

        with tf.name_scope('kl'):
            kl = pol_dist.kl_sym(
                i.valid.policy_old_dist_info_vars,
                policy_dist_info_valid,
            )
            pol_mean_kl = tf.reduce_mean(kl)

        with tf.name_scope('dual'):
            dual_loss = i.param_eta * self.epsilon + i.param_eta * tf.log(
                tf.reduce_mean(
                    tf.exp(delta_v / i.param_eta -
                           tf.reduce_max(delta_v / i.param_eta)))
            ) + i.param_eta * tf.reduce_max(delta_v / i.param_eta)

            dual_loss += self.l2_reg_dual * (tf.square(i.param_eta) +
                                             tf.square(1 / i.param_eta))

            dual_grad = tf.gradients(dual_loss, [i.param_eta, i.param_v])

        self.f_dual = tensor_utils.compile_function(flatten_inputs(
            self._dual_opt_inputs),
                                                    dual_loss,
                                                    log_name='f_dual')

        self.f_dual_grad = tensor_utils.compile_function(
            flatten_inputs(self._dual_opt_inputs),
            dual_grad,
            log_name='f_dual_grad')

        self.f_policy_kl = tensor_utils.compile_function(
            flatten_inputs(self._policy_opt_inputs),
            pol_mean_kl,
            log_name='f_policy_kl')

        return loss