Exemplo n.º 1
0
    def __init__(self,
                 behaviour_actions_log_probs,
                 target_actions_log_probs,
                 policy_entropy,
                 dones,
                 discount,
                 rewards,
                 values,
                 bootstrap_value,
                 entropy_coeff=-0.01,
                 vf_loss_coeff=0.5,
                 clip_rho_threshold=1.0,
                 clip_pg_rho_threshold=1.0):
        """Policy gradient loss with vtrace importance weighting.

        VTraceLoss takes tensors of shape [T, B, ...], where `B` is the
        batch_size. The reason we need to know `B` is for V-trace to properly
        handle episode cut boundaries.

        Args:
            behaviour_actions_log_probs: A float32 tensor of shape [T, B].
            target_actions_log_probs: A float32 tensor of shape [T, B].
            policy_entropy: A float32 tensor of shape [T, B].
            dones: A float32 tensor of shape [T, B].
            discount: A float32 scalar.
            rewards: A float32 tensor of shape [T, B].
            values: A float32 tensor of shape [T, B].
            bootstrap_value: A float32 tensor of shape [B].
        """

        self.vtrace_returns = from_importance_weights(
            behaviour_actions_log_probs=behaviour_actions_log_probs,
            target_actions_log_probs=target_actions_log_probs,
            discounts=inverse(dones) * discount,
            rewards=rewards,
            values=values,
            bootstrap_value=bootstrap_value,
            clip_rho_threshold=clip_rho_threshold,
            clip_pg_rho_threshold=clip_pg_rho_threshold)

        # The policy gradients loss
        self.pi_loss = -1.0 * layers.reduce_sum(
            target_actions_log_probs * self.vtrace_returns.pg_advantages)

        # The baseline loss
        delta = values - self.vtrace_returns.vs
        self.vf_loss = 0.5 * layers.reduce_sum(layers.square(delta))

        # The entropy loss (We want to maximize entropy, so entropy_ceoff < 0)
        self.entropy = layers.reduce_sum(policy_entropy)

        # The summed weighted loss
        self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff +
                           self.entropy * entropy_coeff)
Exemplo n.º 2
0
    def _calc_kl(self, means, logvars, old_means, old_logvars):
        """ Calculate KL divergence between old and new distributions
            See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Kullback.E2.80.93Leibler_divergence

        Args:
            means: shape (batch_size, act_dim)
            logvars: shape (act_dim)
            old_means: shape (batch_size, act_dim)
            old_logvars: shape (act_dim)

        Returns:
            kl: shape (batch_size)
        """
        log_det_cov_old = layers.reduce_sum(old_logvars)
        log_det_cov_new = layers.reduce_sum(logvars)
        tr_old_new = layers.reduce_sum(layers.exp(old_logvars - logvars))
        kl = 0.5 * (layers.reduce_sum(
            layers.square(means - old_means) / layers.exp(logvars), dim=1) +
                    (log_det_cov_new - log_det_cov_old) + tr_old_new -
                    self.act_dim)
        return kl
Exemplo n.º 3
0
    def _calc_logprob(self, actions, means, logvars):
        """ Calculate log probabilities of actions, when given means and logvars
            of normal distribution.
            The constant sqrt(2 * pi) is omitted, which will be eliminated in later.

        Args:
            actions: shape (batch_size, act_dim)
            means:   shape (batch_size, act_dim)
            logvars: shape (act_dim)

        Returns:
            logprob: shape (batch_size)
        """
        exp_item = layers.elementwise_div(layers.square(actions - means),
                                          layers.exp(logvars),
                                          axis=1)
        exp_item = -0.5 * layers.reduce_sum(exp_item, dim=1)

        vars_item = -0.5 * layers.reduce_sum(logvars)
        logprob = exp_item + vars_item
        return logprob
Exemplo n.º 4
0
    def learn(self, obs, actions, advantages, target_values, learning_rate,
              entropy_coeff):
        """
        Args:
            obs: An float32 tensor of shape ([B] + observation_space).
                 E.g. [B, C, H, W] in atari.
            actions: An int64 tensor of shape [B].
            advantages: A float32 tensor of shape [B].
            target_values: A float32 tensor of shape [B].
            learning_rate: float scalar of learning rate.
            entropy_coeff: float scalar of entropy coefficient.
        """
        logits = self.model.policy(obs)
        policy_distribution = CategoricalDistribution(logits)
        actions_log_probs = policy_distribution.logp(actions)

        # The policy gradient loss
        pi_loss = -1.0 * layers.reduce_sum(actions_log_probs * advantages)

        # The value function loss
        values = self.model.value(obs)
        delta = values - target_values
        vf_loss = 0.5 * layers.reduce_sum(layers.square(delta))

        # The entropy loss (We want to maximize entropy, so entropy_ceoff < 0)
        policy_entropy = policy_distribution.entropy()
        entropy = layers.reduce_sum(policy_entropy)

        total_loss = (pi_loss + vf_loss * self.vf_loss_coeff +
                      entropy * entropy_coeff)

        fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
            clip_norm=40.0))

        optimizer = fluid.optimizer.AdamOptimizer(learning_rate)
        optimizer.minimize(total_loss)

        return total_loss, pi_loss, vf_loss, entropy
Exemplo n.º 5
0
    def _actor_learn(self, obs_n, act_n):
        i = self.agent_index
        this_policy = self.model.policy(obs_n[i])
        sample_this_action = SoftPDistribution(
            logits=this_policy,
            act_space=self.act_space[self.agent_index]).sample()

        action_input_n = act_n + []
        action_input_n[i] = sample_this_action
        eval_q = self.Q(obs_n, action_input_n)
        act_cost = layers.reduce_mean(-1.0 * eval_q)

        act_reg = layers.reduce_mean(layers.square(this_policy))

        cost = act_cost + act_reg * 1e-3

        fluid.clip.set_gradient_clip(
            clip=fluid.clip.GradientClipByNorm(clip_norm=0.5),
            param_list=self.model.get_actor_params())

        optimizer = fluid.optimizer.AdamOptimizer(self.lr)
        optimizer.minimize(cost, parameter_list=self.model.get_actor_params())
        return cost