示例#1
0
    def kl(self, other):
        """
        Args:
            other: object of CategoricalDistribution

        Returns:
            kl: A float32 tensor with shape [BATCH_SIZE]
        """
        assert isinstance(other, CategoricalDistribution)

        logits = self.logits - layers.reduce_max(self.logits, dim=1)
        other_logits = other.logits - layers.reduce_max(other.logits, dim=1)

        e_logits = layers.exp(logits)
        other_e_logits = layers.exp(other_logits)

        z = layers.reduce_sum(e_logits, dim=1)
        other_z = layers.reduce_sum(other_e_logits, dim=1)

        prob = e_logits / z
        kl = layers.reduce_sum(
            prob *
            (logits - layers.log(z) - other_logits + layers.log(other_z)),
            dim=1)
        return kl
示例#2
0
 def sample(self):
     """
     Returns:
         sample_action: An int64 tensor with shape [BATCH_SIZE, NUM_ACTIOINS] of sample action,
                        with noise to keep the target close to the original action.
     """
     eps = 1e-4
     logits_shape = layers.cast(layers.shape(self.logits), dtype='int64')
     uniform = layers.uniform_random(logits_shape, min=eps, max=1.0 - eps)
     soft_uniform = layers.log(-1.0 * layers.log(uniform))
     return layers.softmax(self.logits - soft_uniform, axis=-1)
示例#3
0
    def logp(self, actions, eps=1e-6):
        """
        Args:
            actions: An int64 tensor with shape [BATCH_SIZE]
            eps: A small float constant that avoids underflows when computing the log probability

        Returns:
            actions_log_prob: A float32 tensor with shape [BATCH_SIZE]
        """
        assert len(actions.shape) == 1

        logits = self.logits - layers.reduce_max(self.logits, dim=1)
        e_logits = layers.exp(logits)
        z = layers.reduce_sum(e_logits, dim=1)
        prob = e_logits / z

        actions = layers.unsqueeze(actions, axes=[1])
        actions_onehot = layers.one_hot(actions, prob.shape[1])
        actions_onehot = layers.cast(actions_onehot, dtype='float32')
        actions_prob = layers.reduce_sum(prob * actions_onehot, dim=1)

        actions_prob = actions_prob + eps
        actions_log_prob = layers.log(actions_prob)

        return actions_log_prob
示例#4
0
    def entropy(self):
        """
        Returns:
            entropy: A float32 tensor with shape [BATCH_SIZE] of entropy of self policy distribution.
        """
        logits = self.logits - layers.reduce_max(self.logits, dim=1)
        e_logits = layers.exp(logits)
        z = layers.reduce_sum(e_logits, dim=1)
        prob = e_logits / z
        entropy = -1.0 * layers.reduce_sum(prob * (logits - layers.log(z)),
                                           dim=1)

        return entropy
示例#5
0
文件: sac.py 项目: YuechengLiu/PARL
 def sample(self, obs):
     mean, log_std = self.actor.policy(obs)
     std = layers.exp(log_std)
     normal = Normal(mean, std)
     x_t = normal.sample([1])[0]
     y_t = layers.tanh(x_t)
     action = y_t * self.max_action
     log_prob = normal.log_prob(x_t)
     log_prob -= layers.log(self.max_action * (1 - layers.pow(y_t, 2)) +
                            epsilon)
     log_prob = layers.reduce_sum(log_prob, dim=1, keep_dim=True)
     log_prob = layers.squeeze(log_prob, axes=[1])
     return action, log_prob
示例#6
0
文件: alg.py 项目: Termset/IMPASS
    def learn(self, obs, actions, means, log_std, rewards, dones,
              learning_rate, entropy_coeff):
        """
        Args:
            obs: An float32 tensor of shape ([B] + observation_space).
                 E.g. [B, C, H, W] in atari.
            actions: An int64 tensor of shape [B].
            behaviour_logits: A float32 tensor of shape [B, NUM_ACTIONS].
            rewards: A float32 tensor of shape [B].
            dones: A float32 tensor of shape [B].
            learning_rate: float scalar of learning rate.
            entropy_coeff: float scalar of entropy coefficient.
        """
        values = self.model.value(obs)
        # pi
        log_std = layers.exp(log_std)
        normal_pi = Normal(means, log_std)
        # x_t1 = normal_pi.sample([1])[0]
        # x_t1.stop_gradient = True
        y_t1 = actions / self.max_action
        # action1 = y_t1 * self.max_action
        log_prob1 = normal_pi.log_prob(actions)
        log_prob1 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) +
                                epsilon)
        log_prob1 = layers.reduce_sum(log_prob1, dim=1, keep_dim=True)
        log_prob_pi = layers.squeeze(log_prob1, axes=[1])

        # mu
        actions_mu, log_std_mu = self.model.policy(obs)
        log_std_mu = layers.exp(log_std_mu)
        normal_mu = Normal(actions_mu, log_std_mu)
        # x_t2 = normal_mu.sample([1])[0]
        # x_t2.stop_gradient = True
        # y_t2 = actions
        # action2 = y_t2 * self.max_action
        log_prob2 = normal_mu.log_prob(actions)
        log_prob2 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) +
                                epsilon)
        log_prob2 = layers.reduce_sum(log_prob2, dim=1, keep_dim=True)
        log_prob_mu = layers.squeeze(log_prob2, axes=[1])

        # target_policy_distribution = CategoricalDistribution(target_logits)
        # behaviour_policy_distribution = CategoricalDistribution(
        #     behaviour_logits)

        policy_entropy = normal_mu.entropy()
        # policy_entropy = layers.reduce_mean(policy_entropy, dim=1)
        target_actions_log_probs = log_prob_mu
        behaviour_actions_log_probs = log_prob_pi

        # Calculating kl for debug
        # kl = target_policy_distribution.kl(behaviour_policy_distribution)
        kl = normal_mu.kl_divergence(normal_pi)
        kl = layers.reduce_mean(kl, dim=1)
        # kl = layers.unsqueeze(kl, axes=[1])
        """
        Split the tensor into batches at known episode cut boundaries. 
        [B * T] -> [T, B]
        """
        T = self.sample_batch_steps

        def split_batches(tensor):
            B = tensor.shape[0] // T
            splited_tensor = layers.reshape(tensor,
                                            [B, T] + list(tensor.shape[1:]))
            # transpose B and T
            return layers.transpose(splited_tensor, [1, 0] +
                                    list(range(2, 1 + len(tensor.shape))))

        behaviour_actions_log_probs = split_batches(
            behaviour_actions_log_probs)
        target_actions_log_probs = split_batches(target_actions_log_probs)
        policy_entropy = split_batches(policy_entropy)
        dones = split_batches(dones)
        rewards = split_batches(rewards)
        values = split_batches(values)

        # [T, B] -> [T - 1, B] for V-trace calc.
        behaviour_actions_log_probs = layers.slice(behaviour_actions_log_probs,
                                                   axes=[0],
                                                   starts=[0],
                                                   ends=[-1])
        target_actions_log_probs = layers.slice(target_actions_log_probs,
                                                axes=[0],
                                                starts=[0],
                                                ends=[-1])
        policy_entropy = layers.slice(policy_entropy,
                                      axes=[0],
                                      starts=[0],
                                      ends=[-1])
        dones = layers.slice(dones, axes=[0], starts=[0], ends=[-1])
        rewards = layers.slice(rewards, axes=[0], starts=[0], ends=[-1])
        bootstrap_value = layers.slice(values,
                                       axes=[0],
                                       starts=[T - 1],
                                       ends=[T])
        values = layers.slice(values, axes=[0], starts=[0], ends=[-1])

        bootstrap_value = layers.squeeze(bootstrap_value, axes=[0])

        vtrace_loss = VTraceLoss(
            behaviour_actions_log_probs=behaviour_actions_log_probs,
            target_actions_log_probs=target_actions_log_probs,
            policy_entropy=policy_entropy,
            dones=dones,
            discount=self.gamma,
            rewards=rewards,
            values=values,
            bootstrap_value=bootstrap_value,
            entropy_coeff=entropy_coeff,
            vf_loss_coeff=self.vf_loss_coeff,
            clip_rho_threshold=self.clip_rho_threshold,
            clip_pg_rho_threshold=self.clip_pg_rho_threshold)

        fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
            clip_norm=40.0))

        optimizer = fluid.optimizer.AdamOptimizer(learning_rate)
        optimizer.minimize(vtrace_loss.total_loss)
        return vtrace_loss, kl