def softmax_with_cross_entropy(self, shard_logit, shard_label): shard_max = nn.reduce_max(shard_logit, dim=1, keep_dim=True) global_max = collective._c_allreduce(shard_max, reduce_type='max', use_calc_stream=True) shard_logit_new = nn.elementwise_sub(shard_logit, global_max) shard_exp = ops.exp(shard_logit_new) shard_demon = nn.reduce_sum(shard_exp, dim=1, keep_dim=True) global_demon = collective._c_allreduce(shard_demon, reduce_type='sum', use_calc_stream=True) global_log_demon = nn.log(global_demon) shard_log_prob = shard_logit_new - global_log_demon shard_prob = ops.exp(shard_log_prob) shard_one_hot = nn.one_hot(shard_label, depth=self.shard_dim, allow_out_of_range=True) target_log_prob = nn.reduce_min(shard_log_prob * shard_one_hot, dim=1, keep_dim=True) shard_loss = nn.scale(target_log_prob, scale=-1.0) global_loss = collective._c_reducescatter(shard_loss, nranks=self.nranks, use_calc_stream=True) return global_loss, shard_prob
def kl_divergence(self, other): """The KL-divergence between two Categorical distributions. Args: other (Categorical): instance of Categorical. The data type is float32. Returns: Tensor: kl-divergence between two Categorical distributions. Examples: .. code-block:: python import paddle from paddle.distribution import Categorical paddle.seed(100) # on CPU device x = paddle.rand([6]) print(x) # [0.5535528 0.20714243 0.01162981 # 0.51577556 0.36369765 0.2609165 ] paddle.seed(200) # on CPU device y = paddle.rand([6]) print(y) # [0.77663314 0.90824795 0.15685187 # 0.04279523 0.34468332 0.7955718 ] cat = Categorical(x) cat2 = Categorical(y) cat.kl_divergence(cat2) # [0.071952] """ name = self.name + '_kl_divergence' if not _non_static_mode(): check_type(other, 'other', Categorical, 'kl_divergence') logits = self.logits - \ paddle.max(self.logits, axis=-1, keepdim=True) other_logits = other.logits - paddle.max( other.logits, axis=-1, keepdim=True) e_logits = ops.exp(logits) other_e_logits = ops.exp(other_logits) z = paddle.sum(e_logits, axis=-1, keepdim=True) other_z = paddle.sum(other_e_logits, axis=-1, keepdim=True) prob = e_logits / z kl = paddle.sum( prob * (logits - paddle.log(z) - other_logits + paddle.log(other_z)), axis=-1, keepdim=True, name=name) return kl
def entropy(self): """Shannon entropy in nats. Returns: Tensor: Shannon entropy of Categorical distribution. The data type is float32. Examples: .. code-block:: python import paddle from paddle.distribution import Categorical paddle.seed(100) # on CPU device x = paddle.rand([6]) print(x) # [0.5535528 0.20714243 0.01162981 # 0.51577556 0.36369765 0.2609165 ] cat = Categorical(x) cat.entropy() # [1.77528] """ name = self.name + '_entropy' logits = self.logits - \ paddle.max(self.logits, axis=-1, keepdim=True) e_logits = ops.exp(logits) z = paddle.sum(e_logits, axis=-1, keepdim=True) prob = e_logits / z neg_entropy = paddle.sum(prob * (logits - paddle.log(z)), axis=-1) entropy = paddle.scale(neg_entropy, scale=-1.0, name=name) return entropy
def probs(self, value): """Probability density/mass function. Args: value (Tensor): The input tensor. Returns: Tensor: probability.The data type is same with value. """ name = self.name + '_probs' value = self._check_values_dtype_in_probs(self.loc, value) var = self.scale * self.scale return elementwise_div(ops.exp( -1. * ((value - self.loc) * (value - self.loc)) / (2. * var)), (math.sqrt(2 * math.pi) * self.scale), name=name)