def entropy(self): """Shannon entropy in nats. Returns: Tensor: Shannon entropy of Categorical distribution. The data type is float32. Examples: .. code-block:: python import paddle from paddle.distribution import Categorical paddle.seed(100) # on CPU device x = paddle.rand([6]) print(x) # [0.5535528 0.20714243 0.01162981 # 0.51577556 0.36369765 0.2609165 ] cat = Categorical(x) cat.entropy() # [1.77528] """ name = self.name + '_entropy' logits = self.logits - nn.reduce_max( self.logits, dim=-1, keep_dim=True) e_logits = ops.exp(logits) z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True) prob = e_logits / z neg_entropy = nn.reduce_sum(prob * (logits - nn.log(z)), dim=-1, keep_dim=True) entropy = nn.scale(neg_entropy, scale=-1.0, name=name) return entropy
def softmax_with_cross_entropy(self, shard_logit, shard_label): shard_max = nn.reduce_max(shard_logit, dim=1, keep_dim=True) global_max = collective._c_allreduce(shard_max, reduce_type='max', use_calc_stream=True) shard_logit_new = nn.elementwise_sub(shard_logit, global_max) shard_exp = ops.exp(shard_logit_new) shard_demon = nn.reduce_sum(shard_exp, dim=1, keep_dim=True) global_demon = collective._c_allreduce(shard_demon, reduce_type='sum', use_calc_stream=True) global_log_demon = nn.log(global_demon) shard_log_prob = shard_logit_new - global_log_demon shard_prob = ops.exp(shard_log_prob) shard_one_hot = nn.one_hot(shard_label, depth=self.shard_dim, allow_out_of_range=True) target_log_prob = nn.reduce_min(shard_log_prob * shard_one_hot, dim=1, keep_dim=True) shard_loss = nn.scale(target_log_prob, scale=-1.0) global_loss = collective._c_reducescatter(shard_loss, nranks=self.nranks, use_calc_stream=True) return global_loss, shard_prob
def net(self, input, is_infer=False): """ network""" text = input[0] pos_tag = input[1] neg_tag = input[2] text_emb = fluid.embedding(input=text, size=[self.vocab_text_size, self.emb_dim], param_attr="text_emb") text_emb = fluid.layers.squeeze(input=text_emb, axes=[1]) pos_tag_emb = fluid.embedding(input=pos_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb") pos_tag_emb = fluid.layers.squeeze(input=pos_tag_emb, axes=[1]) neg_tag_emb = fluid.embedding(input=neg_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb") neg_tag_emb = fluid.layers.squeeze(input=neg_tag_emb, axes=[1]) conv_1d = fluid.nets.sequence_conv_pool(input=text_emb, num_filters=self.hid_dim, filter_size=self.win_size, act="tanh", pool_type="max", param_attr="cnn") text_hid = fluid.layers.fc(input=conv_1d, size=self.emb_dim, param_attr="text_hid") cos_pos = nn.cos_sim(pos_tag_emb, text_hid) mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb) mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid) cos_neg_all = fluid.layers.sequence_reshape(input=mul_cos_neg, new_dim=self.neg_size) #choose max negtive cosine cos_neg = nn.reduce_max(cos_neg_all, dim=1, keep_dim=True) #calculate hinge loss loss_part1 = nn.elementwise_sub( tensor.fill_constant_batch_size_like(input=cos_pos, shape=[-1, 1], value=self.margin, dtype='float32'), cos_pos) loss_part2 = nn.elementwise_add(loss_part1, cos_neg) loss_part3 = nn.elementwise_max( tensor.fill_constant_batch_size_like(input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'), loss_part2) avg_cost = nn.mean(loss_part3) less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32') correct = nn.reduce_sum(less) self._cost = avg_cost if is_infer: self._infer_results["correct"] = correct self._infer_results["cos_pos"] = cos_pos else: self._metrics["correct"] = correct self._metrics["cos_pos"] = cos_pos
def kl_divergence(self, other): """The KL-divergence between two Categorical distributions. Args: other (Categorical): instance of Categorical. The data type is float32. Returns: Tensor: kl-divergence between two Categorical distributions. Examples: .. code-block:: python import paddle from paddle.distribution import Categorical paddle.seed(100) # on CPU device x = paddle.rand([6]) print(x) # [0.5535528 0.20714243 0.01162981 # 0.51577556 0.36369765 0.2609165 ] paddle.seed(200) # on CPU device y = paddle.rand([6]) print(y) # [0.77663314 0.90824795 0.15685187 # 0.04279523 0.34468332 0.7955718 ] cat = Categorical(x) cat2 = Categorical(y) cat.kl_divergence(cat2) # [0.071952] """ name = self.name + '_kl_divergence' if not in_dygraph_mode(): check_type(other, 'other', Categorical, 'kl_divergence') logits = self.logits - nn.reduce_max( self.logits, dim=-1, keep_dim=True) other_logits = other.logits - nn.reduce_max( other.logits, dim=-1, keep_dim=True) e_logits = ops.exp(logits) other_e_logits = ops.exp(other_logits) z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True) other_z = nn.reduce_sum(other_e_logits, dim=-1, keep_dim=True) prob = e_logits / z kl = nn.reduce_sum( prob * (logits - nn.log(z) - other_logits + nn.log(other_z)), dim=-1, keep_dim=True, name=name) return kl
def network(vocab_text_size, vocab_tag_size, emb_dim=10, hid_dim=1000, win_size=5, margin=0.1, neg_size=5): """ network definition """ text = io.data(name="text", shape=[1], lod_level=1, dtype='int64') pos_tag = io.data(name="pos_tag", shape=[1], lod_level=1, dtype='int64') neg_tag = io.data(name="neg_tag", shape=[1], lod_level=1, dtype='int64') text_emb = nn.embedding(input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb") pos_tag_emb = nn.embedding(input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb") neg_tag_emb = nn.embedding(input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb") conv_1d = fluid.nets.sequence_conv_pool(input=text_emb, num_filters=hid_dim, filter_size=win_size, act="tanh", pool_type="max", param_attr="cnn") text_hid = fluid.layers.fc(input=conv_1d, size=emb_dim, param_attr="text_hid") cos_pos = nn.cos_sim(pos_tag_emb, text_hid) mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb) mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid) cos_neg_all = fluid.layers.sequence_reshape(input=mul_cos_neg, new_dim=neg_size) #choose max negtive cosine cos_neg = nn.reduce_max(cos_neg_all, dim=1, keep_dim=True) #calculate hinge loss loss_part1 = nn.elementwise_sub( tensor.fill_constant_batch_size_like(input=cos_pos, shape=[-1, 1], value=margin, dtype='float32'), cos_pos) loss_part2 = nn.elementwise_add(loss_part1, cos_neg) loss_part3 = nn.elementwise_max( tensor.fill_constant_batch_size_like(input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'), loss_part2) avg_cost = nn.mean(loss_part3) less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32') correct = nn.reduce_sum(less) return avg_cost, correct, cos_pos
def get_correct(self, x, y): less = tensor.cast(cf.less_than(x, y), dtype='float32') correct = nn.reduce_sum(less) return correct
def probs(self, value): """Probabilities of the given category (``value``). If ``logits`` is 2-D or higher dimension, the last dimension will be regarded as category, and the others represents the different distributions. At the same time, if ``vlaue`` is 1-D Tensor, ``value`` will be broadcast to the same number of distributions as ``logits``. If ``value`` is not 1-D Tensor, ``value`` should have the same number distributions with ``logits. That is, ``value[:-1] = logits[:-1]``. Args: value (Tensor): The input tensor represents the selected category index. Returns: Tensor: probability according to the category index. Examples: .. code-block:: python import paddle from paddle.distribution import Categorical paddle.seed(100) # on CPU device x = paddle.rand([6]) print(x) # [0.5535528 0.20714243 0.01162981 # 0.51577556 0.36369765 0.2609165 ] cat = Categorical(x) value = paddle.to_tensor([2,1,3]) cat.probs(value) # [0.00608027 0.108298 0.269656] """ name = self.name + '_probs' dist_sum = nn.reduce_sum(self.logits, dim=-1, keep_dim=True) prob = self.logits / dist_sum shape = list(prob.shape) value_shape = list(value.shape) if len(shape) == 1: num_value_in_one_dist = np.prod(value_shape) index_value = nn.reshape(value, [num_value_in_one_dist, 1]) index = index_value else: num_dist = np.prod(shape[:-1]) num_value_in_one_dist = value_shape[-1] prob = nn.reshape(prob, [num_dist, shape[-1]]) if len(value_shape) == 1: value = nn.expand(value, [num_dist]) value_shape = shape[:-1] + value_shape index_value = nn.reshape(value, [num_dist, -1, 1]) if shape[:-1] != value_shape[:-1]: raise ValueError( "shape of value {} must match shape of logits {}".format( str(value_shape[:-1]), str(shape[:-1]))) index_prefix = nn.unsqueeze(arange(num_dist, dtype=index_value.dtype), axes=-1) index_prefix = nn.expand(index_prefix, [1, num_value_in_one_dist]) index_prefix = nn.unsqueeze(index_prefix, axes=-1) if index_value.dtype != index_prefix.dtype: tensor.cast(index_prefix, dtype=index_value.dtype) index = concat([index_prefix, index_value], axis=-1) # value is the category index to search for the corresponding probability. select_prob = gather_nd(prob, index) return nn.reshape(select_prob, value_shape, name=name)
def arcface_classify(self, x, label, margin=0.5, logit_scale=64, param_attr=None): ''' reference: ArcFace. https://arxiv.org/abs/1801.07698 ''' flatten_dim = reduce(lambda a, b: a * b, x.shape[1:], 1) weight, bias = self.create_parameter(dtype=x.dtype, in_dim=flatten_dim, param_attr=param_attr, use_bias=False) # normalize x x_l2 = ops.sqrt(nn.reduce_sum(ops.square(x), dim=1)) norm_x = nn.elementwise_div(x, x_l2, axis=0) norm_x_all = collective._c_allgather(norm_x, nranks=self.nranks, use_calc_stream=True) label_all = collective._c_allgather(label, nranks=self.nranks, use_calc_stream=True) label_all.stop_gradient = True shard_label = nn.shard_index(label_all, index_num=self.nclasses, nshards=self.nranks, shard_id=self.rank_id, ignore_value=-1) # TODO check necessary shard_label.stop_gradient = True # normalize weight weight_l2 = ops.sqrt(nn.reduce_sum(ops.square(weight), dim=0)) norm_weight = nn.elementwise_div(weight, weight_l2, axis=1) shard_cos = nn.mul(norm_x_all, norm_weight, x_num_col_dims=1) theta = ops.acos(shard_cos) margin_cos = ops.cos(theta + margin) shard_one_hot = nn.one_hot(shard_label, depth=self.shard_dim, allow_out_of_range=True) # TODO check necessary shard_one_hot.stop_gradient = True diff = (margin_cos - shard_cos) * shard_one_hot shard_target_cos = shard_cos + diff shard_logit = nn.scale(shard_target_cos, scale=logit_scale) global_loss, shard_prob = self.softmax_with_cross_entropy( shard_logit, shard_label) avg_loss = nn.mean(global_loss) avg_loss._set_info('shard_logit', shard_logit) avg_loss._set_info('shard_prob', shard_prob) avg_loss._set_info('shard_label', shard_label) avg_loss._set_info('shard_dim', self.shard_dim) return avg_loss