示例#1
0
 def arch_weights(self):
     """Get weights of alphas."""
     self.alphas_normal = self.get_weights('alphas_normal')
     self.alphas_reduce = self.get_weights('alphas_reduce')
     alphas_normal = ops.softmax(torch.stack(self.alphas_normal, dim=0), -1)
     alphas_reduce = ops.softmax(torch.stack(self.alphas_reduce, dim=0), -1)
     return [ops.to_numpy(alphas_normal), ops.to_numpy(alphas_reduce)]
示例#2
0
    def call(self, hidden_states, attention_mask):
        """Call attention func."""
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self._transpose_for_scores(mixed_query_layer)
        key_layer = self._transpose_for_scores(mixed_key_layer)
        value_layer = self._transpose_for_scores(mixed_value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = ops.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(
            self.attention_head_size)
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = ops.softmax(attention_scores, dim=-1)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        context_layer = ops.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (
            self.all_head_size, )
        context_layer = context_layer.view(*new_context_layer_shape)
        return context_layer
示例#3
0
    def call(self, inputs, targets):
        """Compute loss.

        :param inputs: predict data.
        :param targets: true data.
        :return:
        """
        y_true = ops.to(ops.one_hot(targets, 2, ), 'float32')
        y_pred = ops.softmax(inputs, dim=1)

        tp = ops.reduce_sum(y_true * y_pred, dtype='float32')
        # tn = ops.reduce_sum(((1 - y_true) * (1 - y_pred)), dtype='float32')
        fp = ops.reduce_sum(((1 - y_true) * y_pred), dtype='float32')
        fn = ops.reduce_sum((y_true * (1 - y_pred)), dtype='float32')

        precision = tp / (tp + fp + self.epsilon)
        recall = tp / (tp + fn + self.epsilon)

        f1 = 2 * (precision * recall) / (precision + recall + self.epsilon)
        f1 = ops.clamp(f1, min=self.epsilon, max=1 - self.epsilon)
        return 1 - f1.mean()
示例#4
0
    def call(self, inputs, targets):
        """Compute loss.

        :param inputs: predict data.
        :param targets: true data.
        :return:
        """
        N = inputs.size(0)
        C = inputs.size(1)
        P = ops.softmax(inputs)
        class_mask = inputs.data.new(N, C).fill_(0)
        ids = targets.view(-1, 1)
        class_mask.scatter_(1, ids.data, 1.)
        if inputs.is_cuda and not self.alpha.is_cuda:
            self.alpha = self.alpha.cuda()
        alpha = self.alpha[ids.data.view(-1)]
        probs = (P * class_mask).sum(1).view(-1, 1)
        log_p = probs.log()
        batch_loss = -alpha * (ops.pow((1 - probs), self.gamma)) * log_p
        if self.size_average:
            loss = batch_loss.mean()
        else:
            loss = batch_loss.sum()
        return loss
示例#5
0
 def calc_alphas(self, alphas, dim=-1, **kwargs):
     """Calculate Alphas."""
     new_alphas = []
     for alpha in alphas:
         new_alphas.append(ops.softmax(alpha, dim))
     return new_alphas
示例#6
0
 def calc_alphas(self, alphas, dim=-1, **kwargs):
     """Calculate Alphas."""
     return ops.softmax(alphas, dim)