예제 #1
0
    def get_logits(self, hidden):
        """Get all the logits.

        Parameters
        ----------
        F
        hidden
            The hidden representation
            Shape (..., in_units)

        Returns
        -------
        logits
            Shape (..., |V|)

        """
        if self._cutoffs is None:
            if self._in_units != self._embed_size:
                hidden = self.inter_proj_l[0](hidden)
            logits = self.out_proj_l[0](hidden)
            return logits
        else:
            all_logits = []
            if self._div_val == 1.0:
                if self._in_units == self._embed_size:
                    all_scores = self.out_proj_l[0](hidden)
                    tail_cluster_scores = self.tail_cluster_score_proj(hidden)
                else:
                    inter_hidden = self.inter_proj_l[0](hidden)
                    all_scores = self.out_proj_l[0](inter_hidden)
                    tail_cluster_scores = self.tail_cluster_score_proj(
                        inter_hidden)
                all_scores_l = np.split(all_scores, self._cutoffs, axis=-1)
                head_scores = all_scores_l[0]
            else:
                inter_hidden = self.inter_proj_l[0](hidden)
                head_scores = self.out_proj_l[0](inter_hidden)
                tail_cluster_scores = self.tail_cluster_score_proj(
                    inter_hidden)
            head_tail_cluster_logits = \
                npx.log_softmax(np.concatenate([head_scores, tail_cluster_scores],
                                                   axis=-1), axis=-1)
            head_logits, tail_cluster_logits = \
                np.split(head_tail_cluster_logits, [self._cutoffs[0]], axis=-1)
            tail_cluster_logits = np.split(tail_cluster_logits,
                                           self._num_tail_clusters,
                                           axis=-1)
            all_logits.append(head_logits)
            for i in range(1, len(self._cutoffs) + 1):
                if self._div_val == 1.0:
                    ele_scores = all_scores_l[i]
                else:
                    ele_scores = self.out_proj_l[i](
                        self.inter_proj_l[i](hidden))
                ele_logits = npx.log_softmax(ele_scores, axis=-1)
                ele_logits = tail_cluster_logits[-i] + ele_logits
                all_logits.append(ele_logits)
            return np.concatenate(all_logits, axis=-1)
예제 #2
0
def masked_logsoftmax(att_score, mask, axis: int = -1):
    """Ignore the masked elements when calculating the softmax. The mask can be broadcastable.

    Parameters
    ----------
    att_score : Symborl or NDArray
        Shape (..., length, ...)
    mask : Symbol or NDArray or None
        Shape (..., length, ...)
        mask = 1 --> not masked
        mask = 0 --> masked
    axis
        The axis to calculate the softmax. att_score.shape[axis] must be the same as mask.shape[axis]

    Returns
    -------
    logits : Symborl or NDArray
        Shape (..., length, ...)
        The masked values will be all zero
    """
    if mask is None:
        return npx.log_softmax(att_score, axis=axis)
    else:
        mask = mask.astype(np.bool)
        return np.where(mask, npx.masked_log_softmax(att_score, mask, axis=axis), -np.inf)
예제 #3
0
    def get_answerable_logits(self, contextual_embedding, p_mask):
        """Get the answerable logits.

        Parameters
        ----------
        contextual_embedding
            Shape (batch_size, sequence_length, C)
        p_mask
            Shape (batch_size, sequence_length)
            Mask the sequence.
            0 --> Denote that the element is masked,
            1 --> Denote that the element is not masked

        Returns
        -------
        answerable_logits
            Shape (batch_size, 2)
        """
        # Shape (batch_size, sequence_length)
        start_scores = np.squeeze(self.start_scores(contextual_embedding), -1)
        start_score_weights = masked_softmax(start_scores, p_mask, axis=-1)
        start_agg_feature = npx.batch_dot(np.expand_dims(start_score_weights, axis=1),
                                          contextual_embedding)
        start_agg_feature = np.squeeze(start_agg_feature, 1)
        cls_feature = contextual_embedding[:, 0, :]
        answerable_scores = self.answerable_scores(np.concatenate([start_agg_feature,
                                                                  cls_feature], axis=-1))
        answerable_logits = npx.log_softmax(answerable_scores, axis=-1)
        return answerable_logits
예제 #4
0
    def forward(self, logits, labels, length_ratio, source_length,
                target_length):
        """
        :param logits: Model logits. Shape: (batch, length, vocab_size).
        :param labels: Gold targets. Shape: (batch, length).
        :param length_ratio: Length Ratios. Shape: (batch,).
        :param source_length: Source lengths. Shape: (batch,).
        :param target_length: Target lengths. Shape: (batch,).
        :return: Sequence scores. Shape: (batch,).
        """
        logprobs = npx.log_softmax(logits,
                                   axis=-1,
                                   temperature=self.softmax_temperature)

        # Select the label probability, then take their logs.
        # probs and scores: (batch_size, target_seq_len)
        token_scores = npx.pick(logprobs, labels, axis=-1)
        if self.score_type == C.SCORING_TYPE_NEGLOGPROB:
            token_scores = token_scores * -1

        # Sum, then apply length penalty. The call to `np.where` masks out invalid values from scores.
        # zeros and sums: (batch_size,)
        scores = np.sum(np.where(labels != 0, token_scores,
                                 np.zeros_like(token_scores)),
                        axis=1)

        if self.constant_length_ratio is not None and self.constant_length_ratio > 0.0:
            predicted_output_length = source_length * self.constant_length_ratio
        else:
            predicted_output_length = source_length * length_ratio

        scores = self.scorer(scores, target_length, predicted_output_length)

        return scores
예제 #5
0
파일: loss.py 프로젝트: bricksdont/sockeye
    def forward(self, logits: np.ndarray, labels: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        pred = npx.log_softmax(logits, axis=-1)

        # (batch, len)
        neg_log_likelihood = - npx.pick(pred,  # pylint: disable=invalid-unary-operand-type
                                        labels, axis=-1, keepdims=False)

        # label smoothing as in
        # https://github.com/dmlc/gluon-nlp/blob/b714eaccc67619d7bdcbd1574d30be87d9c73f0c/src/gluonnlp/loss.py#L4
        if self._alpha > 0:
            all_scores = np.sum(pred, axis=-1)
            neg_log_likelihood = (1 - self._alpha) * neg_log_likelihood - self._alpha / self._num_labels * all_scores

        # (batch, len,)
        valid_mask = labels != self.ignore_label

        # (batch, len)
        loss = neg_log_likelihood * valid_mask

        # (1,)
        num_valid = np.sum(valid_mask)

        # (1,)
        ce = np.sum(loss) * self.weight

        # we need to divide by num_valid here to backpropagate a 'valid' normalized loss value like in SoftmaxOutput.
        return ce / num_valid, np.ones((1,))
예제 #6
0
def masked_logsoftmax(att_score, mask, dtype=np.float32, axis: int = -1):
    """Ignore the masked elements when calculating the softmax. The mask can be broadcastable.

    Parameters
    ----------
    att_score : Symborl or NDArray
        Shape (..., length, ...)
    mask : Symbol or NDArray or None
        Shape (..., length, ...)
        mask = 1 --> not masked
        mask = 0 --> masked
    dtype
        data type
    axis
        The axis to calculate the softmax. att_score.shape[axis] must be the same as mask.shape[axis]
    Returns
    -------
    logits : Symborl or NDArray
        Shape (..., length, ...)
        The masked values will be all zero
    """
    if mask is not None:
        # Fill in the masked scores with a very small value
        neg = -1e18
        if _np.dtype(dtype) == np.float16:
            neg = -1e4
        else:
            try:
                # if AMP (automatic mixed precision) is enabled, -1e18 will cause NaN.
                from mxnet.contrib import amp
                if amp.amp._amp_initialized:
                    neg = -1e4
            except ImportError:
                pass
        att_score = np.where(mask, att_score, neg)
        logits = np.where(mask, npx.log_softmax(att_score, axis=axis), -np.inf)
    else:
        logits = npx.log_softmax(att_score, axis=axis)
    return logits
예제 #7
0
    def decode_step(self,
                    step_input: np.ndarray,
                    states: List,
                    vocab_slice_ids: Optional[np.ndarray] = None):
        logits, states, target_factor_outputs = self._model.decode_step(step_input, states, vocab_slice_ids)
        if not self._skip_softmax:
            logits = npx.log_softmax(logits, axis=-1, temperature=self._softmax_temperature)
        scores = -logits

        target_factors = None  # type: Optional[np.ndarray]
        if target_factor_outputs:
            # target factors are greedily 'decoded'.
            factor_predictions = [npx.cast(np.expand_dims(np.argmax(tfo, axis=1), axis=1), dtype='int32') for tfo in target_factor_outputs]
            target_factors = factor_predictions[0] if len(factor_predictions) == 1 \
                else np.concatenate(factor_predictions, axis=1)
        return scores, states, target_factors
예제 #8
0
    def forward(self, pred, label):
        """

        Parameters
        ----------
        pred :
            The predictions of the network. Shape (..., V)
        label :
            The labels. Shape (..., )

        Returns
        -------
        loss :
            Shape (..., )
        """
        if not self._from_logits:
            pred = npx.log_softmax(pred, axis=-1)
        log_likelihood = npx.pick(pred, label, axis=-1)
        all_scores = pred.sum(axis=-1)
        loss = - (1 - self._alpha) * log_likelihood\
               - self._alpha / float(self._num_labels) * all_scores
        return loss
예제 #9
0
 def log_linear_interpolation(predictions):
     log_probs = utils.average_arrays([np.log(p) for p in predictions])
     return -npx.log_softmax(log_probs)  # pylint: disable=invalid-unary-operand-type