def forward(self, logits, labels, length_ratio, source_length, target_length): """ :param logits: Model logits. Shape: (batch, length, vocab_size). :param labels: Gold targets. Shape: (batch, length). :param length_ratio: Length Ratios. Shape: (batch,). :param source_length: Source lengths. Shape: (batch,). :param target_length: Target lengths. Shape: (batch,). :return: Sequence scores. Shape: (batch,). """ logprobs = npx.log_softmax(logits, axis=-1, temperature=self.softmax_temperature) # Select the label probability, then take their logs. # probs and scores: (batch_size, target_seq_len) token_scores = npx.pick(logprobs, labels, axis=-1) if self.score_type == C.SCORING_TYPE_NEGLOGPROB: token_scores = token_scores * -1 # Sum, then apply length penalty. The call to `np.where` masks out invalid values from scores. # zeros and sums: (batch_size,) scores = np.sum(np.where(labels != 0, token_scores, np.zeros_like(token_scores)), axis=1) if self.constant_length_ratio is not None and self.constant_length_ratio > 0.0: predicted_output_length = source_length * self.constant_length_ratio else: predicted_output_length = source_length * length_ratio scores = self.scorer(scores, target_length, predicted_output_length) return scores
def forward(self, scores, target_dists, finished, best_hyp_indices): """ Choose an extension of each hypothesis from its softmax distribution. :param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size) :param target_dists: The non-cumulative target distributions (ignored). :param finished: The list of finished hypotheses. :param best_hyp_indices: Best hypothesis indices constant. :return: The row indices, column indices, and values of the sampled words. """ # Map the negative logprobs to probabilities so as to have a distribution target_dists = np.exp(-target_dists) # n == 0 means sample from the full vocabulary. Otherwise, we sample from the top n. if self.n != 0: # select the top n in each row, via a mask masked_items = npx.topk(target_dists, k=self.n, ret_typ='mask', axis=1, is_ascend=False) # set unmasked items to 0 masked_items = np.where(masked_items, target_dists, masked_items) # renormalize target_dists = masked_items / np.sum(masked_items, axis=1, keepdims=True) # Sample from the target distributions over words, then get the corresponding values from the cumulative scores best_word_indices = npx.random.categorical(target_dists, get_prob=False) # Zeroes for finished hypotheses. best_word_indices = np.where(finished, np.zeros_like(best_word_indices), best_word_indices) values = npx.pick(scores, best_word_indices, axis=1, keepdims=True) best_hyp_indices = npx.slice_like(best_hyp_indices, best_word_indices, axes=(0,)) return best_hyp_indices, best_word_indices, values
def forward(self, logits: np.ndarray, labels: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: pred = npx.log_softmax(logits, axis=-1) # (batch, len) neg_log_likelihood = - npx.pick(pred, # pylint: disable=invalid-unary-operand-type labels, axis=-1, keepdims=False) # label smoothing as in # https://github.com/dmlc/gluon-nlp/blob/b714eaccc67619d7bdcbd1574d30be87d9c73f0c/src/gluonnlp/loss.py#L4 if self._alpha > 0: all_scores = np.sum(pred, axis=-1) neg_log_likelihood = (1 - self._alpha) * neg_log_likelihood - self._alpha / self._num_labels * all_scores # (batch, len,) valid_mask = labels != self.ignore_label # (batch, len) loss = neg_log_likelihood * valid_mask # (1,) num_valid = np.sum(valid_mask) # (1,) ce = np.sum(loss) * self.weight # we need to divide by num_valid here to backpropagate a 'valid' normalized loss value like in SoftmaxOutput. return ce / num_valid, np.ones((1,))
def test_pick(): A = np.zeros((INT_OVERFLOW, 2)) B = np.zeros((INT_OVERFLOW)) A.attach_grad() B.attach_grad() with mx.autograd.record(): C = npx.pick(A, B) assert C.shape == (INT_OVERFLOW, ) assert C[0] == 0 C.backward() assert A.grad.shape == (INT_OVERFLOW, 2) assert B.grad.shape == (INT_OVERFLOW, ) assert A.grad[0][0] == 1
def forward(self, hidden, target): """ Parameters ---------- hidden The hidden representation Shape (..., in_units) target The target representation Shape (...,) Returns ------- sel_logits The log probability that each hidden has when label == target """ # TODO(sxjscience) The computation here can be greatly accelerated! Due to the # missing feature of index_update, we are not able to do this here. logits = self.get_logits(hidden) sel_logits = npx.pick(logits, target, axis=-1) return sel_logits
def forward(self, pred, label): """ Parameters ---------- pred : The predictions of the network. Shape (..., V) label : The labels. Shape (..., ) Returns ------- loss : Shape (..., ) """ if not self._from_logits: pred = npx.log_softmax(pred, axis=-1) log_likelihood = npx.pick(pred, label, axis=-1) all_scores = pred.sum(axis=-1) loss = - (1 - self._alpha) * log_likelihood\ - self._alpha / float(self._num_labels) * all_scores return loss