示例#1
0
    def __init__(self, args, model_path=None, model=None, forward_task=None):
        """ Initialize a rescorer model

        Args:
          args: model arguments
          model_path: checkpoint path for rescoring model
        """
        # TODO (T40938917): Allow loading of multiple rescoring models
        # allow to create an empty scorer w/o model
        self.args = args
        self.forward_task = forward_task
        self.task = None
        self.model = None
        # Instantiate the model
        if model is not None:
            self.model = model["model"]
            self.task = model["task"]
        elif model_path:
            rescoring_model, _, task = utils.load_diverse_ensemble_for_inference(
                [model_path])
            self.model = rescoring_model[0]
            self.task = task

        if self.model is not None:
            self.model.eval()
            # Turn off gradient computation in eval mode
            for param in self.model.parameters():
                param.requires_grad = False
            utils.maybe_cuda(self.model)
示例#2
0
 def forward(self, src_tokens, src_lengths):
     # Fetch language IDs and remove them from src_tokens
     # Language IDs are on the right
     lang_ids = (
         src_tokens[:, -1] - pytorch_translate_data.MULTILING_DIALECT_ID_OFFSET
     )
     src_tokens = src_tokens[:, :-1]
     src_lengths -= 1
     # Create tensors for collecting encoder outputs
     bsz, seq_len = src_tokens.size()[:2]
     all_encoder_outs = utils.maybe_cuda(torch.zeros(seq_len, bsz, self.hidden_dim))
     all_final_hidden = utils.maybe_cuda(
         torch.zeros(self.num_layers, bsz, self.hidden_dim)
     )
     all_final_cell = utils.maybe_cuda(
         torch.zeros(self.num_layers, bsz, self.hidden_dim)
     )
     # We cannot use zeros_like() for src_lengths because dtype changes
     # from LongInt to Int
     all_src_lengths = utils.maybe_cuda(torch.zeros(bsz, dtype=torch.int))
     all_src_tokens = torch.zeros_like(src_tokens)
     all_embedded_words = utils.maybe_cuda(torch.zeros(seq_len, bsz, self.word_dim))
     self.last_bsz = bsz
     self.last_lang_bszs = []
     for lang_id, encoder in enumerate(self.encoders):
         if encoder is None:
             continue
         indices = torch.nonzero(lang_ids == lang_id)
         lang_bsz = indices.size(0)
         self.last_lang_bszs.append(lang_bsz)
         if lang_bsz == 0:  # Language not in this batch
             for p in encoder.parameters():
                 p.grad = torch.zeros_like(p.data)
             continue
         indices = indices.squeeze(1)
         (
             lang_encoder_outs,
             lang_final_hidden,
             lang_final_cell,
             lang_src_lengths,
             lang_src_tokens,
             lang_embedded_words,
         ) = encoder(src_tokens[indices], src_lengths[indices])
         lang_seq_len = lang_encoder_outs.size(0)
         all_encoder_outs[:lang_seq_len, indices, :] = lang_encoder_outs
         all_final_hidden[:, indices, :] = lang_final_hidden
         all_final_cell[:, indices, :] = lang_final_cell
         all_src_lengths[indices] = lang_src_lengths
         all_src_tokens[indices] = lang_src_tokens
         all_embedded_words[:, indices, :] = lang_embedded_words
     return (
         all_encoder_outs,
         all_final_hidden,
         all_final_cell,
         all_src_lengths,
         all_src_tokens,
         all_embedded_words,
     )
    def test_forward_training_precompute(self):
        """
        We test if precomputation gives the same result.
        """
        test_args = test_utils.ModelParamsDict(arch="char_aware_hybrid")

        decoder_embed_tokens = transformer.build_embedding(
            dictionary=self.word_dict, embed_dim=10
        )
        decoder = maybe_cuda(
            char_aware_hybrid.CharAwareHybridRNNDecoder(
                args=test_args,
                src_dict=self.word_dict,
                dst_dict=self.word_dict,
                embed_tokens=decoder_embed_tokens,
                num_chars=len(self.char_dict),
            )
        )
        decoder.eval()
        prev_output_word_strs = self.word_dict.symbols[-3:]
        prev_out_word_indices = [
            self.word_dict.indices[w] for w in prev_output_word_strs
        ]
        prev_output_tokens = maybe_cuda(
            torch.LongTensor(prev_out_word_indices).unsqueeze(1)
        )

        prev_output_chars = maybe_cuda(
            torch.LongTensor(
                [
                    decoder._char_list_for_word(
                        word_index=word_index, word=word, char_dict=self.char_dict
                    )
                    for word_index, word in zip(
                        prev_out_word_indices, prev_output_word_strs
                    )
                ]
            )
        )

        embed_output = maybe_cuda(
            decoder._embed_prev_outputs(
                prev_output_tokens=prev_output_tokens,
                prev_output_chars=prev_output_chars,
            )[0]
        )

        decoder.precompute_char_representations(
            char_dict=self.char_dict, embed_bytes=False, batch_size=30
        )
        embed_output_after_precompute = decoder._embed_prev_outputs(
            prev_output_tokens=prev_output_tokens, prev_output_chars=prev_output_chars
        )[0]
        # Due to a known issue in padding, we know that the results are not exactly
        # the same.
        assert torch.allclose(
            embed_output, embed_output_after_precompute, rtol=1e-04, atol=1e-04
        )
示例#4
0
    def test_precompute(self):
        """
        We test that if we shuffle the input sample, we will get the same
        forward values, both in training mode (without dropout) and in
        eval mode.
        For the meanwhile, we use an auxiliary hybrid_transformer_rnn
        in order to get the encoder output.
        """
        test_args = test_utils.ModelParamsDict(arch="char_aware_hybrid")

        decoder_embed_tokens = maybe_cuda(
            transformer.build_embedding(dictionary=self.word_dict,
                                        embed_dim=10))

        decoder = maybe_cuda(
            char_aware_hybrid.CharAwareHybridRNNDecoder(
                args=test_args,
                src_dict=self.word_dict,
                dst_dict=self.word_dict,
                embed_tokens=decoder_embed_tokens,
                num_chars=len(self.char_dict),
            ))
        # Making sure we do not apply dropout.
        decoder.eval()
        decoder.precompute_char_representations(char_dict=self.char_dict,
                                                embed_bytes=False,
                                                batch_size=5)

        first_embedding = decoder.combined_word_char_embed.weight.clone()
        first_embedding.detach()
        decoder.precompute_char_representations(char_dict=self.char_dict,
                                                embed_bytes=False,
                                                batch_size=11)
        second_embedding = decoder.combined_word_char_embed.weight.clone()
        second_embedding.detach()

        # Due to a known issue in the char_encoder model, this does not hold for
        # torch.equal (T53048656)
        assert torch.allclose(first_embedding,
                              second_embedding,
                              rtol=1e-04,
                              atol=1e-04)

        decoder.precompute_char_representations(char_dict=self.char_dict,
                                                embed_bytes=False,
                                                batch_size=23)
        third_embedding = decoder.combined_word_char_embed.weight.clone()
        third_embedding.detach()
        # Due to a known issue in the char_encoder model, this does not hold for
        # torch.equal (T53048656)
        assert torch.allclose(second_embedding,
                              third_embedding,
                              rtol=1e-04,
                              atol=1e-04)
示例#5
0
def torch_find(index, query, vocab_size):
    """
    Finds elements of query from index, outputting the last (max) index for each
    query.
    preconditions:  (1) index and query are flat arrays (can be different sizes)
                    (2) all tokens in index and query have values < vocab_size
    """
    full_to_index = maybe_cuda(torch.zeros(vocab_size).long())
    index_shape_range = maybe_cuda(torch.arange(index.shape[0]).long())
    full_to_index[index] = index_shape_range
    result = full_to_index[query]
    return result
示例#6
0
    def __init__(self, args, model_path):
        self.args = args
        # TODO (T40938917): Allow loading of multiple rescoring models
        (
            rescoring_model,
            rescoring_model_arg,
            rescoring_task,
        ) = utils.load_diverse_ensemble_for_inference([model_path])
        self.task = rescoring_task
        self.model = rescoring_model[0]
        self.model.eval()

        if not self.args.cpu:
            utils.maybe_cuda(self.model)
    def __init__(self, args, src_dict, tgt_dict, char_source_dict=None):
        super().__init__(
            args,
            src_dict=src_dict,
            tgt_dict=tgt_dict,
            char_source_dict=char_source_dict,
        )
        self.top_k_probs_binary_file = args.top_k_probs_binary_file
        self.top_k_teacher_tokens = args.top_k_teacher_tokens

        if self.top_k_probs_binary_file is None:
            # Load model ensemble from checkpoints
            (
                self.teacher_models,
                _,
                _,
            ) = pytorch_translate_utils.load_diverse_ensemble_for_inference(
                args.teacher_path.split(":"))
            if torch.cuda.is_available():
                for teacher_model in self.teacher_models:
                    teacher_model = pytorch_translate_utils.maybe_cuda(
                        teacher_model)
        else:
            self.teacher_models = None

        # Memoized scores for teacher models. By having this and gradually memoizing
        # the values, we prevent the teacher model from keeping recalculating the
        # teacher scores.
        self.top_k_teacher_scores: Dict[int, np.ndarray] = {}
        self.top_k_teacher_indices: Dict[int, np.ndarray] = {}
    def forward(self, model, sample, reduce=True):
        """Compute the loss for the given sample.

        Returns a tuple with three elements:
        1) the loss, as a Variable
        2) the sample size, which is used as the denominator for the gradient
        3) logging outputs to display while training
        """
        predictor_output, decoder_output = model(**sample['net_input'])
        # translation loss
        translation_lprobs = model.get_normalized_probs(decoder_output,
                                                        log_probs=True)
        translation_target = model.get_targets(sample, decoder_output).view(-1)
        translation_loss = F.nll_loss(translation_lprobs,
                                      translation_target,
                                      size_average=False,
                                      ignore_index=self.padding_idx,
                                      reduce=reduce)
        # predictor loss
        prediction_lprobs = model.get_predictor_normalized_probs(
            predictor_output, log_probs=True)
        # prevent domination of padding idx
        non_padding_mask = maybe_cuda(torch.ones(prediction_lprobs.size(1)))
        non_padding_mask[model.encoder.padding_idx] = 0
        prediction_lprobs = prediction_lprobs * non_padding_mask.unsqueeze(0)

        prediction_target = model.get_target_words(sample)
        assert prediction_lprobs.size(0) == prediction_target.size(0)
        assert prediction_lprobs.dim() == 2
        word_prediction_loss = -torch.gather(prediction_lprobs, 1,
                                             prediction_target)

        if reduce:
            word_prediction_loss = word_prediction_loss.sum()
        else:
            word_prediction_loss = word_prediction_loss.sum(
                1)  # loss per batch element

        assert translation_loss.size() == word_prediction_loss.size()
        loss = translation_loss + word_prediction_loss

        if self.args.sentence_avg:
            sample_size = sample['target'].size(0)
        else:
            sample_size = sample['ntokens']

        logging_output = {
            'loss': translation_loss,
            'word_prediction_loss': word_prediction_loss,
            'ntokens': sample['ntokens'],
            'sample_size': sample_size,
        }

        if reduce:
            logging_output['loss'] = utils.item(logging_output['loss'])
            logging_output['word_prediction_loss'] = utils.item(
                logging_output['word_prediction_loss'])

        return loss, sample_size, logging_output
示例#9
0
    def __init__(self, args, model_path):
        """ Initialize a rescorer model

        Args:
          args: model arguments
          model_path: checkpoint path for rescoring model
        """
        self.args = args
        # TODO (T40938917): Allow loading of multiple rescoring models
        (
            rescoring_model,
            rescoring_model_arg,
            rescoring_task,
        ) = utils.load_diverse_ensemble_for_inference([model_path])
        self.task = rescoring_task  # e.g p(y), p(x|y) etc.
        self.model = rescoring_model[0]
        self.model.eval()

        utils.maybe_cuda(self.model)
    def precompute_char_representations(
        self, char_dict, embed_bytes=False, batch_size=5000
    ):
        """
        Precomputes the embeddings from character CNNs. Then adds that to the
        word embeddings.
        Args:
            batch_size: maximum number of words in one batch
        """
        character_list = self._char_list_from_dict(
            char_dict=char_dict, embed_bytes=embed_bytes
        )
        all_idx = maybe_cuda(
            torch.LongTensor([i for i in range(self.embed_tokens.num_embeddings)])
        )
        word_embeds = self.embed_tokens(all_idx)
        num_minibatches = math.ceil(len(character_list) / batch_size)
        for i in range(num_minibatches):
            character_sublist = character_list[
                i * batch_size : min((i + 1) * batch_size, len(character_list))
            ]
            max_word_len = max(len(chars) for chars in character_sublist)
            char_inds = (
                torch.Tensor(len(character_sublist), max_word_len)
                .long()
                .fill_(char_dict.pad_index)
            )

            for j, chars in enumerate(character_sublist):
                char_inds[j, : len(chars)] = torch.LongTensor(chars)

            char_cnn_output = self._get_char_cnn_output(maybe_cuda(char_inds))

            # Filling in the precomputed embedding values.
            index_offset = i * batch_size
            for j in range(char_cnn_output.size()[1]):
                cur_idx = j + index_offset
                self.combined_word_char_embed.weight[cur_idx] = (
                    char_cnn_output[0, j, :] + word_embeds[cur_idx]
                )

        self._is_precomputed = True
        self.combined_word_char_embed.weight.detach()
示例#11
0
    def set_rank_weights(self, n_labels, rank_weights_type="uniform"):
        """Sets ranking for weights based on the number of labels.

        Args:
            n_labels: Number of labels
            rank_weights_type: Type of the ranking.

        Raises:
            AssertionError: Number of labels <= 1
            NotImplementedError: rank_weights_type is not 'uniform'
        """
        assert n_labels > 1
        if rank_weights_type == "uniform":
            self.rank_weights = 1.0 / (n_labels - 1) * maybe_cuda(torch.ones(n_labels))
        else:
            raise NotImplementedError(
                "Rank weights type {} not implemented".format(rank_weights_type)
            )
示例#12
0
    def compute_weights(self, unprojected_outs, select_single=None):
        """Derive interpolation weights from unprojected decoder outputs.

        Args:
            unprojected_outs: List of [batch_size, seq_len, out_embed_dim]
                tensors with unprojected decoder outputs.
            select_single: If not None, put all weighton n-th model.

        Returns:
            A [batch_size, seq_len, num_decoders] float32 tensor with
            normalized decoder interpolation weights.
        """
        if select_single is not None:
            sz = unprojected_outs[0].size()
            ret = maybe_cuda(torch.zeros(
                (sz[0], sz[1], len(unprojected_outs))))
            ret[:, :, select_single] = 1.0
            return ret
        if self.fixed_weights is not None:
            return self.fixed_weights
        logits = self.norm_fn(
            self.gating_network(torch.cat(unprojected_outs, 2)))
        return logits / torch.sum(logits, dim=2, keepdim=True)
示例#13
0
    def __init__(
        self,
        out_embed_dims,
        vocab_size,
        vocab_reduction_module=None,
        fixed_weights=None,
        hidden_layer_size=32,
        activation_fn=torch.nn.ReLU,
        norm_fn=torch.exp,
    ):
        """Initializes a combination strategy with explicit weights.

        Args:
            out_embed_dims (list): List of output dimensionalities of the
                decoders.
            vocab_size (int): Size of the output projection.
            vocab_reduction_module: For vocabulary reduction
            fixed_weights (list): If not None, use these fixed weights rather
                than a gating network.
            hidden_layer_size (int): Size of the hidden layer of the gating
                network.
            activation_fn: Non-linearity at the hidden layer.
            norm_fn: Function to use for normalization (exp or sigmoid).
        """
        super().__init__(out_embed_dims, vocab_size, vocab_reduction_module)
        if fixed_weights is None:
            self.fixed_weights = None
            self.gating_network = nn.Sequential(
                Linear(sum(out_embed_dims), hidden_layer_size, bias=True),
                activation_fn(),
                Linear(hidden_layer_size, len(out_embed_dims), bias=True),
            )
            self.norm_fn = norm_fn
        else:
            assert len(fixed_weights) == len(out_embed_dims)
            self.fixed_weights = maybe_cuda(
                torch.Tensor(fixed_weights).view(1, 1, -1))
示例#14
0
 def forward(self, src_tokens, src_lengths):
     bsz = src_lengths.size(0)
     ones = maybe_cuda(torch.ones((self.num_layers, bsz, 1)))
     dummy_out = torch.ones((1, bsz, 1))
     return dummy_out, ones, ones, src_lengths, src_tokens
示例#15
0
    def test_collate(self):
        """
        Makes sure that we can memoize in collate if we give a particular data index
        in different orders.
        """
        test_args = test_utils.ModelParamsDict()
        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        self.task = tasks.DictionaryHolderTask(src_dict, tgt_dict)

        teacher_model = pytorch_translate_utils.maybe_cuda(
            self.task.build_model(test_args)
        )

        d0, d1, d2, d3 = self._dummy_datasets(src_dict.eos(), tgt_dict.eos())
        dataset1 = [d0, d1]
        dataset2 = [d2, d3]
        dataset3 = [d3, d0]
        dataset4 = [d1, d2]

        top_k_teacher_scores = {}
        top_k_teacher_indices = {}
        b1 = TeacherDataset.collate(
            dataset1,
            [teacher_model],
            3,
            src_dict.pad(),
            src_dict.eos(),
            top_k_teacher_scores,
            top_k_teacher_indices,
        )
        TeacherDataset.collate(
            dataset2,
            [teacher_model],
            3,
            src_dict.pad(),
            src_dict.eos(),
            top_k_teacher_scores,
            top_k_teacher_indices,
        )
        before_scores = [top_k_teacher_scores[i].cpu().numpy() for i in range(4)]
        before_indices = [top_k_teacher_indices[i].cpu().numpy() for i in range(4)]

        TeacherDataset.collate(
            dataset3,
            [teacher_model],
            3,
            src_dict.pad(),
            src_dict.eos(),
            top_k_teacher_scores,
            top_k_teacher_indices,
        )
        TeacherDataset.collate(
            dataset4,
            [teacher_model],
            3,
            src_dict.pad(),
            src_dict.eos(),
            top_k_teacher_scores,
            top_k_teacher_indices,
        )
        after_scores = [top_k_teacher_scores[i].cpu().numpy() for i in range(4)]
        after_indices = [top_k_teacher_indices[i].cpu().numpy() for i in range(4)]

        for i in range(4):
            np.array_equal(after_scores[i], before_scores[i])
            np.array_equal(after_indices[i], before_indices[i])

        b5 = TeacherDataset.collate(
            dataset1,
            [teacher_model],
            3,
            src_dict.pad(),
            src_dict.eos(),
            top_k_teacher_scores,
            top_k_teacher_indices,
        )
        probs_before = b1["top_k_scores"].numpy()
        indices_before = b1["top_k_indices"].numpy()
        probs_after = b5["top_k_scores"].numpy()
        indices_after = b5["top_k_indices"].numpy()

        # The first one has a different length, does the last two values in the
        # before value has irrelevant values.abs
        assert np.array_equal(probs_before[0][:-4], probs_after[0][:-4]) is True
        assert np.array_equal(indices_before[0][:-4], indices_after[0][:-4]) is True
        assert np.array_equal(probs_after[0][-4:], np.zeros((4, 3))) is True
        assert np.array_equal(indices_after[0][-4:], np.zeros((4, 3))) is True

        assert np.array_equal(probs_before[1], probs_after[1]) is True
        assert np.array_equal(indices_before[1], indices_after[1]) is True
    def collate(
        dataset,
        teacher_models,
        top_k_teacher_tokens,
        pad_idx,
        eos_idx,
        top_k_teacher_scores: Dict[int, np.ndarray],
        top_k_teacher_indices: Dict[int, np.ndarray],
        left_pad_source=False,
    ):
        if len(dataset) == 0:
            return {}

        batched_samples = data.language_pair_dataset.collate(
            dataset, pad_idx, eos_idx, left_pad_source)

        sen_ids = batched_samples["id"].numpy()

        if teacher_models is not None:
            all_sen_ids_memoized = all(id in top_k_teacher_scores
                                       for id in sen_ids)

            if not all_sen_ids_memoized:
                # Because there is a high chance that the batches do not fit into memory
                # for big batches, we have to split them into smaller batches and
                # memoize their values separately.
                smaller_datasets = []

                chunk_size = max(1, math.ceil(len(dataset) / MEM_SPLIT_SIZE))
                for i in range(MEM_SPLIT_SIZE):
                    small_data = dataset[chunk_size *
                                         i:min(len(dataset), (i + 1) *
                                               chunk_size)]
                    if len(small_data) > 0:
                        smaller_datasets.append(small_data)

                for smaller_data in smaller_datasets:
                    smaller_batch = data.language_pair_dataset.collate(
                        smaller_data, pad_idx, eos_idx, left_pad_source)

                    sen_ids_this_batch = smaller_batch["id"].numpy()

                    # smaller_batch is natively on CPU. We want to make sure that
                    # the teacher models run on GPU.
                    net_input = {
                        key: pytorch_translate_utils.maybe_cuda(
                            smaller_batch["net_input"][key])
                        for key in smaller_batch["net_input"].keys()
                    }

                    teacher_output = teacher_models[0](**net_input)
                    avg_teacher_probs = teacher_models[0].get_normalized_probs(
                        teacher_output, log_probs=False)

                    for i in range(1, len(teacher_models)):
                        teacher_output = teacher_models[i](**net_input)
                        probs = teacher_models[i].get_normalized_probs(
                            teacher_output, log_probs=False)
                        avg_teacher_probs.add_(probs)

                    avg_teacher_probs.div_(len(teacher_models))
                    avg_teacher_probs = avg_teacher_probs.detach()

                    # Getting the topk probabilities, masking others,
                    # normalizing the topk probabilities.
                    top_k_teacher_tokens_avg_probs, indices = torch.topk(
                        avg_teacher_probs, k=top_k_teacher_tokens)
                    top_k_teacher_probs_normalized = F.normalize(
                        top_k_teacher_tokens_avg_probs, p=1, dim=2).cpu()
                    indices = indices.cpu()

                    # Memoization
                    for id_index, id in enumerate(sen_ids_this_batch):
                        target_length = sum(
                            (batched_samples["target"][id_index] !=
                             pad_idx).numpy())
                        if id not in top_k_teacher_scores:
                            top_k_teacher_scores[
                                id] = top_k_teacher_probs_normalized[
                                    id_index][:target_length, :]
                            top_k_teacher_indices[id] = indices[
                                id_index][:target_length, :]
            else:
                # We assume that when there is a batch which is entirely memoized
                # that means we do not need the teacher models anymore, and
                # it is better to remove them from memory.
                if len(teacher_models) > 0:
                    del teacher_models[:]

        # Now we assume that all values are already memoized.
        # Preparing all zero scores and gradually filling them in.
        max_ntokens = batched_samples["target"].shape[1]
        memoized_probs = torch.zeros(len(sen_ids), max_ntokens,
                                     top_k_teacher_tokens)
        memoized_prob_idx = torch.zeros(len(sen_ids), max_ntokens,
                                        top_k_teacher_tokens).long()

        for idx, id in enumerate(sen_ids):
            memoized_probs[idx][:top_k_teacher_scores[id].
                                shape[0]] = top_k_teacher_scores[id]
            memoized_prob_idx[idx][:top_k_teacher_indices[id].
                                   shape[0]] = top_k_teacher_indices[id]
        batched_samples["top_k_scores"] = memoized_probs
        batched_samples["top_k_indices"] = memoized_prob_idx
        return batched_samples
示例#17
0
 def forward(self, decoder_state, source_hids, src_lengths):
     return None, maybe_cuda(torch.zeros(1, src_lengths.shape[0]))
示例#18
0
    def forward(
        self,
        input_tokens,
        encoder_out,
        incremental_state=None,
        possible_translation_tokens=None,
    ):
        if input_tokens.size(1) <= 1:
            # This happens in the first time step of beam search. We return
            # flat scores, and wait for the real work in the next time step
            bsz = input_tokens.size(0)
            return (
                utils.maybe_cuda(torch.zeros(bsz, 1, self.max_vocab_size)),
                utils.maybe_cuda(torch.zeros(bsz, 1, encoder_out[0].size(0))),
                None,
            )
        # Vocab reduction not implemented
        assert possible_translation_tokens is None
        # Fetch language IDs and remove them from input_tokens
        # Token sequences start with <GO-token> <lang-id> ...
        lang_ids = (input_tokens[:, 1] -
                    pytorch_translate_data.MULTILING_DIALECT_ID_OFFSET)
        if input_tokens.size(1) > 2:
            input_tokens = torch.cat(
                [input_tokens[:, :1], input_tokens[:, 2:]], dim=1)
        else:
            input_tokens = input_tokens[:, :1]

        bsz, seq_len = input_tokens.size()[:2]
        if incremental_state is None:
            incremental_state = {
                lang_id: None
                for lang_id in range(len(self.decoders))
            }
        else:
            seq_len = 1
        # Create tensors for collecting encoder outputs
        # +1 for language ID
        all_logits = utils.maybe_cuda(
            torch.zeros(bsz, seq_len + 1, self.max_vocab_size))
        all_attn_scores = utils.maybe_cuda(
            torch.zeros(bsz, seq_len, encoder_out[0].size(0)))
        self.last_bsz = bsz
        self.last_lang_bszs = []
        for lang_id, decoder in enumerate(self.decoders):
            if decoder is None:
                continue
            if lang_id not in incremental_state:
                incremental_state[lang_id] = {}
            indices = torch.nonzero(lang_ids == lang_id)
            lang_bsz = indices.size(0)
            self.last_lang_bszs.append(lang_bsz)
            if lang_bsz == 0:  # Language not in this batch
                for p in decoder.parameters():
                    p.grad = torch.zeros_like(p.data)
                continue
            indices = indices.squeeze(1)
            max_source_length = torch.max(encoder_out[3][indices])
            lang_encoder_out = (
                encoder_out[0][:max_source_length, indices, :],
                encoder_out[1][:, indices, :],
                encoder_out[2][:, indices, :],
                encoder_out[3][indices],
                encoder_out[4][indices, :max_source_length],
            )

            lang_logits, lang_attn_scores, _ = decoder(
                input_tokens[indices], lang_encoder_out,
                incremental_state[lang_id])
            all_attn_scores[indices, :, :max_source_length] = lang_attn_scores
            all_logits[indices, 1:, :lang_logits.size(2)] = lang_logits
        incremental_state["lang_ids"] = lang_ids
        return all_logits, all_attn_scores, None
示例#19
0
    def test_forward_training(self):
        """
        We test that if we shuffle the input sample, we will get the same
        forward values, both in training mode (without dropout) and in
        eval mode.
        For the meanwhile, we use an auxiliary hybrid_transformer_rnn
        in order to get the encoder output.
        """
        test_word_decoder_args = test_utils.ModelParamsDict(
            arch="hybrid_transformer_rnn")
        self.task = tasks.DictionaryHolderTask(self.word_dict, self.word_dict)
        word_model = maybe_cuda(self.task.build_model(test_word_decoder_args))
        word_model.eval()  # Make sure we do not apply dropout.

        test_args = test_utils.ModelParamsDict(arch="char_aware_hybrid")

        decoder_embed_tokens = maybe_cuda(
            transformer.build_embedding(dictionary=self.word_dict,
                                        embed_dim=10))
        decoder = maybe_cuda(
            char_aware_hybrid.CharAwareHybridRNNDecoder(
                args=test_args,
                src_dict=self.word_dict,
                dst_dict=self.word_dict,
                embed_tokens=decoder_embed_tokens,
                num_chars=len(self.char_dict),
            ))

        src_tokens = maybe_cuda(self.sample["net_input"]["src_tokens"])
        src_lengths = maybe_cuda(self.sample["net_input"]["src_lengths"])
        prev_output_chars = maybe_cuda(
            self.sample["net_input"]["prev_output_chars"][:,
                                                          -1:, :].squeeze(1))
        prev_output_tokens = maybe_cuda(
            self.sample["net_input"]["prev_output_tokens"][:, 0:1])

        encoder_out = word_model.encoder(src_tokens, src_lengths)

        embed_output = decoder._embed_prev_outputs(
            prev_output_tokens=prev_output_tokens,
            prev_output_chars=prev_output_chars)[0]
        forward_output = decoder(
            prev_output_tokens=prev_output_tokens,
            encoder_out=encoder_out,
            prev_output_chars=prev_output_chars,
        )
        output_logits = forward_output[0]

        prev_output_tokens_shuffled = torch.cat(
            [prev_output_tokens[1:], prev_output_tokens[0].unsqueeze(0)],
            dim=0)
        prev_output_chars_shuffled = torch.cat(
            [prev_output_chars[1:], prev_output_chars[0].unsqueeze(0)], dim=0)
        src_tokens_shuffled = torch.cat(
            [src_tokens[1:], src_tokens[0].unsqueeze(0)], dim=0)

        # Making sure shuffling is done correctly.
        assert torch.equal(src_tokens[0], src_tokens_shuffled[2])
        assert torch.equal(src_tokens[1], src_tokens_shuffled[0])
        assert torch.equal(src_tokens[2], src_tokens_shuffled[1])
        assert torch.equal(prev_output_chars[0], prev_output_chars_shuffled[2])
        assert torch.equal(prev_output_chars[1], prev_output_chars_shuffled[0])
        assert torch.equal(prev_output_chars[2], prev_output_chars_shuffled[1])
        assert torch.equal(prev_output_tokens[0],
                           prev_output_tokens_shuffled[2])
        assert torch.equal(prev_output_tokens[1],
                           prev_output_tokens_shuffled[0])
        assert torch.equal(prev_output_tokens[2],
                           prev_output_tokens_shuffled[1])

        # Making sure that we embed the inputs correctly.
        encoder_out_shuffled = word_model.encoder(src_tokens_shuffled,
                                                  src_lengths)
        embed_output_shuffled = decoder._embed_prev_outputs(
            prev_output_tokens=prev_output_tokens_shuffled,
            prev_output_chars=prev_output_chars_shuffled,
        )[0]
        assert embed_output[0, 0].equal(embed_output_shuffled[0, 2])
        assert embed_output[0, 1].equal(embed_output_shuffled[0, 0])
        assert embed_output[0, 2].equal(embed_output_shuffled[0, 1])

        # Making sure the output of the forward function is correct.
        forward_output_shuffled = decoder(
            prev_output_tokens=prev_output_tokens_shuffled,
            encoder_out=encoder_out_shuffled,
            prev_output_chars=prev_output_chars_shuffled,
        )
        output_logits_shuffled = forward_output_shuffled[0]

        assert encoder_out[0][:, 0, :].equal(encoder_out_shuffled[0][:, 2, :])
        assert encoder_out[0][:, 1, :].equal(encoder_out_shuffled[0][:, 0, :])
        assert encoder_out[0][:, 2, :].equal(encoder_out_shuffled[0][:, 1, :])

        assert output_logits[0].equal(output_logits_shuffled[2])
        assert output_logits[1].equal(output_logits_shuffled[0])
        assert output_logits[2].equal(output_logits_shuffled[1])
        """
        Now trying in the eval mode.
        """
        decoder.eval()
        forward_output = decoder(
            prev_output_tokens=prev_output_tokens,
            encoder_out=encoder_out,
            prev_output_chars=prev_output_chars,
        )
        output_logits = forward_output[0]
        forward_output_shuffled = decoder(
            prev_output_tokens=prev_output_tokens_shuffled,
            encoder_out=encoder_out_shuffled,
            prev_output_chars=prev_output_chars_shuffled,
        )
        output_logits_shuffled = forward_output_shuffled[0]
        assert output_logits[0].equal(output_logits_shuffled[2])
        assert output_logits[1].equal(output_logits_shuffled[0])
        assert output_logits[2].equal(output_logits_shuffled[1])
示例#20
0
    def predictor_loss_function(self, prediction, target, rank_weights_type="uniform"):
        """Implements the WARP loss given in [1].

        In its core the function computes the following:
            loss = (X-1)/N*(xn_i - xp),
        where `xn_i` is confidence of the ith false positive, and `xp` is the
        true positive confidence. `X` is the total number of labels and `N` is
        the number of steps that it takes to find a false positive.
        Note: We might want to use ln((X-1)/N), in case N << X, which would
              expolode the loss.

        Args:
            prediction: Prediction that was made by the model of shape
                        [BATCH_SIZE, N_LABELS]
            target: Expected result of shape [BATCH_SIZE, N_OUTPUT_TOKENS]
            rank_weight_type: Argument to set the ranks of the weights.
                              See `set_rank_weights` for more details.

        Returns:
            loss: Loss as a torch.Variable
        """
        batch_size = prediction.size()[0]
        n_labels = prediction.size()[1]
        n_output_tokens = target.size()[1]
        max_num_trials = n_labels - 1

        self.set_rank_weights(n_labels, rank_weights_type)

        loss = maybe_cuda(torch.zeros(batch_size, n_output_tokens))

        for i in range(batch_size):
            for j in range(n_output_tokens):
                target_idx = target[i, j]
                neg_labels_idx = maybe_cuda(
                    torch.tensor(
                        list(set(range(n_labels)) - set(target[i, :].cpu().numpy()))
                    )
                )
                neg_idx = torch.multinomial(neg_labels_idx.double(), 1)
                # This is the hinge loss:
                # sample_score_margin = \
                #   1 - prediction[i, target_idx] + prediction[i, neg_idx]
                # TODO:
                #   Since |- prediction[i, target_idx] + prediction[i, neg_idx]|
                #   is normally around 0.01, directly using log probability in
                #   hinge loss causes most N to be 1, thus is not a good choice.
                # Observation: translation_loss is normally ~10, similar to
                #              log_probs.
                # Alternatives: scale up score difference by 100 times to match
                #               the magnitude of 1, but we also need to consider
                #               magnitude of weights and loss;
                sample_score_margin = (
                    -prediction[i, target_idx] + prediction[i, neg_idx]
                )
                N = 1
                while sample_score_margin < 0 and N < max_num_trials:
                    neg_idx = torch.multinomial(neg_labels_idx.double(), 1)
                    N += 1
                    sample_score_margin = (
                        -prediction[i, target_idx] + prediction[i, neg_idx]
                    )

                k = torch.floor(torch.tensor(max_num_trials / N)).int()
                weights = torch.sum(self.rank_weights[:k])
                score_margins = -prediction[i, target_idx] + prediction[i, neg_idx]
                loss[i, j] = (weights * score_margins.clamp(min=0.0)).mean()
        return loss
示例#21
0
    def collate(
        dataset,
        teacher_models,
        top_k_teacher_tokens,
        pad_idx,
        eos_idx,
        top_k_teacher_scores: Dict[int, np.ndarray],
        top_k_teacher_indices: Dict[int, np.ndarray],
        left_pad_source=False,
    ):
        if len(dataset) == 0:
            return {}

        batched_samples = data.language_pair_dataset.collate(
            dataset, pad_idx, eos_idx, left_pad_source
        )

        # batched_samples is natively on CPU. We want to make sure that the teacher
        # models run on GPU.
        net_input = {
            key: pytorch_translate_utils.maybe_cuda(batched_samples["net_input"][key])
            for key in batched_samples["net_input"].keys()
        }

        sen_ids = batched_samples["id"].numpy()

        all_sen_ids_memoized = all(id in top_k_teacher_scores for id in sen_ids)

        if not all_sen_ids_memoized:
            teacher_output = teacher_models[0](**net_input)
            avg_teacher_probs = teacher_models[0].get_normalized_probs(
                teacher_output, log_probs=False
            )

            for i in range(1, len(teacher_models)):
                teacher_output = teacher_models[i](**net_input)
                probs = teacher_models[i].get_normalized_probs(
                    teacher_output, log_probs=False
                )
                avg_teacher_probs.add_(probs)
            avg_teacher_probs.div_(len(teacher_models))
            avg_teacher_probs = avg_teacher_probs.detach()

            # Getting the topk probabilities, masking others,
            # normalizing the topk probabilities.
            top_k_teacher_tokens_avg_probs, indices = torch.topk(
                avg_teacher_probs, k=top_k_teacher_tokens
            )
            top_k_teacher_probs_normalized = F.normalize(
                top_k_teacher_tokens_avg_probs, p=1, dim=2
            ).cpu()
            indices = indices.cpu()
            batched_samples["top_k_scores"] = top_k_teacher_probs_normalized
            batched_samples["top_k_indices"] = indices

            # Memoization
            for id_index, id in enumerate(sen_ids):
                target_length = sum(
                    (batched_samples["target"][id_index] != pad_idx).numpy()
                )
                if id not in top_k_teacher_scores:
                    top_k_teacher_scores[id] = top_k_teacher_probs_normalized[id_index][
                        :target_length, :
                    ]
                    top_k_teacher_indices[id] = indices[id_index][:target_length, :]
        else:
            # Preparing all zero scores and gradually filling them in.
            max_ntokens = batched_samples["target"].shape[1]
            memoized_probs = torch.zeros(
                len(sen_ids), max_ntokens, top_k_teacher_tokens
            )
            memoized_prob_idx = torch.zeros(
                len(sen_ids), max_ntokens, top_k_teacher_tokens
            ).long()

            for idx, id in enumerate(sen_ids):
                memoized_probs[idx][
                    : top_k_teacher_scores[id].shape[0]
                ] = top_k_teacher_scores[id]
                memoized_prob_idx[idx][
                    : top_k_teacher_indices[id].shape[0]
                ] = top_k_teacher_indices[id]
            batched_samples["top_k_scores"] = memoized_probs
            batched_samples["top_k_indices"] = memoized_prob_idx
        return batched_samples