Пример #1
0
 def preprocess_line(self, line: list) -> list:
     results = self.tokenizer(
         line[3],
         line[4],
         padding=self.hyperparameters.padding,
         max_length=self.hyperparameters.max_length,
         truncation=True,
     )
     results['words_tails'] = whole_word_tails_mask(
         results['input_ids'], tokenizer=self.tokenizer)
     results['ids'] = line[0]
     results['labels'] = line[5]
     return results
Пример #2
0
def test_language_model(seed, sentence, masking):

    seed_everything(seed)

    input_ids = torch.tensor([tok.encode(sentence)])
    words_tails_mask = whole_word_tails_mask(input_ids, tok)

    original = input_ids.clone()

    masked, labels = mlm(input_ids, words_tails=words_tails_mask)

    assert torch.all(torch.where(labels != IGNORE_IDX, labels, masked).eq(original))

    labels = labels.tolist()[0]
    assert labels == masking, f"{labels} different from {masking}"
    def __call__(self,
                 inputs: torch.Tensor,
                 words_tails: torch.Tensor = None) -> Tuple[torch.LongTensor, torch.LongTensor]:

        device = inputs.device
        inputs = inputs.clone()
        labels = torch.full(inputs.shape, fill_value=0, dtype=torch.long, device=device)

        # We sample a few tokens in each sequence for masked-LM training (with probability args.probability defaults to 0.15 in Bert/RoBERTa)
        probability_matrix = torch.full(inputs.shape, fill_value=self.probability, dtype=torch.float32, device=device)

        # create whole work masking mask -> True if the token starts with ## (following token in composed words)
        if words_tails is None and self.whole_word_swapping:
            words_tails = whole_word_tails_mask(inputs, self.tokenizer, device=device)

        if self.whole_word_swapping:
            # with whole word masking probability matrix should average probability over the entire word
            probability_matrix.masked_fill_(words_tails, value=0.0)

        # not going to substitute special tokens of the LM (bert, roby, ...)
        special_tokens_mask = [
            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in inputs.tolist()
        ]
        special_tokens_mask_tensor = torch.tensor(special_tokens_mask, dtype=torch.bool, device=device)
        probability_matrix.masked_fill_(special_tokens_mask_tensor, value=0.0)
        labels.masked_fill_(special_tokens_mask_tensor, value=IGNORE_IDX)

        # no need to substitute padding tokens, assigning 0.0 prob
        if self.tokenizer._pad_token is not None:
            padding_mask = inputs.eq(self.tokenizer.pad_token_id)
            probability_matrix.masked_fill_(padding_mask, value=0.0)
            labels.masked_fill_(padding_mask, value=IGNORE_IDX)

        substituted_indices = torch.bernoulli(probability_matrix).bool()

        # with whole word masking, assure all tokens in a word are either all masked or not
        if self.whole_word_swapping:
            for i in range(1, substituted_indices.shape[-1]):
                substituted_indices[:, i] = substituted_indices[:, i] | (
                    substituted_indices[:, i - 1] & words_tails[:, i]
                )

        random_words = torch.randint(len(self.tokenizer), inputs.shape, dtype=torch.long, device=device)
        inputs[substituted_indices] = random_words[substituted_indices]
        labels.masked_fill_(substituted_indices, value=1)

        return inputs, labels
Пример #4
0
def test_language_model(seed, sentence, masking, new_ids):

    seed_everything(seed)

    input_ids = torch.tensor([tok.encode(sentence)])
    words_tails_mask = whole_word_tails_mask(input_ids, tok)

    original = input_ids.clone()

    swapped, labels = rts(input_ids, words_tails=words_tails_mask)

    assert torch.all(torch.eq(swapped[labels != 1], original[labels != 1]))
    assert torch.all(torch.ne(swapped[labels == 1], original[labels == 1]))

    labels = labels.tolist()[0]
    swapped = swapped.tolist()[0]

    assert swapped == new_ids, f"{swapped} different from {new_ids}"
    assert labels == masking, f"{labels} different from {masking}"
    def __call__(
        self,
        inputs: torch.Tensor,
        words_tails: torch.Tensor = None
    ) -> Tuple[torch.LongTensor, torch.LongTensor]:

        if self.tokenizer.mask_token is None:
            raise ValueError(
                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
            )

        device = inputs.device
        labels = inputs.clone()
        inputs = inputs.clone()

        # We sample a few tokens in each sequence for masked-LM training (with probability probability defaults to 0.15 in Bert/RoBERTa)
        probability_matrix = torch.full(labels.shape,
                                        fill_value=self.probability,
                                        dtype=torch.float32,
                                        device=device)

        # create whole work masking mask -> True if the token starts with ## (following token in composed words)
        if words_tails is None and self.whole_word_masking:
            words_tails = whole_word_tails_mask(inputs,
                                                self.tokenizer,
                                                device=device)

        if self.whole_word_masking:
            # with whole word masking probability matrix should average probability over the entire word
            probability_matrix.masked_fill_(words_tails, value=0.0)

        special_tokens_mask = [
            self.tokenizer.get_special_tokens_mask(
                val, already_has_special_tokens=True)
            for val in labels.tolist()
        ]
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask,
                                                     dtype=torch.bool,
                                                     device=device),
                                        value=0.0)

        if self.tokenizer._pad_token is not None:
            padding_mask = labels.eq(self.tokenizer.pad_token_id)
            probability_matrix.masked_fill_(padding_mask, value=0.0)
        masked_indices = torch.bernoulli(probability_matrix).bool()

        # with whole word masking, assure all tokens in a word are either all masked or not
        if self.whole_word_masking:
            for i in range(1, masked_indices.shape[-1]):
                masked_indices[:, i] = masked_indices[:, i] | (
                    masked_indices[:, i - 1] & words_tails[:, i])

        labels[
            ~masked_indices] = IGNORE_IDX  # We only compute loss on masked tokens

        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = torch.bernoulli(
            torch.full(labels.shape, self.probability_masked,
                       device=device)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.mask_token_id

        probability_replaced_relative = self.probability_replaced / (
            1 - self.probability_masked)
        # 10% of the time, we replace masked input tokens with random word
        indices_random = torch.bernoulli(
            torch.full(
                labels.shape, probability_replaced_relative,
                device=device)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer),
                                     labels.shape,
                                     dtype=torch.long,
                                     device=device)
        inputs[indices_random] = random_words[indices_random]

        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        pass

        return inputs, labels