示例#1
0
    def __call__(self, examples: List[Dict[str, List[int]]]) -> BatchEncoding:
        # convert list to dict and tensorize input
        batch = BatchEncoding({
            k: np.array([examples[i][k] for i in range(len(examples))])
            for k, v in examples[0].items()
        })
        batch["labels"] = batch["input_ids"].copy()
        batch["decoder_input_ids"] = shift_tokens_right(
            batch["labels"], self.tokenizer.pad_token_id,
            self.decoder_start_token_id)
        # permuting sentences
        do_permute = False
        if self.permute_sentence_ratio > 0.0:
            batch["input_ids"] = self.permute_sentences(batch["input_ids"])
            do_permute = True

        # masking span of tokens (text infilling in the paper)
        if self.mask_ratio:
            batch["input_ids"], batch["labels"] = self.span_mask_tokens(
                batch["input_ids"], batch["labels"], do_permute)

        # ignore pad tokens
        batch["attention_mask"] = (batch["input_ids"] !=
                                   self.tokenizer.pad_token_id).astype(int)
        batch["decoder_attention_mask"] = (
            batch["decoder_input_ids"] !=
            self.tokenizer.pad_token_id).astype(int)
        return batch
示例#2
0
文件: inputs.py 项目: xiamenwcy/ttt
def tokenize_with_progress_bar(tokenizer, args, text_list, batch_size=1000):
    assert batch_size > 0
    encoded_return = {"input_ids": [], "attention_mask": []}
    batch = []
    # actual_max_seq = =args.max_src_length
    # tokenization in batch specified by batch_size, a bit different padding here compared to the at-one-go way, the batch here always pad to max_src_length although the longest one is not that long
    for idx, each_text in tqdm(enumerate(text_list), desc="tokenizing by batch...", total=len(text_list)):
        if (idx + 1) % batch_size == 0:
            batch.append(each_text)
            encoded = tokenizer(batch, padding="max_length", truncation=True, max_length=args.max_src_length,
                                add_special_tokens=not args.is_pretrain)
            encoded_return["input_ids"].extend(encoded["input_ids"])
            encoded_return["attention_mask"].extend(encoded["attention_mask"])
            batch = []
        else:
            batch.append(each_text)

    if batch != []:
        encoded = tokenizer(batch, padding="max_length", truncation=True, max_length=args.max_src_length,
                            add_special_tokens=not args.is_pretrain)
        encoded_return["input_ids"].extend(encoded["input_ids"])
        encoded_return["attention_mask"].extend(encoded["attention_mask"])

    assert len(encoded_return["input_ids"]) == len(text_list)

    encoded_return["input_ids"] = np.array(encoded_return["input_ids"])

    assert len(encoded_return["attention_mask"]) == len(text_list)

    encoded_return["attention_mask"] = np.array(encoded_return["attention_mask"])

    from transformers import BatchEncoding
    return BatchEncoding(data=encoded_return)
示例#3
0
    def __call__(self, examples: List[Dict[str, np.ndarray]]) -> Dict[str, np.ndarray]:

        # convert list to dict and tensorize input
        batch = BatchEncoding(
            {k: np.array([examples[i][k] for i in range(len(examples))]) for k, v in examples[0].items()}
        )

        input_ids = batch["input_ids"]
        batch_size, expandend_input_length = input_ids.shape

        mask_indices = np.asarray([self.random_spans_noise_mask(expandend_input_length) for i in range(batch_size)])
        labels_mask = ~mask_indices

        input_ids_sentinel = self.create_sentinel_ids(mask_indices.astype(np.int8))
        labels_sentinel = self.create_sentinel_ids(labels_mask.astype(np.int8))

        batch["input_ids"] = self.filter_input_ids(input_ids, input_ids_sentinel)
        batch["labels"] = self.filter_input_ids(input_ids, labels_sentinel)

        if batch["input_ids"].shape[-1] != self.input_length:
            raise ValueError(
                f"`input_ids` are incorrectly preprocessed. `input_ids` length is {batch['input_ids'].shape[-1]}, but should be {self.target_length}."
            )

        if batch["labels"].shape[-1] != self.target_length:
            raise ValueError(
                f"`labels` are incorrectly preprocessed. `labels` length is {batch['labels'].shape[-1]}, but should be {self.target_length}."
            )

        # to check that tokens are correctly proprocessed, one can run `self.tokenizer.batch_decode(input_ids)` and `self.tokenizer.batch_decode(labels)` here...
        batch["decoder_input_ids"] = shift_tokens_right(
            batch["labels"], self.pad_token_id, self.decoder_start_token_id
        )

        return batch
示例#4
0
    def _generative_step(self, batch: dict) -> dict:
        start_time = time.time()
        batch = BatchEncoding(batch).to(device=self.model.device)
        generated_ids = self.model.generate(
            batch["input_ids"],
            attention_mask=batch["attention_mask"],
            do_deduplication=False,  # rag specific parameter
            use_cache=True,
            min_length=1,
            max_length=self.target_lens["val"],
        )
        gen_time = (time.time() - start_time) / batch["input_ids"].shape[0]
        preds: List[str] = self.ids_to_clean_text(generated_ids)
        target: List[str] = self.ids_to_clean_text(batch["decoder_input_ids"])
        # print(preds,target)
        loss_tensors = self._step(batch)
        base_metrics = {
            name: loss
            for name, loss in zip(self.loss_names, loss_tensors)
        }
        gen_metrics: Dict = self.calc_generative_metrics(preds, target)

        summ_len = np.mean(lmap(len, generated_ids))
        base_metrics.update(gen_time=gen_time,
                            gen_len=summ_len,
                            preds=preds,
                            target=target,
                            **gen_metrics)
        return base_metrics
示例#5
0
    def call(self,
             inputs: BatchEncoding,
             training=None,
             mask: tf.Tensor = None):
        # 1. encoding by bert module
        labels = inputs.pop('labels')
        bert_outputs: TFBaseModelOutputWithPooling = self.bert(inputs)

        # 1.1 get the pooling result => (batch_size, sequence_length, hidden_size)
        # eg: (64, 128, 768)
        output: tf.Tensor = bert_outputs.last_hidden_state

        # 1.2 compute the mask

        # 2. log_likelihood by crf
        sequence_lengths = tf.cast(tf.reduce_sum(mask, axis=-1), tf.int32)
        log_likelihood, trans = tfa.text.crf_log_likelihood(
            inputs=output,
            tag_indices=labels,
            transition_params=self.crf_transition,
            sequence_lengths=sequence_lengths)
        loss = tf.reduce_mean(-log_likelihood)

        predicted_ids, _ = tfa.text.crf_decode(
            potentials=output,
            transition_params=trans,
            sequence_length=sequence_lengths)
        if training:
            return loss
        return predicted_ids
示例#6
0
    def weight_inputs(self, inputs: BatchEncoding) -> List[float]:
        device = self.transformer.model.device
        all_attentions = self.transformer.model(**inputs.to(device),
                                                output_attentions=True)[-1]
        weights = self._aggregate_attentions(all_attentions, self.heads,
                                             self.agg_strategy).detach().cpu()
        if self.normalize_weights:
            norm = torch.linalg.norm(weights, ord=2)
            weights = torch.tensor(weights) / torch.max(
                norm, 1e-10 * torch.ones_like(norm))

        return weights.detach().cpu().numpy().tolist()
示例#7
0
def test_with_mock_objects(k):
    sequence = "Hello [MASK]"
    vocab_size = 1000

    data = {"input_ids": torch.tensor([[101, 555, 103, 102]])}
    be = BatchEncoding(data=data)

    logits = torch.rand(1, 4, vocab_size)

    tokenizer_m = Mock(spec=BertTokenizerFast,
                       return_value=be,
                       mask_token_id=103)
    model_m = Mock(spec=BertForMaskedLM)
    model_m.return_value.logits = logits

    res = get_top_k(sequence, tokenizer_m, model_m, k=k)

    assert isinstance(res, torch.Tensor)
    assert res.shape == (k, )
示例#8
0
    def test_batch_encoding_with_labels_tf(self):
        batch = BatchEncoding({
            "inputs": [[1, 2, 3], [4, 5, 6]],
            "labels": [0, 1]
        })
        tensor_batch = batch.convert_to_tensors(tensor_type="tf")
        self.assertEqual(tensor_batch["inputs"].shape, (2, 3))
        self.assertEqual(tensor_batch["labels"].shape, (2, ))

        batch = BatchEncoding({"inputs": [1, 2, 3], "labels": 0})
        tensor_batch = batch.convert_to_tensors(tensor_type="tf",
                                                prepend_batch_axis=True)
        self.assertEqual(tensor_batch["inputs"].shape, (1, 3))
        self.assertEqual(tensor_batch["labels"].shape, (1, ))
示例#9
0
    def _generative_step(self, batch: dict) -> dict:

        # batch['decoder_input_ids'] = batch['labels']
        # del batch['labels']

        start_time = time.time()
        batch = BatchEncoding(batch).to(device=self.model.device)

        generated_ids = self.model.generate(
            batch["input_ids"],
            attention_mask=batch["attention_mask"],
            use_cache=True,
            min_length=1,
            max_length=self.target_lens["val"],
        )

        gen_time = (time.time() - start_time) / batch["input_ids"].shape[0]

        preds: List[str] = self.ids_to_clean_text(generated_ids)

        label = batch["decoder_input_ids"].cpu().detach().numpy()

        label = np.where(label != -100, label, self.tokenizer.pad_token_id)
        target: List[str] = self.ids_to_clean_text(label)

        loss_tensors = self._step(batch)

        base_metrics = {
            name: loss
            for name, loss in zip(self.loss_names, loss_tensors)
        }
        gen_metrics: Dict = self.calc_generative_metrics(preds, target)

        summ_len = np.mean(lmap(len, generated_ids))

        base_metrics.update(gen_time=gen_time,
                            gen_len=summ_len,
                            preds=preds,
                            target=target,
                            **gen_metrics)

        return base_metrics
    def test_batch_encoding_with_labels_jax(self):
        batch = BatchEncoding({
            "inputs": [[1, 2, 3], [4, 5, 6]],
            "labels": [0, 1]
        })
        tensor_batch = batch.convert_to_tensors(tensor_type="jax")
        self.assertEqual(tensor_batch["inputs"].shape, (2, 3))
        self.assertEqual(tensor_batch["labels"].shape, (2, ))
        # test converting the converted
        with CaptureStderr() as cs:
            tensor_batch = batch.convert_to_tensors(tensor_type="jax")
        self.assertFalse(len(cs.err),
                         msg=f"should have no warning, but got {cs.err}")

        batch = BatchEncoding({"inputs": [1, 2, 3], "labels": 0})
        tensor_batch = batch.convert_to_tensors(tensor_type="jax",
                                                prepend_batch_axis=True)
        self.assertEqual(tensor_batch["inputs"].shape, (1, 3))
        self.assertEqual(tensor_batch["labels"].shape, (1, ))
示例#11
0
    def assert_dump_and_restore(self,
                                be_original: BatchEncoding,
                                equal_op: Optional[Callable] = None):
        batch_encoding_str = pickle.dumps(be_original)
        self.assertIsNotNone(batch_encoding_str)

        be_restored = pickle.loads(batch_encoding_str)

        # Ensure is_fast is correctly restored
        self.assertEqual(be_restored.is_fast, be_original.is_fast)

        # Ensure encodings are potentially correctly restored
        if be_original.is_fast:
            self.assertIsNotNone(be_restored.encodings)
        else:
            self.assertIsNone(be_restored.encodings)

        # Ensure the keys are the same
        for original_v, restored_v in zip(be_original.values(),
                                          be_restored.values()):
            if equal_op:
                self.assertTrue(equal_op(restored_v, original_v))
            else:
                self.assertEqual(restored_v, original_v)
示例#12
0
    def build_scatter_offsets(
        self,
        model_inputs: BatchEncoding,
        return_tensors: bool = True,
        there_is_text_pair: bool = False,
    ) -> Tuple:
        """
        Build the offset tensor for the batch of inputs.

        Args:
            model_inputs (:obj:`BatchEncoding`):
                The inputs to the transformer model.
            return_tensors (:obj:`bool`, optional, defaults to :obj:`True`):
                If :obj:`True`, the outputs is converted to :obj:`torch.Tensor`
            there_is_text_pair (:obj:`bool`, optional, defaults to :obj:`False`):
                If :obj:`True` `text_pair` is not None.

        Returns:
            :obj:`List[List[int]]` or :obj:`torch.Tensor`: The offsets of the sub-tokens.
        """
        # output data structure
        offsets = []
        sentence_lengths = []
        # model_inputs should be the output of the HuggingFace tokenizer
        # it contains the word offsets to reconstruct the original tokens from the
        # sub-tokens
        for batch_index in range(len(model_inputs.input_ids)):
            word_ids = model_inputs.word_ids(batch_index)
            # it is slightly different from what we need, so here we make it compatible
            # with our subword pooling strategy
            # if the first token is a special token, we need to take it into account
            if self.has_starting_token:
                word_offsets = [0] + [
                    w + 1 if w is not None else w for w in word_ids[1:]
                ]
            # otherwise, we can just use word_ids as is
            else:
                word_offsets = word_ids
            # here we retrieve the max offset for the sample, which will be used as SEP offset
            # and also as padding value for the offsets
            sep_offset_value = max([w for w in word_offsets if w is not None
                                    ]) + 1
            # replace first None occurrence with sep_offset
            sep_index = word_offsets.index(None)
            word_offsets[sep_index] = sep_offset_value
            # if there is a text pair, we need to adjust the offsets for the second text
            if there_is_text_pair:
                # some models have two SEP tokens in between the two texts
                if self.has_double_sep:
                    sep_index += 1
                    sep_offset_value += 1
                    word_offsets[sep_index] = sep_offset_value
                # keep the first offsets as is, adjust the second ones
                word_offsets = word_offsets[:sep_index + 1] + [
                    w + sep_offset_value if w is not None else w
                    for w in word_offsets[sep_index + 1:]
                ]
                # update again the sep_offset
                sep_offset_value = max(
                    [w for w in word_offsets if w is not None]) + 1
                # replace first None occurrence with sep_offset, it should be the last SEP
                sep_index = word_offsets.index(None)
                word_offsets[sep_index] = sep_offset_value
            # keep track of the maximum offset for padding
            offsets.append(word_offsets)
            sentence_lengths.append(sep_offset_value + 1)
        # replace remaining None occurrences with -1
        # the remaining None occurrences are the padding values
        offsets = [[o if o is not None else -1 for o in offset]
                   for offset in offsets]
        # if return_tensor is True, we need to convert the offsets to tensors
        if return_tensors:
            offsets = torch.as_tensor(offsets)
        return offsets, sentence_lengths
示例#13
0
    def __call__(self,
        input_text: Union[List[str], List[List[str]]],
        *,
        use_delim: bool = False,
        delim_set: Optional[str] = ',,。::;;!!??',
        batch_size: int = 256,
        max_length: Optional[int] = None,
        show_progress: bool = True,
    ):
        """Call the driver.

        Parameters
        ----------
            input_text : ``List[str]`` or ``List[List[str]]``
                The input sentences. Each sentence is a string or a list of string.
            use_delim : ``bool``, *optional*, defaults to False
                Segment sentence (internally) using ``delim_set``.
            delim_set : `str`, *optional*, defaults to ``',,。::;;!!??'``
                Used for sentence segmentation if ``use_delim=True``.
            batch_size : ``int``, *optional*, defaults to 256
                The size of mini-batch.
            max_length : ``int``, *optional*
                The maximum length of the sentence,
                must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
            show_progress : ``int``, *optional*, defaults to True
                Show progress bar.
        """

        model_max_length = self.tokenizer.model_max_length - 2  # Add [CLS] and [SEP]
        if max_length:
            assert max_length < model_max_length, \
                'Sequence length is longer than the maximum sequence length for this model ' \
               f'({max_length} > {model_max_length}).'
        else:
            max_length = model_max_length

        # Apply delimiter cut
        delim_index = self._find_delim(
            input_text=input_text,
            use_delim=use_delim,
            delim_set=delim_set,
        )

        # Get worded input IDs
        if show_progress:
            input_text = tqdm(input_text, desc='Tokenization')

        input_ids_worded = [
            [
                self.tokenizer.convert_tokens_to_ids(list(input_word)) for input_word in input_sent
            ] for input_sent in input_text
        ]

        # Flatten input IDs
        (
            input_ids,
            index_map,
        ) = self._flatten_input_ids(
            input_ids_worded=input_ids_worded,
            max_length=max_length,
            delim_index=delim_index,
        )

        # Pad and segment input IDs
        (
            input_ids,
            attention_mask,
        ) = self._pad_input_ids(
            input_ids=input_ids,
        )

        # Convert input format
        encoded_input = BatchEncoding(
            data=dict(
                input_ids=input_ids,
                attention_mask=attention_mask,
            ),
            tensor_type='pt',
        )

        # Create dataset
        dataset = TensorDataset(*encoded_input.values())
        dataloader = DataLoader(
            dataset=dataset,
            batch_size=batch_size,
            shuffle=False,
            drop_last=False,
            pin_memory=True,
        )
        if show_progress:
            dataloader = tqdm(dataloader, desc='Inference')

        # Call Model
        logits = []
        with torch.no_grad():
            for batch in dataloader:
                batch = tuple(tensor.to(self.device) for tensor in batch)
                (
                    batch_logits,
                ) = self.model(**dict(zip(encoded_input.keys(), batch)), return_dict=False)
                batch_logits = batch_logits.cpu().numpy()[:, 1:, :]  # Remove [CLS]
                logits.append(batch_logits)

        # Call model
        logits = np.concatenate(logits, axis=0)

        return logits, index_map
示例#14
0
 def embed_inputs(self, inputs: BatchEncoding) -> List[List[List[float]]]:
     device = self.transformer.model.device
     outputs = self.transformer.model(**inputs.to(device),
                                      output_hidden_states=True)[-1]
     embeddings_t = self._embedings_from_outputs(outputs)
     return embeddings_t.detach().cpu().tolist()