def __call__(self, examples: List[Dict[str, List[int]]]) -> BatchEncoding: # convert list to dict and tensorize input batch = BatchEncoding({ k: np.array([examples[i][k] for i in range(len(examples))]) for k, v in examples[0].items() }) batch["labels"] = batch["input_ids"].copy() batch["decoder_input_ids"] = shift_tokens_right( batch["labels"], self.tokenizer.pad_token_id, self.decoder_start_token_id) # permuting sentences do_permute = False if self.permute_sentence_ratio > 0.0: batch["input_ids"] = self.permute_sentences(batch["input_ids"]) do_permute = True # masking span of tokens (text infilling in the paper) if self.mask_ratio: batch["input_ids"], batch["labels"] = self.span_mask_tokens( batch["input_ids"], batch["labels"], do_permute) # ignore pad tokens batch["attention_mask"] = (batch["input_ids"] != self.tokenizer.pad_token_id).astype(int) batch["decoder_attention_mask"] = ( batch["decoder_input_ids"] != self.tokenizer.pad_token_id).astype(int) return batch
def tokenize_with_progress_bar(tokenizer, args, text_list, batch_size=1000): assert batch_size > 0 encoded_return = {"input_ids": [], "attention_mask": []} batch = [] # actual_max_seq = =args.max_src_length # tokenization in batch specified by batch_size, a bit different padding here compared to the at-one-go way, the batch here always pad to max_src_length although the longest one is not that long for idx, each_text in tqdm(enumerate(text_list), desc="tokenizing by batch...", total=len(text_list)): if (idx + 1) % batch_size == 0: batch.append(each_text) encoded = tokenizer(batch, padding="max_length", truncation=True, max_length=args.max_src_length, add_special_tokens=not args.is_pretrain) encoded_return["input_ids"].extend(encoded["input_ids"]) encoded_return["attention_mask"].extend(encoded["attention_mask"]) batch = [] else: batch.append(each_text) if batch != []: encoded = tokenizer(batch, padding="max_length", truncation=True, max_length=args.max_src_length, add_special_tokens=not args.is_pretrain) encoded_return["input_ids"].extend(encoded["input_ids"]) encoded_return["attention_mask"].extend(encoded["attention_mask"]) assert len(encoded_return["input_ids"]) == len(text_list) encoded_return["input_ids"] = np.array(encoded_return["input_ids"]) assert len(encoded_return["attention_mask"]) == len(text_list) encoded_return["attention_mask"] = np.array(encoded_return["attention_mask"]) from transformers import BatchEncoding return BatchEncoding(data=encoded_return)
def __call__(self, examples: List[Dict[str, np.ndarray]]) -> Dict[str, np.ndarray]: # convert list to dict and tensorize input batch = BatchEncoding( {k: np.array([examples[i][k] for i in range(len(examples))]) for k, v in examples[0].items()} ) input_ids = batch["input_ids"] batch_size, expandend_input_length = input_ids.shape mask_indices = np.asarray([self.random_spans_noise_mask(expandend_input_length) for i in range(batch_size)]) labels_mask = ~mask_indices input_ids_sentinel = self.create_sentinel_ids(mask_indices.astype(np.int8)) labels_sentinel = self.create_sentinel_ids(labels_mask.astype(np.int8)) batch["input_ids"] = self.filter_input_ids(input_ids, input_ids_sentinel) batch["labels"] = self.filter_input_ids(input_ids, labels_sentinel) if batch["input_ids"].shape[-1] != self.input_length: raise ValueError( f"`input_ids` are incorrectly preprocessed. `input_ids` length is {batch['input_ids'].shape[-1]}, but should be {self.target_length}." ) if batch["labels"].shape[-1] != self.target_length: raise ValueError( f"`labels` are incorrectly preprocessed. `labels` length is {batch['labels'].shape[-1]}, but should be {self.target_length}." ) # to check that tokens are correctly proprocessed, one can run `self.tokenizer.batch_decode(input_ids)` and `self.tokenizer.batch_decode(labels)` here... batch["decoder_input_ids"] = shift_tokens_right( batch["labels"], self.pad_token_id, self.decoder_start_token_id ) return batch
def _generative_step(self, batch: dict) -> dict: start_time = time.time() batch = BatchEncoding(batch).to(device=self.model.device) generated_ids = self.model.generate( batch["input_ids"], attention_mask=batch["attention_mask"], do_deduplication=False, # rag specific parameter use_cache=True, min_length=1, max_length=self.target_lens["val"], ) gen_time = (time.time() - start_time) / batch["input_ids"].shape[0] preds: List[str] = self.ids_to_clean_text(generated_ids) target: List[str] = self.ids_to_clean_text(batch["decoder_input_ids"]) # print(preds,target) loss_tensors = self._step(batch) base_metrics = { name: loss for name, loss in zip(self.loss_names, loss_tensors) } gen_metrics: Dict = self.calc_generative_metrics(preds, target) summ_len = np.mean(lmap(len, generated_ids)) base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **gen_metrics) return base_metrics
def call(self, inputs: BatchEncoding, training=None, mask: tf.Tensor = None): # 1. encoding by bert module labels = inputs.pop('labels') bert_outputs: TFBaseModelOutputWithPooling = self.bert(inputs) # 1.1 get the pooling result => (batch_size, sequence_length, hidden_size) # eg: (64, 128, 768) output: tf.Tensor = bert_outputs.last_hidden_state # 1.2 compute the mask # 2. log_likelihood by crf sequence_lengths = tf.cast(tf.reduce_sum(mask, axis=-1), tf.int32) log_likelihood, trans = tfa.text.crf_log_likelihood( inputs=output, tag_indices=labels, transition_params=self.crf_transition, sequence_lengths=sequence_lengths) loss = tf.reduce_mean(-log_likelihood) predicted_ids, _ = tfa.text.crf_decode( potentials=output, transition_params=trans, sequence_length=sequence_lengths) if training: return loss return predicted_ids
def weight_inputs(self, inputs: BatchEncoding) -> List[float]: device = self.transformer.model.device all_attentions = self.transformer.model(**inputs.to(device), output_attentions=True)[-1] weights = self._aggregate_attentions(all_attentions, self.heads, self.agg_strategy).detach().cpu() if self.normalize_weights: norm = torch.linalg.norm(weights, ord=2) weights = torch.tensor(weights) / torch.max( norm, 1e-10 * torch.ones_like(norm)) return weights.detach().cpu().numpy().tolist()
def test_with_mock_objects(k): sequence = "Hello [MASK]" vocab_size = 1000 data = {"input_ids": torch.tensor([[101, 555, 103, 102]])} be = BatchEncoding(data=data) logits = torch.rand(1, 4, vocab_size) tokenizer_m = Mock(spec=BertTokenizerFast, return_value=be, mask_token_id=103) model_m = Mock(spec=BertForMaskedLM) model_m.return_value.logits = logits res = get_top_k(sequence, tokenizer_m, model_m, k=k) assert isinstance(res, torch.Tensor) assert res.shape == (k, )
def test_batch_encoding_with_labels_tf(self): batch = BatchEncoding({ "inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1] }) tensor_batch = batch.convert_to_tensors(tensor_type="tf") self.assertEqual(tensor_batch["inputs"].shape, (2, 3)) self.assertEqual(tensor_batch["labels"].shape, (2, )) batch = BatchEncoding({"inputs": [1, 2, 3], "labels": 0}) tensor_batch = batch.convert_to_tensors(tensor_type="tf", prepend_batch_axis=True) self.assertEqual(tensor_batch["inputs"].shape, (1, 3)) self.assertEqual(tensor_batch["labels"].shape, (1, ))
def _generative_step(self, batch: dict) -> dict: # batch['decoder_input_ids'] = batch['labels'] # del batch['labels'] start_time = time.time() batch = BatchEncoding(batch).to(device=self.model.device) generated_ids = self.model.generate( batch["input_ids"], attention_mask=batch["attention_mask"], use_cache=True, min_length=1, max_length=self.target_lens["val"], ) gen_time = (time.time() - start_time) / batch["input_ids"].shape[0] preds: List[str] = self.ids_to_clean_text(generated_ids) label = batch["decoder_input_ids"].cpu().detach().numpy() label = np.where(label != -100, label, self.tokenizer.pad_token_id) target: List[str] = self.ids_to_clean_text(label) loss_tensors = self._step(batch) base_metrics = { name: loss for name, loss in zip(self.loss_names, loss_tensors) } gen_metrics: Dict = self.calc_generative_metrics(preds, target) summ_len = np.mean(lmap(len, generated_ids)) base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **gen_metrics) return base_metrics
def test_batch_encoding_with_labels_jax(self): batch = BatchEncoding({ "inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1] }) tensor_batch = batch.convert_to_tensors(tensor_type="jax") self.assertEqual(tensor_batch["inputs"].shape, (2, 3)) self.assertEqual(tensor_batch["labels"].shape, (2, )) # test converting the converted with CaptureStderr() as cs: tensor_batch = batch.convert_to_tensors(tensor_type="jax") self.assertFalse(len(cs.err), msg=f"should have no warning, but got {cs.err}") batch = BatchEncoding({"inputs": [1, 2, 3], "labels": 0}) tensor_batch = batch.convert_to_tensors(tensor_type="jax", prepend_batch_axis=True) self.assertEqual(tensor_batch["inputs"].shape, (1, 3)) self.assertEqual(tensor_batch["labels"].shape, (1, ))
def assert_dump_and_restore(self, be_original: BatchEncoding, equal_op: Optional[Callable] = None): batch_encoding_str = pickle.dumps(be_original) self.assertIsNotNone(batch_encoding_str) be_restored = pickle.loads(batch_encoding_str) # Ensure is_fast is correctly restored self.assertEqual(be_restored.is_fast, be_original.is_fast) # Ensure encodings are potentially correctly restored if be_original.is_fast: self.assertIsNotNone(be_restored.encodings) else: self.assertIsNone(be_restored.encodings) # Ensure the keys are the same for original_v, restored_v in zip(be_original.values(), be_restored.values()): if equal_op: self.assertTrue(equal_op(restored_v, original_v)) else: self.assertEqual(restored_v, original_v)
def build_scatter_offsets( self, model_inputs: BatchEncoding, return_tensors: bool = True, there_is_text_pair: bool = False, ) -> Tuple: """ Build the offset tensor for the batch of inputs. Args: model_inputs (:obj:`BatchEncoding`): The inputs to the transformer model. return_tensors (:obj:`bool`, optional, defaults to :obj:`True`): If :obj:`True`, the outputs is converted to :obj:`torch.Tensor` there_is_text_pair (:obj:`bool`, optional, defaults to :obj:`False`): If :obj:`True` `text_pair` is not None. Returns: :obj:`List[List[int]]` or :obj:`torch.Tensor`: The offsets of the sub-tokens. """ # output data structure offsets = [] sentence_lengths = [] # model_inputs should be the output of the HuggingFace tokenizer # it contains the word offsets to reconstruct the original tokens from the # sub-tokens for batch_index in range(len(model_inputs.input_ids)): word_ids = model_inputs.word_ids(batch_index) # it is slightly different from what we need, so here we make it compatible # with our subword pooling strategy # if the first token is a special token, we need to take it into account if self.has_starting_token: word_offsets = [0] + [ w + 1 if w is not None else w for w in word_ids[1:] ] # otherwise, we can just use word_ids as is else: word_offsets = word_ids # here we retrieve the max offset for the sample, which will be used as SEP offset # and also as padding value for the offsets sep_offset_value = max([w for w in word_offsets if w is not None ]) + 1 # replace first None occurrence with sep_offset sep_index = word_offsets.index(None) word_offsets[sep_index] = sep_offset_value # if there is a text pair, we need to adjust the offsets for the second text if there_is_text_pair: # some models have two SEP tokens in between the two texts if self.has_double_sep: sep_index += 1 sep_offset_value += 1 word_offsets[sep_index] = sep_offset_value # keep the first offsets as is, adjust the second ones word_offsets = word_offsets[:sep_index + 1] + [ w + sep_offset_value if w is not None else w for w in word_offsets[sep_index + 1:] ] # update again the sep_offset sep_offset_value = max( [w for w in word_offsets if w is not None]) + 1 # replace first None occurrence with sep_offset, it should be the last SEP sep_index = word_offsets.index(None) word_offsets[sep_index] = sep_offset_value # keep track of the maximum offset for padding offsets.append(word_offsets) sentence_lengths.append(sep_offset_value + 1) # replace remaining None occurrences with -1 # the remaining None occurrences are the padding values offsets = [[o if o is not None else -1 for o in offset] for offset in offsets] # if return_tensor is True, we need to convert the offsets to tensors if return_tensors: offsets = torch.as_tensor(offsets) return offsets, sentence_lengths
def __call__(self, input_text: Union[List[str], List[List[str]]], *, use_delim: bool = False, delim_set: Optional[str] = ',,。::;;!!??', batch_size: int = 256, max_length: Optional[int] = None, show_progress: bool = True, ): """Call the driver. Parameters ---------- input_text : ``List[str]`` or ``List[List[str]]`` The input sentences. Each sentence is a string or a list of string. use_delim : ``bool``, *optional*, defaults to False Segment sentence (internally) using ``delim_set``. delim_set : `str`, *optional*, defaults to ``',,。::;;!!??'`` Used for sentence segmentation if ``use_delim=True``. batch_size : ``int``, *optional*, defaults to 256 The size of mini-batch. max_length : ``int``, *optional* The maximum length of the sentence, must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``). show_progress : ``int``, *optional*, defaults to True Show progress bar. """ model_max_length = self.tokenizer.model_max_length - 2 # Add [CLS] and [SEP] if max_length: assert max_length < model_max_length, \ 'Sequence length is longer than the maximum sequence length for this model ' \ f'({max_length} > {model_max_length}).' else: max_length = model_max_length # Apply delimiter cut delim_index = self._find_delim( input_text=input_text, use_delim=use_delim, delim_set=delim_set, ) # Get worded input IDs if show_progress: input_text = tqdm(input_text, desc='Tokenization') input_ids_worded = [ [ self.tokenizer.convert_tokens_to_ids(list(input_word)) for input_word in input_sent ] for input_sent in input_text ] # Flatten input IDs ( input_ids, index_map, ) = self._flatten_input_ids( input_ids_worded=input_ids_worded, max_length=max_length, delim_index=delim_index, ) # Pad and segment input IDs ( input_ids, attention_mask, ) = self._pad_input_ids( input_ids=input_ids, ) # Convert input format encoded_input = BatchEncoding( data=dict( input_ids=input_ids, attention_mask=attention_mask, ), tensor_type='pt', ) # Create dataset dataset = TensorDataset(*encoded_input.values()) dataloader = DataLoader( dataset=dataset, batch_size=batch_size, shuffle=False, drop_last=False, pin_memory=True, ) if show_progress: dataloader = tqdm(dataloader, desc='Inference') # Call Model logits = [] with torch.no_grad(): for batch in dataloader: batch = tuple(tensor.to(self.device) for tensor in batch) ( batch_logits, ) = self.model(**dict(zip(encoded_input.keys(), batch)), return_dict=False) batch_logits = batch_logits.cpu().numpy()[:, 1:, :] # Remove [CLS] logits.append(batch_logits) # Call model logits = np.concatenate(logits, axis=0) return logits, index_map
def embed_inputs(self, inputs: BatchEncoding) -> List[List[List[float]]]: device = self.transformer.model.device outputs = self.transformer.model(**inputs.to(device), output_hidden_states=True)[-1] embeddings_t = self._embedings_from_outputs(outputs) return embeddings_t.detach().cpu().tolist()