def __init__( self, examples: List[SequenceClassificationExample], tokenizer: PreTrainedTokenizerFast, label_to_id: Dict[str, int], tokens_per_batch: int = 32, ): self.features: List[InputFeatures] = [] self.examples: List[SequenceClassificationExample] = examples texts: StrList = [ex.text for ex in self.examples] labels: StrList = [ex.label for ex in self.examples] # tokenize text into subwords with padding and truncation self.encodings: List[BatchEncoding] = [ tokenizer.encode_plus( text, add_special_tokens=True, max_length=tokens_per_batch, return_token_type_ids=False, padding="max_length", return_attention_mask=True, return_tensors="np", truncation=True, ) for text in texts ] # register features self.features = [ InputFeatures( input_ids=encoding.input_ids.flatten().tolist(), attention_mask=encoding.attention_mask.flatten().tolist(), label_ids=[label_to_id.get(label, 0)], ) for encoding, label in zip(self.encodings, labels) ] self._n_features = len(self.features)
def get_adjusted_lengths( sentences: Sentences, tokenizer: PreTrainedTokenizerFast, max_sequence_length, ) -> Tuple[int, ...]: """Return adjusted lengths based on a tokenizer and model max length.""" encodings = [tokenizer.encode_plus(" ".join(sentence), return_offsets_mapping=True) for sentence in sentences] # Create end-token masks: [CLS] Hauk ur er [SEP] -> [dropped, 0, 1, 1, dropped] # By getting initial token masks and shifting them: # [CLS] Hauk ur er [SEP] -> [0, 1, 0, 1, 0] -> # -> drop [mid shifted to left] + [1] drop # -> [_, 0, 1, 1, _] end_token_masks = [get_initial_token_mask(encoded["offset_mapping"])[2:-1] + [1] for encoded in encodings] # We need to account for two special tokens (SEP and CLS) or (<s> and </s>) when finding the cuts max_sequence_length -= 2 # And some extra, because of errors max_sequence_length -= 6 lengths = [] for end_token_mask in end_token_masks: while len(end_token_mask) != 0: prefix, end_token_mask = ( end_token_mask[:max_sequence_length], end_token_mask[max_sequence_length:], ) length = sum(prefix) lengths.append(length) return tuple(int(length) for length in lengths)
def convert_instances_to_feature_tensors( instances: List[Instance], tokenizer: PreTrainedTokenizerFast, label2idx: Dict[str, int]) -> List[Feature]: features = [] ## tokenize the word into word_piece / BPE ## NOTE: adding a leading space is important for BART/GPT/Roberta tokenization. ## Related GitHub issues: ## https://github.com/huggingface/transformers/issues/1196 ## https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py#L38-L56 ## https://github.com/ThilinaRajapakse/simpletransformers/issues/458 assert tokenizer.add_prefix_space ## has to be true, in order to tokenize pre-tokenized input print( "[Data Info] We are not limiting the max length in tokenizer. You should be aware of that" ) for idx, inst in enumerate(instances): words = inst.ori_words orig_to_tok_index = [] res = tokenizer.encode_plus(words, is_split_into_words=True) subword_idx2word_idx = res.word_ids(batch_index=0) prev_word_idx = -1 for i, mapped_word_idx in enumerate(subword_idx2word_idx): """ Note: by default, we use the first wordpiece/subword token to represent the word If you want to do something else (e.g., use last wordpiece to represent), modify them here. """ if mapped_word_idx is None: ## cls and sep token continue if mapped_word_idx != prev_word_idx: ## because we take the first subword to represent the whold word orig_to_tok_index.append(i) prev_word_idx = mapped_word_idx assert len(orig_to_tok_index) == len(words) labels = inst.labels label_ids = [label2idx[label] for label in labels] if labels else [-100] * len(words) segment_ids = [0] * len(res["input_ids"]) features.append( Feature(input_ids=res["input_ids"], attention_mask=res["attention_mask"], orig_to_tok_index=orig_to_tok_index, token_type_ids=segment_ids, word_seq_len=len(orig_to_tok_index), label_ids=label_ids)) return features
def preprocess(texts, tokenizer_path, max_len=32): input_ids, input_masks = [], [] tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path) tokenizer.mask_token = '[MASK]' tokenizer.pad_token = "[PAD]" tokenizer.sep_token = "[SEP]" tokenizer.cls_token = "[CLS]" tokenizer.unk_token = "[UNK]" for text in tqdm(texts): encoded = tokenizer.encode_plus(text, max_length=max_len, pad_to_max_length=True, truncation=True) input_ids.append(encoded['input_ids']) input_masks.append(encoded['attention_mask']) return [np.array(input_ids), np.array(input_masks)]