def get_special_ids(tokenizer: AutoTokenizer) -> tuple[int, ...]: """ Returns seperator id, close id, pad id, mask_id, and unk id """ return tuple( tokenizer.convert_tokens_to_ids(t) for t in (tokenizer.sep_token, tokenizer.cls_token, tokenizer.pad_token, tokenizer.mask_token, tokenizer.unk_token))
def mask_tokens(inputs: torch.Tensor, tokenizer: AutoTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. This is the standard script used in the huggingface libaray with slight adjustments for pytorch-lightning. That is only adjusting how tensors are casted to the device (e.g. probability_matrix = probability_matrix.to(inputs.device)). """ labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability) probability_matrix = probability_matrix.to(inputs.device) special_tokens_mask = [ tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] special_tokens_mask_tensor = torch.tensor( special_tokens_mask, dtype=torch.bool) special_tokens_mask_tensor = special_tokens_mask_tensor.to(inputs.device) probability_matrix.masked_fill_(special_tokens_mask_tensor, value=0.0) if tokenizer._pad_token is not None: padding_mask = labels.eq(tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() masked_indices = masked_indices.to(inputs.device) labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) full_tensor = torch.full(labels.shape, 0.8) full_tensor = full_tensor.to(inputs.device) indices_replaced = torch.bernoulli(full_tensor).bool() & masked_indices indices_replaced = indices_replaced.to(inputs.device) inputs[indices_replaced] = tokenizer.convert_tokens_to_ids( tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word other_full_tensor = torch.full(labels.shape, 0.5) other_full_tensor = other_full_tensor.to(inputs.device) indices_random = torch.bernoulli( other_full_tensor).bool() & masked_indices & ~indices_replaced indices_random = indices_random.to(inputs.device) random_words = torch.randint( len(tokenizer), labels.shape, dtype=torch.long) random_words = random_words.to(inputs.device) inputs[indices_random] = random_words[indices_random] # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels
def mask_tokens(inputs: torch.Tensor, tokenizer: AutoTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ labels = inputs.clone() probability_matrix = torch.full(labels.shape, args.mlm_probability) probability_matrix = probability_matrix.to(inputs.device) special_tokens_mask = [ tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] special_tokens_mask_tensor = torch.tensor(special_tokens_mask, dtype=torch.bool) special_tokens_mask_tensor = special_tokens_mask_tensor.to(inputs.device) # print(special_tokens_mask_tensor.device) probability_matrix.masked_fill_(special_tokens_mask_tensor, value=0.0) if tokenizer._pad_token is not None: padding_mask = labels.eq(tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() masked_indices = masked_indices.to(inputs.device) labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) full_tensor = torch.full(labels.shape, 0.8) full_tensor = full_tensor.to(inputs.device) indices_replaced = torch.bernoulli(full_tensor).bool() & masked_indices indices_replaced = indices_replaced.to(inputs.device) inputs[indices_replaced] = tokenizer.convert_tokens_to_ids( tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word other_full_tensor = torch.full(labels.shape, 0.5) other_full_tensor = other_full_tensor.to(inputs.device) indices_random = torch.bernoulli( other_full_tensor).bool() & masked_indices & ~indices_replaced indices_random = indices_random.to(inputs.device) random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) random_words = random_words.to(inputs.device) inputs[indices_random] = random_words[indices_random] # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels
def preprocess_text(x: str, tokenizer: AutoTokenizer, max_sequence_len: int): cur_x = x if isinstance(tokenizer, BertTokenizer): cur_x = "[CLS] " + cur_x cur_x = cur_x.replace("\n", "") cur_x = cur_x.replace(" cannot ", " can not ") cur_x = tokenizer.tokenize(cur_x) cur_x = tokenizer.convert_tokens_to_ids(cur_x) cur_x = cur_x[:max_sequence_len] cur_x = cur_x + [0] * (max_sequence_len - len(cur_x)) return cur_x
def convert_example_to_feature( example, tokenizer: AutoTokenizer, chineseandpunctuationextractor: ChineseAndPunctuationExtractor, label_map, max_length: Optional[int] = 512, pad_to_max_length: Optional[bool] = None): spo_list = example['spo_list'] if "spo_list" in example.keys() else None text_raw = example['text'] sub_text = [] # 放置中文字符 buff = "" # 存放非中文字符 for char in text_raw: if chineseandpunctuationextractor.is_chinese_or_punct(char): if buff != "": sub_text.append(buff) buff = "" sub_text.append(char) else: buff += char if buff != "": sub_text.append(buff) tok_to_orig_start_index = [] tok_to_orig_end_index = [] orig_to_tok_index = [] tokens = [] text_tmp = '' for (i, token) in enumerate(sub_text): orig_to_tok_index.append(len(tokens)) sub_tokens = tokenizer.tokenize(token) text_tmp += token for sub_token in sub_tokens: tok_to_orig_start_index.append(len(text_tmp) - len(token)) tok_to_orig_end_index.append(len(text_tmp) - 1) tokens.append(sub_token) if len(tokens) >= max_length - 2: break else: continue break # print("tok_to_orig_start_index: ", tok_to_orig_start_index) # print("tok_to_orig_end_index: ", tok_to_orig_end_index) # print("orig_to_tok_index: ", orig_to_tok_index) # print("tokens: ", tokens) seq_len = len(tokens) # 2 tags for each predicate + I tag + O tag num_labels = 2 * (len(label_map.keys()) - 2) + 2 # initialize tag labels = [[0] * num_labels for i in range(seq_len)] # 每个字都要生成标签表示,用于预测 if spo_list is not None: labels = parse_label(spo_list, label_map, tokens, tokenizer) # add [CLS] and [SEP] token, they are tagged into "O" for outside if seq_len > max_length - 2: tokens = tokens[0:(max_length - 2)] labels = labels[0:(max_length - 2)] tok_to_orig_start_index = tok_to_orig_start_index[0:(max_length - 2)] tok_to_orig_end_index = tok_to_orig_end_index[0:(max_length - 2)] tokens = ["[CLS]"] + tokens + ["[SEP]"] # "O" tag for [PAD], [CLS], [SEP] token outside_label = [[1] + [0] * (num_labels - 1)] labels = outside_label + labels + outside_label tok_to_orig_start_index = [-1] + tok_to_orig_start_index + [-1] tok_to_orig_end_index = [-1] + tok_to_orig_end_index + [-1] if seq_len < max_length: tokens = tokens + ["[PAD]"] * (max_length - seq_len - 2) labels = labels + outside_label * (max_length - len(labels)) tok_to_orig_start_index = tok_to_orig_start_index + [-1] * ( max_length - len(tok_to_orig_start_index)) tok_to_orig_end_index = tok_to_orig_end_index + [-1] * ( max_length - len(tok_to_orig_end_index)) token_ids = tokenizer.convert_tokens_to_ids(tokens) return InputFeature( input_ids=np.array(token_ids), seq_len=np.array(seq_len), tok_to_orig_start_index=np.array(tok_to_orig_start_index), tok_to_orig_end_index=np.array(tok_to_orig_end_index), labels=np.array(labels), )
def convert_examples_to_features( examples: List[InputExample], label_list: List[str], max_seq_length: int, tokenizer: AutoTokenizer, cls_token="[CLS]", cls_token_segment_id=0, sep_token="[SEP]", pad_token=0, pad_token_segment_id=0, pad_token_label_id=-100, sequence_a_segment_id=0, sequence_b_segment_id=1, mask_padding_with_zero=True, verbose=False ) -> List[InputFeatures]: """ Loads a data file into a list of `InputFeatures` """ label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10_000 == 0: logger.info("Writing example %d of %d", ex_index, len(examples)) tokens = [] label_ids = [] for word, label in zip(example.words, example.labels): word_tokens = tokenizer.tokenize(word) # word_tokens = word_tokens[:5] if len(word_tokens) > 0: tokens.extend(word_tokens) label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)) if len(tokens) > max_seq_length - 2: logger.warning("Sequence length exceed {} (cut).".format(max_seq_length)) tokens = tokens[: (max_seq_length - 2)] label_ids = label_ids[: (max_seq_length - 2)] tokens += [sep_token] label_ids += [pad_token_label_id] segment_ids = [sequence_a_segment_id] * len(tokens) tokens = [cls_token] + tokens label_ids = [pad_token_label_id] + label_ids segment_ids = [cls_token_segment_id] + segment_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. seq_length = len(input_ids) padding_length = max_seq_length - len(input_ids) input_ids += [pad_token] * padding_length input_mask += [0 if mask_padding_with_zero else 1] * padding_length segment_ids += [pad_token_segment_id] * padding_length label_ids += [pad_token_label_id] * padding_length decoder_mask = [(x != pad_token_label_id) for x in label_ids] # assert len(input_ids) == max_seq_length # assert len(input_mask) == max_seq_length # assert len(segment_ids) == max_seq_length # assert len(label_ids) == max_seq_length if verbose and ex_index < 1: logger.info("*** Example ***") logger.info("guid: {} (length: {})".format(example.guid, seq_length)) logger.info("tokens: %s", " ".join([str(x) for x in tokens[:seq_length]])) logger.info("input_ids: %s", " ".join([str(x) for x in input_ids[:seq_length]])) # logger.info("input_mask: %s", " ".join([str(x) for x in input_mask])) # logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) logger.info("label_ids: %s", " ".join([str(x) for x in label_ids[:seq_length]])) logger.info("decode_mask: %s", " ".join([str(x) for x in decoder_mask[:seq_length]])) features.append( InputFeatures( input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label_ids=label_ids, decoder_mask=decoder_mask ) )
def convert_examples_to_features(examples: List[InputExample], label_list: List[str], max_seq_length: int, tokenizer: AutoTokenizer, cls_token="[CLS]", cls_token_segment_id=0, sep_token="[SEP]", pad_token=0, pad_token_segment_id=0, pad_token_label_id=-100, sequence_a_segment_id=0, sequence_b_segment_id=1, mask_padding_with_zero=True, verbose=False) -> List[InputFeatures]: """ Loads a data file into a list of `InputFeatures` """ label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10_000 == 0: logger.info("Writing example %d of %d", ex_index, len(examples)) tokens = [] label_ids = [] prod_start_index = prod_end_index = -1 for wid, (word, label) in enumerate(zip(example.words, example.labels)): if label == "B-arm_description": prod_start_index = len(tokens) tokens.append(PROD_START_MARKER) label_ids.append(pad_token_label_id) elif prod_start_index >= 0 and prod_end_index < 0 and label != "I-arm_description": prod_end_index = len(tokens) tokens.append(PROD_END_MARKER) label_ids.append(pad_token_label_id) word_tokens = tokenizer.tokenize(word) word_tokens = word_tokens[:5] # avoid long chemical names if len(word_tokens) > 0: tokens.extend(word_tokens) # Use the real label id for the first token of the word, # and padding ids for the remaining tokens # skip unknown labels (used by semi-supervised training with partial annotations label_ids.extend([label_map.get(label, pad_token_label_id)] + [pad_token_label_id] * (len(word_tokens) - 1)) # Product at the end of sequence if prod_start_index >= 0 and prod_end_index < 0: prod_end_index = len(tokens) tokens.append(PROD_END_MARKER) label_ids.append(pad_token_label_id) assert prod_start_index >= 0 assert prod_end_index >= 0 # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. if len(tokens) > max_seq_length - 2: # [CLS], [SEP] logger.info( "Sentence length exceeds max_seq_length: {} ({})".format( " ".join(tokens), len(tokens))) # This will fail if PROD is cut tokens = tokens[:(max_seq_length - 2)] label_ids = label_ids[:(max_seq_length - 2)] tokens += [sep_token] label_ids += [pad_token_label_id] segment_ids = [sequence_a_segment_id] * len(tokens) tokens = [cls_token] + tokens label_ids = [pad_token_label_id] + label_ids segment_ids = [cls_token_segment_id] + segment_ids prod_start_index += 1 # cls_token added to th beginning prod_end_index += 1 input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) prod_start_mask = [0 for i in range(len(input_ids))] prod_start_mask[prod_start_index] = 1 prod_end_mask = [0 for i in range(len(input_ids))] prod_end_mask[prod_end_index] = 1 prod_mask = [0 for i in range(len(input_ids))] prod_mask[prod_start_index:prod_end_index + 1] = [1] * (prod_end_index + 1 - prod_start_index) # set segment ids for product # segment_ids[prod_start_index:prod_end_index+1] = [1] * (prod_end_index+1-prod_start_index) # Zero-pad up to the sequence length. seq_length = len(input_ids) padding_length = max_seq_length - seq_length input_ids += [pad_token] * padding_length input_mask += [0 if mask_padding_with_zero else 1] * padding_length prod_start_mask += ([0 if mask_padding_with_zero else 1] * padding_length) prod_end_mask += ([0 if mask_padding_with_zero else 1] * padding_length) prod_mask += ([0 if mask_padding_with_zero else 1] * padding_length) segment_ids += [pad_token_segment_id] * padding_length label_ids += [pad_token_label_id] * padding_length decoder_mask = [(x != pad_token_label_id) for x in label_ids] assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(prod_start_mask) == max_seq_length assert len(prod_end_mask) == max_seq_length assert len(prod_mask) == max_seq_length assert len(prod_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if verbose and ex_index < 1: logger.info("*** Example ***") logger.info("guid: {} (length: {})".format(example.guid, seq_length)) logger.info("tokens: " + " ".join([str(x) for x in tokens[:seq_length]])) logger.info("input_ids: " + " ".join([str(x) for x in input_ids[:seq_length]])) logger.info("label_ids: " + " ".join([str(x) for x in label_ids[:seq_length]])) logger.info("decoder_mask: " + " ".join([str(x) for x in decoder_mask[:seq_length]])) features.append( InputFeatures(input_ids=input_ids, attention_mask=input_mask, prod_start_mask=prod_start_mask, prod_end_mask=prod_end_mask, prod_mask=prod_mask, token_type_ids=segment_ids, label_ids=label_ids, decoder_mask=decoder_mask))
class TorchTransformersNerPreprocessor(Component): """Takes tokens and splits them into bert subtokens, encodes subtokens with their indices. Creates a mask of subtokens (one for the first subtoken, zero for the others). If tags are provided, calculates tags for subtokens. Args: vocab_file: path to vocabulary do_lower_case: set True if lowercasing is needed max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens max_subword_length: replace token to <unk> if it's length is larger than this (defaults to None, which is equal to +infinity) token_masking_prob: probability of masking token while training provide_subword_tags: output tags for subwords or for words subword_mask_mode: subword to select inside word tokens, can be "first" or "last" (default="first") Attributes: max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens max_subword_length: rmax lenght of a bert subtoken tokenizer: instance of Bert FullTokenizer """ def __init__(self, vocab_file: str, do_lower_case: bool = False, max_seq_length: int = 512, max_subword_length: int = None, token_masking_prob: float = 0.0, provide_subword_tags: bool = False, subword_mask_mode: str = "first", **kwargs): self._re_tokenizer = re.compile(r"[\w']+|[^\w ]") self.provide_subword_tags = provide_subword_tags self.mode = kwargs.get('mode') self.max_seq_length = max_seq_length self.max_subword_length = max_subword_length self.subword_mask_mode = subword_mask_mode if Path(vocab_file).is_file(): vocab_file = str(expand_path(vocab_file)) self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: self.tokenizer = AutoTokenizer.from_pretrained( vocab_file, do_lower_case=do_lower_case) self.token_masking_prob = token_masking_prob def __call__(self, tokens: Union[List[List[str]], List[str]], tags: List[List[str]] = None, **kwargs): if isinstance(tokens[0], str): tokens = [re.findall(self._re_tokenizer, s) for s in tokens] subword_tokens, subword_tok_ids, startofword_markers, subword_tags = [], [], [], [] for i in range(len(tokens)): toks = tokens[i] ys = ['O'] * len(toks) if tags is None else tags[i] assert len(toks) == len(ys), \ f"toks({len(toks)}) should have the same length as ys({len(ys)})" sw_toks, sw_marker, sw_ys = \ self._ner_bert_tokenize(toks, ys, self.tokenizer, self.max_subword_length, mode=self.mode, subword_mask_mode=self.subword_mask_mode, token_masking_prob=self.token_masking_prob) if self.max_seq_length is not None: if len(sw_toks) > self.max_seq_length: raise RuntimeError( f"input sequence after bert tokenization" f" shouldn't exceed {self.max_seq_length} tokens.") subword_tokens.append(sw_toks) subword_tok_ids.append( self.tokenizer.convert_tokens_to_ids(sw_toks)) startofword_markers.append(sw_marker) subword_tags.append(sw_ys) assert len(sw_marker) == len(sw_toks) == len(subword_tok_ids[-1]) == len(sw_ys), \ f"length of sow_marker({len(sw_marker)}), tokens({len(sw_toks)})," \ f" token ids({len(subword_tok_ids[-1])}) and ys({len(ys)})" \ f" for tokens = `{toks}` should match" subword_tok_ids = zero_pad(subword_tok_ids, dtype=int, padding=0) startofword_markers = zero_pad(startofword_markers, dtype=int, padding=0) attention_mask = Mask()(subword_tokens) if tags is not None: if self.provide_subword_tags: return tokens, subword_tokens, subword_tok_ids, \ attention_mask, startofword_markers, subword_tags else: nonmasked_tags = [[t for t in ts if t != 'X'] for ts in tags] for swts, swids, swms, ts in zip(subword_tokens, subword_tok_ids, startofword_markers, nonmasked_tags): if (len(swids) != len(swms)) or (len(ts) != sum(swms)): log.warning( 'Not matching lengths of the tokenization!') log.warning( f'Tokens len: {len(swts)}\n Tokens: {swts}') log.warning( f'Markers len: {len(swms)}, sum: {sum(swms)}') log.warning(f'Masks: {swms}') log.warning(f'Tags len: {len(ts)}\n Tags: {ts}') return tokens, subword_tokens, subword_tok_ids, \ attention_mask, startofword_markers, nonmasked_tags return tokens, subword_tokens, subword_tok_ids, startofword_markers, attention_mask @staticmethod def _ner_bert_tokenize( tokens: List[str], tags: List[str], tokenizer: AutoTokenizer, max_subword_len: int = None, mode: str = None, subword_mask_mode: str = "first", token_masking_prob: float = None ) -> Tuple[List[str], List[int], List[str]]: do_masking = (mode == 'train') and (token_masking_prob is not None) do_cutting = (max_subword_len is not None) tokens_subword = ['[CLS]'] startofword_markers = [0] tags_subword = ['X'] for token, tag in zip(tokens, tags): token_marker = int(tag != 'X') subwords = tokenizer.tokenize(token) if not subwords or (do_cutting and (len(subwords) > max_subword_len)): tokens_subword.append('[UNK]') startofword_markers.append(token_marker) tags_subword.append(tag) else: if do_masking and (random.random() < token_masking_prob): tokens_subword.extend(['[MASK]'] * len(subwords)) else: tokens_subword.extend(subwords) if subword_mask_mode == "last": startofword_markers.extend([0] * (len(subwords) - 1) + [token_marker]) else: startofword_markers.extend([token_marker] + [0] * (len(subwords) - 1)) tags_subword.extend([tag] + ['X'] * (len(subwords) - 1)) tokens_subword.append('[SEP]') startofword_markers.append(0) tags_subword.append('X') return tokens_subword, startofword_markers, tags_subword
def convert_examples_to_features( examples: List[InputExample], max_seq_len: int, tokenizer: AutoTokenizer, pad_token_label_id: int = -100, cls_token_segment_id: int = 0, pad_token_segment_id: int = 0, sequence_a_segment_id: int = 0, mask_padding_with_zero: bool = True, ) -> List[InputFeatures]: # Setting based on the current model type cls_token = tokenizer.cls_token sep_token = tokenizer.sep_token unk_token = tokenizer.unk_token pad_token_id = tokenizer.pad_token_id features = [] for (ex_index, example) in enumerate(examples): if ex_index % 5000 == 0: logging.debug("Processing example %d of %d", ex_index, len(examples)) # Tokenize word by word (for NER) tokens: List[str] = [] slot_labels_ids = [] pos_labels_ids = [] np_labels_ids, vp_labels_ids, entity_labels_ids, acronym_labels_ids = ( [], [], [], [], ) for ( word, slot_label, pos_label, np_label, vp_label, entity_label, acronym_label, ) in zip( example.words, example.slot_labels, example.pos_labels, example.np_labels, example.vp_labels, example.entity_labels, example.acronym_labels, ): word_tokens = tokenizer.tokenize(word) if not word_tokens: # For handling the bad-encoded word word_tokens = [unk_token] tokens.extend(word_tokens) # Use the real label ID for the first token of the word, and padding IDs for the # remaining tokens. slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1)) pos_labels_ids.extend([int(pos_label)] + [pad_token_label_id] * (len(word_tokens) - 1)) np_labels_ids.extend([int(np_label)] + [pad_token_label_id] * (len(word_tokens) - 1)) vp_labels_ids.extend([int(vp_label)] + [pad_token_label_id] * (len(word_tokens) - 1)) entity_labels_ids.extend([int(entity_label)] + [pad_token_label_id] * (len(word_tokens) - 1)) acronym_labels_ids.extend([int(acronym_label)] + [pad_token_label_id] * (len(word_tokens) - 1)) # Account for [CLS] and [SEP]. special_tokens_count = 2 if len(tokens) > max_seq_len - special_tokens_count: tokens = tokens[:(max_seq_len - special_tokens_count)] slot_labels_ids = slot_labels_ids[:(max_seq_len - special_tokens_count)] pos_labels_ids = pos_labels_ids[:(max_seq_len - special_tokens_count)] np_labels_ids = np_labels_ids[:(max_seq_len - special_tokens_count)] vp_labels_ids = vp_labels_ids[:(max_seq_len - special_tokens_count)] entity_labels_ids = entity_labels_ids[:(max_seq_len - special_tokens_count)] acronym_labels_ids = acronym_labels_ids[:(max_seq_len - special_tokens_count)] # Add [SEP] token. tokens += [sep_token] slot_labels_ids += [pad_token_label_id] pos_labels_ids += [pad_token_label_id] np_labels_ids += [pad_token_label_id] vp_labels_ids += [pad_token_label_id] entity_labels_ids += [pad_token_label_id] acronym_labels_ids += [pad_token_label_id] token_type_ids = [sequence_a_segment_id] * len(tokens) # Add [CLS] token. tokens = [cls_token] + tokens slot_labels_ids = [pad_token_label_id] + slot_labels_ids pos_labels_ids = [pad_token_label_id] + pos_labels_ids np_labels_ids = [pad_token_label_id] + np_labels_ids vp_labels_ids = [pad_token_label_id] + vp_labels_ids entity_labels_ids = [pad_token_label_id] + entity_labels_ids acronym_labels_ids = [pad_token_label_id] + acronym_labels_ids token_type_ids = [cls_token_segment_id] + token_type_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_seq_len - len(input_ids) input_ids = input_ids + ([pad_token_id] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length) pos_labels_ids = pos_labels_ids + ([pad_token_label_id] * padding_length) np_labels_ids = np_labels_ids + ([pad_token_label_id] * padding_length) vp_labels_ids = vp_labels_ids + ([pad_token_label_id] * padding_length) entity_labels_ids = entity_labels_ids + ([pad_token_label_id] * padding_length) acronym_labels_ids = acronym_labels_ids + ([pad_token_label_id] * padding_length) assert len(input_ids ) == max_seq_len, "Error with input length {} vs {}".format( len(input_ids), max_seq_len) assert (len(attention_mask) == max_seq_len ), "Error with attention mask length {} vs {}".format( len(attention_mask), max_seq_len) assert (len(token_type_ids) == max_seq_len ), "Error with token type length {} vs {}".format( len(token_type_ids), max_seq_len) assert (len(slot_labels_ids) == max_seq_len ), "Error with slot labels length {} vs {}".format( len(slot_labels_ids), max_seq_len) assert (len(pos_labels_ids) == max_seq_len ), "Error with pos labels length {} vs {}".format( len(pos_labels_ids), max_seq_len) assert (len(np_labels_ids) == max_seq_len ), "Error with np labels length {} vs {}".format( len(np_labels_ids), max_seq_len) assert (len(vp_labels_ids) == max_seq_len ), "Error with vp labels length {} vs {}".format( len(vp_labels_ids), max_seq_len) assert (len(entity_labels_ids) == max_seq_len ), "Error with entity labels length {} vs {}".format( len(entity_labels_ids), max_seq_len) assert (len(acronym_labels_ids) == max_seq_len ), "Error with acronym labels length {} vs {}".format( len(acronym_labels_ids), max_seq_len) intent_label_id = int(example.intent_label) if ex_index < 3: logging.debug( # pylint: disable=logging-not-lazy "Example created. guid: %s, tokens: %s, input_ids: %s, " + "attention_mask: %s, token_type_ids: %s, intent_label: %s (id = %d), " + "slot_labels: %s, POS_labels: %s, NP_labels: %s" + "VP_labels: %s, entity_labels, %s acronym_labels: %s", example.guid, " ".join([str(x) for x in tokens]), " ".join([str(x) for x in input_ids]), " ".join([str(x) for x in attention_mask]), " ".join([str(x) for x in token_type_ids]), example.intent_label, intent_label_id, " ".join([str(x) for x in slot_labels_ids]), " ".join([str(x) for x in pos_labels_ids]), " ".join([str(x) for x in np_labels_ids]), " ".join([str(x) for x in vp_labels_ids]), " ".join([str(x) for x in entity_labels_ids]), " ".join([str(x) for x in acronym_labels_ids]), ) features.append( InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, intent_label_id=intent_label_id, slot_labels_ids=slot_labels_ids, pos_labels_ids=pos_labels_ids, np_labels_ids=np_labels_ids, vp_labels_ids=vp_labels_ids, entity_labels_ids=entity_labels_ids, acronym_labels_ids=acronym_labels_ids, )) return features