def add_special_input_features(self, input_example: InputExample, input_features: InputFeatures) -> None: mask_start = input_features.input_ids.index( self.wrapper.tokenizer.mask_token_id) for choice in ['choice1', 'choice2']: choice_text = input_example.meta[choice] choice_token_ids = get_verbalization_ids(choice_text, self.wrapper.tokenizer, force_single_token=False) mask_end = mask_start + len(choice_token_ids) input_features.meta[f'{choice}_token_ids'] = [-100] * \ len(input_features.input_ids) input_features.meta[f'{choice}_token_ids'][ mask_start:mask_end] = choice_token_ids
def get_input_features(self, example: InputExample, **kwargs) -> InputFeatures: inputs = self.wrapper.tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=self.wrapper.config.max_seq_length, ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] attention_mask = [1] * len(input_ids) padding_length = self.wrapper.config.max_seq_length - len(input_ids) input_ids = input_ids + ([self.wrapper.tokenizer.pad_token_id] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([0] * padding_length) mlm_labels = [-1] * len(input_ids) assert len(input_ids) == self.wrapper.config.max_seq_length assert len(attention_mask) == self.wrapper.config.max_seq_length assert len(token_type_ids) == self.wrapper.config.max_seq_length label = self.label_map[example.label] logits = example.logits if example.logits else [-1] return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label, mlm_labels=mlm_labels, logits=logits)
def get_input_features(self, example: InputExample, labelled: bool, **kwargs) -> InputFeatures: input_ids, token_type_ids = self.pvp.encode(example) attention_mask = [1] * len(input_ids) padding_length = self.wrapper.config.max_seq_length - len(input_ids) input_ids = input_ids + ([self.wrapper.tokenizer.pad_token_id] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([0] * padding_length) assert len(input_ids) == self.wrapper.config.max_seq_length assert len(attention_mask) == self.wrapper.config.max_seq_length assert len(token_type_ids) == self.wrapper.config.max_seq_length label = self.label_map[example.label] logits = example.logits if example.logits else [-1] if labelled: mlm_labels = self.pvp.get_mask_positions(input_ids) else: mlm_labels = [-1] * self.wrapper.config.max_seq_length return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label, mlm_labels=mlm_labels, logits=logits)
def _convert_examples_to_features( self, examples: List[InputExample], labelled: bool = True) -> List[InputFeatures]: features = [] for example in examples: # Preprocessor for models pretrained using a masked language modeling objective (e.g., BERT). input_ids, token_type_ids, block_flag = self.pvp.encode(example) attention_mask = [1] * len(input_ids) padding_length = self.config.max_seq_length - \ len(input_ids) if padding_length < 0: raise ValueError( f"Maximum sequence length is too small, got {len(input_ids)} input ids" ) input_ids = input_ids + \ ([self.tokenizer.pad_token_id] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([0] * padding_length) block_flag = block_flag + ([0] * padding_length) assert len(input_ids) == self.config.max_seq_length assert len(attention_mask) == self.config.max_seq_length assert len(token_type_ids) == self.config.max_seq_length assert len(block_flag) == self.config.max_seq_length label = self.label_map[ example.label] if example.label is not None else -100 logits = example.logits if example.logits else [-1] if labelled: mlm_labels = self.pvp.get_mask_positions(input_ids) else: mlm_labels = [-1] * self.config.max_seq_length input_features = InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label, mlm_labels=mlm_labels, logits=logits, idx=example.idx, block_flag=block_flag) # Add meta input features if self.task_helper: self.task_helper.add_special_input_features( example, input_features) features.append(input_features) return features
def add_special_input_features(self, input_example: InputExample, input_features: InputFeatures) -> None: mask_start = input_features.input_ids.index( self.wrapper.tokenizer.mask_token_id) num_masks = input_features.input_ids.count( self.wrapper.tokenizer.mask_token_id) mask_end = mask_start + num_masks target = input_example.meta['span1_text'] input_features.meta['target'] = target target_token_ids = get_verbalization_ids(target, self.wrapper.tokenizer, force_single_token=False) input_features.meta['target_token_ids'] = [-100] * \ len(input_features.input_ids) # we also predict <pad> tokens at the missing positions target_token_ids += [self.wrapper.tokenizer.pad_token_id] * \ (num_masks - len(target_token_ids)) input_features.meta['target_token_ids'][ mask_start:mask_end] = target_token_ids
def convert_examples_to_features( examples: List[InputExample], tokenizer: AutoTokenizer, max_length: Optional[int] = None, label_list: List = None, output_mode="classification", ): if max_length is None: max_length = tokenizer.max_len label_map = {label: i for i, label in enumerate(label_list)} def label_from_example(example: InputExample) -> Union[int, float, None]: if example.label is None: return None if output_mode == "classification": return label_map[example.label] elif output_mode == "regression": return float(example.label) raise KeyError(output_mode) labels = [label_from_example(example) for example in examples] batch_encoding = tokenizer( # [(example.text_a, example.text_b) for example in examples], [example.text_a for example in examples], max_length=max_length, padding="max_length", truncation=True, ) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % features[i]) return features
def convert_examples_to_features(self, examples, max_seq_len1, max_seq_len2, tokenizer, cls_token='[CLS]', sep_token='[SEP]', pad_token=0, pad_token_label_id=-100, cls_token_segment_id=0, pad_token_segment_id=0, sequence_a_segment_id=0, sequence_b_segment_id=1, mask_padding_with_zero=True): # Setting based on the current model type cls_token = tokenizer.cls_token sep_token = tokenizer.sep_token # unk_token = tokenizer.unk_token pad_token_id = tokenizer.pad_token_id features = [] for (ex_index, example) in enumerate(examples): if ex_index % 5000 == 0: # logger.info("Writing example %d of %d" % (ex_index, len(examples))) # print("Writing example %d of %d" % (ex_index, len(examples))) pass # Tokenize word by word # tokens1_ = tokenizer.tokenize(example.text1) text1_a = example.text1[:example.mask1[0]] text1_b = example.text1[example.mask1[0]:example.mask1[1]] text1_c = example.text1[example.mask1[1]:] tokens1_a = tokenizer.tokenize(text1_a) tokens1_b = tokenizer.tokenize(text1_b) tokens1_c = tokenizer.tokenize(text1_c) tokens1_ = tokens1_a + tokens1_b + tokens1_c mention_mask1_ = [0] * len(tokens1_a) + [1] * len( tokens1_b) + [0] * len(tokens1_c) # Add [CLS] [SEP] token tokens1_ = [cls_token] + tokens1_ + [sep_token] mention_mask1_ = [0] + mention_mask1_ + [0] token_type_ids1_ = [ cls_token_segment_id ] + [sequence_a_segment_id] * (len(tokens1_) - 1) input_ids1_ = tokenizer.convert_tokens_to_ids(tokens1_) attention_mask1_ = [1 if mask_padding_with_zero else 0 ] * len(input_ids1_) # mention_mask1_ = [0]*len(tokens1_) # # word1_ = example.text1[example.mask1[0]: example.mask1[1]] # word_token1_ = tokenizer.tokenize(word1_) # for i in range(len(tokens1_)): # if word_token1_ == tokens1_[i:i+len(word_token1_)]: # for j in range(i, i+len(word_token1_)): # mention_mask1_[j] = 1 # break # mention_mask1_ = [0] + example.mask1 + [0] tokens2 = [] for word in example.text2: tokens2.append(tokenizer.tokenize(word)) tokens2_ = [] token_type_ids2_ = [] for i, t in enumerate(tokens2): tokens2_ += t # Add [SEP] token tokens2_ += [sep_token] if i == 1 or i == 0: token_type_ids2_ += [sequence_a_segment_id] * (len(t) + 1) else: token_type_ids2_ += [sequence_b_segment_id] * (len(t) + 1) # Add [CLS] token tokens2_ = [cls_token] + tokens2_ token_type_ids2_ = [cls_token_segment_id] + token_type_ids2_ input_ids2_ = tokenizer.convert_tokens_to_ids(tokens2_) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask2_ = [1 if mask_padding_with_zero else 0 ] * len(input_ids2_) # sep_mask :find the sep token sep_token_id = tokenizer.convert_tokens_to_ids(sep_token) sep_mask_ids = [] for i, x in enumerate(input_ids2_): if x == sep_token_id: sep_mask_ids.append(i) sep_masks = [] for i in sep_mask_ids: sep_mask0 = [0] * len(input_ids2_) sep_mask0[i] = 1 sep_masks.append(sep_mask0) # Zero-pad up to the sequence length. padding_length2 = max_seq_len2 - len(input_ids2_) input_ids2_ = input_ids2_ + ([pad_token_id] * padding_length2) attention_mask2_ = attention_mask2_ + ( [0 if mask_padding_with_zero else 1] * padding_length2) token_type_ids2_ = token_type_ids2_ + ([pad_token_segment_id] * padding_length2) sep_masks111 = [] for x in sep_masks: x = x + ([0] * padding_length2) sep_masks111.append(x) sep_masks2_ = np.array(sep_masks111) padding_length1 = max_seq_len1 - len(input_ids1_) input_ids1_ = input_ids1_ + ([pad_token_id] * padding_length1) attention_mask1_ = attention_mask1_ + ( [0 if mask_padding_with_zero else 1] * padding_length1) token_type_ids1_ = token_type_ids1_ + ([pad_token_segment_id] * padding_length1) mention_mask1_ = mention_mask1_ + ([0] * padding_length1) assert len( input_ids2_ ) == max_seq_len2, "Error with input2 length {} vs {}".format( len(input_ids2_), max_seq_len2) assert len( attention_mask2_ ) == max_seq_len2, "Error with attention2 mask length {} vs {}".format( len(attention_mask2_), max_seq_len2) assert len( token_type_ids2_ ) == max_seq_len2, "Error with token2 type length {} vs {}".format( len(token_type_ids2_), max_seq_len2) assert len( input_ids1_ ) == max_seq_len1, "Error with input1 length {} vs {}".format( len(input_ids1_), max_seq_len1) assert len( attention_mask1_ ) == max_seq_len1, "Error with attention1 mask length {} vs {}".format( len(attention_mask1_), max_seq_len1) assert len( token_type_ids1_ ) == max_seq_len1, "Error with token1 type length {} vs {}".format( len(token_type_ids1_), max_seq_len1) assert len( mention_mask1_ ) == max_seq_len1, "Error with mention1 mask length {} vs {}".format( len(mention_mask1_), max_seq_len1) label = int(example.label) # if ex_index < 5: # print("example") # logger.info("*** Example ***") # logger.info("guid: %s" % example.guid) # logger.info("tokens1: %s" % " ".join([str(x) for x in tokens1_])) # logger.info("input_ids1: %s" % " ".join([str(x) for x in input_ids1_])) # logger.info("attention_mask1: %s" % " ".join([str(x) for x in attention_mask1_])) # logger.info("token_type_ids1: %s" % " ".join([str(x) for x in token_type_ids1_])) # logger.info("mention_mask1: %s" % " ".join([str(x) for x in mention_mask1_])) # # logger.info("tokens2: %s" % " ".join([str(x) for x in tokens2_])) # logger.info("input_ids2: %s" % " ".join([str(x) for x in input_ids2_])) # logger.info("attention_mask2: %s" % " ".join([str(x) for x in attention_mask2_])) # logger.info("token_type_ids2: %s" % " ".join([str(x) for x in token_type_ids2_])) # logger.info("sep_mask_ids: %s" % " ".join([str(x) for x in sep_mask_ids])) # # logger.info("intent_label: %d" % example.label) features.append( InputFeatures(input_ids1=input_ids1_, attention_mask1=attention_mask1_, token_type_ids1=token_type_ids1_, mention_masks=mention_mask1_, input_ids2=input_ids2_, attention_mask2=attention_mask2_, token_type_ids2=token_type_ids2_, sep_masks2=sep_masks2_, labels=label)) return features
def glue_convert_examples_to_features( examples, tokenizer, max_length=512, task=None, label_list=None, output_mode=None, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, ): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ is_tf_dataset = False if is_tf_available() and isinstance(examples, tf.data.Dataset): is_tf_dataset = True if task is not None: processor = glue_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = glue_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): len_examples = 0 if is_tf_dataset: example = processor.get_example_from_tensor_dict(example) example = processor.tfds_map(example) len_examples = tf.data.experimental.cardinality(examples) else: len_examples = len(examples) if ex_index % 10000 == 0: logger.info("Writing example %d/%d" % (ex_index, len_examples)) inputs = tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, ) input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids ) == max_length, "Error with input length {} vs {}".format( len(input_ids), max_length) assert len(attention_mask ) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length) assert len(token_type_ids ) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length) if output_mode == "classification": label = label_map[example.label] elif output_mode == "regression": label = float(example.label) else: raise KeyError(output_mode) if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)) if is_tf_available() and is_tf_dataset: def gen(): for ex in features: yield ( { "input_ids": ex.input_ids, "attention_mask": ex.attention_mask, "token_type_ids": ex.token_type_ids, }, ex.label, ) return tf.data.Dataset.from_generator( gen, ({ "input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32 }, tf.int64), ( { "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None]), }, tf.TensorShape([]), ), ) return features
def add_special_input_features(self, input_example: InputExample, input_features: InputFeatures) -> None: input_features.meta['question_idx'] = input_example.meta[ 'question_idx']
def get_input_features(self, example: InputExample, labelled: bool, **kwargs) -> InputFeatures: ### NEW ### if self.few_shot_data is not None: cls_id = self.wrapper.tokenizer.cls_token_id sep_id = self.wrapper.tokenizer.sep_token_id mask_id = self.wrapper.tokenizer.mask_token_id def preprocessed_ex_ids(ex, labelize): ex_input_ids = self.pvp.encode(ex)[0] # Remove the cls token while cls_id in ex_input_ids: ex_input_ids.pop(ex_input_ids.index(cls_id)) # Remove any sep token(s) before the mask token while (ex_input_ids.index(mask_id)-1 >= 0 and ex_input_ids[ex_input_ids.index(mask_id)-1] == sep_id): ex_input_ids.pop(ex_input_ids.index(mask_id)-1) if not labelize: return ex_input_ids # Replace <mask> with the label label = _prepare(self.pvp.verbalize(ex.label)[0], self.wrapper.tokenizer) label_id = self.wrapper.tokenizer.convert_tokens_to_ids(label) return [label_id if tok_id == mask_id else tok_id for tok_id in ex_input_ids] input_ids = preprocessed_ex_ids(example, labelize=False) cond = [] for ex in self.few_shot_data: new_ex = preprocessed_ex_ids(ex, labelize=True) if (1 + len(sum(cond, [])) + len(new_ex) + len(input_ids) > self.wrapper.config.max_seq_length): break cond.append(new_ex) # random.shuffle(cond) # shuffle few-shot examples # cond.insert(0, input_ids) # prompt at the beginning # cond.insert(len(cond) // 2, input_ids) # prompt in the middle cond.insert(len(cond), input_ids) # prompt at the end input_ids = sum(cond, []) token_type_ids = [0] * len(input_ids) # print(f'Conditioning on {len(cond)}/' # f'{len(self.few_shot_data)} examples; ' # f'labels: {[e.label for e in self.few_shot_data[:len(cond)]]}') else: input_ids, token_type_ids = self.pvp.encode(example) ### NEW ### attention_mask = [1] * len(input_ids) padding_length = self.wrapper.config.max_seq_length - len(input_ids) input_ids = input_ids + ([self.wrapper.tokenizer.pad_token_id] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([0] * padding_length) ### NEW ### # print example input # print('*****************************') # print(self.wrapper.tokenizer.decode(input_ids)) # assert False ### NEW ### assert len(input_ids) == self.wrapper.config.max_seq_length assert len(attention_mask) == self.wrapper.config.max_seq_length assert len(token_type_ids) == self.wrapper.config.max_seq_length label = self.label_map[example.label] logits = example.logits if example.logits else [-1] if labelled: mlm_labels = self.pvp.get_mask_positions(input_ids) else: mlm_labels = [-1] * self.wrapper.config.max_seq_length return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label, mlm_labels=mlm_labels, logits=logits)
def convert_examples_to_features(examples, tokenizer, max_length=512, task=None, label_list=None, output_mode=None, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ label_list = ["0", "1", "2", "3", "4"] label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Writing example %d" % (ex_index)) inputs = tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, ) input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids ) == max_length, "Error with input length {} vs {}".format( len(input_ids), max_length) assert len(attention_mask ) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length) assert len(token_type_ids ) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length) if output_mode == "classification": label = label_map[example.label] elif output_mode == "regression": label = float(example.label) else: raise KeyError(output_mode) if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)) return features