Python InputFeatures 예제들, utils.InputFeatures Python 예제들

예제 #1

0

파일 보기

    def add_special_input_features(self, input_example: InputExample,
                                   input_features: InputFeatures) -> None:

        mask_start = input_features.input_ids.index(
            self.wrapper.tokenizer.mask_token_id)

        for choice in ['choice1', 'choice2']:
            choice_text = input_example.meta[choice]
            choice_token_ids = get_verbalization_ids(choice_text,
                                                     self.wrapper.tokenizer,
                                                     force_single_token=False)
            mask_end = mask_start + len(choice_token_ids)
            input_features.meta[f'{choice}_token_ids'] = [-100] * \
                len(input_features.input_ids)
            input_features.meta[f'{choice}_token_ids'][
                mask_start:mask_end] = choice_token_ids

예제 #2

0

파일 보기

파일: preprocessor.py 프로젝트: akhileshgotmare/pet

    def get_input_features(self, example: InputExample, **kwargs) -> InputFeatures:
        inputs = self.wrapper.tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=self.wrapper.config.max_seq_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

        attention_mask = [1] * len(input_ids)
        padding_length = self.wrapper.config.max_seq_length - len(input_ids)

        input_ids = input_ids + ([self.wrapper.tokenizer.pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        mlm_labels = [-1] * len(input_ids)

        assert len(input_ids) == self.wrapper.config.max_seq_length
        assert len(attention_mask) == self.wrapper.config.max_seq_length
        assert len(token_type_ids) == self.wrapper.config.max_seq_length

        label = self.label_map[example.label]
        logits = example.logits if example.logits else [-1]

        return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                             label=label, mlm_labels=mlm_labels, logits=logits)

예제 #3

0

파일 보기

파일: preprocessor.py 프로젝트: subhadarship/pet

    def get_input_features(self, example: InputExample, labelled: bool,
                           **kwargs) -> InputFeatures:
        input_ids, token_type_ids = self.pvp.encode(example)

        attention_mask = [1] * len(input_ids)
        padding_length = self.wrapper.config.max_seq_length - len(input_ids)

        input_ids = input_ids + ([self.wrapper.tokenizer.pad_token_id] *
                                 padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

        assert len(input_ids) == self.wrapper.config.max_seq_length
        assert len(attention_mask) == self.wrapper.config.max_seq_length
        assert len(token_type_ids) == self.wrapper.config.max_seq_length

        label = self.label_map[example.label]
        logits = example.logits if example.logits else [-1]

        if labelled:
            mlm_labels = self.pvp.get_mask_positions(input_ids)
        else:
            mlm_labels = [-1] * self.wrapper.config.max_seq_length

        return InputFeatures(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids,
                             label=label,
                             mlm_labels=mlm_labels,
                             logits=logits)

예제 #4

0

파일 보기

    def _convert_examples_to_features(
            self,
            examples: List[InputExample],
            labelled: bool = True) -> List[InputFeatures]:
        features = []
        for example in examples:
            # Preprocessor for models pretrained using a masked language modeling objective (e.g., BERT).
            input_ids, token_type_ids, block_flag = self.pvp.encode(example)
            attention_mask = [1] * len(input_ids)
            padding_length = self.config.max_seq_length - \
                len(input_ids)

            if padding_length < 0:
                raise ValueError(
                    f"Maximum sequence length is too small, got {len(input_ids)} input ids"
                )

            input_ids = input_ids + \
                ([self.tokenizer.pad_token_id] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
            block_flag = block_flag + ([0] * padding_length)

            assert len(input_ids) == self.config.max_seq_length
            assert len(attention_mask) == self.config.max_seq_length
            assert len(token_type_ids) == self.config.max_seq_length
            assert len(block_flag) == self.config.max_seq_length

            label = self.label_map[
                example.label] if example.label is not None else -100
            logits = example.logits if example.logits else [-1]

            if labelled:
                mlm_labels = self.pvp.get_mask_positions(input_ids)
            else:
                mlm_labels = [-1] * self.config.max_seq_length

            input_features = InputFeatures(input_ids=input_ids,
                                           attention_mask=attention_mask,
                                           token_type_ids=token_type_ids,
                                           label=label,
                                           mlm_labels=mlm_labels,
                                           logits=logits,
                                           idx=example.idx,
                                           block_flag=block_flag)

            # Add meta input features
            if self.task_helper:
                self.task_helper.add_special_input_features(
                    example, input_features)
            features.append(input_features)

        return features

예제 #5

0

파일 보기

    def add_special_input_features(self, input_example: InputExample,
                                   input_features: InputFeatures) -> None:

        mask_start = input_features.input_ids.index(
            self.wrapper.tokenizer.mask_token_id)
        num_masks = input_features.input_ids.count(
            self.wrapper.tokenizer.mask_token_id)
        mask_end = mask_start + num_masks

        target = input_example.meta['span1_text']
        input_features.meta['target'] = target
        target_token_ids = get_verbalization_ids(target,
                                                 self.wrapper.tokenizer,
                                                 force_single_token=False)
        input_features.meta['target_token_ids'] = [-100] * \
            len(input_features.input_ids)

        # we also predict <pad> tokens at the missing positions
        target_token_ids += [self.wrapper.tokenizer.pad_token_id] * \
            (num_masks - len(target_token_ids))
        input_features.meta['target_token_ids'][
            mask_start:mask_end] = target_token_ids

예제 #6

0

파일 보기

def convert_examples_to_features(
    examples: List[InputExample],
    tokenizer: AutoTokenizer,
    max_length: Optional[int] = None,
    label_list: List = None,
    output_mode="classification",
):
    if max_length is None:
        max_length = tokenizer.max_len
    label_map = {label: i for i, label in enumerate(label_list)}

    def label_from_example(example: InputExample) -> Union[int, float, None]:
        if example.label is None:
            return None
        if output_mode == "classification":
            return label_map[example.label]
        elif output_mode == "regression":
            return float(example.label)
        raise KeyError(output_mode)

    labels = [label_from_example(example) for example in examples]

    batch_encoding = tokenizer(
        # [(example.text_a, example.text_b) for example in examples],
        [example.text_a for example in examples],
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}

        feature = InputFeatures(**inputs, label=labels[i])
        features.append(feature)

    for i, example in enumerate(examples[:5]):
        logger.info("*** Example ***")
        logger.info("guid: %s" % (example.guid))
        logger.info("features: %s" % features[i])

    return features

예제 #7

0

파일 보기

파일: predict.py 프로젝트: xiaojie2018/nlp_study

    def convert_examples_to_features(self,
                                     examples,
                                     max_seq_len1,
                                     max_seq_len2,
                                     tokenizer,
                                     cls_token='[CLS]',
                                     sep_token='[SEP]',
                                     pad_token=0,
                                     pad_token_label_id=-100,
                                     cls_token_segment_id=0,
                                     pad_token_segment_id=0,
                                     sequence_a_segment_id=0,
                                     sequence_b_segment_id=1,
                                     mask_padding_with_zero=True):
        # Setting based on the current model type
        cls_token = tokenizer.cls_token
        sep_token = tokenizer.sep_token
        # unk_token = tokenizer.unk_token
        pad_token_id = tokenizer.pad_token_id

        features = []
        for (ex_index, example) in enumerate(examples):
            if ex_index % 5000 == 0:
                # logger.info("Writing example %d of %d" % (ex_index, len(examples)))
                # print("Writing example %d of %d" % (ex_index, len(examples)))
                pass

            # Tokenize word by word
            # tokens1_ = tokenizer.tokenize(example.text1)
            text1_a = example.text1[:example.mask1[0]]
            text1_b = example.text1[example.mask1[0]:example.mask1[1]]
            text1_c = example.text1[example.mask1[1]:]

            tokens1_a = tokenizer.tokenize(text1_a)
            tokens1_b = tokenizer.tokenize(text1_b)
            tokens1_c = tokenizer.tokenize(text1_c)

            tokens1_ = tokens1_a + tokens1_b + tokens1_c
            mention_mask1_ = [0] * len(tokens1_a) + [1] * len(
                tokens1_b) + [0] * len(tokens1_c)

            # Add [CLS] [SEP] token
            tokens1_ = [cls_token] + tokens1_ + [sep_token]
            mention_mask1_ = [0] + mention_mask1_ + [0]
            token_type_ids1_ = [
                cls_token_segment_id
            ] + [sequence_a_segment_id] * (len(tokens1_) - 1)
            input_ids1_ = tokenizer.convert_tokens_to_ids(tokens1_)
            attention_mask1_ = [1 if mask_padding_with_zero else 0
                                ] * len(input_ids1_)

            # mention_mask1_ = [0]*len(tokens1_)
            #
            # word1_ = example.text1[example.mask1[0]: example.mask1[1]]
            # word_token1_ = tokenizer.tokenize(word1_)
            # for i in range(len(tokens1_)):
            #     if word_token1_ == tokens1_[i:i+len(word_token1_)]:
            #         for j in range(i, i+len(word_token1_)):
            #             mention_mask1_[j] = 1
            #         break
            # mention_mask1_ = [0] + example.mask1 + [0]

            tokens2 = []
            for word in example.text2:
                tokens2.append(tokenizer.tokenize(word))

            tokens2_ = []
            token_type_ids2_ = []

            for i, t in enumerate(tokens2):
                tokens2_ += t
                # Add [SEP] token
                tokens2_ += [sep_token]

                if i == 1 or i == 0:
                    token_type_ids2_ += [sequence_a_segment_id] * (len(t) + 1)
                else:
                    token_type_ids2_ += [sequence_b_segment_id] * (len(t) + 1)

            # Add [CLS] token
            tokens2_ = [cls_token] + tokens2_
            token_type_ids2_ = [cls_token_segment_id] + token_type_ids2_

            input_ids2_ = tokenizer.convert_tokens_to_ids(tokens2_)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            attention_mask2_ = [1 if mask_padding_with_zero else 0
                                ] * len(input_ids2_)

            # sep_mask   :find the sep token
            sep_token_id = tokenizer.convert_tokens_to_ids(sep_token)

            sep_mask_ids = []
            for i, x in enumerate(input_ids2_):
                if x == sep_token_id:
                    sep_mask_ids.append(i)
            sep_masks = []
            for i in sep_mask_ids:
                sep_mask0 = [0] * len(input_ids2_)
                sep_mask0[i] = 1
                sep_masks.append(sep_mask0)

            # Zero-pad up to the sequence length.
            padding_length2 = max_seq_len2 - len(input_ids2_)
            input_ids2_ = input_ids2_ + ([pad_token_id] * padding_length2)
            attention_mask2_ = attention_mask2_ + (
                [0 if mask_padding_with_zero else 1] * padding_length2)
            token_type_ids2_ = token_type_ids2_ + ([pad_token_segment_id] *
                                                   padding_length2)

            sep_masks111 = []
            for x in sep_masks:
                x = x + ([0] * padding_length2)
                sep_masks111.append(x)

            sep_masks2_ = np.array(sep_masks111)

            padding_length1 = max_seq_len1 - len(input_ids1_)
            input_ids1_ = input_ids1_ + ([pad_token_id] * padding_length1)
            attention_mask1_ = attention_mask1_ + (
                [0 if mask_padding_with_zero else 1] * padding_length1)
            token_type_ids1_ = token_type_ids1_ + ([pad_token_segment_id] *
                                                   padding_length1)
            mention_mask1_ = mention_mask1_ + ([0] * padding_length1)

            assert len(
                input_ids2_
            ) == max_seq_len2, "Error with input2 length {} vs {}".format(
                len(input_ids2_), max_seq_len2)
            assert len(
                attention_mask2_
            ) == max_seq_len2, "Error with attention2 mask length {} vs {}".format(
                len(attention_mask2_), max_seq_len2)
            assert len(
                token_type_ids2_
            ) == max_seq_len2, "Error with token2 type length {} vs {}".format(
                len(token_type_ids2_), max_seq_len2)

            assert len(
                input_ids1_
            ) == max_seq_len1, "Error with input1 length {} vs {}".format(
                len(input_ids1_), max_seq_len1)
            assert len(
                attention_mask1_
            ) == max_seq_len1, "Error with attention1 mask length {} vs {}".format(
                len(attention_mask1_), max_seq_len1)
            assert len(
                token_type_ids1_
            ) == max_seq_len1, "Error with token1 type length {} vs {}".format(
                len(token_type_ids1_), max_seq_len1)
            assert len(
                mention_mask1_
            ) == max_seq_len1, "Error with mention1 mask length {} vs {}".format(
                len(mention_mask1_), max_seq_len1)

            label = int(example.label)

            # if ex_index < 5:
            #     print("example")
            #     logger.info("*** Example ***")
            #     logger.info("guid: %s" % example.guid)
            #     logger.info("tokens1: %s" % " ".join([str(x) for x in tokens1_]))
            #     logger.info("input_ids1: %s" % " ".join([str(x) for x in input_ids1_]))
            #     logger.info("attention_mask1: %s" % " ".join([str(x) for x in attention_mask1_]))
            #     logger.info("token_type_ids1: %s" % " ".join([str(x) for x in token_type_ids1_]))
            #     logger.info("mention_mask1: %s" % " ".join([str(x) for x in mention_mask1_]))
            #
            #     logger.info("tokens2: %s" % " ".join([str(x) for x in tokens2_]))
            #     logger.info("input_ids2: %s" % " ".join([str(x) for x in input_ids2_]))
            #     logger.info("attention_mask2: %s" % " ".join([str(x) for x in attention_mask2_]))
            #     logger.info("token_type_ids2: %s" % " ".join([str(x) for x in token_type_ids2_]))
            #     logger.info("sep_mask_ids: %s" % " ".join([str(x) for x in sep_mask_ids]))
            #
            #     logger.info("intent_label: %d" % example.label)

            features.append(
                InputFeatures(input_ids1=input_ids1_,
                              attention_mask1=attention_mask1_,
                              token_type_ids1=token_type_ids1_,
                              mention_masks=mention_mask1_,
                              input_ids2=input_ids2_,
                              attention_mask2=attention_mask2_,
                              token_type_ids2=token_type_ids2_,
                              sep_masks2=sep_masks2_,
                              labels=label))

        return features

예제 #8

0

파일 보기

파일: data_processor.py 프로젝트: ruizewang/Model-Compress

def glue_convert_examples_to_features(
    examples,
    tokenizer,
    max_length=512,
    task=None,
    label_list=None,
    output_mode=None,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    mask_padding_with_zero=True,
):
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    is_tf_dataset = False
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        is_tf_dataset = True

    if task is not None:
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" %
                        (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        len_examples = 0
        if is_tf_dataset:
            example = processor.get_example_from_tensor_dict(example)
            example = processor.tfds_map(example)
            len_examples = tf.data.experimental.cardinality(examples)
        else:
            len_examples = len(examples)
        if ex_index % 10000 == 0:
            logger.info("Writing example %d/%d" % (ex_index, len_examples))

        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs[
            "token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] *
                              padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] *
                              padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + (
                [0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                               padding_length)

        assert len(input_ids
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(input_ids), max_length)
        assert len(attention_mask
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(attention_mask), max_length)
        assert len(token_type_ids
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(token_type_ids), max_length)

        if output_mode == "classification":
            label = label_map[example.label]
        elif output_mode == "regression":
            label = float(example.label)
        else:
            raise KeyError(output_mode)

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x)
                                                    for x in input_ids]))
            logger.info("attention_mask: %s" %
                        " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" %
                        " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          label=label))

    if is_tf_available() and is_tf_dataset:

        def gen():
            for ex in features:
                yield (
                    {
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask,
                        "token_type_ids": ex.token_type_ids,
                    },
                    ex.label,
                )

        return tf.data.Dataset.from_generator(
            gen,
            ({
                "input_ids": tf.int32,
                "attention_mask": tf.int32,
                "token_type_ids": tf.int32
            }, tf.int64),
            (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None]),
                },
                tf.TensorShape([]),
            ),
        )

    return features

예제 #9

0

파일 보기

 def add_special_input_features(self, input_example: InputExample,
                                input_features: InputFeatures) -> None:
     input_features.meta['question_idx'] = input_example.meta[
         'question_idx']

예제 #10

0

파일 보기

파일: preprocessor.py 프로젝트: simassakenis/pet

    def get_input_features(self, example: InputExample, labelled: bool, **kwargs) -> InputFeatures:
        ### NEW ###
        if self.few_shot_data is not None:
            cls_id = self.wrapper.tokenizer.cls_token_id
            sep_id = self.wrapper.tokenizer.sep_token_id
            mask_id = self.wrapper.tokenizer.mask_token_id

            def preprocessed_ex_ids(ex, labelize):
                ex_input_ids = self.pvp.encode(ex)[0]
                # Remove the cls token
                while cls_id in ex_input_ids:
                    ex_input_ids.pop(ex_input_ids.index(cls_id))
                # Remove any sep token(s) before the mask token
                while (ex_input_ids.index(mask_id)-1 >= 0 and
                       ex_input_ids[ex_input_ids.index(mask_id)-1] == sep_id):
                    ex_input_ids.pop(ex_input_ids.index(mask_id)-1)
                if not labelize: return ex_input_ids
                # Replace <mask> with the label
                label = _prepare(self.pvp.verbalize(ex.label)[0],
                                 self.wrapper.tokenizer)
                label_id = self.wrapper.tokenizer.convert_tokens_to_ids(label)
                return [label_id if tok_id == mask_id else tok_id
                        for tok_id in ex_input_ids]

            input_ids = preprocessed_ex_ids(example, labelize=False)
            cond = []
            for ex in self.few_shot_data:
                new_ex = preprocessed_ex_ids(ex, labelize=True)
                if (1 + len(sum(cond, [])) + len(new_ex) + len(input_ids) >
                    self.wrapper.config.max_seq_length): break
                cond.append(new_ex)
            # random.shuffle(cond) # shuffle few-shot examples
            # cond.insert(0, input_ids) # prompt at the beginning
            # cond.insert(len(cond) // 2, input_ids) # prompt in the middle
            cond.insert(len(cond), input_ids) # prompt at the end
            input_ids = sum(cond, [])
            token_type_ids = [0] * len(input_ids)

            # print(f'Conditioning on {len(cond)}/'
            #       f'{len(self.few_shot_data)} examples; '
            #       f'labels: {[e.label for e in self.few_shot_data[:len(cond)]]}')
        else:
            input_ids, token_type_ids = self.pvp.encode(example)
        ### NEW ###

        attention_mask = [1] * len(input_ids)
        padding_length = self.wrapper.config.max_seq_length - len(input_ids)

        input_ids = input_ids + ([self.wrapper.tokenizer.pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

        ### NEW ###
        # print example input
        # print('*****************************')
        # print(self.wrapper.tokenizer.decode(input_ids))
        # assert False
        ### NEW ###

        assert len(input_ids) == self.wrapper.config.max_seq_length
        assert len(attention_mask) == self.wrapper.config.max_seq_length
        assert len(token_type_ids) == self.wrapper.config.max_seq_length

        label = self.label_map[example.label]
        logits = example.logits if example.logits else [-1]

        if labelled:
            mlm_labels = self.pvp.get_mask_positions(input_ids)
        else:
            mlm_labels = [-1] * self.wrapper.config.max_seq_length

        return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                             label=label, mlm_labels=mlm_labels, logits=logits)

예제 #11

0

파일 보기

def convert_examples_to_features(examples,
                                 tokenizer,
                                 max_length=512,
                                 task=None,
                                 label_list=None,
                                 output_mode=None,
                                 pad_on_left=False,
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 mask_padding_with_zero=True):
    """
    Loads a data file into a list of ``InputFeatures``
    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)
    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.
    """
    label_list = ["0", "1", "2", "3", "4"]

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))

        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs[
            "token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] *
                              padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] *
                              padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + (
                [0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                               padding_length)

        assert len(input_ids
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(input_ids), max_length)
        assert len(attention_mask
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(attention_mask), max_length)
        assert len(token_type_ids
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(token_type_ids), max_length)

        if output_mode == "classification":
            label = label_map[example.label]
        elif output_mode == "regression":
            label = float(example.label)
        else:
            raise KeyError(output_mode)

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x)
                                                    for x in input_ids]))
            logger.info("attention_mask: %s" %
                        " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" %
                        " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          label=label))

    return features