示例#1
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text_a = line[3]
         label = line[1]
         examples.append(
             InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
     return examples
示例#2
0
    def _create_examples(self, lst, set_type):
        """Creates examples for the training and dev sets."""
        examples = []

        for _id, data in tqdm(enumerate(lst), desc='Processing NLI'):
            guid = "%s-%s" % (set_type, _id)
            text_a = data['data'][0]
            text_b = data['data'][1]
            label = data['label']
            assert label in self.get_labels(), f"Label: {label}, data:{data}"
            examples.append(
               InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples
示例#3
0
 def _create_examples(self, data, set_type):
     #dict_keys(['orig_title', 'duplicate_title', 'duplicate_id', 'orig_id', 'label'])
     """Creates examples for the training and dev sets."""
     examples = []
     for datum in tqdm(data, desc="parsing"):
         guid = "%s-%s-%s" % (set_type, datum["orig_id"], datum["duplicate_id"])
         text_a = datum['orig_title']
         text_b = datum['duplicate_title']
         label = int(datum['label'])
         assert label in self.get_labels()
         examples.append(
             InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
         )
     return examples
示例#4
0
    def _create_examples(self, df, set_type):
        """Creates examples for the training and dev sets."""
        examples = []

        for _id, row in tqdm(df.iterrows(), desc="Preparing Data", total=len(df)):
            guid = "%s-%s" % (set_type, row["id"])
            text_a = str(row["question1"])
            text_b = str(row["question2"])
            label = int(row["is_duplicate"])
            assert label in self.get_labels()
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
            )
        return examples
示例#5
0
 def _create_examples(self, lst, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     lst = lst[0]
     for _id, key in tqdm(enumerate(lst), desc="Preparing Data", total=len(lst)):
         data = lst[key]
         guid = "%s-%s" % (set_type, _id)
         text_a = data["data"][0]
         text_b = data["data"][1]
         label = int(data["label"])
         assert label in self.get_labels()
         examples.append(
             InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
         )
     return examples
示例#6
0
    def _create_examples(self, questions, set_type):
        """Creates examples for the training and dev sets."""
        examples = []

        def process_text(s, sent_a=True):
            return s

        for qid in questions:
            question_text = questions[qid].text
            for answer in questions[qid].answers:
                guid = "%s-%s" % (set_type, str(qid)+"_"+str(answer.aid))
                answer_text = answer.text

                text_a = process_text(question_text)
                text_b = process_text(answer_text, sent_a=False)

                label = 1 if answer.reference_score > 2 else 0

                examples.append(
                    InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))

        return examples
示例#7
0
 def get_example_from_tensor_dict(self, tensor_dict):
     """See base class."""
     return InputExample(tensor_dict['idx'].numpy(),
                         tensor_dict['sentence1'].numpy().decode('utf-8'),
                         tensor_dict['sentence2'].numpy().decode('utf-8'),
                         str(tensor_dict['label'].numpy()))
示例#8
0
def glue_convert_examples_to_features(examples, tokenizer,
                                      max_length=512,
                                      task=None,
                                      label_list=None,
                                      output_mode=None,
                                      pad_on_left=False,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True,
                                      tokenize=True):
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    is_tf_dataset = False
    #if is_tf_available() and isinstance(examples, tf.data.Dataset):
    #    is_tf_dataset = True

    if task is not None:
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" % (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}
    print(label_map)

    features = []
    for (ex_index, example) in tqdm(enumerate(examples), desc="Iterating"):
        try:
            if ex_index % 10000 == 0:
                logger.info("Writing example %d" % (ex_index))
            if is_tf_dataset:
                example = InputExample(example['idx'].numpy(),
                                example['sentence1'].numpy().decode('utf-8'),
                                example['sentence2'].numpy().decode('utf-8'),
                                example['label'].numpy().decode('utf-8'))

            tokens_a = tokenizer.tokenize(example.text_a) if tokenize else example.text_a
            tokens_b = tokenizer.tokenize(example.text_b) if tokenize else example.text_b
            _truncate_seq_pair(tokens_a, tokens_b, max_length-3)

            inputs = tokenizer.encode_plus(
                tokens_a,
                tokens_b,
                add_special_tokens=True,
                max_length=max_length,
                truncate_first_sequence=True  # We're truncating the first sequence in priority
            )
            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
    
            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
    
            # Zero-pad up to the sequence length.
            padding_length = max_length - len(input_ids)
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
    
            assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
            assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
            assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
    
            if output_mode == "classification":
                label = label_map[example.label]
            elif output_mode == "regression":
                label = float(example.label)
            else:
                raise KeyError(output_mode)
    
            if ex_index < 5:
                logger.info("*** Example ***")
                logger.info("guid: %s" % (example.guid))
                logger.info("input_tokens: %s" % (tokenizer.tokenize(example.text_a+
                    example.text_b)))
                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
                logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
                logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
                logger.info("label: %s (id = %d)" % (example.label, label))

            features.append(
                    InputFeatures(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  token_type_ids=token_type_ids,
                                  label=label,
                                  tokens_a = tokens_a,
                                  tokens_b = tokens_b))
        except ValueError:
            print("Bad input", example.text_a, example.text_b)
            continue

    if is_tf_available() and is_tf_dataset:
        def gen():
            for ex in features:
                yield  ({'input_ids': ex.input_ids,
                         'attention_mask': ex.attention_mask,
                         'token_type_ids': ex.token_type_ids},
                        ex.label)

        return tf.data.Dataset.from_generator(gen,
            ({'input_ids': tf.int32,
              'attention_mask': tf.int32,
              'token_type_ids': tf.int32},
             tf.int64),
            ({'input_ids': tf.TensorShape([None]),
              'attention_mask': tf.TensorShape([None]),
              'token_type_ids': tf.TensorShape([None])},
             tf.TensorShape([])))

    return features
示例#9
0
 def get_example_from_tensor_dict(self, tensor_dict):
     """See base class."""
     return InputExample(tensor_dict['idx'].numpy(),
                         tensor_dict['premise'].numpy().decode('utf-8'),
                         tensor_dict['hypothesis'].numpy().decode('utf-8'),
                         str(tensor_dict['label'].numpy()))