def __str__(self):
   s = ""
   s += "tokens: %s\n" % (" ".join(
       [tokenization.printable_text(x) for x in self.tokens]))
   s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
   s += "token_boundary: %s\n" % (" ".join(
       [str(x) for x in self.token_boundary]))
   s += "is_random_next: %s\n" % self.is_random_next
   s += "masked_lm_positions: %s\n" % (" ".join(
       [str(x) for x in self.masked_lm_positions]))
   s += "masked_lm_labels: %s\n" % (" ".join(
       [tokenization.printable_text(x) for x in self.masked_lm_labels]))
   s += "\n"
   return s
示例#2
0
def example_to_token_ids_segment_ids_label_ids(
    ex_index,
    example,
    max_seq_length,
    tokenizer):
  """Converts an ``InputExample`` to token ids and segment ids."""
  if ex_index < 5:
    tf.logging.info("*** Example {ex_index} ***")
    tf.logging.info("qid: %s" % (example.qid))

  question_tokens = tokenizer.tokenize(example.question)
  answers_tokens = map(tokenizer.tokenize, example.answers)

  token_ids = []
  segment_ids = []
  for choice_idx, answer_tokens in enumerate(answers_tokens):
    truncated_question_tokens = question_tokens[
      :max((max_seq_length - 3)//2, max_seq_length - (len(answer_tokens) + 3))]
    truncated_answer_tokens = answer_tokens[
      :max((max_seq_length - 3)//2, max_seq_length - (len(question_tokens) + 3))]

    choice_tokens = []
    choice_segment_ids = []
    choice_tokens.append("[CLS]")
    choice_segment_ids.append(0)
    for question_token in truncated_question_tokens:
      choice_tokens.append(question_token)
      choice_segment_ids.append(0)
    choice_tokens.append("[SEP]")
    choice_segment_ids.append(0)
    for answer_token in truncated_answer_tokens:
      choice_tokens.append(answer_token)
      choice_segment_ids.append(1)
    choice_tokens.append("[SEP]")
    choice_segment_ids.append(1)

    choice_token_ids = tokenizer.convert_tokens_to_ids(choice_tokens)

    token_ids.append(choice_token_ids)
    segment_ids.append(choice_segment_ids)

    if ex_index < 5:
      tf.logging.info("choice %s" % choice_idx)
      tf.logging.info("tokens: %s" % " ".join(
        [tokenization.printable_text(t) for t in choice_tokens]))
      tf.logging.info("token ids: %s" % " ".join(
        [str(x) for x in choice_token_ids]))
      tf.logging.info("segment ids: %s" % " ".join(
        [str(x) for x in choice_segment_ids]))

  label_ids = [example.label]

  if ex_index < 5:
    tf.logging.info("label: %s (id = %d)" % (example.label, label_ids[0]))

  return token_ids, segment_ids, label_ids
示例#3
0
def write_instance_to_example_files(instances, tokenizer, max_seq_length,
                                    max_predictions_per_seq, output_files):
    """Create TF example files from `TrainingInstance`s."""
    writers = []
    for output_file in output_files:
        writers.append(tf.python_io.TFRecordWriter(output_file))

    writer_index = 0

    total_written = 0
    for (inst_index, instance) in enumerate(instances):
        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
        input_mask = [1] * len(input_ids)
        segment_ids = list(instance.segment_ids)
        token_boundary = list(instance.token_boundary)
        assert len(input_ids) <= max_seq_length

        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            token_boundary.append(0)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        masked_lm_positions = list(instance.masked_lm_positions)
        masked_lm_ids = tokenizer.convert_tokens_to_ids(
            instance.masked_lm_labels)
        masked_lm_weights = [1.0] * len(masked_lm_ids)

        multiplier = 1 + int(FLAGS.do_permutation)
        while len(masked_lm_positions) < max_predictions_per_seq * multiplier:
            masked_lm_positions.append(0)
            masked_lm_ids.append(0)
            masked_lm_weights.append(0.0)

        sentence_order_label = 1 if instance.is_random_next else 0

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(input_ids)
        features["input_mask"] = create_int_feature(input_mask)
        features["segment_ids"] = create_int_feature(segment_ids)
        features["token_boundary"] = create_int_feature(token_boundary)
        features["masked_lm_positions"] = create_int_feature(
            masked_lm_positions)
        features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
        features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
        # Note: We keep this feature name `next_sentence_labels` to be compatible
        # with the original data created by lanzhzh@. However, in the ALBERT case
        # it does contain sentence_order_label.
        features["next_sentence_labels"] = create_int_feature(
            [sentence_order_label])

        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))

        writers[writer_index].write(tf_example.SerializeToString())
        writer_index = (writer_index + 1) % len(writers)

        total_written += 1

        if inst_index < 20:
            tf.logging.info("*** Example ***")
            tf.logging.info("tokens: %s" % " ".join(
                [tokenization.printable_text(x) for x in instance.tokens]))

            for feature_name in features.keys():
                feature = features[feature_name]
                values = []
                if feature.int64_list.value:
                    values = feature.int64_list.value
                elif feature.float_list.value:
                    values = feature.float_list.value
                tf.logging.info(
                    "%s: %s" %
                    (feature_name, " ".join([str(x) for x in values])))

    for writer in writers:
        writer.close()

    tf.logging.info("Wrote %d total instances", total_written)
示例#4
0
def convert_single_example(ex_index, example, label_list, max_seq_length,
                           tokenizer, task_name):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        return InputFeatures(input_ids=[0] * max_seq_length,
                             input_mask=[0] * max_seq_length,
                             segment_ids=[0] * max_seq_length,
                             label_id=0,
                             is_real_example=False)

    if task_name != "sts-b":
        label_map = {}
        for (i, label) in enumerate(label_list):
            label_map[label] = i

    tokens_a = tokenizer.tokenize(example.text_a)
    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]

    # The convention in ALBERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            segment_ids.append(1)
        tokens.append("[SEP]")
        segment_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    if task_name != "sts-b":
        label_id = label_map[example.label]
    else:
        label_id = example.label

    if ex_index < 5:
        tf.logging.info("*** Example ***")
        tf.logging.info("guid: %s" % (example.guid))
        tf.logging.info(
            "tokens: %s" %
            " ".join([tokenization.printable_text(x) for x in tokens]))
        tf.logging.info("input_ids: %s" % " ".join([str(x)
                                                    for x in input_ids]))
        tf.logging.info("input_mask: %s" %
                        " ".join([str(x) for x in input_mask]))
        tf.logging.info("segment_ids: %s" %
                        " ".join([str(x) for x in segment_ids]))
        tf.logging.info("label: %s (id = %d)" % (example.label, label_id))

    feature = InputFeatures(input_ids=input_ids,
                            input_mask=input_mask,
                            segment_ids=segment_ids,
                            label_id=label_id,
                            is_real_example=True)
    return feature
示例#5
0
    def convert_examples_to_features(self, seq_length, tokenizer):
        """Loads a data file into a list of `InputBatch`s."""

        # features = []
        input_masks = []
        examples = self._to_example(self.input_queue.get())
        for (ex_index, example) in enumerate(examples):
            tokens_a = tokenizer.tokenize(example.text_a)

            # if the sentences's length is more than seq_length, only use sentence's left part
            if len(tokens_a) > seq_length - 2:
                tokens_a = tokens_a[0:(seq_length - 2)]

            # The convention in BERT is:
            # (a) For sequence pairs:
            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
            #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
            # (b) For single sequences:
            #  tokens:   [CLS] the dog is hairy . [SEP]
            #  type_ids: 0     0   0   0  0     0 0
            #
            # Where "type_ids" are used to indicate whether this is the first
            # sequence or the second sequence. The embedding vectors for `type=0` and
            # `type=1` were learned during pre-training and are added to the wordpiece
            # embedding vector (and position vector). This is not *strictly* necessary
            # since the [SEP] token unambiguously separates the sequences, but it makes
            # it easier for the model to learn the concept of sequences.
            #
            # For classification tasks, the first vector (corresponding to [CLS]) is
            # used as as the "sentence vector". Note that this only makes sense because
            # the entire model is fine-tuned.
            tokens = []
            segment_ids = []
            tokens.append("[CLS]")
            segment_ids.append(0)
            for token in tokens_a:
                tokens.append(token)
                segment_ids.append(0)
            tokens.append("[SEP]")
            segment_ids.append(0)

            # Where "input_ids" are tokens's index in vocabulary
            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1] * len(input_ids)
            input_masks.append(input_mask)
            # Zero-pad up to the sequence length.
            while len(input_ids) < seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)

            assert len(input_ids) == seq_length
            assert len(input_mask) == seq_length
            assert len(segment_ids) == seq_length

            if ex_index < 5:
                tf.logging.info("*** Example ***")
                tf.logging.info("guid: %s" % (example.guid))
                tf.logging.info(
                    "tokens: %s" %
                    " ".join([tokenization.printable_text(x) for x in tokens]))
                tf.logging.info("input_ids: %s" %
                                " ".join([str(x) for x in input_ids]))
                tf.logging.info("input_mask: %s" %
                                " ".join([str(x) for x in input_mask]))
                tf.logging.info("segment_ids: %s" %
                                " ".join([str(x) for x in segment_ids]))

            yield InputFeatures(guid=example.guid,
                                input_ids=input_ids,
                                input_mask=input_mask,
                                segment_ids=segment_ids)