示例#1
0
  def featurize(self, example, is_training):
    """Turn an InputExample into a dict of features."""

    if is_training and self.config.distill and self._distill_inputs is None:
      self._distill_inputs = utils.load_pickle(
          self.config.distill_inputs(self.name))

    input_ids = example.text_a
    input_mask = example.mask
    segment_ids = []
    
    while len(segment_ids) < self.config.max_seq_length:
      segment_ids.append(0)

    assert len(input_ids) == self.config.max_seq_length
    assert len(input_mask) == self.config.max_seq_length
    assert len(segment_ids) == self.config.max_seq_length

    eid = example.eid
    features = {
        "input_ids": input_ids,
        "input_mask": input_mask,
        "segment_ids": segment_ids,
        "task_id": self.config.task_names.index(self.name),
        self.name + "_eid": eid,
    }
    self._add_features(features, example,
                       None if self._distill_inputs is None else
                       self._distill_inputs[eid])
    return features
    def featurize(self, example, is_training):
        """Turn an InputExample into a dict of features."""

        if is_training and self.config.distill and self._distill_inputs is None:
            self._distill_inputs = utils.load_pickle(
                self.config.distill_inputs(self.name))

        tokens_a = self._tokenizer.tokenize(example.text_a)
        tokens_b = None
        if example.text_b:
            tokens_b = self._tokenizer.tokenize(example.text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b,
                               self.config.max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > self.config.max_seq_length - 2:
                tokens_a = tokens_a[0:(self.config.max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0     0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it
        # makes it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

        input_ids = self._tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < self.config.max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == self.config.max_seq_length
        assert len(input_mask) == self.config.max_seq_length
        assert len(segment_ids) == self.config.max_seq_length

        eid = example.eid
        features = {
            "input_ids": input_ids,
            "input_mask": input_mask,
            "segment_ids": segment_ids,
            "task_id": self.config.task_names.index(self.name),
            self.name + "_eid": eid,
        }
        self._add_features(
            features, example, None
            if self._distill_inputs is None else self._distill_inputs[eid])
        return features