def featurize(self, example, is_training): """Turn an InputExample into a dict of features.""" if is_training and self.config.distill and self._distill_inputs is None: self._distill_inputs = utils.load_pickle( self.config.distill_inputs(self.name)) input_ids = example.text_a input_mask = example.mask segment_ids = [] while len(segment_ids) < self.config.max_seq_length: segment_ids.append(0) assert len(input_ids) == self.config.max_seq_length assert len(input_mask) == self.config.max_seq_length assert len(segment_ids) == self.config.max_seq_length eid = example.eid features = { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, "task_id": self.config.task_names.index(self.name), self.name + "_eid": eid, } self._add_features(features, example, None if self._distill_inputs is None else self._distill_inputs[eid]) return features
def featurize(self, example, is_training): """Turn an InputExample into a dict of features.""" if is_training and self.config.distill and self._distill_inputs is None: self._distill_inputs = utils.load_pickle( self.config.distill_inputs(self.name)) tokens_a = self._tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = self._tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, self.config.max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > self.config.max_seq_length - 2: tokens_a = tokens_a[0:(self.config.max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it # makes it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = self._tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < self.config.max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == self.config.max_seq_length assert len(input_mask) == self.config.max_seq_length assert len(segment_ids) == self.config.max_seq_length eid = example.eid features = { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, "task_id": self.config.task_names.index(self.name), self.name + "_eid": eid, } self._add_features( features, example, None if self._distill_inputs is None else self._distill_inputs[eid]) return features