def _convert_examples_to_features( self, examples: List[TokenClsInputExample], max_seq_length, tokenizer, include_labels=True, cls_token_at_end=False, pad_on_left=False, cls_token="[CLS]", sep_token="[SEP]", pad_token=0, sequence_segment_id=0, sep_token_extra=0, cls_token_segment_id=1, pad_token_segment_id=0, mask_padding_with_zero=True, ): """ Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) """ if include_labels: label_map = {v: k for k, v in self.labels_id_map.items()} label_pad = 0 features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Processing example %d of %d", ex_index, len(examples)) tokens = [] labels = [] valid_tokens = [] for i, token in enumerate(example.tokens): new_tokens = tokenizer.tokenize(token) tokens.extend(new_tokens) v_tok = [0] * (len(new_tokens)) v_tok[0] = 1 valid_tokens.extend(v_tok) if include_labels: v_lbl = [label_pad] * (len(new_tokens)) v_lbl[0] = label_map.get(example.label[i]) labels.extend(v_lbl) # truncate by max_seq_length special_tokens_count = 3 if sep_token_extra else 2 tokens = tokens[:(max_seq_length - special_tokens_count)] valid_tokens = valid_tokens[:(max_seq_length - special_tokens_count)] if include_labels: labels = labels[:(max_seq_length - special_tokens_count)] tokens += [sep_token] if include_labels: labels += [label_pad] valid_tokens += [0] if sep_token_extra: # roberta special case tokens += [sep_token] valid_tokens += [0] if include_labels: labels += [label_pad] segment_ids = [sequence_segment_id] * len(tokens) if cls_token_at_end: tokens = tokens + [cls_token] segment_ids = segment_ids + [cls_token_segment_id] if include_labels: labels = labels + [label_pad] valid_tokens = valid_tokens + [0] else: tokens = [cls_token] + tokens segment_ids = [cls_token_segment_id] + segment_ids if include_labels: labels = [label_pad] + labels valid_tokens = [0] + valid_tokens input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids if include_labels: labels = ([label_pad] * padding_length) + labels valid_tokens = ([0] * padding_length) + valid_tokens else: input_ids = input_ids + ([pad_token] * padding_length) input_mask = input_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) if include_labels: labels = labels + ([label_pad] * padding_length) valid_tokens = valid_tokens + ([0] * padding_length) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(valid_tokens) == max_seq_length if include_labels: assert len(labels) == max_seq_length features.append( InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=labels, valid_ids=valid_tokens, )) return features
def _convert_examples_to_features(self, examples, max_seq_length, tokenizer, task_type, include_labels=True, cls_token_at_end=False, pad_on_left=False, cls_token='[CLS]', sep_token='[SEP]', pad_token=0, sequence_a_segment_id=0, sequence_b_segment_id=1, cls_token_segment_id=1, pad_token_segment_id=0, mask_padding_with_zero=True): """ Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) """ if include_labels: label_map = {label: i for i, label in enumerate(self.labels)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Writing example %d of %d" % (ex_index, len(examples))) tokens_a = tokenizer.tokenize(example.text) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = tokens_a + [sep_token] segment_ids = [sequence_a_segment_id] * len(tokens) if tokens_b: tokens += tokens_b + [sep_token] segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1) if cls_token_at_end: tokens = tokens + [cls_token] segment_ids = segment_ids + [cls_token_segment_id] else: tokens = [cls_token] + tokens segment_ids = [cls_token_segment_id] + segment_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids else: input_ids = input_ids + ([pad_token] * padding_length) input_mask = input_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length if include_labels: if task_type == "classification": label_id = label_map[example.label] elif task_type == "regression": label_id = float(example.label) else: raise KeyError(task_type) else: label_id = None features.append( InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id)) return features
def _convert_examples_to_features( self, examples, max_seq_length, tokenizer, task_type, include_labels=True, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, ): """ Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) """ if include_labels: label_map = {label: i for i, label in enumerate(self.labels)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Writing example %d of %d", ex_index, len(examples)) inputs = tokenizer.encode_plus( example.text, example.text_b, add_special_tokens=True, max_length=max_seq_length, ) input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] attention_mask = [1 if mask_padding_with_zero else 0 ] * len(input_ids) padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_seq_length assert len(attention_mask) == max_seq_length assert len(token_type_ids) == max_seq_length if include_labels: if task_type == "classification": label_id = label_map[example.label] elif task_type == "regression": label_id = float(example.label) else: raise KeyError(task_type) else: label_id = None features.append( InputFeatures( input_ids=input_ids, input_mask=attention_mask, segment_ids=token_type_ids, label_id=label_id, )) return features
def _convert_examples_to_features(self, examples: List[TokenClsInputExample], max_seq_length, tokenizer, include_labels=True, cls_token_at_end=False, pad_on_left=False, cls_token='[CLS]', sep_token='[SEP]', pad_token=0, sequence_segment_id=0, cls_token_segment_id=1, pad_token_segment_id=0, mask_padding_with_zero=True): """ Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) """ if include_labels: label_map = {v: k for k, v in self.labels_id_map.items()} label_pad = 0 features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Processing example %d of %d" % (ex_index, len(examples))) tokens = [] labels = [] valid_tokens = [] for i, token in enumerate(example.tokens): new_tokens = tokenizer.tokenize(token) tokens.extend(new_tokens) v_tok = [0] * (len(new_tokens)) v_tok[0] = 1 valid_tokens.extend(v_tok) if include_labels: v_lbl = [label_pad] * (len(new_tokens)) v_lbl[0] = label_map.get(example.label[i]) labels.extend(v_lbl) # truncate by max_seq_length tokens = tokens[:(max_seq_length - 2)] if include_labels: labels = labels[:(max_seq_length - 2)] valid_tokens = valid_tokens[:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = tokens + [sep_token] if include_labels: labels = labels + [label_pad] valid_tokens = valid_tokens + [0] segment_ids = [sequence_segment_id] * len(tokens) if cls_token_at_end: tokens = tokens + [cls_token] segment_ids = segment_ids + [cls_token_segment_id] if include_labels: labels = labels + [label_pad] valid_tokens = valid_tokens + [0] else: tokens = [cls_token] + tokens segment_ids = [cls_token_segment_id] + segment_ids if include_labels: labels = [label_pad] + labels valid_tokens = [0] + valid_tokens input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids if include_labels: labels = ([label_pad] * padding_length) + labels valid_tokens = ([0] * padding_length) + valid_tokens else: input_ids = input_ids + ([pad_token] * padding_length) input_mask = input_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) if include_labels: labels = labels + ([label_pad] * padding_length) valid_tokens = valid_tokens + ([0] * padding_length) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(valid_tokens) == max_seq_length if include_labels: assert len(labels) == max_seq_length features.append( InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=labels, valid_ids=valid_tokens)) return features