def __getitem__(self, idx): document_idx = bisect_right(self.weights, idx) idx = idx - self.left_weights[document_idx] start_idx = idx * self.overalapping_eval end_idx = start_idx + self.max_seq_len tokens = self.documents[document_idx][start_idx:end_idx] if self.block_lm: if idx == 0 or self.unidirectional: prompt, text = tokens[:1], tokens[1:] else: prompt_length = self.max_seq_len - self.overalapping_eval prompt, text = tokens[:prompt_length], tokens[prompt_length:] prompt = prompt + [self.mask_id] num_special_tokens = num_special_tokens_to_add(prompt, None, text, add_cls=True, add_sep=False, add_piece=True, add_eos=False) data = build_input_from_ids(prompt, None, text, self.max_seq_len + num_special_tokens + 1, self.tokenizer, args=self.args, add_cls=True, add_sep=False, add_piece=True, add_eos=False, mask_id=self.mask_id) ids, types, paddings, position_ids, sep, target_ids, loss_masks = data if idx != 0 and self.unidirectional: loss_masks = np.array(loss_masks, dtype=np.int64) loss_masks[:-self.overalapping_eval] = 0 return {'text': np.array(ids, dtype=np.int64), 'target': np.array(target_ids, dtype=np.int64), 'attention_mask': np.array(sep, dtype=np.int64), 'loss_mask': np.array(loss_masks, dtype=np.int64), "position_id": np.array(position_ids, dtype=np.int64)} else: loss_masks = [1] * len(tokens) if len(tokens) < self.max_seq_len: tokens = tokens + [0] * (self.max_seq_len - len(tokens)) loss_masks = loss_masks + [0] * (self.max_seq_len - len(loss_masks)) if idx != 0: loss_masks = np.array(loss_masks, dtype=np.int64) loss_masks[:-self.overalapping_eval] = 0 return {'text': np.array(tokens, dtype=np.int64), 'loss_mask': np.array(loss_masks, dtype=np.int64)}
def encode(self, example: InputExample, tokenizer, seq_length, args): if args.pretrained_bert: ids_list, types_list, paddings_list = [], [], [] else: ids_list, positions_list, sep_list = [], [], [] tokens_a = tokenizer.EncodeAsIds(example.text_a).tokenization tokens_b = tokenizer.EncodeAsIds(example.text_b).tokenization if example.text_b else None for answer in example.meta["candidates"]: answer_ids = tokenizer.EncodeAsIds(answer).tokenization total_length = len(tokens_a) + len(tokens_b) + len(answer_ids) total_length += num_special_tokens_to_add(tokens_a, tokens_b + answer_ids, None, add_cls=True, add_sep=True, add_piece=False) if total_length > seq_length: self.num_truncated += 1 data = build_input_from_ids(tokens_a, tokens_b + answer_ids, None, seq_length, tokenizer, args, add_cls=True, add_sep=True, add_piece=False) ids, types, paddings, position_ids, sep, target_ids, loss_masks = data if args.pretrained_bert: ids_list.append(ids) types_list.append(types) paddings_list.append(paddings) else: ids_list.append(ids) positions_list.append(position_ids) sep_list.append(sep) label = example.label label = self.get_labels().index(label) if args.pretrained_bert: sample = build_sample(ids_list, label=label, types=types_list, paddings=paddings_list, unique_id=example.guid) else: sample = build_sample(ids_list, positions=positions_list, masks=sep_list, label=label, unique_id=example.guid) return sample
def truncate(self, parts_a: List[Tuple[List[int], bool]], parts_b: List[Tuple[List[int], bool]], answer: List[int], max_length: int): """Truncate two sequences of text to a predefined total maximum length""" total_len = self._seq_length(parts_a) + self._seq_length(parts_b) if answer: total_len += len(answer) total_len += num_special_tokens_to_add(parts_a, parts_b, answer, add_cls=True, add_sep=False, add_piece=True) num_tokens_to_remove = total_len - max_length if num_tokens_to_remove <= 0: return False for _ in range(num_tokens_to_remove): if self._seq_length(parts_a, only_shortenable=True) > self._seq_length( parts_b, only_shortenable=True): self._remove_last(parts_a) else: self._remove_last(parts_b) return True
def __getitem__(self, idx): tokens, answer = self.tokens[idx], self.labels[idx] if self.block_lm: if self.unidirectional: tokens, answer_tokens = tokens[:1], tokens[1:] + answer else: answer_tokens = answer tokens = tokens + [self.mask_id] num_special_tokens = num_special_tokens_to_add(tokens, None, answer_tokens, add_cls=True, add_sep=False, add_piece=True) left_shift = len(tokens) + len( answer_tokens) + num_special_tokens - self.max_seq_length if left_shift > 0: tokens = tokens[left_shift:] data = build_input_from_ids(tokens, None, answer_tokens, self.max_seq_length, self.tokenizer, args=self.args, add_cls=True, add_sep=False, add_piece=True) ids, types, paddings, position_ids, sep, target_ids, loss_masks = data if self.unidirectional: loss_masks = np.array(loss_masks, dtype=np.int64) last_index = len(loss_masks) while loss_masks[last_index - 1] == 0: last_index -= 1 loss_masks[:last_index - len(answer)] = 0 return { 'text': np.array(ids, dtype=np.int64), 'target': np.array(target_ids, dtype=np.int64), 'attention_mask': np.array(sep, dtype=np.int64), 'loss_mask': np.array(loss_masks, dtype=np.int64), "position_id": np.array(position_ids, dtype=np.int64) } else: left_shift = len(tokens) - self.max_seq_length if left_shift > 0: tokens = tokens[left_shift:] ids = tokens + answer if len(ids) < self.max_seq_length: ids = ids + [0] * (self.max_seq_length - len(ids)) loss_masks = [0] * len(tokens) + [1] * len(answer) if len(loss_masks) < self.max_seq_length: loss_masks = loss_masks + [0] * (self.max_seq_length - len(loss_masks)) return { 'text': np.array(ids, dtype=np.int64), 'loss_mask': np.array(loss_masks, dtype=np.int64) }
def encode(self, example: InputExample, tokenizer, args): if args.pretrained_bert: ids_list, types_list, paddings_list = [], [], [] else: ids_list, positions_list, sep_list = [], [], [] question = example.meta['question'] joiner = 'because' if question == 'cause' else 'so' text_a = example.text_a + " " + joiner tokens_a = tokenizer.EncodeAsIds(text_a).tokenization for choice in [example.meta["choice1"], example.meta["choice2"]]: tokens_b = tokenizer.EncodeAsIds(choice).tokenization num_special_tokens = num_special_tokens_to_add(tokens_a, tokens_b, None, add_cls=True, add_sep=True, add_piece=False) if len(tokens_a) + len( tokens_b) + num_special_tokens > args.seq_length: self.num_truncated += 1 data = build_input_from_ids(tokens_a, tokens_b, None, args.seq_length, tokenizer, args, add_cls=True, add_sep=True, add_piece=False) ids, types, paddings, position_ids, sep, target_ids, loss_masks = data if args.pretrained_bert: ids_list.append(ids) types_list.append(types) paddings_list.append(paddings) else: ids_list.append(ids) positions_list.append(position_ids) sep_list.append(sep) label = 0 if example.label is not None: label = example.label label = self.get_labels().index(label) if args.pretrained_bert: sample = build_sample(ids_list, label=label, types=types_list, paddings=paddings_list, unique_id=example.guid) else: sample = build_sample(ids_list, positions=positions_list, masks=sep_list, label=label, unique_id=example.guid) return sample
def encode(self, example: InputExample, tokenizer, args): text_a, text_b = self.get_classifier_input(example, tokenizer) tokens_a = tokenizer.EncodeAsIds(text_a).tokenization tokens_b = tokenizer.EncodeAsIds(text_b).tokenization num_special_tokens = num_special_tokens_to_add(tokens_a, tokens_b, None, add_cls=True, add_sep=True, add_piece=False) if len(tokens_a) + len( tokens_b) + num_special_tokens > args.seq_length: self.num_truncated += 1 data = build_input_from_ids(tokens_a, tokens_b, None, args.seq_length, tokenizer, args=args, add_cls=True, add_sep=True, add_piece=False) ids, types, paddings, position_ids, sep, target_ids, loss_masks = data label = 0 if example.label is not None: label = example.label label = self.get_labels().index(label) if args.pretrained_bert: sample = build_sample(ids, label=label, types=types, paddings=paddings, unique_id=example.guid) else: sample = build_sample(ids, positions=position_ids, masks=sep, label=label, unique_id=example.guid) return sample