Exemplo n.º 1
0
 def __getitem__(self, idx):
     document_idx = bisect_right(self.weights, idx)
     idx = idx - self.left_weights[document_idx]
     start_idx = idx * self.overalapping_eval
     end_idx = start_idx + self.max_seq_len
     tokens = self.documents[document_idx][start_idx:end_idx]
     if self.block_lm:
         if idx == 0 or self.unidirectional:
             prompt, text = tokens[:1], tokens[1:]
         else:
             prompt_length = self.max_seq_len - self.overalapping_eval
             prompt, text = tokens[:prompt_length], tokens[prompt_length:]
         prompt = prompt + [self.mask_id]
         num_special_tokens = num_special_tokens_to_add(prompt, None, text, add_cls=True, add_sep=False,
                                                        add_piece=True,
                                                        add_eos=False)
         data = build_input_from_ids(prompt, None, text, self.max_seq_len + num_special_tokens + 1, self.tokenizer,
                                     args=self.args, add_cls=True, add_sep=False, add_piece=True, add_eos=False, mask_id=self.mask_id)
         ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
         if idx != 0 and self.unidirectional:
             loss_masks = np.array(loss_masks, dtype=np.int64)
             loss_masks[:-self.overalapping_eval] = 0
         return {'text': np.array(ids, dtype=np.int64), 'target': np.array(target_ids, dtype=np.int64),
                 'attention_mask': np.array(sep, dtype=np.int64), 'loss_mask': np.array(loss_masks, dtype=np.int64),
                 "position_id": np.array(position_ids, dtype=np.int64)}
     else:
         loss_masks = [1] * len(tokens)
         if len(tokens) < self.max_seq_len:
             tokens = tokens + [0] * (self.max_seq_len - len(tokens))
             loss_masks = loss_masks + [0] * (self.max_seq_len - len(loss_masks))
         if idx != 0:
             loss_masks = np.array(loss_masks, dtype=np.int64)
             loss_masks[:-self.overalapping_eval] = 0
         return {'text': np.array(tokens, dtype=np.int64), 'loss_mask': np.array(loss_masks, dtype=np.int64)}
Exemplo n.º 2
0
 def encode(self, example: InputExample, tokenizer, seq_length, args):
     if args.pretrained_bert:
         ids_list, types_list, paddings_list = [], [], []
     else:
         ids_list, positions_list, sep_list = [], [], []
     tokens_a = tokenizer.EncodeAsIds(example.text_a).tokenization
     tokens_b = tokenizer.EncodeAsIds(example.text_b).tokenization if example.text_b else None
     for answer in example.meta["candidates"]:
         answer_ids = tokenizer.EncodeAsIds(answer).tokenization
         total_length = len(tokens_a) + len(tokens_b) + len(answer_ids)
         total_length += num_special_tokens_to_add(tokens_a, tokens_b + answer_ids, None, add_cls=True, add_sep=True,
                                                   add_piece=False)
         if total_length > seq_length:
             self.num_truncated += 1
         data = build_input_from_ids(tokens_a, tokens_b + answer_ids, None, seq_length, tokenizer, args,
                                     add_cls=True, add_sep=True, add_piece=False)
         ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
         if args.pretrained_bert:
             ids_list.append(ids)
             types_list.append(types)
             paddings_list.append(paddings)
         else:
             ids_list.append(ids)
             positions_list.append(position_ids)
             sep_list.append(sep)
     label = example.label
     label = self.get_labels().index(label)
     if args.pretrained_bert:
         sample = build_sample(ids_list, label=label, types=types_list, paddings=paddings_list,
                               unique_id=example.guid)
     else:
         sample = build_sample(ids_list, positions=positions_list, masks=sep_list, label=label,
                               unique_id=example.guid)
     return sample
Exemplo n.º 3
0
    def truncate(self, parts_a: List[Tuple[List[int], bool]],
                 parts_b: List[Tuple[List[int], bool]], answer: List[int],
                 max_length: int):
        """Truncate two sequences of text to a predefined total maximum length"""
        total_len = self._seq_length(parts_a) + self._seq_length(parts_b)
        if answer:
            total_len += len(answer)
        total_len += num_special_tokens_to_add(parts_a,
                                               parts_b,
                                               answer,
                                               add_cls=True,
                                               add_sep=False,
                                               add_piece=True)
        num_tokens_to_remove = total_len - max_length

        if num_tokens_to_remove <= 0:
            return False

        for _ in range(num_tokens_to_remove):
            if self._seq_length(parts_a,
                                only_shortenable=True) > self._seq_length(
                                    parts_b, only_shortenable=True):
                self._remove_last(parts_a)
            else:
                self._remove_last(parts_b)
        return True
Exemplo n.º 4
0
 def __getitem__(self, idx):
     tokens, answer = self.tokens[idx], self.labels[idx]
     if self.block_lm:
         if self.unidirectional:
             tokens, answer_tokens = tokens[:1], tokens[1:] + answer
         else:
             answer_tokens = answer
         tokens = tokens + [self.mask_id]
         num_special_tokens = num_special_tokens_to_add(tokens,
                                                        None,
                                                        answer_tokens,
                                                        add_cls=True,
                                                        add_sep=False,
                                                        add_piece=True)
         left_shift = len(tokens) + len(
             answer_tokens) + num_special_tokens - self.max_seq_length
         if left_shift > 0:
             tokens = tokens[left_shift:]
         data = build_input_from_ids(tokens,
                                     None,
                                     answer_tokens,
                                     self.max_seq_length,
                                     self.tokenizer,
                                     args=self.args,
                                     add_cls=True,
                                     add_sep=False,
                                     add_piece=True)
         ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
         if self.unidirectional:
             loss_masks = np.array(loss_masks, dtype=np.int64)
             last_index = len(loss_masks)
             while loss_masks[last_index - 1] == 0:
                 last_index -= 1
             loss_masks[:last_index - len(answer)] = 0
         return {
             'text': np.array(ids, dtype=np.int64),
             'target': np.array(target_ids, dtype=np.int64),
             'attention_mask': np.array(sep, dtype=np.int64),
             'loss_mask': np.array(loss_masks, dtype=np.int64),
             "position_id": np.array(position_ids, dtype=np.int64)
         }
     else:
         left_shift = len(tokens) - self.max_seq_length
         if left_shift > 0:
             tokens = tokens[left_shift:]
         ids = tokens + answer
         if len(ids) < self.max_seq_length:
             ids = ids + [0] * (self.max_seq_length - len(ids))
         loss_masks = [0] * len(tokens) + [1] * len(answer)
         if len(loss_masks) < self.max_seq_length:
             loss_masks = loss_masks + [0] * (self.max_seq_length -
                                              len(loss_masks))
         return {
             'text': np.array(ids, dtype=np.int64),
             'loss_mask': np.array(loss_masks, dtype=np.int64)
         }
Exemplo n.º 5
0
 def encode(self, example: InputExample, tokenizer, args):
     if args.pretrained_bert:
         ids_list, types_list, paddings_list = [], [], []
     else:
         ids_list, positions_list, sep_list = [], [], []
     question = example.meta['question']
     joiner = 'because' if question == 'cause' else 'so'
     text_a = example.text_a + " " + joiner
     tokens_a = tokenizer.EncodeAsIds(text_a).tokenization
     for choice in [example.meta["choice1"], example.meta["choice2"]]:
         tokens_b = tokenizer.EncodeAsIds(choice).tokenization
         num_special_tokens = num_special_tokens_to_add(tokens_a,
                                                        tokens_b,
                                                        None,
                                                        add_cls=True,
                                                        add_sep=True,
                                                        add_piece=False)
         if len(tokens_a) + len(
                 tokens_b) + num_special_tokens > args.seq_length:
             self.num_truncated += 1
         data = build_input_from_ids(tokens_a,
                                     tokens_b,
                                     None,
                                     args.seq_length,
                                     tokenizer,
                                     args,
                                     add_cls=True,
                                     add_sep=True,
                                     add_piece=False)
         ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
         if args.pretrained_bert:
             ids_list.append(ids)
             types_list.append(types)
             paddings_list.append(paddings)
         else:
             ids_list.append(ids)
             positions_list.append(position_ids)
             sep_list.append(sep)
     label = 0
     if example.label is not None:
         label = example.label
         label = self.get_labels().index(label)
     if args.pretrained_bert:
         sample = build_sample(ids_list,
                               label=label,
                               types=types_list,
                               paddings=paddings_list,
                               unique_id=example.guid)
     else:
         sample = build_sample(ids_list,
                               positions=positions_list,
                               masks=sep_list,
                               label=label,
                               unique_id=example.guid)
     return sample
Exemplo n.º 6
0
 def encode(self, example: InputExample, tokenizer, args):
     text_a, text_b = self.get_classifier_input(example, tokenizer)
     tokens_a = tokenizer.EncodeAsIds(text_a).tokenization
     tokens_b = tokenizer.EncodeAsIds(text_b).tokenization
     num_special_tokens = num_special_tokens_to_add(tokens_a,
                                                    tokens_b,
                                                    None,
                                                    add_cls=True,
                                                    add_sep=True,
                                                    add_piece=False)
     if len(tokens_a) + len(
             tokens_b) + num_special_tokens > args.seq_length:
         self.num_truncated += 1
     data = build_input_from_ids(tokens_a,
                                 tokens_b,
                                 None,
                                 args.seq_length,
                                 tokenizer,
                                 args=args,
                                 add_cls=True,
                                 add_sep=True,
                                 add_piece=False)
     ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
     label = 0
     if example.label is not None:
         label = example.label
         label = self.get_labels().index(label)
     if args.pretrained_bert:
         sample = build_sample(ids,
                               label=label,
                               types=types,
                               paddings=paddings,
                               unique_id=example.guid)
     else:
         sample = build_sample(ids,
                               positions=position_ids,
                               masks=sep,
                               label=label,
                               unique_id=example.guid)
     return sample