Exemplo n.º 1
0
 def encode(self, example: InputExample, tokenizer, seq_length, args):
     if args.pretrained_bert:
         ids_list, types_list, paddings_list = [], [], []
     else:
         ids_list, positions_list, sep_list = [], [], []
     tokens_a = tokenizer.EncodeAsIds(example.text_a).tokenization
     tokens_b = tokenizer.EncodeAsIds(example.text_b).tokenization if example.text_b else None
     for answer in example.meta["candidates"]:
         answer_ids = tokenizer.EncodeAsIds(answer).tokenization
         total_length = len(tokens_a) + len(tokens_b) + len(answer_ids)
         total_length += num_special_tokens_to_add(tokens_a, tokens_b + answer_ids, None, add_cls=True, add_sep=True,
                                                   add_piece=False)
         if total_length > seq_length:
             self.num_truncated += 1
         data = build_input_from_ids(tokens_a, tokens_b + answer_ids, None, seq_length, tokenizer, args,
                                     add_cls=True, add_sep=True, add_piece=False)
         ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
         if args.pretrained_bert:
             ids_list.append(ids)
             types_list.append(types)
             paddings_list.append(paddings)
         else:
             ids_list.append(ids)
             positions_list.append(position_ids)
             sep_list.append(sep)
     label = example.label
     label = self.get_labels().index(label)
     if args.pretrained_bert:
         sample = build_sample(ids_list, label=label, types=types_list, paddings=paddings_list,
                               unique_id=example.guid)
     else:
         sample = build_sample(ids_list, positions=positions_list, masks=sep_list, label=label,
                               unique_id=example.guid)
     return sample
Exemplo n.º 2
0
 def encode(self, example: InputExample, tokenizer, args):
     if args.pretrained_bert:
         ids_list, types_list, paddings_list = [], [], []
     else:
         ids_list, positions_list, sep_list = [], [], []
     question = example.meta['question']
     joiner = 'because' if question == 'cause' else 'so'
     text_a = example.text_a + " " + joiner
     tokens_a = tokenizer.EncodeAsIds(text_a).tokenization
     for choice in [example.meta["choice1"], example.meta["choice2"]]:
         tokens_b = tokenizer.EncodeAsIds(choice).tokenization
         num_special_tokens = num_special_tokens_to_add(tokens_a,
                                                        tokens_b,
                                                        None,
                                                        add_cls=True,
                                                        add_sep=True,
                                                        add_piece=False)
         if len(tokens_a) + len(
                 tokens_b) + num_special_tokens > args.seq_length:
             self.num_truncated += 1
         data = build_input_from_ids(tokens_a,
                                     tokens_b,
                                     None,
                                     args.seq_length,
                                     tokenizer,
                                     args,
                                     add_cls=True,
                                     add_sep=True,
                                     add_piece=False)
         ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
         if args.pretrained_bert:
             ids_list.append(ids)
             types_list.append(types)
             paddings_list.append(paddings)
         else:
             ids_list.append(ids)
             positions_list.append(position_ids)
             sep_list.append(sep)
     label = 0
     if example.label is not None:
         label = example.label
         label = self.get_labels().index(label)
     if args.pretrained_bert:
         sample = build_sample(ids_list,
                               label=label,
                               types=types_list,
                               paddings=paddings_list,
                               unique_id=example.guid)
     else:
         sample = build_sample(ids_list,
                               positions=positions_list,
                               masks=sep_list,
                               label=label,
                               unique_id=example.guid)
     return sample
Exemplo n.º 3
0
 def __getitem__(self, idx):
     raw_sample = self.samples[idx]
     ids, types, paddings = build_tokens_types_paddings_from_text(
         raw_sample['text_a'], raw_sample['text_b'], self.tokenizer,
         self.max_seq_length)
     sample = build_sample(ids, types, paddings, raw_sample['label'],
                           raw_sample['uid'])
     return sample
Exemplo n.º 4
0
 def encode(self, example: InputExample, tokenizer, args):
     text_a, text_b = self.get_classifier_input(example, tokenizer)
     tokens_a = tokenizer.EncodeAsIds(text_a).tokenization
     tokens_b = tokenizer.EncodeAsIds(text_b).tokenization
     num_special_tokens = num_special_tokens_to_add(tokens_a,
                                                    tokens_b,
                                                    None,
                                                    add_cls=True,
                                                    add_sep=True,
                                                    add_piece=False)
     if len(tokens_a) + len(
             tokens_b) + num_special_tokens > args.seq_length:
         self.num_truncated += 1
     data = build_input_from_ids(tokens_a,
                                 tokens_b,
                                 None,
                                 args.seq_length,
                                 tokenizer,
                                 args=args,
                                 add_cls=True,
                                 add_sep=True,
                                 add_piece=False)
     ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
     label = 0
     if example.label is not None:
         label = example.label
         label = self.get_labels().index(label)
     if args.pretrained_bert:
         sample = build_sample(ids,
                               label=label,
                               types=types,
                               paddings=paddings,
                               unique_id=example.guid)
     else:
         sample = build_sample(ids,
                               positions=position_ids,
                               masks=sep,
                               label=label,
                               unique_id=example.guid)
     return sample
Exemplo n.º 5
0
def process_single_datapath(datapath, tokenizer, max_qa_length, max_seq_length):
    """Read in RACE files, combine, clean-up, tokenize, and convert to
    samples."""

    print_rank_0('   > working on {}'.format(datapath))
    start_time = time.time()

    # Get list of files.
    filenames = glob.glob(os.path.join(datapath, '*.txt'))

    samples = []
    num_docs = 0
    num_questions = 0
    num_samples = 0
    # Load all the files
    for filename in filenames:
        with open(filename, 'r') as f:
            for line in f:
                data = json.loads(line)
                num_docs += 1

                context = data["article"]
                questions = data["questions"]
                choices = data["options"]
                answers = data["answers"]
                # Check the length.
                assert len(questions) == len(answers)
                assert len(questions) == len(choices)

                # Context: clean up and convert to ids.
                context = clean_text(context)
                context_ids = tokenizer.tokenize(context)

                # Loop over questions.
                for qi, question in enumerate(questions):
                    num_questions += 1
                    # Label.
                    label = ord(answers[qi]) - ord("A")
                    assert label >= 0
                    assert label < NUM_CHOICES
                    assert len(choices[qi]) == NUM_CHOICES

                    # For each question, build num-choices samples.
                    ids_list = []
                    types_list = []
                    paddings_list = []
                    for ci in range(NUM_CHOICES):
                        choice = choices[qi][ci]
                        # Merge with choice.
                        if "_" in question:
                            qa = question.replace("_", choice)
                        else:
                            qa = " ".join([question, choice])
                        # Clean QA.
                        qa = clean_text(qa)
                        # Tokenize.
                        qa_ids = tokenizer.tokenize(qa)
                        # Trim if needed.
                        if len(qa_ids) > max_qa_length:
                            qa_ids = qa_ids[0:max_qa_length]

                        # Build the sample.
                        ids, types, paddings \
                            = build_tokens_types_paddings_from_ids(
                                qa_ids, context_ids, max_seq_length,
                                tokenizer.cls, tokenizer.sep, tokenizer.pad)

                        ids_list.append(ids)
                        types_list.append(types)
                        paddings_list.append(paddings)

                    # Convert to numpy and add to samples
                    samples.append(build_sample(ids_list, types_list,
                                                paddings_list, label,
                                                num_samples))
                    num_samples += 1

    elapsed_time = time.time() - start_time
    print_rank_0('    > processed {} document, {} questions, and {} samples'
                 ' in {:.2f} seconds'.format(num_docs, num_questions,
                                             num_samples, elapsed_time))

    return samples
Exemplo n.º 6
0
    def encode(self,
               example: InputExample,
               priming: bool = False,
               labeled: bool = False):
        """
        Encode an input example using this pattern-verbalizer pair.

        :param example: the input example to encode
        :param priming: whether to use this example for priming
        :param labeled: if ``priming=True``, whether the label should be appended to this example
        :return: A tuple, consisting of a list of input ids and a list of token type ids
        """

        if not priming:
            assert not labeled, "'labeled' can only be set to true if 'priming' is also set to true"

        tokenizer = self.tokenizer
        raw_parts_a, raw_parts_b = self.get_parts(example)

        raw_parts_a = [
            x if isinstance(x, tuple) else (x, False) for x in raw_parts_a
        ]
        prompt_id = tokenizer.num_tokens

        def encode_input(raw_parts):
            parts, flags = [], []
            for x, s in raw_parts:
                if isinstance(x, str):
                    x = tokenizer.EncodeAsIds(x)
                    flag = [0] * len(x)
                elif isinstance(x, int):
                    flag = [1] * x
                    x = [prompt_id] * x
                else:
                    flag = [0] * len(x)
                parts.append((x, s))
                flags.append((flag, x))
            return parts, flags

        parts_a, flags_a = encode_input(raw_parts_a)
        parts_b, flags_b = None, None
        if raw_parts_b:
            raw_parts_b = [
                x if isinstance(x, tuple) else (x, False) for x in raw_parts_b
            ]
            parts_b, flags_b = encode_input(raw_parts_b)

        if self.is_multi_token:
            answers = self.get_answers(example)

            if not self.fast_decode:
                ids_list, positions_list, sep_list, mask_list, target_list, prompt_list = [], [], [], [], [], []
                for idx, answer in enumerate(answers):
                    this_parts_a, this_parts_b = copy.deepcopy(
                        parts_a), copy.deepcopy(parts_b)
                    answer_ids = get_verbalization_ids(
                        answer, tokenizer, force_single_token=False)
                    answer_ids = answer_ids + [tokenizer.get_command('eop').Id]
                    self.num_truncated += self.truncate(
                        this_parts_a,
                        this_parts_b,
                        answer_ids,
                        max_length=self.max_seq_length)
                    tokens_a = [
                        token_id for part, _ in this_parts_a
                        for token_id in part
                    ]
                    tokens_b = [
                        token_id for part, _ in this_parts_b
                        for token_id in part
                    ] if parts_b else None
                    data = build_input_from_ids(tokens_a,
                                                tokens_b,
                                                answer_ids,
                                                self.max_seq_length,
                                                self.tokenizer,
                                                args=self.args,
                                                add_cls=True,
                                                add_sep=False,
                                                add_piece=True,
                                                mask_id=self.mask_id)
                    ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
                    prompt_pos = [
                        idx for idx, token in enumerate(ids)
                        if token == prompt_id
                    ]
                    ids = [idx if idx != prompt_id else 0 for idx in ids]
                    prompt_list.append(prompt_pos)
                    ids_list.append(ids)
                    positions_list.append(position_ids)
                    sep_list.append(sep)
                    target_list.append(target_ids)
                    mask_list.append(loss_masks)
                if example.label is not None:
                    label = self.label_list.index(example.label)
                else:
                    label = 0
                sample = build_sample(ids_list,
                                      positions=positions_list,
                                      masks=sep_list,
                                      label=label,
                                      logit_mask=mask_list,
                                      target=target_list,
                                      unique_id=example.guid,
                                      prompt_ids=prompt_list)
                return sample

            else:
                this_parts_a, this_parts_b = copy.deepcopy(
                    parts_a), copy.deepcopy(parts_b)
                self.num_truncated += self.truncate(
                    this_parts_a,
                    this_parts_b,
                    None,
                    max_length=self.max_seq_length)
                tokens_a = [
                    token_id for part, _ in this_parts_a for token_id in part
                ]
                tokens_b = [
                    token_id for part, _ in this_parts_b for token_id in part
                ] if parts_b else None
                data = build_input_from_ids(tokens_a,
                                            tokens_b,
                                            None,
                                            self.max_seq_length,
                                            self.tokenizer,
                                            args=self.args,
                                            add_cls=True,
                                            add_sep=False,
                                            add_piece=False)
                ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
                if example.label is not None:
                    label = self.label_list.index(example.label)
                else:
                    label = 0
                sample = build_sample(ids,
                                      positions=position_ids,
                                      masks=sep,
                                      label=label,
                                      unique_id=example.guid)

                ids_list, positions_list, mask_list, target_list, logit_mask_list = [], [], [], [], []
                for answer in answers:
                    answer_ids = get_verbalization_ids(
                        answer, tokenizer, force_single_token=False)
                    answer_ids = answer_ids + [tokenizer.get_command('eop').Id]
                    answer_ids = answer_ids[:self.max_dec_seq_length]
                    data = build_decoder_input(ids, answer_ids,
                                               self.max_seq_length,
                                               self.max_dec_seq_length,
                                               tokenizer)
                    dec_ids, _, _, dec_position_ids, _, dec_target_ids, dec_loss_masks = data
                    ids_list.append(dec_ids)
                    positions_list.append(dec_position_ids)
                    mask_list.append(sep)
                    target_list.append(dec_target_ids)
                    logit_mask_list.append(dec_loss_masks)

                sample = build_decoder_sample(sample, ids_list, positions_list,
                                              mask_list, target_list,
                                              logit_mask_list)
                return sample

        else:
            self.num_truncated += self.truncate(parts_a,
                                                parts_b, [],
                                                max_length=self.max_seq_length)

            tokens_a = [token_id for part, _ in parts_a for token_id in part]
            tokens_b = [token_id for part, _ in parts_b
                        for token_id in part] if parts_b else None
            if priming:
                input_ids = tokens_a
                if tokens_b:
                    input_ids += tokens_b
                if labeled:
                    mask_idx = input_ids.index(self.mask_id)
                    assert mask_idx == 1, 'sequence of input_ids must contain a mask token'
                    assert len(
                        self.verbalize(example.label)
                    ) == 1, 'priming only supports one verbalization per label'
                    verbalizer = self.verbalize(example.label)[0]
                    verbalizer_id = get_verbalization_ids(
                        verbalizer, self.tokenizer, force_single_token=True)
                    input_ids[mask_idx] = verbalizer_id
                return input_ids
            data = build_input_from_ids(tokens_a,
                                        tokens_b,
                                        None,
                                        self.max_seq_length,
                                        self.tokenizer,
                                        args=self.args,
                                        add_cls=True,
                                        add_sep=False,
                                        add_piece=True)
            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
            prompt_pos = [
                idx for idx, token in enumerate(ids) if token == prompt_id
            ]
            ids = [token if token != prompt_id else 0 for token in ids]
            target_ids = self.get_verbalizer_ids()
            if example.label is not None:
                label = self.label_list.index(example.label)
            else:
                label = 0
            sample = build_sample(ids=ids,
                                  positions=position_ids,
                                  target=target_ids,
                                  masks=sep,
                                  logit_mask=loss_masks,
                                  label=label,
                                  unique_id=example.guid,
                                  prompt_ids=prompt_pos)
            return sample