def encode(self, example: InputExample, tokenizer, seq_length, args): if args.pretrained_bert: ids_list, types_list, paddings_list = [], [], [] else: ids_list, positions_list, sep_list = [], [], [] tokens_a = tokenizer.EncodeAsIds(example.text_a).tokenization tokens_b = tokenizer.EncodeAsIds(example.text_b).tokenization if example.text_b else None for answer in example.meta["candidates"]: answer_ids = tokenizer.EncodeAsIds(answer).tokenization total_length = len(tokens_a) + len(tokens_b) + len(answer_ids) total_length += num_special_tokens_to_add(tokens_a, tokens_b + answer_ids, None, add_cls=True, add_sep=True, add_piece=False) if total_length > seq_length: self.num_truncated += 1 data = build_input_from_ids(tokens_a, tokens_b + answer_ids, None, seq_length, tokenizer, args, add_cls=True, add_sep=True, add_piece=False) ids, types, paddings, position_ids, sep, target_ids, loss_masks = data if args.pretrained_bert: ids_list.append(ids) types_list.append(types) paddings_list.append(paddings) else: ids_list.append(ids) positions_list.append(position_ids) sep_list.append(sep) label = example.label label = self.get_labels().index(label) if args.pretrained_bert: sample = build_sample(ids_list, label=label, types=types_list, paddings=paddings_list, unique_id=example.guid) else: sample = build_sample(ids_list, positions=positions_list, masks=sep_list, label=label, unique_id=example.guid) return sample
def encode(self, example: InputExample, tokenizer, args): if args.pretrained_bert: ids_list, types_list, paddings_list = [], [], [] else: ids_list, positions_list, sep_list = [], [], [] question = example.meta['question'] joiner = 'because' if question == 'cause' else 'so' text_a = example.text_a + " " + joiner tokens_a = tokenizer.EncodeAsIds(text_a).tokenization for choice in [example.meta["choice1"], example.meta["choice2"]]: tokens_b = tokenizer.EncodeAsIds(choice).tokenization num_special_tokens = num_special_tokens_to_add(tokens_a, tokens_b, None, add_cls=True, add_sep=True, add_piece=False) if len(tokens_a) + len( tokens_b) + num_special_tokens > args.seq_length: self.num_truncated += 1 data = build_input_from_ids(tokens_a, tokens_b, None, args.seq_length, tokenizer, args, add_cls=True, add_sep=True, add_piece=False) ids, types, paddings, position_ids, sep, target_ids, loss_masks = data if args.pretrained_bert: ids_list.append(ids) types_list.append(types) paddings_list.append(paddings) else: ids_list.append(ids) positions_list.append(position_ids) sep_list.append(sep) label = 0 if example.label is not None: label = example.label label = self.get_labels().index(label) if args.pretrained_bert: sample = build_sample(ids_list, label=label, types=types_list, paddings=paddings_list, unique_id=example.guid) else: sample = build_sample(ids_list, positions=positions_list, masks=sep_list, label=label, unique_id=example.guid) return sample
def __getitem__(self, idx): raw_sample = self.samples[idx] ids, types, paddings = build_tokens_types_paddings_from_text( raw_sample['text_a'], raw_sample['text_b'], self.tokenizer, self.max_seq_length) sample = build_sample(ids, types, paddings, raw_sample['label'], raw_sample['uid']) return sample
def encode(self, example: InputExample, tokenizer, args): text_a, text_b = self.get_classifier_input(example, tokenizer) tokens_a = tokenizer.EncodeAsIds(text_a).tokenization tokens_b = tokenizer.EncodeAsIds(text_b).tokenization num_special_tokens = num_special_tokens_to_add(tokens_a, tokens_b, None, add_cls=True, add_sep=True, add_piece=False) if len(tokens_a) + len( tokens_b) + num_special_tokens > args.seq_length: self.num_truncated += 1 data = build_input_from_ids(tokens_a, tokens_b, None, args.seq_length, tokenizer, args=args, add_cls=True, add_sep=True, add_piece=False) ids, types, paddings, position_ids, sep, target_ids, loss_masks = data label = 0 if example.label is not None: label = example.label label = self.get_labels().index(label) if args.pretrained_bert: sample = build_sample(ids, label=label, types=types, paddings=paddings, unique_id=example.guid) else: sample = build_sample(ids, positions=position_ids, masks=sep, label=label, unique_id=example.guid) return sample
def process_single_datapath(datapath, tokenizer, max_qa_length, max_seq_length): """Read in RACE files, combine, clean-up, tokenize, and convert to samples.""" print_rank_0(' > working on {}'.format(datapath)) start_time = time.time() # Get list of files. filenames = glob.glob(os.path.join(datapath, '*.txt')) samples = [] num_docs = 0 num_questions = 0 num_samples = 0 # Load all the files for filename in filenames: with open(filename, 'r') as f: for line in f: data = json.loads(line) num_docs += 1 context = data["article"] questions = data["questions"] choices = data["options"] answers = data["answers"] # Check the length. assert len(questions) == len(answers) assert len(questions) == len(choices) # Context: clean up and convert to ids. context = clean_text(context) context_ids = tokenizer.tokenize(context) # Loop over questions. for qi, question in enumerate(questions): num_questions += 1 # Label. label = ord(answers[qi]) - ord("A") assert label >= 0 assert label < NUM_CHOICES assert len(choices[qi]) == NUM_CHOICES # For each question, build num-choices samples. ids_list = [] types_list = [] paddings_list = [] for ci in range(NUM_CHOICES): choice = choices[qi][ci] # Merge with choice. if "_" in question: qa = question.replace("_", choice) else: qa = " ".join([question, choice]) # Clean QA. qa = clean_text(qa) # Tokenize. qa_ids = tokenizer.tokenize(qa) # Trim if needed. if len(qa_ids) > max_qa_length: qa_ids = qa_ids[0:max_qa_length] # Build the sample. ids, types, paddings \ = build_tokens_types_paddings_from_ids( qa_ids, context_ids, max_seq_length, tokenizer.cls, tokenizer.sep, tokenizer.pad) ids_list.append(ids) types_list.append(types) paddings_list.append(paddings) # Convert to numpy and add to samples samples.append(build_sample(ids_list, types_list, paddings_list, label, num_samples)) num_samples += 1 elapsed_time = time.time() - start_time print_rank_0(' > processed {} document, {} questions, and {} samples' ' in {:.2f} seconds'.format(num_docs, num_questions, num_samples, elapsed_time)) return samples
def encode(self, example: InputExample, priming: bool = False, labeled: bool = False): """ Encode an input example using this pattern-verbalizer pair. :param example: the input example to encode :param priming: whether to use this example for priming :param labeled: if ``priming=True``, whether the label should be appended to this example :return: A tuple, consisting of a list of input ids and a list of token type ids """ if not priming: assert not labeled, "'labeled' can only be set to true if 'priming' is also set to true" tokenizer = self.tokenizer raw_parts_a, raw_parts_b = self.get_parts(example) raw_parts_a = [ x if isinstance(x, tuple) else (x, False) for x in raw_parts_a ] prompt_id = tokenizer.num_tokens def encode_input(raw_parts): parts, flags = [], [] for x, s in raw_parts: if isinstance(x, str): x = tokenizer.EncodeAsIds(x) flag = [0] * len(x) elif isinstance(x, int): flag = [1] * x x = [prompt_id] * x else: flag = [0] * len(x) parts.append((x, s)) flags.append((flag, x)) return parts, flags parts_a, flags_a = encode_input(raw_parts_a) parts_b, flags_b = None, None if raw_parts_b: raw_parts_b = [ x if isinstance(x, tuple) else (x, False) for x in raw_parts_b ] parts_b, flags_b = encode_input(raw_parts_b) if self.is_multi_token: answers = self.get_answers(example) if not self.fast_decode: ids_list, positions_list, sep_list, mask_list, target_list, prompt_list = [], [], [], [], [], [] for idx, answer in enumerate(answers): this_parts_a, this_parts_b = copy.deepcopy( parts_a), copy.deepcopy(parts_b) answer_ids = get_verbalization_ids( answer, tokenizer, force_single_token=False) answer_ids = answer_ids + [tokenizer.get_command('eop').Id] self.num_truncated += self.truncate( this_parts_a, this_parts_b, answer_ids, max_length=self.max_seq_length) tokens_a = [ token_id for part, _ in this_parts_a for token_id in part ] tokens_b = [ token_id for part, _ in this_parts_b for token_id in part ] if parts_b else None data = build_input_from_ids(tokens_a, tokens_b, answer_ids, self.max_seq_length, self.tokenizer, args=self.args, add_cls=True, add_sep=False, add_piece=True, mask_id=self.mask_id) ids, types, paddings, position_ids, sep, target_ids, loss_masks = data prompt_pos = [ idx for idx, token in enumerate(ids) if token == prompt_id ] ids = [idx if idx != prompt_id else 0 for idx in ids] prompt_list.append(prompt_pos) ids_list.append(ids) positions_list.append(position_ids) sep_list.append(sep) target_list.append(target_ids) mask_list.append(loss_masks) if example.label is not None: label = self.label_list.index(example.label) else: label = 0 sample = build_sample(ids_list, positions=positions_list, masks=sep_list, label=label, logit_mask=mask_list, target=target_list, unique_id=example.guid, prompt_ids=prompt_list) return sample else: this_parts_a, this_parts_b = copy.deepcopy( parts_a), copy.deepcopy(parts_b) self.num_truncated += self.truncate( this_parts_a, this_parts_b, None, max_length=self.max_seq_length) tokens_a = [ token_id for part, _ in this_parts_a for token_id in part ] tokens_b = [ token_id for part, _ in this_parts_b for token_id in part ] if parts_b else None data = build_input_from_ids(tokens_a, tokens_b, None, self.max_seq_length, self.tokenizer, args=self.args, add_cls=True, add_sep=False, add_piece=False) ids, types, paddings, position_ids, sep, target_ids, loss_masks = data if example.label is not None: label = self.label_list.index(example.label) else: label = 0 sample = build_sample(ids, positions=position_ids, masks=sep, label=label, unique_id=example.guid) ids_list, positions_list, mask_list, target_list, logit_mask_list = [], [], [], [], [] for answer in answers: answer_ids = get_verbalization_ids( answer, tokenizer, force_single_token=False) answer_ids = answer_ids + [tokenizer.get_command('eop').Id] answer_ids = answer_ids[:self.max_dec_seq_length] data = build_decoder_input(ids, answer_ids, self.max_seq_length, self.max_dec_seq_length, tokenizer) dec_ids, _, _, dec_position_ids, _, dec_target_ids, dec_loss_masks = data ids_list.append(dec_ids) positions_list.append(dec_position_ids) mask_list.append(sep) target_list.append(dec_target_ids) logit_mask_list.append(dec_loss_masks) sample = build_decoder_sample(sample, ids_list, positions_list, mask_list, target_list, logit_mask_list) return sample else: self.num_truncated += self.truncate(parts_a, parts_b, [], max_length=self.max_seq_length) tokens_a = [token_id for part, _ in parts_a for token_id in part] tokens_b = [token_id for part, _ in parts_b for token_id in part] if parts_b else None if priming: input_ids = tokens_a if tokens_b: input_ids += tokens_b if labeled: mask_idx = input_ids.index(self.mask_id) assert mask_idx == 1, 'sequence of input_ids must contain a mask token' assert len( self.verbalize(example.label) ) == 1, 'priming only supports one verbalization per label' verbalizer = self.verbalize(example.label)[0] verbalizer_id = get_verbalization_ids( verbalizer, self.tokenizer, force_single_token=True) input_ids[mask_idx] = verbalizer_id return input_ids data = build_input_from_ids(tokens_a, tokens_b, None, self.max_seq_length, self.tokenizer, args=self.args, add_cls=True, add_sep=False, add_piece=True) ids, types, paddings, position_ids, sep, target_ids, loss_masks = data prompt_pos = [ idx for idx, token in enumerate(ids) if token == prompt_id ] ids = [token if token != prompt_id else 0 for token in ids] target_ids = self.get_verbalizer_ids() if example.label is not None: label = self.label_list.index(example.label) else: label = 0 sample = build_sample(ids=ids, positions=position_ids, target=target_ids, masks=sep, logit_mask=loss_masks, label=label, unique_id=example.guid, prompt_ids=prompt_pos) return sample