def clean_filter_sample_gpt_eval(self, sample): """ Does tokenization for final model evaluation. This should return input_ids as the context and labels as the true answer. """ if sample == None: return None if self.mode_answer == 'eval_peeking': return self.clean_filter_sample_peeking_gpt_eval(sample) elif self.mode_answer == 'eval_nopack_padding': return self.clean_filter_sample_nopackpadding_gpt_eval(sample) question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = _clean_numbers(answer) answer_final = last_boxed_only_string(answer) assert not answer.isspace() question_ids = torch.LongTensor( self.tokenizer.encode("\nQUESTION:\n" + question, verbose=False)) sep_ids = torch.LongTensor( self.tokenizer.encode("\FULL SOLUTION:\n", verbose=False)) answer_final_ids = torch.LongTensor( self.tokenizer.encode( answer_final, verbose=False)) # Loss only counted on these tokens. input_ids = torch.cat([ question_ids, sep_ids, ], dim=0) label_ids = torch.cat([answer_final_ids.clone()], dim=0) # Stop early if this Q,A pair is too long if input_ids.shape[0] + label_ids.shape[0] > self.max_tokens: # Print reason for skipping # print(f"Skipping due to input_ids being too big. input_ids.shape[0] = {input_ids.shape[0]}.") return None return { 'input_ids_list': input_ids.tolist(), 'label_ids_list': label_ids.tolist() }
def clean_filter_sample_gpt(self, sample): """ Does the actual tokenization. Should be parallelized because it can be a bit slow. """ if sample == None: return None question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = _clean_numbers(answer) if self.mode_answer == 'default': question_ids = torch.LongTensor( self.tokenizer.encode("\nQUESTION:\n" + question, verbose=False)) sep_ids = self.tokenizer.encode("\nFULL SOLUTION:\n", verbose=False) sep_ids.append(self.tokenizer.eos_token_id) sep_ids = torch.LongTensor(sep_ids) answer_ids = self.tokenizer.encode(answer, verbose=False) answer_ids.append(self.tokenizer.eos_token_id) answer_ids = torch.LongTensor(answer_ids) # Use full solution input_ids = torch.cat([question_ids, sep_ids, answer_ids], dim=0) label_ids = torch.cat([ torch.ones_like(question_ids) * -100, torch.ones_like(sep_ids) * -100, answer_ids.clone() ], dim=0) else: raise NotImplementedError() # Stop early if this Q,A pair is too long if question_ids.shape[0] + sep_ids.shape[0] > self.max_tokens: # Print reason for skipping # print(f"{self.__class__.__name__} Skipping due to input_ids being too big. question_ids.shape[0] + sep_ids.shape[0] = {question_ids.shape[0] + sep_ids.shape[0]}.") return None input_ids = input_ids.tolist() label_ids = label_ids.tolist() return {'input_ids_list': input_ids, 'label_ids_list': label_ids}
def clean_filter_sample_nopackpadding_gpt_eval(self, sample): if sample == None: return None question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = _clean_numbers(answer) answer_final = last_boxed_only_string(answer) question_ids = torch.LongTensor( self.tokenizer.encode("\nQUESTION:\n" + question, verbose=False)) sep_ids = torch.LongTensor( self.tokenizer.encode("\nFINAL ANSWER:\n", verbose=False)) final_answer_ids = torch.LongTensor( self.tokenizer.encode(answer_final, verbose=False)) num_to_pad = 32 padding_tensor = torch.ones( (num_to_pad )) * 220 # 220 is the token for space in the case of GPT2 models input_ids = torch.cat([ question_ids, padding_tensor, sep_ids, ], dim=0) # Only answer_ids contribute to the loss label_ids = torch.cat([final_answer_ids.clone()], dim=0) # Stop early if this Q,A pair is too long if input_ids.shape[0] + label_ids.shape[0] > self.max_tokens: # Print reason for skipping # print(f"Skipping due to input_ids being too big. input_ids.shape[0] = {input_ids.shape[0]}.") return None input_ids = input_ids.tolist() label_ids = label_ids.tolist() return {'input_ids_list': input_ids, 'label_ids_list': label_ids}
def clean_filter_sample_nopackpadding_gpt(self, sample): if sample == None: return None question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = _clean_numbers(answer) answer_final = last_boxed_only_string(answer) question_ids = torch.LongTensor( self.tokenizer.encode("\nQUESTION:\n" + question, verbose=False)) sep_ids = torch.LongTensor( self.tokenizer.encode("\nFINAL ANSWER:\n", verbose=False)) final_answer_ids = torch.LongTensor( self.tokenizer.encode(answer_final, verbose=False)) # Stop early if this Q,A pair is too long num_to_pad = 32 padding_tensor = torch.ones( (num_to_pad )) * 220 # 220 is the token for space in the case of GPT2 models input_ids = torch.cat( [question_ids, padding_tensor, sep_ids, final_answer_ids], dim=0) # Only answer_ids contribute to the loss label_ids = torch.cat([ torch.ones_like(question_ids) * -100, torch.ones_like(padding_tensor) * -100, torch.ones_like(sep_ids) * -100, final_answer_ids.clone() ], dim=0) input_ids = input_ids.tolist() label_ids = label_ids.tolist() return {'input_ids_list': input_ids, 'label_ids_list': label_ids}
def clean_filter_sample_t5(self, sample): """ Does the actual tokenization. Should be parallelized because it can be a bit slow. """ if sample == None: return None question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = _clean_numbers(answer) if self.mode_answer == 'default': question_ids = torch.LongTensor( self.tokenizer.encode("\nQUESTION:\n" + question + "\nFINAL ANSWER:\n", verbose=False)) answer_ids = torch.LongTensor( self.tokenizer.encode(answer, verbose=False)) input_ids = torch.cat([ question_ids, ], dim=0) label_ids = torch.cat([answer_ids], dim=0) else: raise NotImplementedError() # Stop early if this Q,A pair is too long if input_ids.shape[0] > self.max_tokens: # Print reason for skipping # print(f"{self.__class__.__name__} Skipping due to input_ids being too big. input_ids.shape[0] = {input_ids.shape[0]}.") return None input_ids = input_ids.tolist() label_ids = label_ids.tolist() return {'input_ids_list': input_ids, 'label_ids_list': label_ids}
def clean_filter_sample_t5(self, sample): """ Does the actual tokenization. Should be parallelized because it can be a bit slow. """ if sample == None: return None question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = list(map(_clean_numbers, answer)) if self.mode_answer == 'mixed_hints': answer_full = "".join(answer) answer_final = answer[-1] if random.random() < 0.5: # Use full solution question_ids = torch.LongTensor(self.tokenizer.encode("\nQUESTION:\n" + question + "\nFULL SOLUTION:\n", verbose=False)) answer_ids = torch.LongTensor(self.tokenizer.encode(answer_full, verbose=False)) else: # Use only final answer question_ids = torch.LongTensor(self.tokenizer.encode("\nQUESTION:\n" + question + "\nFINAL ANSWER:\n", verbose=False)) answer_ids = torch.LongTensor(self.tokenizer.encode(answer_final, verbose=False)) else: raise NotImplementedError() input_ids = torch.cat([ question_ids, ], dim=0) label_ids = torch.cat([ answer_ids ], dim=0) # Stop early if this Q,A pair is too long if question_ids.shape[0] > self.max_tokens: # Print reason for skipping # print(f"{self.__class__.__name__} Skipping due to input_ids being too big. question_ids.shape[0] = {question_ids.shape[0]}.") return None input_ids = input_ids.tolist() label_ids = label_ids.tolist() return { 'input_ids_list' : input_ids, 'label_ids_list' : label_ids }
def clean_filter_sample_gpt(self, sample): """ Does the actual tokenization. Should be parallelized because it can be a bit slow. """ if sample == None: return None if self.mode_answe == 'peeking_only': return self.clean_filter_sample_peeking_gpt(sample) if self.mode_answer == 'mixed_full_and_peeking': if random.random() < 0.5: return self.clean_filter_sample_peeking_gpt(sample) else: _mode_answer = 'full' elif self.mode_answer == 'mixed_full_and_nopack_padding': if random.random() < 0.5: return self.clean_filter_sample_nopackpadding_gpt(sample) else: _mode_answer = 'full' elif self.mode_answer == 'mixed_final_boxed_and_full': if random.random() < 0.5: _mode_answer = 'full' else: _mode_answer = 'final_boxed' elif self.mode_answer == 'full': _mode_answer = 'full' elif self.mode_answer == 'final_boxed': _mode_answer = 'final_boxed' else: raise NotImplementedError( f"self.mode_answer = {self.mode_answer} not recognized.") if _mode_answer == 'full': question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = _clean_numbers(answer) answer_final = last_boxed_only_string(answer) question_ids = torch.LongTensor( self.tokenizer.encode("\nQUESTION:\n" + question, verbose=False)) sep_ids_2 = torch.LongTensor( self.tokenizer.encode("\nFULL SOLUTION:\n", verbose=False)) answer_ids = self.tokenizer.encode(answer, verbose=False) answer_ids.append(self.tokenizer.eos_token_id) answer_ids = torch.LongTensor(answer_ids) input_ids = torch.cat([question_ids, sep_ids_2, answer_ids], dim=0) # Only answer_ids contribute to the loss label_ids = torch.cat([ torch.ones_like(question_ids) * -100, torch.ones_like(sep_ids_2) * -100, answer_ids.clone() ], dim=0) elif _mode_answer == 'final_boxed': question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = _clean_numbers(answer) answer_final = last_boxed_only_string(answer) if not answer_final: print("ERROR FROM", question, answer) return None question_ids = torch.LongTensor( self.tokenizer.encode("\nQUESTION:\n" + question, verbose=False)) sep_ids_1 = torch.LongTensor( self.tokenizer.encode("\nFINAL ANSWER:\n", verbose=False)) answer_final_ids = self.tokenizer.encode(answer_final, verbose=False) answer_final_ids.append(self.tokenizer.eos_token_id) answer_final_ids = torch.LongTensor(answer_final_ids) input_ids = torch.cat([ question_ids, sep_ids_1, answer_final_ids, ], dim=0) # Only answer_ids contribute to the loss label_ids = torch.cat([ torch.ones_like(question_ids) * -100, torch.ones_like(sep_ids_1) * -100, answer_final_ids.clone(), ], dim=0) else: raise NotImplementedError() # Stop early if this Q,A pair is too long if input_ids.shape[0] > self.max_tokens: # Print reason for skipping # print(f"Skipping due to input_ids being too big. input_ids.shape[0] = {input_ids.shape[0]}.") return None input_ids = input_ids.tolist() label_ids = label_ids.tolist() return {'input_ids_list': input_ids, 'label_ids_list': label_ids}
def clean_filter_sample_peeking_gpt_eval(self, sample): """ Does the actual tokenization. Should be parallelized because it can be a bit slow. """ if sample == None: return None question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = _clean_numbers(answer) answer_final = last_boxed_only_string(answer) question_ids = torch.LongTensor( self.tokenizer.encode("\nQUESTION:\n" + question + "\nFULL SOLUTION:\n", verbose=False)) answer_ids = self.tokenizer.tokenize(answer) answer_ids_full = torch.LongTensor(self.tokenizer.encode(answer)) answer_ids = only_until_first_boxed_from_tokens(answer, answer_ids) if len(answer_ids) == 0: return None answer_ids = torch.LongTensor( self.tokenizer.encode(answer_ids, verbose=False)) # Take a fraction if isinstance(self.peek_fraction, tuple): final_idx = int( len(answer_ids) * random.uniform(*self.peek_fraction)) else: final_idx = int(len(answer_ids) * self.peek_fraction) answer_ids = answer_ids[:final_idx] # sep_ids = torch.LongTensor(self.tokenizer.encode("\nFINAL ANSWER\n", verbose=False)) final_answer_ids = answer_ids_full[final_idx:] print(final_answer_ids) input_ids = torch.cat( [ question_ids, answer_ids, # sep_ids, ], dim=0) # Only answer_ids contribute to the loss label_ids = torch.cat([final_answer_ids.clone()], dim=0) # Stop early if this Q,A pair is too long if input_ids.shape[0] + label_ids.shape[0] > self.max_tokens: # Print reason for skipping # print(f"Skipping due to input_ids being too big. input_ids.shape[0] = {input_ids.shape[0]}.") return None input_ids = input_ids.tolist() label_ids = label_ids.tolist() return {'input_ids_list': input_ids, 'label_ids_list': label_ids}