def memory_merge(prompt, context, tokenizer, maxHistory=1024): assert (prompt + context) # print(prompt+context) # logger.debug('RAW TEXT INPUT IS:`%r`', context) # the tokenizer is kind of broken for the first input, especially if it includes white space. Same with any trailing white space on the last output. # I'm going with the add prefix option but I'm not sure it's quite right prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False, add_prefix_space=True) if len(prompt_tokens) >= maxHistory: logger.debug("Clamping the amount of prompt tokens.") context_tokens = prompt_tokens[-maxHistory:] else: context_tokens = hackyEncode(tokenizer, hackyWhiteSpaceCutter(prompt) + context) context_tokens = context_tokens[-(maxHistory - len(prompt_tokens)):] # logger.debug('DECODED CONTEXT TOKENS: `%r`', tokenizer.convert_ids_to_tokens(context_tokens)) prompt_tokens.extend(context_tokens) context_tokens = prompt_tokens # logger.debug('DECODED OUTPUT IS: `%r`', tokenizer.decode(context_tokens, clean_up_tokenization_spaces=False)) # this is a hack and it should be up to the sampler to deal with max size if len(context_tokens) > maxHistory: logger.error("CONTEXT IS TOO LONG ERROR") context_tokens = context_tokens[-maxHistory:] return context_tokens
def clean_suggested_action(result_raw, min_length=4): result_cleaned = standardize_punctuation(result_raw) result_cleaned = cut_trailing_sentence(result_cleaned, allow_action=True) # The generations actions carry on into the next prompt, so lets remove the prompt results = result_cleaned.split("\n") results = [s.strip() for s in results] results = [s for s in results if len(s) > min_length] # Sometimes actions are generated with leading > ! . or ?. Likely the model trying to finish the prompt or start an action. result = results[0].strip().lstrip(" >!.?") if len(results) else "" # result = cut_trailing_quotes(result) logger.debug( "full suggested action '%r'. Cropped: '%r'. Split '%r'", result_raw, result, results, ) # Often actions are cropped with sentance fragments, lets remove. Or we could just turn up config_act["generate-number"] result = first_to_second_person(result) # Sometimes the suggestion start with "You" we will add that on later anyway so remove it here # result = re.sub("^ ?[Yy]ou try to ?", "", result) # result = re.sub("^ ?[Yy]ou start to ?", "", result) # result = re.sub("^ ?[Yy]ou ", "", result) logger.debug("suggested action after cleaning `%r`", result) return result
def generate(self, context, prompt='', temperature=None, top_p=None, top_k=None, repetition_penalty=None, depth=0): assert (top_k is not None) assert (temperature is not None) assert (top_p) assert (repetition_penalty) # logger.debug("BEFORE PROMPT_REPLACE: `%r`", prompt) # prompt = [self.prompt_replace(p) for p in prompt] # logger.debug("AFTER PROMPT_REPLACE is: `%r`", repr(prompt)) assert (prompt + context) text = self.generate_raw(context, prompt, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, stop_tokens=torch.tensor( [[self.tokenizer.eos_token_id]])) logger.debug("Generated result is: `%r`", repr(text)) result = self.result_replace(text) if (depth > 6) and len(result) == 0: # Sometimes it keeps generating a story startng with an action (">"), if it's tried a few times and it keeps # happening, lets let it keep action text which starts in ">" # We could just blacklist that token and force it to generate something else. TODO result = self.result_replace(text, allow_action=True) logger.info( "Model generated empty text after formatting `%r`. Trying to format less with allow_action=True. `%r`", text, result, ) # same here as above if len(result) == 0: if depth < 20: logger.info("Model generated empty text trying again %r", depth) return self.generate(prompt, context, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty, depth=depth + 1) else: logger.warn( "Model generated empty text %r times. Try another action", depth) return result
def generate_raw(self, context, prompt='', generate_num=None, temperature=None, top_k=None, top_p=None, repetition_penalty=None, stop_tokens=None): assert (top_k is not None) assert (temperature is not None) assert (top_p) assert (repetition_penalty) context_tokens = memory_merge(prompt, context, self.tokenizer, self.max_history_tokens) # if os.environ.get("DEBUG_GPT2", False): logger.debug( "Text passing into model `%r`", self.tokenizer.decode( context_tokens, clean_up_tokenization_spaces=True, #skip_special_tokens=True, ), ) generated = 0 for _ in range(self.samples // self.batch_size): out = self.sample_sequence( context_tokens, generate_num=generate_num, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, stop_tokens=stop_tokens, ) out = out[:, len(context_tokens):].tolist() for o in out: generated += 1 #disabled clean up of spaces, see what effect this has TODO text = self.tokenizer.decode( o, clean_up_tokenization_spaces=False, skip_special_tokens=True) if self.stop_token: index = text.find(self.stop_token) if index == -1: index = None text = text[:index] if stop_tokens is not None: for stop_token in stop_tokens: index = text.find(self.stop_token) if index == -1: index = None text = text[:index] return text
def generate_raw(self, context, prompt='', generate_num=None, temperature=None, top_k=None, top_p=None, repetition_penalty=None, repetition_penalty_range=512, repetition_penalty_slope=3.33, stop_tokens=None): assert (top_k is not None) assert (temperature is not None) assert (top_p) assert (repetition_penalty) context_tokens = memory_merge(prompt, context, self.tokenizer, self.max_history_tokens) logger.debug( "Text passing into model `%r`", self.tokenizer.decode( context_tokens, clean_up_tokenization_spaces=True, # skip_special_tokens=True, ), ) generated = 0 text = "" for _ in range(self.samples // self.batch_size): out = self.sample_sequence( context_tokens, generate_num=generate_num, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, repetition_penalty_range=repetition_penalty_range, repetition_penalty_slope=repetition_penalty_slope, stop_tokens=stop_tokens, ) text += out.text generated += 1 # disabled clean up of spaces, see what effect this has TODO if self.stop_token: index = text.find(self.stop_token) if index == -1: index = None text = text[:index] if stop_tokens is not None: for stop_token in stop_tokens: index = text.find(self.stop_token) if index == -1: index = None text = text[:index] return text
def generate_raw(self, prompt, generate_num=None, temperature=None, stop_tokens=None): # the prompt is a list of strings, encode each one tok tokens, then truncate the longest ones context_tokens = [ self.tokenizer.encode(p, add_special_tokens=False, max_length=self.max_history_tokens) for p in prompt ] truncate_multiple_sequences(context_tokens, self.max_history_tokens) context_tokens = list(itertools.chain(*context_tokens)) # if os.environ.get("DEBUG_GPT2", False): logger.debug( "Text passing into model `%r`", self.tokenizer.decode( context_tokens, clean_up_tokenization_spaces=True, skip_special_tokens=True, ), ) generated = 0 for _ in range(self.samples // self.batch_size): out = self.sample_sequence( context_tokens, generate_num=generate_num, temperature=temperature, stop_tokens=stop_tokens, ) out = out[:, len(context_tokens):].tolist() for o in out: generated += 1 text = self.tokenizer.decode(o, clean_up_tokenization_spaces=True, skip_special_tokens=True) if self.stop_token: index = text.find(self.stop_token) if index == -1: index = None text = text[:index] if stop_tokens is not None: for stop_token in stop_tokens: index = text.find(self.stop_token) if index == -1: index = None text = text[:index] return text
def result_replace(self, result, allow_action=False): # logger.debug("BEFORE RESULT_REPLACE: `%s`", repr(result)) result = cut_trailing_sentence(result, allow_action=allow_action) if len(result) == 0: return "" first_letter_capitalized = result[0].isupper() result = result.replace('."', '".') result = result.replace("#", "") result = result.replace("*", "") result = result.replace("\n\n", "\n") # result = first_to_second_person(result) if not first_letter_capitalized: result = result[0].lower() + result[1:] logger.debug("AFTER RESULT_REPLACE: `%r`. allow_action=%r", repr(result), allow_action) return result
def sample_sequence(model, length, context, temperature=1, top_k=0, top_p=0.9, repetition_penalty=1.0, device="cpu", stop_tokens=None, tokenizer=None): """Actually generate the tokens""" logger.debug('temp: {} top_k: {} top_p: {} rep-pen: {}'.format( temperature, top_k, top_p, repetition_penalty)) max_length = context.shape[1] + length # check to see if greater than 2048? if settings.getboolean('force-cpu'): context = context.long().cpu() else: context = context.long().cuda() out = model.generate( context, do_sample=True, min_length=max_length, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, repetition_penalty_range=300, repetition_penalty_slope=3.33, use_cache=True, pad_token_id=tokenizer.eos_token_id, ).long() generated = tokenizer.decode(out[0]) return generated
def generate(self, prompt, options=None, seed=None, depth=0): logger.debug("BEFORE PROMPT_REPLACE: `%r`", prompt) prompt = [self.prompt_replace(p) for p in prompt] # logger.debug("AFTER PROMPT_REPLACE is: `%r`", repr(prompt)) text = self.generate_raw(prompt, stop_tokens=self.tokenizer.encode( ["<|endoftext|>", ">"])) logger.debug("Generated result is: `%r`", repr(text)) result = self.result_replace(text) if (depth > 6) and len(result) == 0: # Sometimes it keeps generating a story startng with an action (">"), if it's tried a few times and it keeps # happening, lets let it keep action text which starts in ">" result = self.result_replace(text, allow_action=True) logger.info( "Model generated empty text after formatting `%r`. Trying to format less with allow_action=True. `%r`", text, result, ) if len(result) == 0: if depth < 20: logger.info("Model generated empty text trying again %r", depth) return self.generate(prompt + [" {}".format(depth)], seed=depth, depth=depth + 1) else: logger.warn( "Model generated empty text %r times. Try another action", depth) return result
def play(generator): print("\n") with open(Path("interface", "mainTitle.txt"), "r", encoding="utf-8") as file: colPrint(file.read(), colors["title"], wrap=False) with open(Path("interface", "subTitle.txt"), "r", encoding="utf-8") as file: cols = termWidth for line in file: line = re.sub(r'\n', '', line) line = line[:cols] #fills in the graphic using reverse video mode substituted into the areas between |'s colPrint( re.sub(r'\|[ _]*(\||$)', lambda x: '\x1B[7m' + x.group(0) + '\x1B[27m', line), colors['subtitle'], False) print() colPrint( "Go to https://github.com/cloveranon/Clover-Edition/ or email [email protected] for bug reports, help, and feature requests.", colors['subsubtitle']) while True: # May be needed to avoid out of mem gc.collect() torch.cuda.empty_cache() print("\n\n") colPrint( "0: Pick Prompt From File (Default if you type nothing)\n1: Write Custom Prompt", colors['menu']) if getNumberInput(1) == 1: with open(Path("interface", "prompt-instructions.txt"), "r", encoding="utf-8") as file: colPrint(file.read(), colors["instructions"], False) prompt = colInput("Prompt>", colors["main-prompt"], colors["user-text"]) context = colInput("Context>", colors["main-prompt"], colors["user-text"]) filename = colInput( "Name to save prompt as? (Leave blank for no save): ", colors["query"], colors["user-text"], ) filename = re.sub( "-$", "", re.sub("^-", "", re.sub("[^a-zA-Z0-9_-]+", "-", filename))) if filename != "": with open(Path("prompts", filename + ".txt"), "w", encoding="utf-8") as f: f.write(context + "\n" + prompt) else: prompt, context = selectFile() assert (prompt + context) instructions() print() colPrint("Generating story...", colors["loading-message"]) story = newStory(generator, prompt, context) while True: # Generate suggested actions act_alts = settings.getint("action-sugg") if act_alts > 0: # TODO change this to two messages for different colors suggested_actions = [] colPrint("\nSuggested actions:", colors["selection-value"]) action_suggestion_lines = 2 for i in range(act_alts): suggested_action = story.getSuggestion() if len(suggested_action.strip()) > 0: j = len(suggested_actions) suggested_actions.append(suggested_action) suggestion = "{}> {}".format(j, suggested_action) action_suggestion_lines += colPrint( suggestion, colors["selection-value"]) print() bell() action = colInput("> You ", colors["main-prompt"], colors["user-text"]) # Clear suggestions and user input if act_alts > 0: action_suggestion_lines += 2 if not IN_COLAB: clear_lines(action_suggestion_lines) # Show user input again # colPrint("\n> " + action.rstrip(), colors["user-text"], end="") setRegex = re.search("^/set ([^ ]+) ([^ ]+)$", action) if setRegex: if setRegex.group(1) in settings: currentSettingValue = settings[setRegex.group(1)] colPrint( "Current Value of {}: {} Changing to: {}".format( setRegex.group(1), currentSettingValue, setRegex.group(2))) settings[setRegex.group(1)] = setRegex.group(2) colPrint("Save config file?", colors["query"]) colPrint("Saving an invalid option will corrupt file!", colors["error"]) if (colInput( "y/n? >", colors["selection-prompt"], colors["selection-value"], ) == "y"): with open("config.ini", "w", encoding="utf-8") as file: config.write(file) else: colPrint("Invalid Setting", colors["error"]) instructions() elif action == "/menu": break elif action == "/restart": print() colPrint("Restarting story...", colors["loading-message"]) story = newStory(generator, story.prompt, context) continue elif action == "/quit": exit() elif action == "/help": instructions() elif action == "/print": print("\nPRINTING\n") #TODO colorize printed story colPrint(str(story), colors["print-story"]) elif action == '/retry': if len(story.story) == 1: print() colPrint("Restarting story...", colors["loading-message"]) story = newStory(generator, story.prompt, context) continue else: newaction = story.story[-1][0] colPrint(newaction, colors['user-text'], end='') story.story = story.story[:-1] result = "\n" + story.act(newaction)[0] if len(story.story) >= 2: similarity = get_similarity(result, story.story[-2][1][0]) if similarity > 0.9: story.story = story.story[:-1] colPrint( "Woops that action caused the model to start looping. Try a different action to prevent that.", colors["error"], ) continue colPrint(result, colors["ai-text"]) continue elif action == '/revert': if len(story.story) == 1: colPrint("You can't go back any farther. ", colors["error"]) continue story.story = story.story[:-1] colPrint("Last action reverted. ", colors["message"]) if len(story.story) < 2: colPrint(story.prompt, colors["ai-text"]) colPrint(story.story[-1][1][0], colors["ai-text"]) continue elif action == "/alter": story.story[-1][1][0] = alterText(story.story[-1][1][0]) if len(story.story) < 2: colPrint(story.prompt, colors["ai-text"]) else: colPrint("\n" + story.story[-1][0] + "\n", colors["transformed-user-text"]) colPrint("\n" + story.story[-1][1][0] + "\n\n", colors["ai-text"]) elif action == "/prompt": story.prompt = alterText(story.prompt) if len(story.story) < 2: colPrint(story.prompt, colors["ai-text"]) else: colPrint("\n" + story.story[-1][0] + "\n", colors["transformed-user-text"]) colPrint("\n" + story.story[-1][1][0] + "\n\n", colors["ai-text"]) else: if act_alts > 0: # Options to select a suggestion action if action in [ str(i) for i in range(len(suggested_actions)) ]: action = suggested_actions[int(action)] original_action = action action = action.strip() #TODO debug stuff to delete if action != original_action: logger.debug("STRIPPED WHITE SPACE OFF ACTION %r vs %r", original_action, action) # Crop actions to a max length #action = action[:4096] if action != "": # Roll a 20 sided dice to make things interesting d = random.randint(1, 20) logger.debug("roll d20=%s", d) # If it says 'You say "' then it's still dialouge. Normalise it by removing `You say `, we will add again soon action = re.sub("^ ?[Yy]ou say [\"']", '"', action) if any(action.lstrip().startswith(t) for t in ['"', "'"]): if settings.getboolean("action-d20"): action = d20ify_speech(action, d) else: action = "You say " + action logger.info( "%r. %r, %r", action, any(action.lstrip().startswith(t) for t in ['"', "'"]), settings.getboolean("action-d20")) else: action = first_to_second_person(action) if not action.lower().startswith( "you ") and not action.lower().startswith( "i "): action = action[0].lower() + action[1:] # roll a d20 if settings.getboolean("action-d20"): action = d20ify_action(action, d) else: action = "You " + action if action[-1] not in [".", "?", "!"]: action = action + "." action = "\n> " + action + "\n" colPrint( "\n>" + action.lstrip().lstrip("> \n"), colors["transformed-user-text"], ) #TODO check if leading white space makes sense result = "\n" + story.act(action)[0] #TODO: Replace all this nonsense if len(story.story) >= 2: similarity = get_similarity(result, story.story[-2][1][0]) if similarity > 0.9: story.story = story.story[:-1] colPrint( "Woops that action caused the model to start looping. Try a different action to prevent that.", colors["error"], ) continue if player_won(result): colPrint(result + "\n CONGRATS YOU WIN", colors["message"]) break elif player_died(result): colPrint(result, colors["ai-text"]) colPrint("YOU DIED. GAME OVER", colors["error"]) colPrint( "\nOptions:\n0)Start a new game\n1)\"I'm not dead yet!\" (If you didn't actually die)", colors["menu"], ) choice = getNumberInput(1) if choice == 0: break else: colPrint("Sorry about that...where were we?", colors["query"]) colPrint(result, colors["ai-text"])
def sample_sequence(model, length, context, temperature=1, top_k=0, top_p=0.9, repetition_penalty=1.0, repetition_penalty_range=512, repetition_penalty_slope=3.33, device="cpu", stop_tokens=None, tokenizer=None): """Actually generate the tokens""" logger.debug( 'temp: {} top_k: {} top_p: {} rep-pen: {} rep-pen-range: {} rep-pen-slope: {}' .format(temperature, top_k, top_p, repetition_penalty, repetition_penalty_range, repetition_penalty_slope)) context_tokens = context context = torch.tensor(context, dtype=torch.long, device=device) # context = context.repeat(num_samples, 1) generated = context USE_PAST = True next_token = context pasts = None clines = 0 penalty = None if not repetition_penalty_range is None and not repetition_penalty_slope is None and repetition_penalty_range > 0: penalty = (torch.arange(repetition_penalty_range) / (repetition_penalty_range - 1)) * 2. - 1 penalty = (repetition_penalty_slope * penalty) / (1 + torch.abs(penalty) * (repetition_penalty_slope - 1)) penalty = 1 + ((penalty + 1) / 2) * (repetition_penalty - 1) with torch.no_grad(): for j in range(length): # why would we ever not use past? # is generated and next_token always same thing? if not USE_PAST: input_ids_next = generated pasts = None else: input_ids_next = next_token # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states) model_kwargs = {"past": pasts, "use_cache": True} model_inputs = model.prepare_inputs_for_generation( generated.unsqueeze(0), **model_kwargs) model_outputs = model(**model_inputs, return_dict=True) logits, pasts = model_outputs.logits, model_outputs.past_key_values logits = logits[0, -1, :].float() # Originally the order was Temperature, Repetition Penalty, then top-k/p if settings.getboolean('top-p-first'): logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) logits = logits / (temperature if temperature > 0 else 1.0) # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858) plus range limit if repetition_penalty != 1.0: if penalty is not None: penalty_len = min(generated.shape[0], repetition_penalty_range) penalty_context = generated[-repetition_penalty_range:] score = torch.gather(logits, 0, penalty_context) penalty = penalty.type(score.dtype).to(score.device) penalty_window = penalty[-penalty_len:] score = torch.where(score < 0, score * penalty_window, score / penalty_window) logits.scatter_(0, penalty_context, score) else: score = torch.gather(logits, 0, generated) score = torch.where(score < 0, score * repetition_penalty, score / repetition_penalty) logits.scatter_(0, generated, score) if not settings.getboolean('top-p-first'): logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) if temperature == 0: # greedy sampling: next_token = torch.argmax(logits, dim=-1).unsqueeze(-1) else: next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1) generated = torch.cat((generated, next_token), dim=-1) # Decode into plain text o = generated[len(context_tokens):].tolist() generated.text = tokenizer.decode( o, clean_up_tokenization_spaces=False, skip_special_tokens=True) if use_ptoolkit(): clear_lines(clines) generated.text = format_result(generated.text) clines = output(generated.text, "ai-text") if ((stop_tokens is not None) and (j > 4) and (next_token[0] in stop_tokens)): # Why the minimum tokens, j>X. Because sometimes the models starts with whitespace, which will strip away anyway. Having a minimum amount of tokens before we stop usually means we don't just stop because of "\n " or similar logger.debug( "Stopping generation as we found stop tokens. One of `%s`, in '%s'. token generated `%s`", stop_tokens, next_token, j, ) break clear_lines(clines) return generated
def sample_sequence( model, length, context, temperature=1, top_k=0, top_p=0.9, repetition_penalty=1.0, device="cpu", stop_tokens=None, tokenizer=None ): """Actually generate the tokens""" logger.debug( 'temp: {} top_k: {} top_p: {} rep-pen: {}'.format(temperature, top_k, top_p, repetition_penalty)) context_tokens = context context = torch.tensor(context, dtype=torch.long, device=device) # context = context.repeat(num_samples, 1) generated = context USE_PAST = True next_token = context pasts = None clines = 0 with torch.no_grad(): for j in range(length): # why would we ever not use past? # is generated and next_token always same thing? if not USE_PAST: input_ids_next = generated pasts = None else: input_ids_next = next_token # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states) logits, pasts = model(input_ids=input_ids_next, past=pasts) logits = logits[-1, :].float() # переписать логику TODO if settings.getboolean('sparse-gen'): probs = entmax_bisect(logits, dim=-1, alpha=settings.sparse-level) next_token = torch.multinomial(probs, num_samples=1) else: # Originally the order was Temperature, Repetition Penalty, then top-k/p if settings.getboolean('top-p-first'): logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) logits = logits / (temperature if temperature > 0 else 1.0) # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858) for k in set(generated.tolist()): logits[k] /= repetition_penalty if not settings.getboolean('top-p-first'): logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) if temperature == 0: # greedy sampling: next_token = torch.argmax(logits, dim=-1).unsqueeze(-1) else: next_token = torch.multinomial( F.softmax(logits, dim=-1), num_samples=1 ) generated = torch.cat((generated, next_token), dim=-1) # Decode into plain text o = generated[len(context_tokens):].tolist() generated.text = tokenizer.decode( o, clean_up_tokenization_spaces=False, skip_special_tokens=True ) if use_ptoolkit(): clear_lines(clines) generated.text = format_result(generated.text) clines = output(generated.text, "ai-text") if ( (stop_tokens is not None) and (j > 4) and (next_token[0] in stop_tokens) ): # Why the minimum tokens, j>X. Because sometimes the models starts with whitespace, which will strip away anyway. Having a minimum amount of tokens before we stop usually means we don't just stop because of "\n " or similar logger.debug( "Stopping generation as we found stop tokens. One of `%s`, in '%s'. token generated `%s`", stop_tokens, next_token, j, ) break clear_lines(clines) return generated
def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.9, repetition_penalty=1.0, is_xlnet=False, is_xlm_mlm=False, xlm_mask_token=None, xlm_lang=None, device="cpu", stop_tokens=None, tokenizer=None): logger.debug('temp: {} top_k: {} top_p: {} rep-pen: {}'.format( temperature, top_k, top_p, repetition_penalty)) context = torch.tensor(context, dtype=torch.long, device=device) context = context.unsqueeze(0).repeat(num_samples, 1) generated = context USE_PAST = True next_token = context outputs = None with torch.no_grad(): for j in range(length): #why would we ever not use past? #is generated and next_token always same thing? if USE_PAST: past = outputs[1] if outputs is not None else None inputs = {"input_ids": next_token, "past": past} else: inputs = {"input_ids": generated} outputs = model( **inputs ) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states) logits = outputs[0][:, -1, :].float() #Originally the order was Temperature, Repetition Penalty, then top-k/p if settings.getboolean('top-p-first'): logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) logits = logits / (temperature if temperature > 0 else 1.0) # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858) for i in range(num_samples): for k in set(generated[i].tolist()): logits[i, k] /= repetition_penalty if not settings.getboolean('top-p-first'): logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) if temperature == 0: # greedy sampling: next_token = torch.argmax(logits, dim=-1).unsqueeze(-1) else: next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1) generated = torch.cat((generated, next_token), dim=1) if ((stop_tokens is not None) and (j > 4) and (next_token[0][0] in stop_tokens)): # Why the minimum tokens, j>X. Because sometimes the models starts with whitespace, which will strip away anyway. Having a minimum amount of tokens before we stop usually means we don't just stop because of "\n " or similar logger.debug( "Stopping generation as we found stop tokens. One of `%s`, in '%s'. token generated `%s`", stop_tokens, next_token, j, ) break return generated