def load_model(train_steps, num_warmup_steps): try: # try to load finetuned model at local. tokenizer = load_tokenizer() config = GPT2Config.from_pretrained(configs.model_path, return_dict=False) model = TFGPT2LMHeadModel.from_pretrained(configs.model_path, return_dict=False) print("model loaded from local!") except Exception as e: tokenizer = BertTokenizer.from_pretrained( "mymusise/gpt2-medium-chinese") model = TFGPT2LMHeadModel.from_pretrained( "mymusise/gpt2-medium-chinese", return_dict=False) print("model loaded from remote!") loss = model.compute_loss optimizer = nlp.optimization.create_optimizer( 5e-5, num_train_steps=train_steps, num_warmup_steps=num_warmup_steps) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') model.compile( optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], # metrics=[metric] ) return model
def load_model(): tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # add the EOS token as PAD token to avoid warnings model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id) return tokenizer, model
def alternate_sentences(pos, sentence): GPT2tokenizer = GPT2Tokenizer.from_pretrained("gpt2") GPT2model = TFGPT2LMHeadModel.from_pretrained( "gpt2", pad_token_id=GPT2tokenizer.eos_token_id) # GPT2tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") # GPT2model = TFGPT2LMHeadModel.from_pretrained("distilgpt2",pad_token_id=GPT2tokenizer.eos_token_id) partial_sentence = get_np_vp(pos, sentence) input_ids = GPT2tokenizer.encode(partial_sentence, return_tensors='tf') maximum_length = len(partial_sentence.split()) + 40 # Activate top_k sampling and top_p sampling with only from 90% most likely words sample_outputs = GPT2model.generate( input_ids, do_sample=True, max_length=maximum_length, top_p=0.80, # 0.85 top_k=30, #30 repetition_penalty=10.0, num_return_sequences=10) generated_sentences = [] sentence = sentence.replace("\n", "") for i, sample_output in enumerate(sample_outputs): decoded_sentence = GPT2tokenizer.decode(sample_output, skip_special_tokens=True) # final_sentence = decoded_sentence final_sentence = tokenize.sent_tokenize(decoded_sentence)[0] final_sentence = final_sentence.replace("\r\n", "") final_sentence = final_sentence.replace("\n", "") generated_sentences.append(final_sentence) generated_sentences.append(sentence) if len(generated_sentences) > 2: return generated_sentences[-2:] else: return generated_sentences
def load_model(category): model_path = f"../models/model_gpt2_{category}" tokenizer = GPT2Tokenizer.from_pretrained(model_path, local_files_only=True) model = TFGPT2LMHeadModel.from_pretrained(model_path, local_files_only=True) return model, tokenizer
def generate_text(self, prefix=None, file_data=True, max_length=512, do_sample=True, top_k=50, top_p=0.9, temperature=0.3, return_sequences=2): ''' Takes in initial text and generates text with specified number of characters more using Top P sampling :param prefix: initial text to start with :param several parameters to hyperparemeterize with given defaults :return: complete generated text ''' if return_sequences < 1: raise Exception( "return sequences number is less than 1 (need an integer of atleast 1)" ) if max_length < 1: raise Exception("Max text length must be equal to or greater than 1") with NoStdStreams(): tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = TFGPT2LMHeadModel.from_pretrained( "gpt2", pad_token_id=tokenizer.eos_token_id) if file_data: f = open(self.dataset, "r") input_ids = tokenizer.encode(f.read(), return_tensors='tf', max_length=max_length - 1, truncation=True) f.close() else: input_ids = tokenizer.encode(prefix, return_tensors='tf', max_length=max_length - 1, truncation=True) logger("Generating text now...") tf.random.set_seed(0) output = model.generate(input_ids, do_sample=do_sample, max_length=max_length, top_k=top_k, top_p=top_p, temperature=temperature, num_return_sequences=return_sequences) total_text = "" for i, sample_output in enumerate(output): value = "{}: {}".format( i, tokenizer.decode(sample_output, skip_special_tokens=True)) total_text += value self.models['text_generation'] = {"generated_text": total_text} return self.models['text_generation']
def __init__(self, inp_context): # user input context self.inp_context = inp_context # the transformers self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # add the EOS token as PAD token to avoid warnings self.model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=self.tokenizer.eos_token_id)
def load_model_tokenizer_GPT2(): """ Loads GPT-2 model from local memory. Replace with gpt2 """ dir_path = os.path.dirname(os.path.realpath(__file__)) tokenizer = GPT2Tokenizer.from_pretrained(f'{dir_path}\\gpt2_model') model = TFGPT2LMHeadModel.from_pretrained(f'{dir_path}\\gpt2_model') return tokenizer, model
def load_or_train_model(tokenizer, file_paths, gpt_model_path, cumulative_string_path): ''' Tries to load previously trained model If there is none, runs the training on the generated dataset ''' if os.path.exists(gpt_model_path): print('Loading GPT model') return TFGPT2LMHeadModel.from_pretrained(gpt_model_path + 'pytorch_model.bin') else: print('GPT model not found, training one') # Creating the configurations from which the model can be made config = GPT2Config( vocab_size=tokenizer.vocab_size, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id ) # Creating the model model = TFGPT2LMHeadModel(config) # Defining our optimizer optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) # Definining our loss function loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # Defining our metric which we want to observe metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') # Compiling the model model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric]) # Prepare training dataset dataset = GPTModel.load_or_generate_dataset(tokenizer, file_paths, cumulative_string_path) # Execute training num_epoch = 3 model.fit(dataset, epochs=num_epoch) # Creating directory if it is not present os.mkdir(gpt_model_path) # Save the model output_model_file = os.path.join(gpt_model_path, WEIGHTS_NAME) model.save_pretrained(output_model_file) # Save the config model_to_save = model.module if hasattr(model, 'module') else model output_config_file = os.path.join(gpt_model_path, CONFIG_NAME) model_to_save.config.to_json_file(output_config_file) return model
def __init__(self, next_node): super().__init__(next_node) # self.dir_path = r'D:\BaiduNetdiskDownload\huggingface\gpt2-chinese-poem' # self.dir_path = r'D:\python\nlp_chat_robot\models\model_file\gpt2-chinese-poem' self.dir_path = config.poem_gen_node_dir_path self.tokenizer = BertTokenizer.from_pretrained(self.dir_path) self.model = TFGPT2LMHeadModel.from_pretrained(self.dir_path) self.text_generator = TextGenerationPipeline(self.model, self.tokenizer)
def load(self): self._tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # check if model exists if os.path.exists(self._model_path): print('loading') # https://huggingface.co/transformers/training.html#fine-tuning-in-native-tensorflow-2 self._model = TFGPT2LMHeadModel.from_pretrained(self._model_path) # self._model.load_weights(self._model_path) return self
def __init__(self, flags, model_path=HF_MODEL_PATH): if flags.model_type == 'tf': from transformers import TFGPT2LMHeadModel, GPT2Tokenizer self.tokenizer = GPT2Tokenizer.from_pretrained(model_path) self.model = TFGPT2LMHeadModel.from_pretrained( model_path, pad_token_id=self.tokenizer.eos_token_id) else: from transformers import GPT2LMHeadModel, GPT2Tokenizer self.tokenizer = GPT2Tokenizer.from_pretrained(model_path) self.model = GPT2LMHeadModel.from_pretrained( model_path, pad_token_id=self.tokenizer.eos_token_id) self.flags = flags
def __init__(self, model_name, device, tf_pt='tf'): self.tf_pt = tf_pt self.device = device self.tokenizer = BertTokenizerFast.from_pretrained(model_name) if self.tf_pt == 'tf': self.model = TFGPT2LMHeadModel.from_pretrained( model_name, pad_token_id=self.tokenizer.eos_token_id) else: self.model = GPT2LMHeadModel.from_pretrained( model_name, pad_token_id=self.tokenizer.eos_token_id) self.model.device(device)
def init_model( tokenizer: BertTokenizer, train_steps: int = 20000, num_warmup_steps: int = 1000, model_path: str = configs.model_path, ) -> TFGPT2LMHeadModel: try: model = TFGPT2LMHeadModel.from_pretrained( model_path, return_dict=False) except EnvironmentError: config = GPT2Config( architectures=["TFGPT2LMHeadModel"], model_type="TFGPT2LMHeadModel", tokenizer_class="BertTokenizer", vocab_size=tokenizer.vocab_size, n_positions=configs.model.n_positions, n_ctx=configs.model.n_ctx, n_embd=configs.model.n_embd, n_layer=configs.model.n_layer, n_head=configs.model.n_head, d_model=configs.model.n_embd, num_heads=configs.model.n_head, pad_token_id=tokenizer.pad_token_id, task_specific_params={ "text-generation": { "do_sample": True, "max_length": 120 } }, return_dict=False, output_attentions=False, output_hidden_states=False, use_cache=False, ) model = TFGPT2LMHeadModel(config) loss = model.compute_loss optimizer = nlp.optimization.create_optimizer( 5e-5, num_train_steps=train_steps, num_warmup_steps=num_warmup_steps) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') # metric = Mymetrice('accuracy') model.compile( optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric] ) return model
def main(highQualityMode=False): dir_path = os.path.dirname(os.path.realpath(__file__)) tokenizer_save_path = dir_path + "/saved_tokenizer" model_out_dir = dir_path + "/model/" input_dir = dir_path + "/input.txt" output_dir = dir_path + "/output.txt" try: with open(input_dir, 'r') as r: text = r.read() os.remove(input_dir) except: raise SystemExit("Input not found!") translator = Translator() lang = translator.detect(text).lang if lang != "zh-CN": text = translator.translate(text, dest="zh-tw").text #load pretrained model tokenizer = getTokenizer(tokenizer_save_path) model = TFGPT2LMHeadModel.from_pretrained(model_out_dir) # encoding the input text start = time.time() input_ids = tokenizer.encode(text, return_tensors='tf') # generate output NUM_SEQUENCE = 3 if highQualityMode else 1 beam_output = model.generate(input_ids, max_length=1000, num_beams=int(random.random() * 10) + 1, temperature=random.random() * 10 % 5 / 10 + 0.5, no_repeat_ngram_size=2, num_return_sequences=NUM_SEQUENCE, top_k=int(random.random() * 100 % 40), top_p=1) beam_output = getHighQuality(tokenizer.eos_token_id, beam_output) with open(output_dir, 'w') as w: w.write("Time used: " + str(time.time() - start) + '\n') if lang != "zh-CN": output = translator.translate(tokenizer.decode(beam_output), dest=lang).text else: output = tokenizer.decode(beam_output) w.write(output)
def _initialise_model_and_tokenizer(self): """ Initialise model and tokenizer. ---------- model_name_or_dir: str, optional either local dir containing the tf_model.h5, vocab.json, config.json, merges.txt OR the model shortcut name 'distilgpt2' """ self.tokenizer = GPT2Tokenizer.from_pretrained( pretrained_model_name_or_path=self.model_name) self.tokenizer.pad_token = '[PAD]' self.tokenizer.decoder[ self.tokenizer.pad_token_id] = self.tokenizer.pad_token self.keras_model = TFGPT2LMHeadModel.from_pretrained(self.model_name) self.get_candidate_word_probs( '.', ['warming', 'up']) # because first prediction is always slow
def main(): # Initialize a tokenizer and model. tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = TFGPT2LMHeadModel.from_pretrained("gpt2", from_pt=True) # Infinite loop. while True: #i = 0 #while i == 0: # Take in the user input. If the input matches a certain # string, break out of the loop and exit the program. user_input = input("prompt:> ") if user_input == "<|endoftext|>": break #user_input = "And to the darkness, I cast a bright light. The shadow disolves " # Tokenize/encode the input text. #encoded_input = tokenizer.tokenize(user_input) encoded_input = tokenizer.encode(user_input, return_tensors="tf") # Generate samples. generated_samples = model.generate(encoded_input, max_length=150, num_return_sequences=10, no_repeat_ngram_size=2, repetition_penalty=1.5, top_p=0.92, temperature=0.85, do_sample=True, top_k=125, early_stopping=True) # Print samples for i, beam in enumerate(generated_samples): print("{}: {}".format( i, tokenizer.decode(beam, skip_special_tokens=True))) print() #i += 1 # Exit the program. exit(0)
def __init__(self): """Possible states are 1. "await" (awaiting response) 2. "proceed" (proceed with the conversation)- used to give the bot control over the converation""" self._state="await" """Possible Flags are 1. "Exec" (task Executed) 2. "notExec" (proceed with the conversation)- used to give the bot control over the converation""" self._FLAG=None self._bert_base_case_mrpc_tokenizer=AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc") self._bert_base_case_mrpc_model=TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc") self._gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") self._gpt2_model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=self._gpt2_tokenizer.eos_token_id) self.bert_large_uncased_whole_word_masking_finetuned_squad_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") self.bert_large_uncased_whole_word_masking_finetuned_squad_model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") self._DialoGP_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") self._DialoGP_model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-medium") self._conversation_started=False self._conversation_ended=True
def setup_model_finetuning(path_to_pretrained, tokenizer_en, tokenizer_lng): # load pre-trained models model = TFGPT2LMHeadModel.from_pretrained(path_to_pretrained) # setup new embedding matrix for fine-tuning weights = tf.stop_gradient( model.transformer.get_input_embeddings() \ .weight.value()) \ .numpy() # get mean embeddings mean_weights = tf.reduce_mean(weights, axis=0).numpy() new_vocab = tokenizer_lng.get_vocab() old_vocab = tokenizer_en.get_vocab() new_embeddings = tf.zeros([len(new_vocab), mean_weights.shape[0]]).numpy() for word, idx_new in new_vocab.items(): idx_old = old_vocab.get(word, -1) if idx_old >= 0: new_embeddings[idx_new, :] = weights[idx_old, :] else: new_embeddings[idx_new, :] = mean_weights # set embeddings model.transformer.set_input_embeddings(tf.constant(new_embeddings)) # freezing model weights for layer in model.transformer.h: layer.trainable = False model.transformer.wte.trainable = True model.transformer.wpe.trainable = True model.transformer.ln_f.trainable = True return model
def create_model(args, vocab_size): """ :param args: :param vocab_size:字典大小 :return: """ print('配置模型参数') # model_config = GPT2Config.from_json_file('config/model_config_dialogue_small.json') print(vocab_size) print('创建model') # model = TFGPT2LMHeadModel.from_pretrained('gpt2') if args.pretrained_model: # 如果指定了预训练的GPT2模型 model = TFGPT2LMHeadModel.from_pretrained(args.pretrained_model) else: # 若没有指定预训练模型,则初始化模型 print('初始化模型') model_config = GPT2Config.from_json_file(args.model_config) print('config:\n' + model_config.to_json_string()) model = TFGPT2LMHeadModel(config=model_config) print('构造好模型') # 根据tokenizer的vocabulary调整GPT2模型的voca的大小 #model.resize_token_embeddings(vocab_size) # model = TFGPT2LMHeadModel.from_pretrained()#实例化一个类 return model, model.config.to_dict().get("n_ctx")
return "negative" elif max((model.predict(text))[0]) == model.predict(text)[0][1]: return "neutral" else: return "positive" def about_symbol(text): text = text.replace(".", ". ") text = text.replace(". . .", ". ") return text tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model_gpt = TFGPT2LMHeadModel.from_pretrained( "gpt2", pad_token_id=tokenizer.eos_token_id) with open("train_pos_edit_full.json", encoding='utf-8') as json_file: json_data = json.load(json_file) json_string = json_data['splited_sentence'] seq_length = 1000 start = time.time() text_list = [] output_list = [] for a in tqdm(range(len(json_string))): # print("-" * 100) for b in range(len(json_string[a])): input_text = json_string[a][b]
# 참고 : https://nlp.gluon.ai/api/modules/data.html toked = tokenizer('안녕 하세요') # tokenizer가 잘못 만들어진 듯 ... 한글자씩 ? print(toked) toked_idx = vocab(toked) print(toked_idx) toked = vocab.to_tokens(toked_idx) print(toked) detoked = detokenizer(toked) print(detoked) ''.join(toked).replace('▁', ' ') model = TFGPT2LMHeadModel.from_pretrained(MODEL_PATH) model.summary() # 모델의 seed 입력 문장 생성 tok = tokenizer('이때') # tok = ['▁', '이', '때'] tok_idx = [vocab[vocab.bos_token]] + vocab[tok] # tok_idx = [0, 47437, 47438, 47675] input_ids = tf.convert_to_tensor(tok_idx)[None, :] # 텐서로 변환 # 모델의 출력 output = model.generate(input_ids, max_length=50) output # 모델의 출력을 문자열로 변환 out_tok_idx = output.numpy().tolist()[0] # output token 인덱스 out_tok = vocab.to_tokens(out_tok_idx) # token 인덱스를 token 문자로 변환
import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' from transformers import TFGPT2LMHeadModel, BertTokenizer import tensorflow as tf import re from flask import Flask from flask import render_template from flask import request, Response app = Flask(__name__) app.config['DEBUG'] = True model = TFGPT2LMHeadModel.from_pretrained("gpt2-cn-50") @app.route('/') def index_main(): return render_template('index.html') @app.route('/random', methods=["GET", "POST"]) def get_text(): if request.method == "GET": sentence = request.args.get("message") result = test_model(sentence) return Response(result) def test_model(sentence): if " " not in sentence: sentence = re.sub("", " ", sentence)[1:]
def run(inp, outString): #tokenizer = GPT2Tokenizer.from_pretrained("gpt2") #model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id) tokenizer = GPT2Tokenizer.from_pretrained("ATLA_GPT2", from_pt=True) model = TFGPT2LMHeadModel.from_pretrained( "ATLA_GPT2", from_pt=True, pad_token_id=tokenizer.eos_token_id) stop_token = tokenizer.encode('\n', return_tensors='tf') while (True): i = "" server_key = '' while (inp.empty()): pass while (not inp.empty()): tmp = inp.get() server_key = tmp['server_key'] i += tmp['SpeechResult'] ''' #Collect total input from memory #Currently throws an error when using beams st = '' for s in paragraph: st += s i = st + i ''' input_ids = tokenizer.encode(i, return_tensors='tf') # Batch size 1 # set no_repeat_ngram_size to 2 ''' beam_output = model.generate( input_ids, num_beams=3, no_repeat_ngram_size=2, early_stopping=True, s ='\n' ) ''' beam_output = model.generate(input_ids, do_sample=True, max_length=50, top_k=50) ''' beam_output = model.generate( input_ids, do_sample=True, max_length=50, top_p=0.92, top_k=0 ) ''' #max_length=len(i) + 30, ''' beam_output = model.generate( input_ids ) ''' #print("Output:\n" + 100 * '-') #print(tokenizer.decode(beam_output[0], skip_special_tokens=True)) out = (tokenizer.decode(beam_output[0], skip_special_tokens=False)) paragraph.append(out) out = out[len(i):len(out)] outString.put(out) data = {"GPT2_RESULT": out, "server_key": server_key} print(data) #print(data) #print(paragraph) resp = requests.post(url, json=data)
config = base_model.config tokenizer = BertTokenizer.from_pretrained(vocab_name) trainer = Trainer elif FLAGS.pretrained_model_name == "bert_mini_uncased": pretrained_model_name = "uncased_L-4_H-256_A-4" vocab_name = f"{PRETRAINED_MODELS_DIR}/{pretrained_model_name}/" config = read_bert_config(pretrained_model_name) base_model = TFBertModel(config) base_model = ModelManager.load_pretrained_model( base_model, f"{pretrained_model_name}/bert_model.ckpt.index") tokenizer = BertTokenizer.from_pretrained(vocab_name) trainer = Trainer elif FLAGS.pretrained_model_name == "gpt2": pretrained_model_name = "gpt2" vocab_name = "gpt2" base_model = TFGPT2LMHeadModel.from_pretrained(pretrained_model_name) tokenizer = GPT2Tokenizer.from_pretrained(vocab_name) tokenizer.add_special_tokens({"pad_token": "[PAD]"}) base_model.config.pad_token_id = tokenizer.pad_token_id config = base_model.config trainer = GPTTrainer else: raise NotImplementedError() embedder = Embedder(pretrained_model_name, tokenizer) model = Transformer(embedder=embedder, model=base_model, hidden_state_size=config.hidden_size, max_sequence_length=150) if FLAGS.load_model:
def get_encoder_decoder_models(self): encoder_model = TFViTModel.from_pretrained("google/vit-base-patch16-224-in21k", name="encoder") decoder_model = TFGPT2LMHeadModel.from_pretrained("../gpt2", config=self.get_decoder_config(), name="decoder") return {"encoder": encoder_model, "decoder": decoder_model}
def __init__(self): self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2") self.model = TFGPT2LMHeadModel.from_pretrained( 'gpt2', pad_token_id=self.tokenizer.eos_token_id)
for i in range(num_gpus): devices.append("GPU:" + str(i)) strategy = tf.distribute.MirroredStrategy(devices=devices) print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) print( "============================ Loading model from pretrained and compiling ===========================" ) with strategy.scope(): tokenizer = GPT2TokenizerFast.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token print("========================= Loading dataset ========================") train_dataset = tokenize(get_dataset(train_file), tokenizer, truncate).batch(num_gpus) valid_dataset = tokenize(get_dataset(valid_file), tokenizer, truncate).batch(num_gpus) model = TFGPT2LMHeadModel.from_pretrained(model_name) #Disable past key values model.config.use_cache = False optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = metrics.SparseCategoricalAccuracy(name='Accuracy') model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric]) print( "========================= Finetuning Model ==================================" ) model.fit(train_dataset, batch_size=64, epochs=num_epochs) print( "========================= Evaluating Model ==================================" )
import tensorflow as tf from transformers import GPT2Tokenizer, TFGPT2LMHeadModel N = 50 # Number of words to generate k = 10 # Top-k items to select from p = 0.8 # Top-p cumulative probability to select from # Load pre-trained GPT-2 model and tokenizer tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = TFGPT2LMHeadModel.from_pretrained("gpt2", use_cache=True) # Input a sentence print("Type something for the model:") input_string = input() # Tokenize the input and initialize variables inputs = tokenizer(input_string, return_tensors="tf")["input_ids"] past = None outputs = [] k = tokenizer.vocab_size if not k else k # Generate the output for _ in range(N): logits, past = model(inputs, past) # Top-k filtering: select only k items with largest probabilities logits, indices = tf.math.top_k(logits[:, -1, :], k) # Top-p filtering: select only top items within cumulative probability of p cumsum = tf.math.cumsum(tf.nn.softmax(logits), 1) selected = cumsum <= max(p, cumsum[0][0]) # Make sure at least 1 item is selected
def __init__(self, dir_path): super(GPT2Model, self).__init__() self.gpt2 = TFGPT2LMHeadModel.from_pretrained(dir_path)
# https://www.kaggle.com/tuckerarrants/text-generation-with-huggingface-gpt2/notebook #for reproducability SEED = 34 #maximum number of words in output text MAX_LEN = 70 input_sequence = "In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English." #get transformers from transformers import TFGPT2LMHeadModel, GPT2Tokenizer #get large GPT2 tokenizer and GPT2 model tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large") GPT2 = TFGPT2LMHeadModel.from_pretrained("gpt2-large", pad_token_id=tokenizer.eos_token_id) #tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium") #GPT2 = TFGPT2LMHeadModel.from_pretrained("gpt2-medium", pad_token_id=tokenizer.eos_token_id) #tokenizer = GPT2Tokenizer.from_pretrained("gpt2") #GPT2 = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id) #view model parameters GPT2.summary() #get deep learning basics import tensorflow as tf tf.random.set_seed(SEED) # encode context the generation is conditioned on