def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained('gpt2-medium') model = GPT2LMHeadModel.from_pretrained( 'gpt2-medium') #gpt2 as other option model.eval() model.to(device) # context is the universal trigger trigger = "TH PEOPLEMan goddreams Blacks" trigger_tokens = enc.encode(trigger) # get samples from the model for _ in range(100): out = sample_sequence(model=model, length=200, context=trigger_tokens, temperature=1.5, top_k=5) out = out[:, len(trigger_tokens):].tolist() for i in range(1): text = enc.decode(out[i]) if text.find('<|endoftext|>') > 0: text = text[0:text.find('<|endoftext|>')] print("Prompt: " + trigger) print("Output: " + text) print("=" * 80)
def predict_next_word(text): import time t1 = time.time() indexed_tokens = tokenizer.encode(text) # Convert indexed tokens in a PyTorch tensor tokens_tensor = torch.tensor([indexed_tokens]) # Load pre-trained model (weights) model = GPT2LMHeadModel.from_pretrained('gpt2') # Set the model in evaluation mode to deactivate the DropOut modules model.eval() # If you have a GPU, put everything on cuda #tokens_tensor = tokens_tensor.to('cuda') #model.to('cuda') # Predict all tokens with torch.no_grad(): outputs = model(tokens_tensor) predictions = outputs[0] # Get the predicted next sub-word predicted_index = torch.argmax(predictions[0, -1, :]).item() predicted_text = tokenizer.decode(indexed_tokens + [predicted_index]) # Print the predicted word print(predicted_text) t2 = time.time() print("Time taken : ", t2-t1)
def __init__(self, lookup, input_size, top_k, top_p, device): """ Creates a Decoder with attention and Pointer network see https://nlp.stanford.edu/pubs/see2017get.pdf """ super().__init__() self.device = device self.gpt2lmheadmodel = GPT2LMHeadModel.from_pretrained('gpt2') self.gpt2lmheadmodel.resize_token_embeddings(len(lookup)) for param in self.gpt2lmheadmodel.parameters(): param.requires_grad = False self.lookup = lookup self.emb_dim = 768 self.hidden_dim = 768 self.vocab_size = len(lookup) self.encoder_size = input_size self.top_k = top_k self.top_p = top_p self.output_linear = nn.Linear(hidden_dim, vocab_size) self.attention = Attention(encoder_size=input_size, decoder_size=self.hidden_dim, vocab_size=vocab_size, device=device) # overwrite output to allow context from the attention to be added to the output layer self.output_linear = nn.Linear(self.hidden_dim+self.encoder_size+self.emb_dim, int((self.hidden_dim+self.encoder_size+self.emb_dim)/2)) self.vocab_linear = nn.Linear(int((self.hidden_dim+self.encoder_size+self.emb_dim)/2), self.vocab_size) self.p_gen_linear = nn.Linear(self.encoder_size + self.hidden_dim*2 + self.emb_dim, 1) self.to(device)
def load(self): try: self._tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium") self._model = GPT2LMHeadModel.from_pretrained("gpt2-medium") except: self._model = None return self
def load_model(self,path='model/mini/'): device = "cuda" if torch.cuda.is_available() else "cpu" device = "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=path+'vocab.txt') model = GPT2LMHeadModel.from_pretrained(path) model.to(device) model.eval() return model, tokenizer
def __init__(self, **kwargs): self.beam_width = kwargs['beam_width'] self.beam_depth = kwargs['beam_depth'] self.timeout = kwargs['timeout'] random.seed = kwargs['seed'] self.model = GPT2LMHeadModel.from_pretrained('gpt2') self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
def __init__(self): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # TODO maybe smaller gpt2 model separately self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') self.model = GPT2LMHeadModel.from_pretrained('gpt2-medium') self.model.to(self.device) self.model.eval()
def __init__(self, model_path='gpt2', top_k=None, top_p=None, device=None): super().__init__(device, top_k=top_k, top_p=top_p) self.model_path = model_path self.tokenizer = GPT2Tokenizer.from_pretrained(model_path) self.model = GPT2LMHeadModel.from_pretrained(model_path) self.model.to(device) self.model.eval()
def initialize_training(args, device): """Initalize the tokenizer, the data loaders, the model and the tools of the optimization process.""" # Create tokenizer, datasets and loaders tokenizer = EpisodeSummaryTokenizer.from_pretrained( args.gpt2_version, max_num_words=args.max_num_words, size_variance_handling=args.size_var_handling) train_dataset, val_dataset = create_datasets_from_jsons( args.json_paths, tokenizer, args.val_split) dataloaders = { 'train': DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size, collate_fn=tokenizer.pad_batch_to_same_size), 'val': DataLoader(val_dataset, shuffle=False, batch_size=args.batch_size, collate_fn=tokenizer.pad_batch_to_same_size) } # Load pre-trained network weights model = GPT2LMHeadModel.from_pretrained(args.gpt2_version) model = model.to(device) # Prepare optimizer and scheduler no_decay = ['bias', 'LayerNorm.weight'] # no decay for biases and layer norm optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=args.max_steps) model.zero_grad() train_state = make_train_state( save_path=args.model_save_path, early_stopping_patience=args.early_stopping_patience) return tokenizer, dataloaders, model, optimizer, scheduler, train_state
def __init__(self, model_path='gpt2', device='cuda'): super().__init__() self.model_path = model_path self.device = device self.tokenizer = GPT2Tokenizer.from_pretrained(model_path) self.model = GPT2LMHeadModel.from_pretrained(model_path) self.model.to(device) self.model.eval()
def __init__(self): super(GPT2, self).__init__() self.model_type = "GPT2" # Load pre-trained model tokenizer (vocabulary) self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # Load pre-trained model (weights) self.model = GPT2LMHeadModel.from_pretrained("gpt2")
def __init__(self, model_name: str) -> None: super().__init__() config = GPT2Config.from_pretrained(model_name) self.input_dim = config.hidden_size self.output_dim = config.vocab_size # TODO(mattg): It's possible that we could use some kind of cache like we have in # allennlp.modules.token_embedders.bert_token_embedder.PretrainedBertModel. That way, we # would only load the GPT2 weights once. Though, it's not clear how to do that here, as we # need to load `GPT2LMHeadModel`, not just `GPT2Model`... gpt2_model = GPT2LMHeadModel.from_pretrained(model_name) self.gpt2_lm_head = gpt2_model.lm_head
def gpt2(self, prep_obj): self.vector_corpus = [] model = GPT2LMHeadModel.from_pretrained('gpt2') token_maker = GPT2Tokenizer.from_pretrained('gpt2') for tweet in prep_obj.detokenized_corpus: text_index = token_maker.encode(tweet) vector = (model.transformer.wte.weight[text_index, :]) vector = vector.detach().numpy() vector = np.sum(vector, axis=0) self.vector_corpus.append(vector)
def __init__(self, model_path, tokenizer_path): device = "cuda" if torch.cuda.is_available() else "cpu" model = GPT2LMHeadModel.from_pretrained(model_path) model.to(device) model.eval() tokenizer = tokenization_bert_word_level.BertTokenizer( vocab_file=tokenizer_path) vocab = Gpt2Vocab(tokenizer) self.device = device self.model = model self.vocab = vocab self.tokenizer = tokenizer
def __init__(self): if not os.path.exists(AGGREGATOR_DIR): os.makedirs(AGGREGATOR_DIR) if not os.path.isfile(AGGREGATOR_2015_2016): print("Downloading aggregators from s3...") wget.download(AGGREGATOR_2015_2016_URL, AGGREGATOR_2015_2016, bar=self._download_progress_bar) if not os.path.isfile(AGGREGATOR_2015_2017): print("Downloading aggregators from s3...") wget.download(AGGREGATOR_2015_2017_URL, AGGREGATOR_2015_2017, bar=self._download_progress_bar) if not os.path.isfile(AGGREGATOR_2015_2016_8_dim): print("Downloading aggregators from s3...") wget.download(AGGREGATOR_2015_2016_8_dim_URL, AGGREGATOR_2015_2016_8_dim, bar=self._download_progress_bar) if not os.path.isfile(AGGREGATOR_2015_2017_8_dim): print("Downloading aggregators from s3...") wget.download(AGGREGATOR_2015_2017_8_dim_URL, AGGREGATOR_2015_2017_8_dim, bar=self._download_progress_bar) if not os.path.isfile(ROBERTA_STS_PATH + '/checkpoint_best.pt'): print("Downloading ROBERTA STS model from s3...") wget.download(ROBERTA_STS_URL, ROBERTA_STS_PATH + '/checkpoint_best.pt', bar=self._download_progress_bar) if not os.path.isfile(ROBERTA_MNLI_PATH + '/model_mnli.pt'): print("Downloading ROBERTA MNLI model from s3...") wget.download(ROBERTA_MNLI_URL, ROBERTA_MNLI_PATH + '/model_mnli.pt', bar=self._download_progress_bar) self.roberta_STS = RobertaModel.from_pretrained( checkpoint_file='checkpoint_best.pt', model_name_or_path=ROBERTA_STS_PATH) self.roberta_STS.eval() self.roberta_MNLI = RobertaModel.from_pretrained( checkpoint_file='model_mnli.pt', model_name_or_path=ROBERTA_MNLI_PATH) self.roberta_MNLI.eval() self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.gpt_model = GPT2LMHeadModel.from_pretrained('gpt2') self.agg_one = load(AGGREGATOR_2015_2016) self.agg_two = load(AGGREGATOR_2015_2017) self.agg_one_8_dim = load(AGGREGATOR_2015_2016_8_dim) self.agg_two_8_dim = load(AGGREGATOR_2015_2017_8_dim)
def gpt_predictor(n=3): if request.method == 'GET': return render_template('index.html', value='hi') if request.method == 'POST': tok = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2LMHeadModel.from_pretrained("gpt2") text = request.form.get('text') n = request.form.get('n') for i in range(int(n)): pred = get_pred(text, model, tok) if pred == "<|endoftext|>": break else: text += pred return render_template('result.html', text=text)
def get_model(seed=1234, model_name='gpt2'): np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained(model_name) enc.unk_token = None enc.bos_token = None enc.eos_token = None model = GPT2LMHeadModel.from_pretrained(model_name) model.to(device) model.eval() #model.double() return enc, model
def predict_next_word(phrase): """ Function to process the phrase using GPT-2 :param phrase: :return: """ # Load pre-trained model tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # Tokenize the input phrase tokenized_phrase = tokenizer.encode(phrase) print("Tokenized Phrase: {}".format(tokenized_phrase)) # Convert tokenized phrase to pytorch tensor tokenized_phrase_tensor = torch.tensor([tokenized_phrase]) print("Tokenized Phrase Tensor: {}".format(tokenized_phrase_tensor)) # Load pretrainied model. This will have weights and bias model = GPT2LMHeadModel.from_pretrained('gpt2') # Set the model in evaluation mode to deactivate drop-out. (Back-prop) model.eval() try: tokenized_phrase_tensor = tokenized_phrase_tensor.to('cuda') model.to('cuda') print("CUDA present.Running code on GPU") except AssertionError: print("Torch not compiled with CUDA. Running on CPU.") except Exception: print("CUDA not present. Running on CPU") # Predict all tokens with torch.no_grad(): outputs = model(tokenized_phrase_tensor) print("Outputs: {}".format(outputs)) predictions = outputs[0] print("Prediction: {}".format(predictions)) # Get the predicted next sub-word predicted_index = torch.argmax(predictions[0, -1, :]).item() predicted_text = tokenizer.decode(tokenized_phrase + [predicted_index]) return predicted_text
def load_model(args): """ Load model and the corresponding tokenizer from pre-trained weight. :param args: The command line arguments. :return model: The main model. :return tokenzier: The tokenzier comes with the main model. """ USE_CUDA = torch.cuda.is_available() # ====== Load GPT2 model ======== model_dir = '../models/' + args.model_dir model = GPT2LMHeadModel.from_pretrained(model_dir) # model = GPT2LMHeadModel.from_pretrained('gpt2') if USE_CUDA: model.cuda() tokenizer = GPT2Tokenizer.from_pretrained(model_dir) # tokenizer = GPT2Tokenizer.from_pretrained('gpt2') print('Model loaded.') return model, tokenizer
def evaluate_ppl_gpt(args): """ Evaluate on raw text, use this with GPT which has its own tokenizer """ if args.expanded_dataset: path = ".data/stories/story_commonsense/torchtext_expanded" else: path = ".data/stories/story_commonsense/torchtext" # Data test_src = [line.rstrip('\n') for line in open(path + "/test.src")] test_trg = [line.rstrip('\n') for line in open(path + "/test.trg")] # Model enc = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') model.to(device) model.eval() loss = 0 batch_size = 1 print("Evaluating test set with GPT2") for i in trange(len(test_src)): src, trg = test_src[i], test_trg[i] context = enc.encode(src) target = enc.encode(trg) length = len(target) # Generate prediction out = utils.sample_sequence(model, length, batch_size=1, context=context, top_k=10, device=device) out = out[:, len(context):] # Get model loss target = torch.tensor([target]).to(device) with torch.no_grad(): #pred, past = model(out) l = model(out, labels=target) loss += float(l) av_loss = loss / len(loss) print(f"ppl: {math.exp(av_loss):.04f}")
def gpt_predictor(request, n=3): tok = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2LMHeadModel.from_pretrained("gpt2") if request.method == 'GET': return "Welcome to GPT predictor" if request.method == 'POST': data = request.get_json() text = data["text"] res = [] n = data["n"] for i in range(n): pred = get_pred(text, model, tok) if pred == "<|endoftext|>": break else: text += pred return text
def get_textgen(sentence: str) -> str: """ Runs text_generation GPT2 model and returns generated text. :param sentence: sentence taken from serializer.data. :return: Generated text. """ output_dir = './models/text_gen' tokenizer = GPT2Tokenizer.from_pretrained(output_dir) model = GPT2LMHeadModel.from_pretrained(output_dir) tokens = tokenizer.encode(sentence) tokens_tensor = torch.tensor([tokens]) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokens_tensor = tokens_tensor.to(device) model.to(device) with torch.no_grad(): outputs = model(tokens_tensor) predictions = outputs[0] predicted_index = torch.argmax(predictions[0, -1, :]).item() predicted_text = tokenizer.decode(tokens + [predicted_index]) return predicted_text
def main(): parser = argparse.ArgumentParser() parser.add_argument("--lr",default=5e-5, type=float, required=True, help="learning rate") parser.add_argument("--seed",default=42, type=int, required=False, help="seed to replicate results") parser.add_argument("--n_gpu",default=1, type=int, required=False, help="no of gpu available") parser.add_argument("--gradient_accumulation_steps",default=32, type=int, required=True, help="gradient_accumulation_steps") parser.add_argument("--batch_size",default=1, type=int, required=True, help="batch_size") parser.add_argument("--num_workers",default=4, type=int, required=False, help="num of cpus available") parser.add_argument("--device",default=torch.device('cuda'), required=False, help="torch.device object") parser.add_argument("--num_train_epochs",default=5, type=int, required=True, help="no of epochs of training") parser.add_argument("--output_dir",default=./output, type=str, required=True, help="path to save evaluation results") parser.add_argument("--model_dir",default=./weights, type=str, required=True, help="path to save trained model") parser.add_argument("--fp16",default=True, type=bool, required=False, help="whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") parser.add_argument("--fp16_opt_level",default='O0', type=str, required=False, help="apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].") parser.add_argument("--max_grad_norm",default=1.0, type=float, help="max gradient norm.") parser.add_argument("--root_dir",default='./CNN/gpt2_1024_data', type=str, help="location of json dataset.") parser.add_argument("--ids_file",default='./CNN/ids.json', type=str, help="location of train, valid and test file indexes") args = parser.parse_args() train_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='train',length=3000) #training on only 3000 datasets valid_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='valid',length=500) #validation on only 500 datasets tokenizer = add_special_tokens() ignore_idx = tokenizer.pad_token_id model = GPT2LMHeadModel.from_pretrained('gpt2') model.resize_token_embeddings(len(tokenizer)) model.to(args.device) start = time.time() train(args, model, tokenizer, train_data, valid_data, ignore_index) print('total time: ', (time.time()-start)/60, " minutes", end='\n\n') print('Saving trained model...') model_file = os.path.join(args['model_dir'], 'model_{}_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.bin'.format(args['fp16_opt_level'],3000,args['num_train_epochs'])) config_file = os.path.join(args['model_dir'], 'config_{}_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.json'.format(args['fp16_opt_level'],3000,args['num_train_epochs'])) torch.save(model.state_dict(), model_file) model.config.to_json_file(config_file)
def get_model(seed=1234, model_name='gpt2'): np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained( 'D:/OneDrive - whu.edu.cn/桌面/NeuralSteganography-master1/pretrained_model' ) # local predownload pretrained_model # enc = GPT2Tokenizer.from_pretrained(model_name) enc.unk_token = None enc.bos_token = None enc.eos_token = None model = GPT2LMHeadModel.from_pretrained( 'D:/OneDrive - whu.edu.cn/桌面/NeuralSteganography-master1/pretrained_model' ) # model = GPT2LMHeadModel.from_pretrained(model_name) model.to(device) model.eval() #model.double() return enc, model
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--length', default=-1, type=int, required=False, help='生成长度') parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度,越高越随机') parser.add_argument('--topk', default=8, type=int, required=False, help='生成的时候最高几选一') parser.add_argument('--topp', default=0, type=float, required=False, help='生成的时候积累概率最高多少') parser.add_argument('--model_config', default='config/model_config.json', type=str, required=False, help='模型参数路径') parser.add_argument('--tokenizer_path', default='cache/bud_vocab.txt', type=str, required=False, help='词表路径') parser.add_argument('--model_path', default='model_bud/', type=str, required=False, help='模型路径') parser.add_argument('--save_path', default='generated/', type=str, required=False, help='存放生成的文件的路径') parser.add_argument('--articles_per_title', default=5, type=int, required=False, help='每个标题生成多少篇文章') parser.add_argument('--titles', default='萧炎', type=str, required=False, help='标题列表,是一个字符串,用空格分开') parser.add_argument('--titles_file', default='', type=str, required=False, help='标题列表文件,文件中每行一个标题。如果这个选项有值则titles无效') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') parser.add_argument('--segment', action='store_true', help='中文以词为单位') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.no_wordpiece: from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert elif args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length temperature = args.temperature topk = args.topk topp = args.topp titles = args.title.split() # 列表,里面每个元素是一个生成的标题 if args.titles_file: with open(args.titles_file, 'r') as f: titles = [line.strip('\n') for line in f.readlines()] articles_per_title = args.articles_per_title # 这里定义一个标题生成多少篇文章 save_path = args.save_path # 设置存到哪 device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model_config = pytorch_transformers.GPT2Config.from_json_file( args.model_config) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() if not os.path.exists(save_path): os.mkdir(save_path) if length == -1: length = model.config.n_ctx // 2 elif length > model.config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) for i, title in enumerate(titles): for j in range(articles_per_title): with open(save_path + str(i * j), 'w') as f: context_tokens = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(title)) generated = 0 out = sample_sequence(model=model, length=length, context=context_tokens, temperature=temperature, top_k=topk, top_p=topp, device=device) out = out.tolist() generated += 1 text = tokenizer.convert_ids_to_tokens(out[0]) for i, item in enumerate(text[:-1]): # 确保英文前后有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' for i, item in enumerate(text): if item == '[MASK]': text[i] = '' if item == '[CLS]' or item == '[SEP]': text[i] = '\n' print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) text = ''.join(text).replace('##', '').strip() # text = ''.join(text.split('\n')[:-1]) print(text) f.write(text) print("=" * 80)
def __init__(self): self.name = 'GPT2LanguageModel' self.trainable_model = False self.GPT2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.model = GPT2LMHeadModel.from_pretrained('gpt2').eval()
def __init__(self): super(GPT2Generator, self).__init__() # TODO: can i make the outputs below large and the knowledge medium? self.gpt2_config = GPT2Config.from_pretrained('gpt2-large') self.lh_model = GPT2LMHeadModel.from_pretrained('gpt2-large')
import logging logging.basicConfig(level=logging.INFO) # Load pre-trained model tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # Encode a text inputs text = "Who was Jim Henson ? Jim Henson was a" indexed_tokens = tokenizer.encode(text) # Convert indexed tokens in a PyTorch tensor tokens_tensor = torch.tensor([indexed_tokens]) # Load pre-trained model (weights) model = GPT2LMHeadModel.from_pretrained('gpt2') # Set the model in evaluation mode to desactivate the DropOut modules # This is IMPORTANT to have reproductible results during evaluation! model.eval() # If you have a GPU, put everything on cuda tokens_tensor = tokens_tensor.to('cuda') model.to('cuda') # Predict all tokens with torch.no_grad(): outputs = model(tokens_tensor) predictions = outputs[0] # get the predicted next sub-word (in our case, the word 'man')
import torch from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel from torch.nn import functional as F tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2LMHeadModel.from_pretrained("gpt2") model.eval() def generate(input,n=20): text = [] prev,past = tokenizer.encode(input),None text.extend(prev) prev = torch.tensor([prev]) with torch.no_grad(): for _ in range(n): logits, past = model(prev, past=past) logits = logits[:, -1, :] log_probs = F.softmax(logits, dim=-1) prev = torch.multinomial(log_probs, num_samples=5) #prev = prev[] text.append(prev[0].item()) print(tokenizer.decode(text))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='生成设备') parser.add_argument('--length', default=-1, type=int, required=False, help='生成长度') parser.add_argument('--batch_size', default=1, type=int, required=False, help='生成的batch size') parser.add_argument('--nsamples', default=10, type=int, required=False, help='生成几个样本') parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度') parser.add_argument('--topk', default=8, type=int, required=False, help='最高几选一') parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='词表路径') parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='模型路径') parser.add_argument('--prefix', default='萧炎', type=str, required=False, help='生成文章的开头') args = parser.parse_args() print(args) os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length batch_size = args.batch_size nsamples = args.nsamples temperature = args.temperature topk = args.topk topp = args.topp device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() if length == -1: length = model.config.n_ctx // 2 elif length > model.config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) while True: raw_text = args.prefix context_tokens = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(raw_text)) generated = 0 for _ in range(nsamples // batch_size): out = sample_sequence(model=model, length=length, context=context_tokens, temperature=temperature, top_k=topk, top_p=topp, device=device) out = out.tolist() for i in range(batch_size): generated += 1 text = tokenizer.convert_ids_to_tokens(out[0]) for i, item in enumerate(text[:-1]): # 确保英文前后有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' for i, item in enumerate(text): if item == '[MASK]': text[i] = '' if item == '[CLS]' or item == '[SEP]': text[i] = '\n' print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) text = ''.join(text).replace('##', '').strip() print(text) print("=" * 80)