def predict_token(tokens_tensors): # Load pre-trained model (weights) model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103') model.eval() # If you have a GPU, put everything on cuda if torch.cuda.is_available(): for i, tt in enumerate(tokens_tensors): tokens_tensors[i] = tokens_tensors[i].to('cuda') model.to('cuda') with torch.no_grad(): # Predict all tokens mems = None all_predictions = [] for i, tt in enumerate(tokens_tensors): #predictions, mems = model(tt) # We can re-use the memory cells in a subsequent call to attend a longer context #predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1) predictions, mems = model(tt, mems=mems) all_predictions.append(predictions) # get the predicted last token predicted_index = torch.argmax(all_predictions[-1][0, -1, :]).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] #assert predicted_token == 'who' return predicted_token
def main(): parser = argparse.ArgumentParser() parser.add_argument('--batch_size',default=1,type=int,help='Batch size for inference') parser.add_argument('--model_name',default='transfo-xl-wt103',type=str, help='Pre-trained model name') parser.add_argument('--max_seq_length',default=128,type=int, help='Maximum total input sequence length after tokenization') args = parser.parse_args() input_ids = torch.zeros([args.batch_size,args.max_seq_length],dtype=torch.long) model = TransfoXLLMHeadModel.from_pretrained(args.model_name) torch.onnx.export(model,input_ids,'transfoxll_'+'batch'+str(args.batch_size)+'.onnx')
def TextGenerator(line): tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103') line_tokenized = tokenizer.tokenize(line) line_indexed = tokenizer.convert_tokens_to_ids(line_tokenized) tokens_tensor = torch.tensor([line_indexed]) max_predictions = 30 mems = None l = [] for i in range(max_predictions): predictions, mems = model(tokens_tensor, mems=mems) predicted_index_tensor = torch.topk(predictions[0, -1, :], 5)[1][1] predicted_index = predicted_index_tensor.item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] print(predicted_token) l.append(predicted_token) tokens_tensor = torch.cat( (tokens_tensor, predicted_index_tensor.reshape(1, 1)), dim=1) s = " ".join(l) return s
def __init__(self, args): super().__init__() if args.transformerxl_model_dir is not None: model_name = args.transformerxl_model_dir dict_file = model_name print("Loading Transformer XL model from {}".format(model_name)) else: model_name = args.transformerxl_model_name dict_file = model_name # Load pre-trained model tokenizer (vocabulary) self.tokenizer = TransfoXLTokenizer.from_pretrained(dict_file) self.vocab = list(self.tokenizer.idx2sym) self._init_inverse_vocab() self.eos_id = self.inverse_vocab[self.EOS_SYMBOL] self.unk_symbol = self.UNK_SYMBOL # Load pre-trained model (weights) self.txl_model = TransfoXLLMHeadModel.from_pretrained(model_name) self.txl_model.eval() print(self.txl_model.config)
def main(): parser = argparse.ArgumentParser( description='PyTorch Transformer Language Model') parser.add_argument('--model_name', type=str, default='transfo-xl-wt103', help='pretrained model name') parser.add_argument('--split', type=str, default='test', choices=['all', 'valid', 'test'], help='which split to evaluate') parser.add_argument('--batch_size', type=int, default=10, help='batch size') parser.add_argument('--tgt_len', type=int, default=128, help='number of tokens to predict') parser.add_argument('--ext_len', type=int, default=0, help='length of the extended context') parser.add_argument('--mem_len', type=int, default=1600, help='length of the retained previous heads') parser.add_argument('--clamp_len', type=int, default=1000, help='max positional embedding index') parser.add_argument('--no_cuda', action='store_true', help='Do not use CUDA even though CUA is available') parser.add_argument('--work_dir', type=str, required=True, help='path to the work_dir') parser.add_argument('--no_log', action='store_true', help='do not log the eval result') parser.add_argument('--same_length', action='store_true', help='set same length attention with masking') parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() assert args.ext_len >= 0, 'extended context length must be non-negative' if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") logger.info("device: {}".format(device)) # Load a pre-processed dataset # You can also build the corpus yourself using TransfoXLCorpus methods # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax # and tokenizing the dataset # The pre-processed corpus is a convertion (using the conversion script ) corpus = TransfoXLCorpus.from_pretrained(args.model_name) ntokens = len(corpus.vocab) va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len) te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len) # Load a pre-trained model model = TransfoXLLMHeadModel.from_pretrained(args.model_name) model = model.to(device) logger.info( 'Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'. format(args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len)) model.reset_length(args.tgt_len, args.ext_len, args.mem_len) if args.clamp_len > 0: model.clamp_len = args.clamp_len if args.same_length: model.same_length = True ############################################################################### # Evaluation code ############################################################################### def evaluate(eval_iter): # Turn on evaluation mode which disables dropout. model.eval() total_len, total_loss = 0, 0. start_time = time.time() with torch.no_grad(): mems = None for idx, (data, target, seq_len) in enumerate(eval_iter): ret = model(data, target, mems) loss, mems = ret loss = loss.mean() total_loss += seq_len * loss.item() total_len += seq_len total_time = time.time() - start_time logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format( total_time, 1000 * total_time / (idx + 1))) return total_loss / total_len # Run on test data. if args.split == 'all': test_loss = evaluate(te_iter) valid_loss = evaluate(va_iter) elif args.split == 'valid': valid_loss = evaluate(va_iter) test_loss = None elif args.split == 'test': test_loss = evaluate(te_iter) valid_loss = None def format_log(loss, split): log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format( split, loss, math.exp(loss)) return log_str log_str = '' if valid_loss is not None: log_str += format_log(valid_loss, 'valid') if test_loss is not None: log_str += format_log(test_loss, 'test') logger.info('=' * 100) logger.info(log_str) logger.info('=' * 100)
print(tokenizer.tokenize("who was jim henson ?")) tokenized_text_2 = tokenizer.tokenize(text_2) indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1); print(indexed_tokens_1) # [2517, 11, 1666, 12034, 788] print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize("who was jim henson ?"))) # [52, 11, 24, 24, 788]; 也是 case sensitive indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2) tokens_tensor_1 = torch.tensor([indexed_tokens_1]) tokens_tensor_2 = torch.tensor([indexed_tokens_2]) ################################################################## ## TransfoXLModel model = TransfoXLModel.from_pretrained('/Users/coder352/datasets/WordVec/pytorch_pretrained_bert/transfo-xl-wt103') model.eval() with torch.no_grad(): hidden_states_1, mems_1 = model(tokens_tensor_1) # Predict hidden states features for each layer hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1) # We can re-use the memory cells in a subsequent call to attend a longer context ################################################################## ## TransfoXLLMHeadModel model = TransfoXLLMHeadModel.from_pretrained('/Users/coder352/datasets/WordVec/pytorch_pretrained_bert/transfo-xl-wt103/') model.eval() with torch.no_grad(): predictions_1, mems_1 = model(tokens_tensor_1) # Predict all tokens predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1) # We can re-use the memory cells in a subsequent call to attend a longer context ## get the predicted last token predicted_index = torch.argmax(predictions_2[0, -1, :]).item(); print(predicted_index) # 52 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index]); print(predicted_token) # ['who']
indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1) indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2) # Convert inputs to PyTorch tensors tokens_tensor_1 = torch.tensor([indexed_tokens_1]) tokens_tensor_2 = torch.tensor([indexed_tokens_2]) # Load pre-trained model (weights) model = TransfoXLModel.from_pretrained('transfo-xl-wt103') model.eval() with torch.no_grad(): # Predict hidden states features for each layer hidden_states_1, mems_1 = model(tokens_tensor_1) # We can re-use the memory cells in a subsequent call to attend a longer context hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1) # Load pre-trained model (weights) model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103') model.eval() with torch.no_grad(): # Predict all tokens predictions_1, mems_1 = model(tokens_tensor_1) # We can re-use the memory cells in a subsequent call to attend a longer context predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1) # get the predicted last token predicted_index = torch.argmax(predictions_2[0, -1, :]).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] # assert predicted_token == 'who'