def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): model = OpenAIGPTLMHeadModel(config) model.eval() loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) result = {"loss": loss, "lm_logits": lm_logits} self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size])
def test_lm_generate_openai_gpt(self): model = OpenAIGPTLMHeadModel.from_pretrained("openai-gpt") input_ids = torch.Tensor([[481, 2585, 544, 4957]]).long() # The dog is cute expected_output_ids = [ 481, 2585, 544, 4957, 669, 512, 761, 5990, 271, 645, 487, 535, 976, 2479, 240, 487, 804, 1296, 2891, 512, ] # the dog is cute when you're annoyed : if he's really stupid, he 'll stop fighting you torch.manual_seed(0) output_ids = model.generate(input_ids) self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
def reset_bot(): global history, tokenizer, model, personality dataset_path = './chatapp/data/counsel_chat_250-tokens_full.json' dataset_cache = './chatapp/dataset_cache' model_checkpoint = download_pretrained_model() device = "cpu" seed = random.randrange(0, 100) random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) # Get pretrained model and tokenizer tokenizer = OpenAIGPTTokenizer.from_pretrained(model_checkpoint) model = OpenAIGPTLMHeadModel.from_pretrained(model_checkpoint) model.to(device) add_special_tokens_(model, tokenizer) # Sample a personality dataset = get_dataset(tokenizer, dataset_path, dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) history = [] return ""
def test_lm_generate_openai_gpt(self): model = OpenAIGPTLMHeadModel.from_pretrained("openai-gpt") model.to(torch_device) input_ids = torch.tensor([[481, 4735, 544]], dtype=torch.long, device=torch_device) # the president is expected_output_ids = [ 481, 4735, 544, 246, 963, 870, 762, 239, 244, 40477, 244, 249, 719, 881, 487, 544, 240, 244, 603, 481, ] # the president is a very good man. " \n " i\'m sure he is, " said the output_ids = model.generate(input_ids, do_sample=False) self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
def __init__(self, bot): self.bot = bot self.src_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), '..')) model_path = os.path.join(self.src_dir, "conv_ai/model/") self.args = { "max_history": 2, "device": "cpu", "max_length": 20, "min_length": 1, "temperature": 0.7, "top_k": 0, "top_p": 0.9, "no_sample": 1 } self.tokenizer = OpenAIGPTTokenizer.from_pretrained(model_path) self.model = OpenAIGPTLMHeadModel.from_pretrained(model_path) self.model.to('cpu') add_special_tokens_(self.model, self.tokenizer) dataset = get_dataset( self.tokenizer, "", os.path.join(self.src_dir, "conv_ai/dataset_cache")) self.personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] self.personality = random.choice(self.personalities) self.history = [] print("Conversational AI model loaded successfully.")
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): model = OpenAIGPTLMHeadModel(config) model.to(torch_device) model.eval() result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) self.parent.assertEqual(result.loss.shape, ()) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def get_gpt2_perplexity(sentence): global model if model is None: from transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel import torch model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') model.eval() tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') tokenize_input = tokenizer.tokenize(sentence) tensor_input = torch.tensor( [tokenizer.convert_tokens_to_ids(tokenize_input)]) loss = model(tensor_input, lm_labels=tensor_input) return math.exp(loss[0].item())
def __init__(self): self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') self.gpt = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt').cuda() self.embedder = SentenceTransformer('bert-base-nli-mean-tokens').cuda() self.pos_phrase = "I have an undiagnosed disease. " self.keywords = [term.strip().lower() for term in open('tweet_crawler/terms.txt').read().split('\n') if term != "" and term != "undiagnosed" and term != "disease"] self.udn_examples = list(open('data/UDN_patient_search_TWEET_samples.txt').read().split('\n')) + \ list(open('data/UDN_patient_search_WEB_samples.txt').read().split('\n')) # self.phrase_gpt_score = gpt_log_prob_score([self.phrase], self.gpt, self.tokenizer) self.pos_phrase_emb = self.embedder.encode([self.pos_phrase])[0]
triggers = {row[0]: row[1] for row in csv.reader(triggers_file)} max_history = 2 min_length, max_length = 1, 20 dataset_path = './chatapp/data/counsel_chat_250-tokens_full.json' dataset_cache = './chatapp/dataset_cache' model_checkpoint = download_pretrained_model() device = "cpu" seed = 0 random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) # Get pretrained model and tokenizer tokenizer = OpenAIGPTTokenizer.from_pretrained(model_checkpoint) model = OpenAIGPTLMHeadModel.from_pretrained(model_checkpoint) model.to(device) add_special_tokens_(model, tokenizer) # Sample a personality dataset = get_dataset(tokenizer, dataset_path, dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) history = [] @app.route("/") def home():
with open(cached_input_file, "rb") as reader: eval_inputs = pickle.load(reader) reader.close() except: eval_inputs = read_data(args.data_dir + 'dev.txt', length) if args.local_rank == -1: logger.info(" Saving eval features into cached file %s", cached_input_file) with open(cached_input_file, "wb") as writer: pickle.dump(eval_inputs, writer) writer.close() eval_dataloader = DataLoader(eval_inputs, sampler = SequentialSampler(eval_inputs), batch_size = args.eval_batch_size) # Set model model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') model.to(device) # Train model logger.info("***** Run training and evaluating *****") logger.info(" Num of train examples = %d", len(train_dataloader)) logger.info(" Train batch size = %d", args.train_batch_size) logger.info(" Num of eval examples = %d", len(eval_dataloader)) logger.info(" Eval batch size = %d", args.eval_batch_size) model.train() num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs // args.train_batch_size # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [
import math import time import json import torch import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader #!pip install transformers #!pip install ftfy #!pip install spacy from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, AdamW %load_ext tensorboard from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter('runs/BaselineModel') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt').to(device) tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') def add_special_tokens_(model, tokenizer): """ Add special tokens to the tokenizer and the model if they have not already been added. """ orig_num_tokens = len(tokenizer.encoder) num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there if num_added_tokens > 0: model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens) SPECIAL_TOKENS = ["<bos>", "<eos>", "<system>", "<user>", "<slots>", "<pad>"] ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>', 'additional_special_tokens': ['<system>', '<user>', '<slots>']} MODEL_INPUTS = ["input_ids", "lm_labels", "token_type_ids"] PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]
Input = Input.long().to(self.device) Output = self.Trans(Input,attention_mask=attn_mask) logits = Output[0] labels = Input shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() flatten_shift_loss_mask = loss_mask[..., :-1].contiguous().view(-1) ids = nonzero(flatten_shift_loss_mask).view(-1) fin_logits,fin_labels = shift_logits.view(-1, shift_logits.size(-1))[ids], shift_labels.view(-1)[ids] return fin_logits,fin_labels def decode(self,Input,Label,max_length): """decode the given input probabilities with search probabilities""" attn_mask = tensor(Label.clone().detach() == 1.0,dtype=uint8,device=self.device) Input = Input.long().to(self.device) Output = self.Trans.generate(Input,attention_mask=attn_mask,max_length=max_length) return Output if __name__ == "__main__": Trans_Config = OpenAIGPTConfig(vocab_size=3002,n_layer=12) Trans_Model = OpenAIGPTLMHeadModel(Trans_Config,) Token_Dir = r"G:\Work Related\Nlc2cmd\Tokenizer_Train\GPTToken/" Trans_Tok = GPT2TokenizerFast.from_pretrained(Token_Dir) Omni = OmniBash(Trans_Model,"cpu") Dataset = OmnibashDataset(r"G:\Work Related\Nlc2cmd\Data\Template.json",Trans_Tok,"train",100) TrainLoader = DataLoader(Dataset,batch_size=10) Sample = next(iter(TrainLoader)) X = Omni.decode(Sample[0][0].unsqueeze(0),Sample[1][0].unsqueeze(0)) print(X) Out = Trans_Tok.convert_ids_to_tokens(X[0]) print(Out)
from .helpers import * from .models import ElmoSCLSTM from .util import get_module_or_attr """ NEW: reranking snippets """ # (GPT/GPT-2/CTRL/Transformer-XL/XLNet) import torch from torch.nn import CrossEntropyLoss HFACE_batch_size = 8 RERANKER = "GPT-2" # GPT/GPT-2/CTRL/Transformer-XL/XLNet if RERANKER == "GPT": from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel gpt2Tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') gpt2LMHeadModel = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') gpt2Tokenizer.add_special_tokens({'pad_token': "[PAD]"}) gpt2LMHeadModel.resize_token_embeddings(len(gpt2Tokenizer)) assert gpt2Tokenizer.pad_token == '[PAD]' elif "GPT-2": from transformers import GPT2Tokenizer, GPT2LMHeadModel gpt2Tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') gpt2LMHeadModel = GPT2LMHeadModel.from_pretrained('gpt2-medium') gpt2Tokenizer.pad_token = gpt2Tokenizer.eos_token elif "Transformer-XL": from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel txlTokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') txlLMHeadModel = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103') txlTokenizer.pad_token = txlTokenizer.eos_token
def load_model(name: str) -> Tuple[OpenAIGPTLMHeadModel, OpenAIGPTTokenizer]: model = OpenAIGPTLMHeadModel.from_pretrained(name) tokenizer = OpenAIGPTTokenizer.from_pretrained(name) model.eval() return model, tokenizer
def setup_gpt(model_name="openai-gpt"): model = OpenAIGPTLMHeadModel.from_pretrained(model_name) tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name) return model, tokenizer