def get_tokenizer(tokenizer_name): log.info(f"\tLoading Tokenizer {tokenizer_name}") if tokenizer_name.startswith("bert-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): # TransformerXL is trained on data pretokenized with MosesTokenizer tokenizer = MosesTokenizer() elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name == "MosesTokenizer": tokenizer = MosesTokenizer() elif tokenizer_name == "SplitChars": tokenizer = SplitCharsTokenizer() elif tokenizer_name == "": tokenizer = SpaceTokenizer() else: tokenizer = None return tokenizer
def add_pytorch_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in pytorch_transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name if tokenizer_name.startswith("bert-"): tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if (tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-")): tokenizer.add_special_tokens({ "bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>" }) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token if tokenizer_name.startswith("roberta-"): if tokenizer.convert_ids_to_tokens(vocab_size - 1) is None: vocab_size -= 1 else: log.info("Time to delete vocab_size-1 in preprocess.py !!!") # due to a quirk in huggingface's file, the last token of RobertaTokenizer is None, remove # this when they fix the problem ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added pytorch_transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace( word, input_module_tokenizer_name(tokenizer_name))
def __init__(self, chunck_size=64, max_length=35, device=torch.device('cuda:0')): super(GPTClient, self).__init__() self.chunck_size = chunck_size self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') self.max_length = max_length # load the model self.model = OpenAIGPTModel.from_pretrained('openai-gpt') self.model.eval() self.device = device # move model to device self.model.to(self.device)
def __init__(self, opt, shared=None): super(TransformerAgent, self).__init__(opt, shared) args = AttrDict( opt) # to keep most commands identical to the interact.py script self.args = args logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__file__) self.logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if shared is None: self.logger.info("Get pretrained model and tokenizer") if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() if 'gpt2' in args.model_checkpoint: self.tokenizer = GPT2Tokenizer.from_pretrained( args.model_checkpoint) model_class = GPT2DoubleHeadsModel if self.args.eval_type == "hits@1" else GPT2LMHeadModel else: self.tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_checkpoint) model_class = OpenAIGPTDoubleHeadsModel if self.args.eval_type == "hits@1" else OpenAIGPTLMHeadModel self.model_checkpoint = model_class.from_pretrained( args.model_checkpoint) self.model_checkpoint.to(args.device) self.logger.info("Build BPE prefix dictionary") convai_dict = build_dict() assert len(convai_dict) == 19304 self.prefix2words = self.get_prefix2words(convai_dict) else: self.model_checkpoint = shared['model'] self.tokenizer = shared['tokenizer'] self.prefix2words = shared['prefix2words'] add_special_tokens_(self.model_checkpoint, self.tokenizer) self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids( SPECIAL_TOKENS) self.persona = [] self.history = [] self.labels = [] self.reset()
def test_special_tokens_checkpoint_behavior(self): toks = [ OpenAIGPTTokenizer.from_pretrained('openai-gpt'), GPT2Tokenizer.from_pretrained('gpt2') ] for tok in toks: self.assertEqual(len(tok.added_tokens_encoder), 0) tok.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) self.assertEqual(len(tok.added_tokens_encoder), 5) # Make sure we never split self.assertEqual(len(tok.tokenize("<bos> <speaker1>")), 2) ids = tok.convert_tokens_to_ids(SPECIAL_TOKENS) self.assertTrue( all([x > 0 for x in ids]), f'some tokens failed to tokenize {SPECIAL_TOKENS} -> {ids}') # Need to mantain indices through save. (this is also tested in pytorch-transformers) tok.save_pretrained(self.save_dir) tok_loaded = tok.from_pretrained(str(self.save_dir)) ids2 = tok_loaded.convert_tokens_to_ids(SPECIAL_TOKENS) self.assertListEqual(ids, ids2)
import jieba from torch.utils.data import Dataset, DataLoader import torch from pytorch_transformers import OpenAIGPTTokenizer, OpenAIGPTConfig special_tokens = ['<bos>', '<del>', '<eos>', '<pad>'] tokenizer = OpenAIGPTTokenizer.from_pretrained( 'openai-gpt', special_tokens=special_tokens) def convert_tokens_to_ids(text): return tokenizer.convert_tokens_to_ids(text) class MyData(Dataset): def __init__(self, texts, labels, is_train=True): self.texts = [jieba.lcut(t) for t in texts] self.labels = labels # 其他操作 .... def __getitem__(self, item): token_id = convert_tokens_to_ids(self.texts[item]) # 词 -> token_id label = self.labels[item] return torch.LongTensor(token_id), torch.LongTensor([label]) def __len__(self): return len(self.texts)
def __init__(self): self.name = 'GPTLanguageModel' self.trainable_model = False self.GPT_tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') self.model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt').eval()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.") parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name) tokenizer.add_special_tokens({ 'cls_token': '<CLS>', 'sep_token': '<SEP>', 'pad_token': '<PAD>', 'eos_token': '<EOS>' }) model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name) model.resize_token_embeddings(len(tokenizer)) special_tokens_ids = [ tokenizer.convert_tokens_to_ids(special_token) for special_token in ['<PAD>', '<CLS>', '<SEP>', '<EOS>'] ] model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps //\ (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader)\ // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() scheduler.step() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, scheduler.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids, lm_labels, mc_labels) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def test_gpt_embeddings(): gpt_model: str = "openai-gpt" tokenizer = OpenAIGPTTokenizer.from_pretrained(gpt_model) model = OpenAIGPTModel.from_pretrained( pretrained_model_name_or_path=gpt_model, output_hidden_states=True) model.to(flair.device) model.eval() s: str = "Berlin and Munich have a lot of puppeteer to see ." with torch.no_grad(): tokens = tokenizer.tokenize(s) indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[-1] first_layer = hidden_states[1][0] assert len(first_layer) == len(tokens) # 0 1 2 3 4 5 6 7 8 9 10 11 12 # # 'berlin</w>', 'and</w>', 'munich</w>', 'have</w>', 'a</w>', 'lot</w>', 'of</w>', 'pupp', 'ete', 'er</w>', 'to</w>', 'see</w>', '.</w>' # | | | | | | | \ | / | | | # Berlin and Munich have a lot of puppeteer to see . # # 0 1 2 3 4 5 6 7 8 9 10 def embed_sentence( sentence: str, pooling_operation, layers: str = "1", use_scalar_mix: bool = False, ) -> Sentence: embeddings = OpenAIGPTEmbeddings( model=gpt_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix, ) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence # First subword embedding sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") first_token_embedding_ref = first_layer[0].tolist() first_token_embedding_actual = sentence_first_subword.tokens[ 0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[7].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual) # Last subword embedding sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") first_token_embedding_ref = first_layer[0].tolist() first_token_embedding_actual = sentence_last_subword.tokens[ 0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[9].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual) # First and last subword embedding sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation="first_last") first_token_embedding_ref = torch.cat([first_layer[0], first_layer[0]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[7], first_layer[9]]).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual) # Mean of all subword embeddings sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") first_token_embedding_ref = calculate_mean_embedding([first_layer[0] ]).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[ 0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[7], first_layer[8], first_layer[9]]).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual) # Check embedding dimension when using multiple layers sentence_mult_layers = embed_sentence(sentence="Munich", pooling_operation="first", layers="1,2,3,4") ref_embedding_size = 4 * 768 actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size # Check embedding dimension when using multiple layers and scalar mix sentence_mult_layers_scalar_mix = embed_sentence( sentence="Berlin", pooling_operation="first", layers="1,2,3,4", use_scalar_mix=True, ) ref_embedding_size = 1 * 768 actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, default='gpt2-medium', choices=["openai-gpt", "gpt2", "gpt2-medium"], help='pretrained model name') parser.add_argument( "--model_dir", type=str, help="path to model's local checkpoint", default= "/home/ouardinik/PycharmProjects/hyperlex/text_generation_GPT/logs/openai-gpt_2019-08-02_10:14:02.631911" ) parser.add_argument("--bin_filename", type=str, help="checkpoint's filename", default="final_pytorch_model.bin") parser.add_argument("--seed", type=int, default=0) parser.add_argument("--length", type=int, default=-1) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') parser.add_argument("--nsamples", type=int, default=1) parser.add_argument("--batch_size", type=int, default=-1) parser.add_argument("--temperature", type=float, default=1.0) parser.add_argument("--top_k", type=int, default=0) parser.add_argument("--run_parallel", action='store_true', help='whether to run on GPUs') args = parser.parse_args() assert os.path.exists(args.model_dir), "input model's path" if args.batch_size == -1: args.batch_size = 1 assert args.nsamples % args.batch_size == 0 # Load Transformer logger.info("Load a trained model and vocabulary that you have fine-tuned") # BPE tokenizer and model if args.model_name == "openai-gpt": tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name) model = OpenAIGPTLMHeadModel.from_pretrained(args.model_name) elif args.model_name == "gpt2" or args.model_name == "gpt2-medium": tokenizer = GPT2Tokenizer.from_pretrained(args.model_name) model = GPT2LMHeadModel.from_pretrained(args.model_name) # Device device = torch.device("cuda" if ( torch.cuda.is_available() and args.run_parallel) else "cpu") n_gpu = torch.cuda.device_count() # Load top layer top_layer_path = os.path.join(args.model_dir, args.bin_filename) if device.type == "cpu": top_layer = torch.load(top_layer_path, map_location="cpu") else: top_layer = torch.load(top_layer_path) model.to(device), top_layer.to(device) model.eval(), top_layer.eval() if args.length == -1: args.length = model.config.n_ctx // 2 elif args.length > model.config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) while True: context_tokens = [] if not args.unconditional: raw_text = input("Model prompt >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("Model prompt >>> ") context_tokens = tokenizer.encode(raw_text) generated = 0 for _ in range(args.nsamples // args.batch_size): out = sample_sequence(top_layer=top_layer, model=model, length=args.length, context=context_tokens, start_token=None, batch_size=args.batch_size, temperature=args.temperature, top_k=args.top_k, device=device) out = out[:, len(context_tokens):].tolist() for i in range(args.batch_size): generated += 1 text = tokenizer.decode(out[i]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) print("=" * 80) else: generated = 0 for _ in range(args.nsamples // args.batch_size): out = sample_sequence( top_layer=top_layer, model=model, length=args.length, context=None, start_token=tokenizer.encoder['<|endoftext|>'], batch_size=args.batch_size, temperature=args.temperature, top_k=args.top_k, device=device) out = out[:, 1:].tolist() for i in range(args.batch_size): generated += 1 text = tokenizer.decode(out[i]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) print("=" * 80)
# this file implements the gpt as a service, based on a pretrained model from the source # https://github.com/huggingface/pytorch-pretrained-BERT import torch from pytorch_transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows import logging #logging.basicConfig(level=logging.INFO) # Load pre-trained model tokenizer (vocabulary) tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') # Tokenized input text = "Who was Jim Henson ? Jim Henson was a puppeteer" tokenized_text = tokenizer.tokenize(text) # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) print(tokens_tensor.size()) # Load pre-trained model (weights) model = OpenAIGPTModel.from_pretrained('openai-gpt') model.eval() cuda = torch.device('cuda:1') # If you have a GPU, put everything on cuda
return tensor_datasets def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) model.to(device) # Load Snli dataset with pkl path_train = args.data_path + "snli_1.0_train.txt" path_test = args.data_path + "snli_1.0_test.txt" path_dev = args.data_path + "snli_1.0_dev.txt" pkl_path = 'data_in/snli.pkl' if os.path.exists(pkl_path): with open(pkl_path, 'rb') as f: encoded_datasets = pickle.load(f)
def load_pick(file_nm): with open(file_nm, 'rb') as f: label, df = pickle.load(f) print("load compeleted") return label, df dev_label, dev_df = load_pick('data_in/dev.pkl') test_label, test_df = load_pick('data_in/test.pkl') train_label, train_df = load_pick('data_in/train.pkl') # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens)) special_tokens = ['<bos>', '<del>', '<eos>', '<pad>'] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) # OpenAI용 토크나이저 불러오기 tokenizer.add_tokens(special_tokens) config = OpenAIGPTConfig.from_pretrained('openai-gpt') config.num_labels = 3 config.vocab_size = len(tokenizer) config.summary_type = 'last' tokenizer.bos_token = '<bos>' tokenizer.eos_token = '<eos>'
def test_gpt_embeddings(): gpt_model = 'openai-gpt' tokenizer = OpenAIGPTTokenizer.from_pretrained(gpt_model) model = OpenAIGPTModel.from_pretrained( pretrained_model_name_or_path=gpt_model, output_hidden_states=True) model.to(flair.device) model.eval() s = 'Berlin and Munich have a lot of puppeteer to see .' with torch.no_grad(): tokens = tokenizer.tokenize(s) indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[(-1)] first_layer = hidden_states[1][0] assert (len(first_layer) == len(tokens)) def embed_sentence(sentence: str, pooling_operation, layers: str = '1', use_scalar_mix: bool = False) -> Sentence: embeddings = OpenAIGPTEmbeddings( pretrained_model_name_or_path=gpt_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence sentence_first_subword = embed_sentence(sentence=s, pooling_operation='first') first_token_embedding_ref = first_layer[0].tolist() first_token_embedding_actual = sentence_first_subword.tokens[ 0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[7].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual) sentence_last_subword = embed_sentence(sentence=s, pooling_operation='last') first_token_embedding_ref = first_layer[0].tolist() first_token_embedding_actual = sentence_last_subword.tokens[ 0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[9].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual) sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation='first_last') first_token_embedding_ref = torch.cat([first_layer[0], first_layer[0]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[7], first_layer[9]]).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual) sentence_mean_subword = embed_sentence(sentence=s, pooling_operation='mean') first_token_embedding_ref = calculate_mean_embedding([first_layer[0] ]).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[ 0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[7], first_layer[8], first_layer[9]]).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual) sentence_mult_layers = embed_sentence(sentence='Munich', pooling_operation='first', layers='1,2,3,4') ref_embedding_size = (4 * 768) actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert (ref_embedding_size == actual_embedding_size) sentence_mult_layers_scalar_mix = embed_sentence(sentence='Berlin', pooling_operation='first', layers='1,2,3,4', use_scalar_mix=True) ref_embedding_size = (1 * 768) actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert (ref_embedding_size == actual_embedding_size)