def __init__( self, pretrained_model_name_or_path: str = "gpt2-medium", layers: str = "1", pooling_operation: str = "first_last", use_scalar_mix: bool = False, ): """OpenAI GPT-2 embeddings, as proposed in Radford et al. 2019. :param pretrained_model_name_or_path: name or path of OpenAI GPT-2 model :param layers: comma-separated list of layers :param pooling_operation: defines pooling operation for subwords :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) """ super().__init__() self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path) self.model = GPT2Model.from_pretrained( pretrained_model_name_or_path=pretrained_model_name_or_path, output_hidden_states=True ) self.name = pretrained_model_name_or_path self.layers: List[int] = [int(layer) for layer in layers.split(",")] self.pooling_operation = pooling_operation self.use_scalar_mix = use_scalar_mix self.static_embeddings = True dummy_sentence: Sentence = Sentence() dummy_sentence.add_token(Token("hello")) embedded_dummy = self.embed(dummy_sentence) self.__embedding_length: int = len( embedded_dummy[0].get_token(1).get_embedding() )
def __init__(self, config): super(GPT2ClassificationModel, self).__init__(config) self.transformer = GPT2Model(config) self.classifier1 = torch.nn.Linear(config.n_embd, config.num_labels) self.dropout = torch.nn.Dropout(config.summary_first_dropout) self.loss_fct = torch.nn.CrossEntropyLoss() self.init_weights()
def __init__(self, model_path): super(OnmtGPT2Encoder, self).__init__() config = GPT2Config.from_json_file( os.path.join(model_path, "config.json")) pretrained_dict = os.path.join(model_path, "pytorch_model.bin") if os.path.exists(pretrained_dict): model = GPT2Model.from_pretrained( pretrained_model_name_or_path=pretrained_dict, config=config) print("init GPT2 model with {} weights".format( len(model.state_dict()))) else: model = GPT2Model(config) model.wte = expandEmbeddingByN(model.wte, 4) self.encoder = model #print(model) print("***" * 20)
def __init__(self, config): super(AblationLongGPT2, self).__init__(config) self.sequence_len = config.sequence_len self.transformer = GPT2Model(config) self.dropout = torch.nn.Dropout(config.summary_first_dropout) self.classifier1 = torch.nn.Linear(config.n_embd * self.sequence_len, config.num_labels) self.loss_fct = torch.nn.CrossEntropyLoss() self.init_weights()
def __init__(self, config): super(GPT2ClassHeadsModel, self).__init__(config) self.transformer = GPT2Model(config) self.classifier = nn.Linear(config.n_embd, 2) # self.classifier = nn.Sequential(nn.Linear(config.n_embd, 768), nn.ReLU(), nn.Dropout(p=0.2), # nn.Linear(768, 2)) # self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.init_weights()
def __init__(self, chunck_size=64, max_length=35, device=torch.device('cuda:0')): super(GPT2Client, self).__init__() self.chunck_size = chunck_size self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.max_length = max_length # load the model self.model = GPT2Model.from_pretrained('gpt2') self.model.eval() self.device = device # move model to device self.model.to(self.device)
def __init__(self, vocab_size, device): super().__init__() self.hidden_size = 768 self.gpt2model = GPT2Model.from_pretrained('gpt2') self.gpt2model.resize_token_embeddings(vocab_size) for param in self.gpt2model.parameters(): param.requires_grad = False self.device = device self.to(device)
def __init__(self, config): super(GPT2_adverse, self).__init__(config) self.transformer = GPT2Model(config) self.lm_head = nn.Linear(2 * config.n_embd, config.vocab_size, bias=False) self.pos_head_norm = nn.Linear(config.n_embd, config.pos_vocab_size, bias=True) self.pos_head_adv = nn.Linear(config.n_embd, config.pos_vocab_size, bias=True) self.syn_layer = nn.Linear(config.n_embd, config.n_embd, bias=True) self.sem_layer = nn.Linear(config.n_embd, config.n_embd, bias=True) self.apply(self.init_weights)
def __init__(self, config,): super(AttentionLongGPT2, self).__init__(config) self.sequence_len = config.sequence_len self.transformer = GPT2Model(config) self.dropout = torch.nn.Dropout(config.summary_first_dropout) self.classifier1 = torch.nn.Linear(config.n_embd * 9, config.num_labels) self.attention1 = torch.nn.Linear(self.sequence_len, 64) self.attention2 = torch.nn.Linear(64, 128) self.attention3 = torch.nn.Linear(128 + config.n_embd, 2*config.n_embd) self.leaky = torch.nn.LeakyReLU(0.2) self.att = NewAttention(config.n_embd) self.loss_fct = torch.nn.CrossEntropyLoss() self.init_weights()
def create_and_check_gpt2_model(self, config, input_ids, head_mask, token_type_ids, *args): model = GPT2Model(config=config) model.eval() model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) model(input_ids, token_type_ids=token_type_ids) sequence_output, presents = model(input_ids) result = { "sequence_output": sequence_output, "presents": presents, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertEqual(len(result["presents"]), config.n_layer)
def __init__(self, gpt2_model, language, name, loi, cuda=False): super(GPT2, self).__init__() # Load pre-trained model tokenizer (vocabulary) # Crucially, do not do basic tokenization; PTB is tokenized. Just do wordpiece tokenization. if gpt2_model not in ['small', 'medium']: raise ValueError("GPT2 model must be small or medium") self.model = GPT2Model.from_pretrained( 'gpt2{}'.format('' if gpt2_model == 'small' else '-medium'), output_hidden_states=True) self.tokenizer = GPT2Tokenizer.from_pretrained( 'gpt2{}'.format('' if gpt2_model == 'small' else '-medium')) self.language = language self.LAYER_COUNT = parameters[gpt2_model]['LAYER_COUNT'] self.FEATURE_COUNT = parameters[gpt2_model]['FEATURE_COUNT'] self.name = name self.loi = np.array(loi) if loi else np.arange( parameters[gpt2_model]['LAYER_COUNT']) # loi: layers of interest self.cuda = cuda
def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): # Construct model if gpt2_config_file == "": config = GPT2Config() else: config = GPT2Config.from_json_file(gpt2_config_file) model = GPT2Model(config) # Load weights from numpy load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) # Save pytorch-model pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to {}".format(pytorch_config_dump_path)) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(config.to_json_string())
import torch import torch.nn as nn from pytorch_transformers import GPT2Model, GPT2LMHeadModel, GPT2Tokenizer from torchbench.language_modelling import WikiText103 new_model = GPT2Model.from_pretrained('gpt2-large') tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large') def model_output_transform(output, target, model): n_embd = 1280 vocab_size = 50257 lm_head = nn.Linear(n_embd, vocab_size, bias=False).cuda() if model.config.torchscript: lm_head.weight = nn.Parameter(transformer.wte.weight.clone()) else: lm_head.weight = model.wte.weight hidden_states = output[0] lm_logits = lm_head(hidden_states) return lm_logits WikiText103.benchmark( model=new_model, context_length=1024, encoder=tokenizer, model_output_transform=model_output_transform, paper_model_name='GPT-2 Large', paper_pwc_id='language-models-are-unsupervised-multitask')
def test_gpt2_embeddings(): gpt_model: str = "gpt2-medium" tokenizer = GPT2Tokenizer.from_pretrained(gpt_model) model = GPT2Model.from_pretrained(pretrained_model_name_or_path=gpt_model, output_hidden_states=True) model.to(flair.device) model.eval() s: str = "Berlin and Munich have a lot of puppeteer to see ." with torch.no_grad(): tokens = tokenizer.tokenize("<|endoftext|>" + s + "<|endoftext|>") indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[-1] first_layer = hidden_states[1][0] assert len(first_layer) == len(tokens) # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 # # '<|endoftext|>', 'Ber', 'lin', 'Ġand', 'ĠMunich', 'Ġhave', 'Ġa', 'Ġlot', 'Ġof', 'Ġpupp', 'ete', 'er', 'Ġto', 'Ġsee', 'Ġ.', '<|endoftext|>' # \ / | | | | | | \ | / | | | # Berlin and Munich have a lot of puppeteer to see . # # 0 1 2 3 4 5 6 7 8 9 10 def embed_sentence( sentence: str, pooling_operation, layers: str = "1", use_scalar_mix: bool = False, ) -> Sentence: embeddings = OpenAIGPT2Embeddings( model=gpt_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix, ) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence # First subword embedding sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_first_subword.tokens[ 0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[9].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual) # Last subword embedding sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") # First token is splitted into two subwords. # As we use "last" as pooling operation, we consider the last subword as "first token" here first_token_embedding_ref = first_layer[2].tolist() first_token_embedding_actual = sentence_last_subword.tokens[ 0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[11].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual) # First and last subword embedding sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation="first_last") first_token_embedding_ref = torch.cat([first_layer[1], first_layer[2]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[9], first_layer[11]]).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual) # Mean of all subword embeddings sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") first_token_embedding_ref = calculate_mean_embedding( [first_layer[1], first_layer[2]]).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[ 0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[9], first_layer[10], first_layer[11]]).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual) # Check embedding dimension when using multiple layers sentence_mult_layers = embed_sentence(sentence="Munich", pooling_operation="first", layers="1,2,3,4") ref_embedding_size = 4 * 1024 actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size # Check embedding dimension when using multiple layers and scalar mix sentence_mult_layers_scalar_mix = embed_sentence( sentence="Berlin", pooling_operation="first", layers="1,2,3,4", use_scalar_mix=True, ) ref_embedding_size = 1 * 1024 actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size
import csv from random import shuffle import numpy as np from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV from xgboost import XGBClassifier from sklearn.metrics import classification_report from sklearn.pipeline import make_pipeline from sklearn.feature_extraction.text import VectorizerMixin import torch from pytorch_transformers import GPT2Tokenizer, GPT2Model tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') model = GPT2Model.from_pretrained('gpt2-medium') import eli5 from eli5.lime import TextExplainer positives = [] negatives = [] rowcutoff = 10000 with open('bset_automl_2.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') index = -1 for line in csv_reader: # skipping header row index += 1 if index > 0:
# Load pre-trained model tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # Encode some inputs text_1 = "Who was Jim Henson ?" text_2 = "Jim Henson was a puppeteer" indexed_tokens_1 = tokenizer.encode(text_1) indexed_tokens_2 = tokenizer.encode(text_2) # Convert inputs to PyTorch tensors tokens_tensor_1 = torch.tensor([indexed_tokens_1]) tokens_tensor_2 = torch.tensor([indexed_tokens_2]) # Load pre-trained model (weights) model = GPT2Model.from_pretrained('gpt2') model.eval() # If you have a GPU, put everything on cuda tokens_tensor_1 = tokens_tensor_1.to('cuda') tokens_tensor_2 = tokens_tensor_2.to('cuda') model.to('cuda') # Predict hidden states features for each layer with torch.no_grad(): hidden_states_1, past = model(tokens_tensor_1) print(len(hidden_states_1)) print(hidden_states_1[-1].size()) # past can be used to reuse precomputed hidden state in a subsequent predictions # (see beam-search examples in the run_gpt2.py example). # hidden_states_2, past = model(tokens_tensor_2, past=past)
print(tokenized_text_1) print(indexed_tokens1) print(tokens_tensor_1) print(tokenized_text_2) print(indexed_tokens2) print(tokens_tensor_2) """ print("Encode:") text = "What is the fastest car in the " indexed_tokens = tokenizer.encode(text) # Convert indexed tokens in a PyTorch tensor tokens_tensor = torch.tensor([indexed_tokens]) print(indexed_tokens) """ encoder = GPT2Model.from_pretrained('gpt2') with torch.no_grad(): last_hidden_states_1, past = encoder(tokens_tensor_1) print(last_hidden_states_1.size()) with torch.no_grad(): last_hidden_states_2, past = encoder(tokens_tensor_2) print(last_hidden_states_2.size()) # Load pre-trained model (weights) model = GPT2LMHeadModel.from_pretrained('gpt2') model.resize_token_embeddings(len(tokenizer)) # Set the model in evaluation mode to deactivate the DropOut modules model.eval() # Predict all tokens
def test_model_from_pretrained(self): cache_dir = "/tmp/pytorch_transformers_test/" for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir) shutil.rmtree(cache_dir) self.assertIsNotNone(model)
import torch from pytorch_transformers import GPT2Model import numpy as np sequence_length = 3 input_sequence = torch.tensor(np.zeros(sequence_length), dtype=torch.long).unsqueeze(0) GPT2Model.from_pretrained('gpt2')(input_sequence)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, help="pretrained_model.") parser.add_argument("--model_option", type=str, default='gpt-2-2', help="pretrained_model.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_probe", action='store_true', help="Whether to run probing.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument('--data_dir', type=str, default='/home/xiongyi/dataxyz/repos/SemSynLSTM/word_language_model/data/wikitext-2/') parser.add_argument('--seed', type=int, default=12) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) timenow = datetime.datetime.now().strftime("%b%d%H%M") model_option = 'gpt_2_2' outdir = model_option + timenow args = parser.parse_args(['--output_dir', outdir,'--do_probe','--num_train_epochs', '10', '--model_option',model_option]) #args = parser.parse_args(['--output_dir', './tmp', '--do_eval', '--model_name', 'gpt2']) print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset #special_tokens = ['_start_', '_delimiter_'] #special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens) # Compute the max input length for the Transformer input_length = 128 data_dir = '../SemSynLSTM/word_language_model/data/wikitext-2/' if args.data_dir is None else args.data_dir train_set, val_set, test_set, dictionary, pos_dictionary = load_tokenize_and_batchify(data_dir, input_length) # Prepare inputs tensors and dataloaders train_data = TensorDataset(*train_set) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32) eval_data = TensorDataset(*val_set) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=32) config = GPT2Config(n_positions=256,n_ctx=256, n_layer=8,n_head=8, n_embd= 256) config.vocab_size = dictionary.__len__() config.pos_vocab_size = pos_dictionary.__len__() config.n_ctx = input_length config.n_positions = input_length model1 = GPT2Model(config=config) #TODO: GPTWithPOSPredicting model2 = GPT2Model(config=config) #TODO: Wrapp2Transformers together and add a LM head model = WrapperLMHead(model1, model2, config, args.model_option) model.to(device) # TODO: Load and encode the datasets logger.info("Encoding dataset...") # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, #max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) #t_total=num_train_optimization_steps) if args.do_train: train_results = {} nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): ###eval on eval set model.eval() nb_eval_steps, nb_eval_examples = 0, 0 log_probs_sum = 0 perp = 0.0 average_loss = np.array([0.0,0.0,0.0]) for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch with torch.no_grad(): loss, loss_lm, loss_pos = model(input_ids, pos_ids = input_pos_ids, labels=input_ids)[0] loss = loss.detach().cpu().numpy() loss_lm = loss_lm.detach().cpu().numpy() loss_pos = loss_pos.detach().cpu().numpy() perp_batch = np.exp(loss_lm) perp += perp_batch average_loss += np.array([loss, loss_lm, loss_pos]) nb_eval_steps += 1 perp /= nb_eval_steps average_loss /= nb_eval_steps print('loss, loss_lm, loss_pos', average_loss,'perp ', perp, 'epoch ', epoch) train_results[epoch]= (perp, average_loss) model.train() tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch loss = model(input_ids, labels=input_ids, pos_ids = input_pos_ids)[0][0] #breakpoint() #loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item() nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} ".format(exp_average_loss) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) #tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = GPT2LMHeadModel.from_pretrained(args.output_dir) #tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) print (train_results) if args.do_eval: model.eval() nb_eval_steps, nb_eval_examples = 0, 0 log_probs_sum=0 perp = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch with torch.no_grad(): loss = model(input_ids, labels= input_ids, pos_ids=input_pos_ids)[0].detach().cpu().numpy() perp_batch = np.exp(loss) perp += perp_batch nb_eval_steps += 1 perp /= nb_eval_steps # perp_word = perp / 128 print (perp) result = {'eval_perp': perp} logger.info("***** Eval results *****") logger.info("'eval_perp' = %s", str(result['eval_perp'])) # output_eval_file = os.path.join(args.output_dir, "eval_results.txt") # with open(output_eval_file, "w") as writer: # logger.info("***** Eval results *****") # for key in sorted(result.keys()): # logger.info(" %s = %s", key, str(result[key])) # writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_probe: ##load model (how???) model_path = '/home/xiongyi/dataxyz/repos/pytorch-pretrained-BERT/examples/gpt2_2_jul22/pytorch_model.bin_double' model.load_state_dict(torch.load(model_path)) ##Add a mlp to the representation probe_model = ProbeModel(model, config) probe_model.to(device) ##train and eval all_param = list(probe_model.named_parameters()) param_probe = [(n, p) for n, p in all_param if 'probe_cls' in n] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_probe if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_probe if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, # max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) # t_total=num_train_optimization_steps) train_results = {} nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None for epoch in trange(int(args.num_train_epochs), desc="Epoch"): ###eval on eval set probe_model.eval() nb_eval_steps, nb_eval_examples = 0, 0 average_loss = 0 average_acc = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch with torch.no_grad(): #breakpoint() loss = probe_model(input_ids, labels=input_ids, pos_ids=input_pos_ids)[0].detach().cpu().numpy() pos_logits = probe_model(input_ids, labels=input_ids, pos_ids=input_pos_ids)[1].detach().cpu().numpy() predicted_labels = np.argmax(pos_logits, -1) correct_rate = np.mean(predicted_labels == input_pos_ids.detach().cpu().numpy()[:,1:]) average_acc += correct_rate average_loss += loss nb_eval_steps += 1 average_loss /= nb_eval_steps average_acc /= nb_eval_steps print('loss', average_loss,' acc_rate ', average_acc, ' epoch ', epoch) train_results[epoch] = (average_loss, average_acc) probe_model.train() tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch loss = probe_model(input_ids, labels=input_ids, pos_ids=input_pos_ids)[0] # breakpoint() # loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item() nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e}".format(exp_average_loss)
from pytorch_transformers import GPT2Model, GPT2Tokenizer import numpy as np import torch import math import torch.nn as nn sequence_length = 3 model = GPT2Model.from_pretrained("gpt2") input_ids = torch.tensor(np.zeros(sequence_length), dtype=torch.long) position_ids = torch.tensor(np.arange(sequence_length).astype(np.float), dtype=torch.long) # Output of the embeddings addition embeddings = model.wpe(position_ids) + model.wte(input_ids) # Output of the first Attention LayerNorm layer ln_1 = model.h[0].ln_1(embeddings) # Output of the attention dense layer for Q, K, V c_attn = model.h[0].attn.c_attn(ln_1).reshape((-1, sequence_length, 2304)) # Splitting the QKV vector query, key, value = c_attn.split(model.h[0].attn.split_size, dim=2) # Splitting the heads split_query, split_key, split_value = model.h[0].attn.split_heads(query), model.h[0].attn.split_heads(key, k=True), model.h[0].attn.split_heads(value) # QK Matmul w = torch.matmul(split_query, split_key)
def test_gpt2_embeddings(): gpt_model = 'gpt2-medium' tokenizer = GPT2Tokenizer.from_pretrained(gpt_model) model = GPT2Model.from_pretrained(pretrained_model_name_or_path=gpt_model, output_hidden_states=True) model.to(flair.device) model.eval() s = 'Berlin and Munich have a lot of puppeteer to see .' with torch.no_grad(): tokens = tokenizer.tokenize((('<|endoftext|>' + s) + '<|endoftext|>')) indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[(-1)] first_layer = hidden_states[1][0] assert (len(first_layer) == len(tokens)) def embed_sentence(sentence: str, pooling_operation, layers: str = '1', use_scalar_mix: bool = False) -> Sentence: embeddings = OpenAIGPT2Embeddings( pretrained_model_name_or_path=gpt_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence sentence_first_subword = embed_sentence(sentence=s, pooling_operation='first') first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_first_subword.tokens[ 0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[9].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual) sentence_last_subword = embed_sentence(sentence=s, pooling_operation='last') first_token_embedding_ref = first_layer[2].tolist() first_token_embedding_actual = sentence_last_subword.tokens[ 0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[11].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual) sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation='first_last') first_token_embedding_ref = torch.cat([first_layer[1], first_layer[2]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[9], first_layer[11]]).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual) sentence_mean_subword = embed_sentence(sentence=s, pooling_operation='mean') first_token_embedding_ref = calculate_mean_embedding( [first_layer[1], first_layer[2]]).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[ 0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[9], first_layer[10], first_layer[11]]).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual) sentence_mult_layers = embed_sentence(sentence='Munich', pooling_operation='first', layers='1,2,3,4') ref_embedding_size = (4 * 1024) actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert (ref_embedding_size == actual_embedding_size) sentence_mult_layers_scalar_mix = embed_sentence(sentence='Berlin', pooling_operation='first', layers='1,2,3,4', use_scalar_mix=True) ref_embedding_size = (1 * 1024) actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert (ref_embedding_size == actual_embedding_size)