def chance_reply(history: List[Tuple[bool, str]], tokenizer: OpenAIGPTTokenizer, model: OpenAIGPTDoubleHeadsModel, device): model.to(device) # build the network inputs output = [] inputs = [bos] token_types = [speaker_other if len(history) > 0 and not history[0][0] else speaker_self] for user, text in history: inputs.append(speaker_self if user else speaker_other) token_types.append(speaker_self if user else speaker_other) for token in tokenizer.tokenize(text): inputs.append(token) token_types.append(speaker_self if user else speaker_other) cutoff = 500 input_ids = tokenizer.convert_tokens_to_ids(inputs) token_type_ids = tokenizer.convert_tokens_to_ids(token_types) model.eval() model_out = model(torch.tensor([input_ids[-cutoff:]], dtype=torch.long).to(device), token_type_ids=torch.tensor([token_type_ids[-cutoff:]], dtype=torch.long).to(device)) logits = model_out.logits[0, -1, :] / config["eval"]["temperature"] logits = filter_logits(logits, tokenizer, True, whitelist=[speaker_self, speaker_other]) probs = F.softmax(logits, dim=-1) speaker_self_token = tokenizer.convert_tokens_to_ids(speaker_self) return probs[speaker_self_token].item()
def evaluate_model(model: OpenAIGPTDoubleHeadsModel, test_loader: torch.utils.data.DataLoader, device, num_tests: int = 100): num_tests = min(num_tests, len(test_loader)) print("Evaluating on {} tests".format(num_tests)) test_num = 0 mc_correct = 0 lm_tested = 0 lm_correct = 0 for batch in test_loader: if test_num == num_tests: break if test_num % 20 == 0: print("Test number {}/{}".format(test_num, num_tests)) model.eval() input_ids = batch["input_ids"].to(device) mc_token_ids = batch["mc_token_ids"].to(device) token_type_ids = batch["token_type_ids"].to(device) lm_labels = batch["lm_labels"].to(device) mc_labels = batch["correct"].to(device) try: model_output = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids) except Exception as e: print(input_ids, token_type_ids, mc_token_ids, lm_labels, mc_labels, sep="\n") raise e mc_logits = model_output.mc_logits mc_guess = torch.topk(mc_logits[0], 1).indices[0].item() mc_answer = mc_labels[0].item() lm_logits = model_output.logits[0][mc_answer] lm_answer = lm_labels[0][mc_answer] for i in range(len(lm_answer)): if lm_answer[i] == -100 or i == 0: continue guess = torch.topk(lm_logits[i - 1], 1).indices[0].item() if guess == lm_answer[i]: lm_correct += 1 lm_tested += 1 if mc_guess == mc_answer: mc_correct += 1 test_num += 1 print("MC: {}/{}, LM: {}/{}".format(mc_correct, num_tests, lm_correct, lm_tested)) return { "mc_correct": mc_correct, "num_tests": num_tests, "lm_correct": lm_correct, "lm_tested": lm_tested }
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): model = OpenAIGPTDoubleHeadsModel(config) model.eval() loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids) result = {"loss": loss, "lm_logits": lm_logits} self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size])
def load_model_and_tokenizer(file_path: str) -> Tuple[OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer]: model = OpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt") tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt") orig_num_tokens = len(tokenizer.encoder) num_added_tokens = tokenizer.add_special_tokens(SPECIAL_TOKENS) model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens) model.load_state_dict(torch.load(file_path)) return model, tokenizer
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): model = OpenAIGPTDoubleHeadsModel(config) model.to(torch_device) model.eval() result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) self.parent.assertEqual(result.loss.shape, ()) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.") parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name) tokenizer.add_tokens(special_tokens) special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name) model.resize_token_embeddings(len(tokenizer)) model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps //\ (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader)\ // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() scheduler.step() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, scheduler.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model itself # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
from typing import * import torch from transformers import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer from itertools import chain from special_tokens import bos, eos, speaker_self, speaker_other, lsep, pad, SPECIAL_TOKENS model = OpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt") tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt") # history = [[(True, "hello"), (True, "how"), (True, "are"), (True, "you"), (True, "?")], # [(False, "i"), (False, "am"), (False, "fine"), (False, "thanks"), (False, ".")]] history = [(True, tokenizer.tokenize("hello how are you?")), (False, tokenizer.tokenize("i am fine thanks."))] reply = (True, ["good", "to", "hear", "."]) orig_num_tokens = len(tokenizer.encoder) print(orig_num_tokens) num_added_tokens = tokenizer.add_special_tokens(SPECIAL_TOKENS) model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens) def build_inputs(history: List[Tuple[bool, List[str]]], reply: Tuple[bool, List[str]]): history = history + [reply] sequence = list(map(lambda x: [speaker_self if x[0] else speaker_other] + x[1], history)) # print(sequence) sequence[0] = [bos] + sequence[0]
def generate_from_history(history: List[Tuple[bool, str]], tokenizer: OpenAIGPTTokenizer, model: OpenAIGPTDoubleHeadsModel, device, token_blacklist: Optional[List[str]] = None,) -> List[str]: """Generates an utterance given a set of messages preceding it. :argument history: a list of tuples (user, message) user is a boolean on whether sender is user. message is string. :argument tokenizer: the tokenizer :argument model: the model :argument device: pytorch device to run on :argument token_blacklist: a list of tokens to not make the network generate""" model.to(device) # build the network inputs output = [] inputs = [bos] token_types = [speaker_other if len(history) > 0 and not history[0][0] else speaker_self] for user, text in history: inputs.append(speaker_self if user else speaker_other) token_types.append(speaker_self if user else speaker_other) for token in tokenizer.tokenize(text): inputs.append(token) token_types.append(speaker_self if user else speaker_other) inputs.append(speaker_self) token_types.append(speaker_self) input_ids = tokenizer.convert_tokens_to_ids(inputs) token_type_ids = tokenizer.convert_tokens_to_ids(token_types) model.eval() eos_token = tokenizer.convert_tokens_to_ids(eos) speaker_self_token = tokenizer.convert_tokens_to_ids(speaker_self) speaker_other_token = tokenizer.convert_tokens_to_ids(speaker_other) cutoff = config["bot"]["max_token_history"] for i in range(config["bot"]["token_limit"]): model_out = model(torch.tensor([input_ids[-cutoff:]], dtype=torch.long).to(device), token_type_ids=torch.tensor([token_type_ids[-cutoff:]], dtype=torch.long).to(device)) logits = model_out.logits[0, -1, :] / config["eval"]["temperature"] blacklist = [bos, eos, pad] + token_blacklist logits = filter_logits(logits, tokenizer, False, blacklist=blacklist) logits = top_p_sample(logits, config["eval"]["top_p"]) # print("{} -> {}".format(tokenizer.convert_ids_to_tokens(output[-5:]), tokenizer.convert_ids_to_tokens(torch.topk(logits, 5)[1]))) probs = F.softmax(logits, dim=-1) prev = torch.multinomial(probs, 1).item() input_ids.append(prev) token_type_ids.append(speaker_self_token) output.append(prev) if prev in (speaker_other_token, eos_token): break output = tokenizer.convert_ids_to_tokens(output) current_msg = [] messages = [] for i in output: if i in (speaker_self, eos, speaker_other): messages.append(tokenizer.convert_tokens_to_string(current_msg)) current_msg = [] else: current_msg.append(i) if len(current_msg) > 0: messages.append(tokenizer.convert_tokens_to_string(current_msg)) return messages
# import argparse import ingest import csv from tqdm import tqdm from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME, get_linear_schedule_with_warmup) from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) import transformers import numpy as np import torch device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = OpenAIGPTDoubleHeadsModel.from_pretrained("log/") tokenizer = OpenAIGPTTokenizer.from_pretrained("log/") special_tokens = ['_start_', '_delimiter_', '_classify_'] special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens) def accuracy(out, labels): outputs = np.argmax(out, axis=1) return np.sum(outputs == labels) def load_rocstories_dataset(dataset_path, loadLabel=False): """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """ with open(dataset_path, encoding='utf_8') as f: f = csv.reader(f) output = [] next(f) # skip the first line
def train(dataset_path: str): device = torch.device(config["train"]["device"]) print("Device: {}".format(device)) # device = torch.device("cpu") # gpu not enough memory :( model = OpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt") model.to(device) tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt") orig_num_tokens = len(tokenizer.encoder) num_added_tokens = tokenizer.add_special_tokens(SPECIAL_TOKENS) model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens) # dataloader = get_data_loader(dataset_path, tokenizer, batch_size=4, shuffle=False, num_workers=1) full_dataset = get_dataset(dataset_path, tokenizer) assert len(full_dataset) > 0 train_size = int( len(full_dataset) * config["train"]["train_dataset_proportion"] + 1) test_size = len(full_dataset) - train_size print("Full dataset has {} dialogs. Splitting into train: {} and test: {}". format(len(full_dataset), train_size, test_size)) train_dataset, test_dataset = random_split( full_dataset, [train_size, test_size], torch.Generator().manual_seed(42)) print(len(train_dataset), len(test_dataset)) train_loader = get_data_loader(train_dataset, tokenizer, config["train"]["batch_size"], True, 0) test_loader = get_data_loader(test_dataset, tokenizer, 1, False, 0) lr = config["train"]["learning_rate"] print("lr: {}".format(lr)) optimizer = AdamW(model.parameters(), lr=lr) # init logging start_time = datetime.datetime.now() save_path = os.path.join( os.path.dirname(__file__), "log/log-{}.txt".format(start_time.strftime("%y-%m-%d-%H-%M-%S"))) print(os.path.dirname(__file__), save_path) f = open(save_path, "w+") f.close() epochs = config["train"]["num_epochs"] eval_every = config["train"]["evaluate_interval_iters"] num_tests = config["train"]["num_tests"] last_model_save = datetime.datetime.now() iteration = 0 for epoch in range(epochs): print("Starting epoch {}/{}".format(epoch, epochs)) for batch in train_loader: if iteration % eval_every == 0: results = evaluate_model(model, test_loader, device, num_tests) add_log( save_path, "test,{0},{1},{2[mc_correct]},{2[num_tests]},{2[lm_correct]},{2[lm_tested]}\n" .format(iteration, epoch, results)) model.train() input_ids = batch["input_ids"].to(device) mc_token_ids = batch["mc_token_ids"].to(device) token_type_ids = batch["token_type_ids"].to(device) lm_labels = batch["lm_labels"].to(device) mc_labels = batch["correct"].to(device) try: model_output = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, labels=lm_labels) except Exception as e: print(input_ids, token_type_ids, mc_token_ids, lm_labels, mc_labels, sep="\n") raise e # print("input_ids: {}\ntoken_type_ids: {}\nmc_token_ids: {}\nlm_labels: {}\nmc_labels: {}" # .format(input_ids, token_type_ids, mc_token_ids, lm_labels, mc_labels)) # print(model_output.loss.item(), model_output.mc_loss.item()) lm_loss = model_output.loss mc_loss = model_output.mc_loss loss = lm_loss * config["train"]["lm_coeff"] + mc_loss * config[ "train"]["mc_coeff"] add_log( save_path, "train,{},{},{},{},{}\n".format(iteration, epoch, loss, lm_loss, mc_loss)) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), config["train"]["max_norm"]) optimizer.step() optimizer.zero_grad() # TODO: evaluation if iteration % 50 == 0: print( "Time: {} Epoch: {}/{} Iteration: {}/{} Loss: {} ({} {})". format( datetime.datetime.now() - start_time, epoch, epochs, iteration, epochs * (len(train_dataset) // config["train"]["batch_size"]), loss.item(), lm_loss.item(), mc_loss.item())) if datetime.datetime.now() - last_model_save > datetime.timedelta( minutes=config["train"]["save_interval_mins"]): print("Saving model...") torch.save( model.state_dict(), os.path.join(os.path.dirname(__file__), "checkpoints/model-{}-iter{}.pt").format( start_time.strftime("%y-%m-%d-%H-%M-%S"), iteration)) last_model_save = datetime.datetime.now() iteration += 1 print("Saving model...") torch.save( model.state_dict(), os.path.join(os.path.dirname(__file__), "checkpoints/model-{}-iter{}.pt").format( start_time.strftime("%y-%m-%d-%H-%M-%S"), iteration))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, default="openai-gpt", help="pretrained model name") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_test", action="store_true", help="fix the theoretical lowest loss") parser.add_argument("--do_save", action="store_true", help="Save the model") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument("--train_dataset", type=str, default="/cloze_test_val__spring2016 - cloze_test_ALL_val.csv") parser.add_argument("--eval_dataset", type=str, default="") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--num_train_epochs", type=int, default=3) parser.add_argument("--train_batch_size", type=int, default=8) parser.add_argument("--eval_batch_size", type=int, default=16) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", type=int, default=1) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.", ) parser.add_argument("--learning_rate", type=float, default=6.25e-5) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--lr_schedule", type=str, default="warmup_linear") parser.add_argument("--weight_decay", type=float, default=0.01) parser.add_argument("--lm_coef", type=float, default=0.9) parser.add_argument("--n_valid", type=int, default=374) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") args = parser.parse_args() #print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ["_start_", "_delimiter_", "_classify_"] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name) tokenizer.add_tokens(special_tokens) special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name) model.resize_token_embeddings(len(tokenizer)) model.to(device) # Load and encode the datasets def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max( len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 for dataset in encoded_datasets for story, cont1, cont2, _ in dataset ) input_length = min(input_length, model.config.n_positions) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train or args.do_test: if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] global optimizer_grouped_parameters optimizer_grouped_parameters = [ { "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(tqdm_bar, desc="Training") for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = ( loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item() ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0]) if args.do_test: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() ##for _ in (0,)): ## optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler=torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lambda x: 1e-2**x),-1) tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Testing") maxloop=0 avrgloops=0 loop=0 prog="" for step, batch in enumerate(tqdm_bar): stage=0 batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() lowloss=loss.item() tqdm.write("reseting lowlost") tqdm_bar.set_description("Testing {} loss:{}".format(loop,lowloss)) scheduler.step(-1) optimizer.step() optimizer.zero_grad() if loop>maxloop: maxloop=loop avrgloops +=loop loop=0 newloss=loss.item() intloss=math.inf oldloss=intloss bad=0 if math.isnan(loss.item()): tqdm_bar.write("beeping NaN") while True: tqdm_bar.set_description("Testing {} loss:{}".format(loop,newloss)) loop = loop + 1 if intloss < newloss: tqdm_bar.write("{} counter productive:{} > {}".format(bad,newloss,intloss)) scheduler.step() if intloss>lowloss: tqdm_bar.write("this run didn't beat the old loss{}".format(lowloss)) stage=1 if oldloss==newloss: tqdm_bar.write("\nlooped {} as good as it gets: {}".format(loop,loss)) break input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.zero_grad() oldloss=intloss intloss=newloss newloss=loss.item() if newloss < lowloss: bad=0 if newloss < lowloss: lowloss=newloss tr_loss += lowloss avgloops += loop exp_average_loss = ( loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item() )
# Save a trained model if args.do_train or args.do_save: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, "module") else model # Only save the model itself # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss, _, mc_logits = model( input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels )