def __init__(self, opt, shared=None): super(TransformerAgent, self).__init__(opt, shared) args = AttrDict( opt) # to keep most commands identical to the interact.py script self.args = args logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__file__) self.logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if shared is None: self.logger.info("Get pretrained model and tokenizer") if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() self.tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_checkpoint) if self.args.eval_type == "hits@1": self.model_checkpoint = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_checkpoint) else: self.model_checkpoint = OpenAIGPTLMHeadModel.from_pretrained( args.model_checkpoint) self.model_checkpoint.to(args.device) self.model_checkpoint.eval() self.logger.info("Build BPE prefix dictionary") convai_dict = build_dict() assert len(convai_dict) == 19304 self.prefix2words = self.get_prefix2words(convai_dict) else: self.model_checkpoint = shared['model'] self.tokenizer = shared['tokenizer'] self.prefix2words = shared['prefix2words'] # self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS) self.persona = [] self.history = [] self.labels = [] self.reset()
def __init__(self, config: Config, embed_dim: int, *args, **kwargs) -> None: super().__init__(config) self.representation_dim = embed_dim self.gptmode = 'gpt2' #self.gptmode = 'openai-gpt' if self.gptmode == 'gpt2': self.model = GPT2DoubleHeadsModel.from_pretrained('gpt2') self.model.resize_token_embeddings(self.model.config.vocab_size + config.num_special_tokens) else: self.model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') self.model.set_num_special_tokens(len(SPECIAL_TOKENS)) self.temperature = 0.9 self.top_k = 0 self.top_p = 0.7 self.min_length = 1 self.max_length = 300 self.no_sample = True
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_data) * args.num_train_epochs // args.train_batch_size optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) # Save a trained model if args.do_train: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") config = model.config torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = OpenAIGPTDoubleHeadsModel(config) model.load_state_dict(model_state_dict) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels) _, mc_logits = model(input_ids, mc_token_ids) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') # We will use 5 special tokens: # - <bos> to indicate the start of the sequence # - <eos> to indicate the end of the sequence # - <speaker1> to indicate the beginning and the tokens of an utterance from the user # - <speaker2> to indicate the beginning and the tokens of an utterance from the bot # - <pad> as a padding token to build batches of sequences SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"] # We can add these special tokens to the vocabulary and the embeddings of the model: tokenizer.set_special_tokens(SPECIAL_TOKENS) model.set_num_special_tokens(len(SPECIAL_TOKENS)) ################################################################################ from itertools import chain # Let's define our contexts and special tokens persona = [["i", "like", "playing", "football", "."], ["i", "am", "from", "NYC", "."]] history = [["hello", "how", "are", "you", "?"], ["i", "am", "fine", "thanks", "."]] reply = ["great", "to", "hear"] bos, eos, speaker1, speaker2 = "<bos>", "<eos>", "<speaker1>", "<speaker2>" def build_inputs(persona, history, reply):
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--answer_only", default=False, action='store_true', help="Whether to run with answers only (blank out question).") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--load_model_from", default=None, type=str, help= "The saved model file to load before doing any training or eval (if both --do_train and --do_eval are specified, the saved model will be loaded, then trained, then the trained model will be evaluated)." ) parser.add_argument( '--train_filename', type=str, default='train.csv', help="Filename to load train data from (relative to data_dir)") parser.add_argument( '--eval_filename', type=str, default='val.csv', help="File to load eval data from (relative to data_dir)") parser.add_argument( '--data_format', type=str, choices=['swag', 'codah'], default='swag', help= "Format of the train and eval files (original SWAG CSV format vs our TSV format)" ) parser.add_argument( '--model_labels_save_filename', type=str, default='model_labels.json', help= "JSON file to save model outputs/labels to (relative to output_dir)") parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=32) parser.add_argument('--eval_batch_size', type=int, default=8) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.5) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument( '--gradient_accumulation_steps', type=int, default=8, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_eval and (not args.do_train) and args.load_model_from is None: args.load_model_from = os.path.join(args.output_dir, 'pytorch_model.bin') # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) config = model.config if args.load_model_from: model_state_dict = torch.load(args.load_model_from) model = OpenAIGPTDoubleHeadsModel(config) model.load_state_dict(model_state_dict) model.to(device) # Load and encode the datasets logger.info("Loading datasets...") datasets = [] dataset_keys = dict() if args.do_train: train_dataset = read_swag_examples(os.path.join( args.data_dir, args.train_filename), is_training=True, answer_only=args.answer_only, data_format=args.data_format) train_dataset = [ EncodedSwagExample(ex, tokenizer) for ex in tqdm(train_dataset, desc='Encoding train') ] dataset_keys['train'] = len(datasets) datasets.append(train_dataset) if args.do_eval: eval_dataset = read_swag_examples(os.path.join(args.data_dir, args.eval_filename), is_training=True, answer_only=args.answer_only, data_format=args.data_format) eval_dataset = [ EncodedSwagExample(ex, tokenizer) for ex in tqdm(eval_dataset, desc='Encoding eval') ] dataset_keys['eval'] = len(datasets) datasets.append(eval_dataset) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(swagex.context_tokens[:max_length]) + len(swagex.start_ending_tokens[:max_length]) + max(len(ending[:max_length]) for ending in swagex.endings_tokens) + 3 \ for dataset in datasets for swagex in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model print('---') print('Input length: {}\n'.format(input_length)) print('---') # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(datasets, input_length, max_length, *special_tokens_ids) if args.do_train: train_tensor_dataset = tensor_datasets[dataset_keys['train']] if args.do_eval: eval_tensor_dataset = tensor_datasets[dataset_keys['eval']] # Prepare optimizer if args.do_train: train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] #num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size num_train_optimization_steps = int( len(train_data) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_steps += 1 exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() # Save a trained model output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) if args.do_eval: eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Load a trained model that you have fine-tuned if args.do_train: model_state_dict = torch.load(output_model_file) model = OpenAIGPTDoubleHeadsModel(config) model.load_state_dict(model_state_dict) model.to(device) model.eval() all_model_outputs = [] data_index = 0 eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels) _, mc_logits = model(input_ids, mc_token_ids) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy for i in range(input_ids.size(0)): output_obj = dict() output_obj['logits'] = [float(x) for x in mc_logits[i]] output_obj['true_label'] = int(mc_labels[i]) output_obj['model_label'] = int(np.argmax(mc_logits[i])) output_obj['swag_data'] = datasets[ dataset_keys['eval']][data_index].raw_example.to_dict() all_model_outputs.append(output_obj) data_index += 1 nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) with open( os.path.join(args.output_dir, args.model_labels_save_filename), 'w') as f: json.dump(all_model_outputs, f)
epochs = 10 max_len = 100 batch_size = 128 learning_rate = 6.25e-5 warmup_proportion = 0.002 max_grad_norm = 1 weight_decay = 0.01 train_path = '/data/users/kyle.shaffer/dialog_data/cornell_movie/cakechat_model/corpora_processed/train_no_tok.txt' valid_path = '/data/users/kyle.shaffer/dialog_data/cornell_movie/cakechat_model/corpora_processed/valid_no_tok.txt' model_name = 'openai-gpt' lm_coef = 0.9 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, special_tokens=special_tokens) special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained(model_name, num_special_tokens=len(special_tokens)) x_train_arr, y_train_arr = load_data_with_tok(train_path, tokenizer, max_len) x_valid_arr, y_valid_arr = load_data_with_tok(valid_path, tokenizer, max_len) train_dataloader, valid_dataloader = data_to_torch(x_train_arr, y_train_arr, x_valid_arr, y_valid_arr, batch_size) opt = prep_optimizer(model=model, epochs=epochs, learning_rate=learning_rate, warmup_proportion=warmup_proportion, max_grad_norm=max_grad_norm, weight_decay=weight_decay) train_model(model=model, train_dataloader=train_dataloader, opt=opt, lm_coef=lm_coef, epochs=epochs, valid_dataloader=valid_dataloader)
def train(): EPOCHS = 3 SAVE_ITR = 3 LM_COEF = 1.0 MC_COEF = 1.0 DEVICE = 0 FP16 = True MAX_NORM = 1.0 # Clipping Gradient Norm GRAD_ACCUM_STEPS = 6 #4 train_batch_size = 3 #4 tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') add_special_tokens_(model, tokenizer) model = model.cuda(DEVICE) optimizer = AdamW(model.parameters(), lr=6.25e-5, correct_bias=True) if FP16: model, optimizer = amp.initialize( model, optimizer, opt_level='O1') #O1/O2 #https://nvidia.github.io/apex/amp.html train_dataset = torch.load('train_dataset.pyobj') train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True) def update(b, batch): model.train() batch = [input_tensor.to(DEVICE) for input_tensor in batch] input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids) loss = (lm_loss * LM_COEF + mc_loss * MC_COEF) / GRAD_ACCUM_STEPS if FP16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), MAX_NORM) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_NORM) if b % GRAD_ACCUM_STEPS == 0: optimizer.step() optimizer.zero_grad() return loss.item() E, B = EPOCHS, len(train_loader) for e in range(EPOCHS): for b, batch in enumerate(train_loader): loss = update(b, batch) if b % (B // 300) == 0: print(e, str(b) + '/' + str(B), loss) torch.cuda.empty_cache() if (e + 1) % SAVE_ITR == 0: torch.save(model.state_dict(), '/media/sec/conv_ai_weights/' + str(e + 1) + '.pth')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_csqa_dataset(args.train_dataset) print("Splitting train 90-10 into train-dev.") dev_dataset = train_dataset[int(len(train_dataset) * 0.9):] train_dataset = train_dataset[:int(len(train_dataset) * 0.9)] test_dataset = load_csqa_dataset(args.eval_dataset) datasets = (train_dataset, dev_dataset, test_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the mex input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max( len(question[:max_length]) + max(len(answer1[:max_length]), len(answer2[:max_length]), len(answer3[:max_length])) + 3 for dataset in encoded_datasets for question, answer1, answer2, answer3, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset = tensor_datasets[0] dev_tensor_dataset = tensor_datasets[1] test_tensor_dataset = tensor_datasets[2] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) dev_data = TensorDataset(*dev_tensor_dataset) dev_sampler = RandomSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.train_batch_size) test_data = TensorDataset(*test_tensor_dataset) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_data) * args.num_train_epochs // args.train_batch_size optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None best_dev_accuracy = 0 test_acc_best_dev = 0 best_dev_epoch = 0 no_up = 0 tqdm_epoch = tqdm(range(args.num_train_epochs), desc="Epoch") for epoch in tqdm_epoch: model.train() tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) # train_loss, train_accuracy = evaluate(model, device, train_dataloader, desc="Evaluate Train") dev_loss, dev_accuracy = evaluate(model, device, dev_dataloader, desc="Evaluate Dev") test_loss, test_accuracy = evaluate(model, device, test_dataloader, desc="Evaluate Test") train_loss = tr_loss / nb_tr_steps if args.do_train else None if dev_accuracy >= best_dev_accuracy: # New best model. best_dev_accuracy = dev_accuracy test_acc_best_dev = test_accuracy best_dev_epoch = epoch + 1 no_up = 0 # Save the new best model. model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) else: no_up += 1 tqdm.write("\t ***** Eval results (Epoch %s) *****" % str(epoch + 1)) # tqdm.write("\t train_accuracy = %s" % str(train_accuracy)) tqdm.write("\t dev_accuracy = %s" % str(dev_accuracy)) tqdm.write("") tqdm.write("\t best_dev_accuracy = %s" % str(best_dev_accuracy)) tqdm.write("\t test_acc_best_dev = %s" % str(test_acc_best_dev)) tqdm.write("\t best_dev_epoch = %s" % str(best_dev_epoch)) tqdm.write("\t no_up = %s" % str(no_up)) tqdm.write("") if no_up >= 10: tqdm_epoch.close() break
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=50, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) # if args.model_checkpoint == "": # args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint) model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_checkpoint) model.to(args.device) model.eval() # logger.info("Sample a personality") # personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) # personality = random.choice(personalities) # logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] #examplepara = "Evidence of prehistoric activity in the area comes from Ashton Moss – a 107-hectare (260-acre) peat bog – and is the only one of Tameside's 22 Mesolithic sites not located in the hilly uplands in the north east of the borough. A single Mesolithic flint tool has been discovered in the bog,[6][7] along with a collection of nine Neolithic flints.[8] There was further activity in or around the bog in the Bronze Age. In about 1911, an adult male skull was found in the moss; it was thought to belong to the Romano-British period – similar to the Lindow Man bog body – until radiocarbon dating revealed that it dated from 1,320–970 BC" #examplepara = tokenizer.encode(examplepara) search = Searcher() raw_text = input(">>> ") start_time = time.time() while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") start_time = time.time() articlelist = search.searchandsplit(raw_text) query = tokenizer.encode(raw_text) toplist = [] topresults = 3 topmcs = [0.01] * topresults threshold = 0.01 with torch.no_grad(): for arti in articlelist: for para in arti: txtpara = para para = tokenizer.encode(para) out_ids, mc = sample_sequence(query, para, tokenizer, model, args, threshold=threshold) out_text = tokenizer.decode(out_ids, skip_special_tokens=True) mcs = mc.item() if mcs > topmcs[0]: toplist.append([mcs, out_text, txtpara]) print(f"Answer propability: {mcs}\n") print(out_text) topmcs.append(mcs) topmcs.sort() del topmcs[0] sortedresults = sorted(toplist, key=lambda x: x[0], reverse=True) toprange = min([topresults, len(sortedresults)]) for i in range(toprange): print("\n\n") print(f"Top {i}\n") print(f"Answer propability: {sortedresults[i][0]}\n") print("Answer: " + sortedresults[i][1] + "\n") print("Paragraph for this answer: " + sortedresults[i][2]) print("Number of paragraphs searched") print(len(sortedresults)) finaltime = time.time() - start_time print(f"Processing finished after {finaltime}")