def meta_train(args, gold_ratio): """ Train the model """ best_acc = 0. best_f1 = 0. best_loss_val = 100000 val_acc_and_f1 = 0. best_cm = "" fake_acc_and_f1 = 0. fake_best_f1 = 0. fake_best_acc = 0. writer = None tokenizer, model = build_model(args) g_dataset = load_fake_news(args, tokenizer, evaluate=False, train_path=args.gold_train_path) s_dataset = load_fake_news(args, tokenizer, evaluate=False, train_path=args.silver_train_path, is_weak=True, weak_type=args.weak_type) val_dataset = load_fake_news(args, tokenizer, evaluate=False, train_path=args.val_path) eval_dataset = copy.deepcopy(val_dataset) # make a copy of train and test towards similar size as the weak source if True: max_length = max(len(g_dataset), len(s_dataset), len(val_dataset)) g_dataset = torch.utils.data.ConcatDataset( [g_dataset] * int(max_length / len(g_dataset))) s_dataset = torch.utils.data.ConcatDataset( [s_dataset] * int(max_length / len(s_dataset))) val_dataset = torch.utils.data.ConcatDataset( [val_dataset] * int(max_length / len(val_dataset))) g_sampler = RandomSampler(val_dataset) g_dataloader = DataLoader(val_dataset, sampler=g_sampler, batch_size=args.g_train_batch_size) train_sampler = RandomSampler(g_dataset) train_dataloader = DataLoader(g_dataset, sampler=train_sampler, batch_size=args.g_train_batch_size) s_sampler = RandomSampler(s_dataset) s_dataloader = DataLoader(s_dataset, sampler=s_sampler, batch_size=args.s_train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(g_dataloader)) + 1 else: if gold_ratio == 0: t_total = min(len(g_dataloader), len(s_dataloader)) * args.num_train_epochs else: t_total = min(len(g_dataloader), len(train_dataloader), len(s_dataloader)) * args.num_train_epochs if args.clf_model is not "cnn": no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) else: optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min") scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, t_total / args.num_train_epochs) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if args.use_group_weight or args.use_group_net: # if args.use_group_weight: group_weight = GroupWeightModel(n_groups=args.multi_head) else: group_weight = FullWeightModel(n_groups=args.multi_head, hidden_size=args.hidden_size) group_weight = group_weight.to(args.device) parameters = [i for i in group_weight.parameters() if i.requires_grad] if "adam" in args.group_opt.lower(): if "w" in args.group_opt.lower(): group_optimizer = AdamW(parameters, lr=args.group_lr, eps=args.group_adam_epsilon, weight_decay=args.group_weight_decay) else: group_optimizer = torch.optim.Adam( parameters, lr=args.group_lr, eps=args.group_adam_epsilon, weight_decay=args.group_weight_decay) group_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( group_optimizer, t_total / args.num_train_epochs) # group_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, # num_training_steps=t_total) elif args.group_opt.lower() == "sgd": group_optimizer = torch.optim.SGD( parameters, lr=args.group_lr, momentum=args.group_momentum, weight_decay=args.group_weight_decay) group_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( group_optimizer, 'min') if args.fp16: group_weight, group_optimizer = amp.initialize( group_weight, group_optimizer, opt_level=args.fp16_opt_level) # # Train! logger.info("***** Running training *****") logger.info(" Num Gold examples = %d, Silver Examples = %d", len(val_dataset), len(s_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d, %d", args.g_train_batch_size, args.s_train_batch_size) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint g_loss, logging_g_loss, logging_s_loss, s_loss = 0.0, 0.0, 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch") set_seed(args) # Added here for reproductibility temp_output = open(args.flat_output_file + "_step", "w+", 1) for _ in train_iterator: be_changed = False for step, (g_batch, s_batch, train_batch) in enumerate( zip(g_dataloader, s_dataloader, train_dataloader)): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() g_batch = tuple(t.to(args.device) for t in g_batch) g_input = { "input_ids": g_batch[0], "attention_mask": g_batch[1], "labels": g_batch[2] } s_batch = tuple(t.to(args.device) for t in s_batch) s_input = { "input_ids": s_batch[0], "attention_mask": s_batch[1], "labels": s_batch[2], "reduction": 'none' } train_batch = tuple(t.to(args.device) for t in train_batch) train_input = { "input_ids": train_batch[0], "attention_mask": train_batch[1], "labels": train_batch[2] } # ATTENTION: RoBERTa does not need token types id if args.multi_head > 1: s_input.update({"is_gold": False}) if (global_step + 1) % args.logging_steps == 0: step_input = global_step else: step_input = None info = {"gold_ratio": gold_ratio, "step": step_input} if args.use_group_net: outputs = step_l2w_group_net(model, optimizer, scheduler, g_input, s_input, train_input, args, group_weight, group_optimizer, group_scheduler, gold_ratio) loss_g, loss_s, instance_weight = outputs else: outputs = step_l2w(model, optimizer, scheduler, g_input, s_input, train_input, args, gold_ratio) loss_g, loss_s = outputs g_loss += loss_g.item() s_loss += loss_s.item() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} results = {} if (args.evaluate_during_training) or True: results = evaluate(args, model, tokenizer, gold_ratio, eval_dataset=eval_dataset) results = { key + "_val": value for key, value in results.items() } results.update({"type": "val"}) print(json.dumps(results)) if val_acc_and_f1 < results['acc_and_f1_val']: be_changed = True best_loss_val = results['loss_val'] val_acc_and_f1 = results['acc_and_f1_val'] test_results = evaluate(args, model, tokenizer, gold_ratio) best_acc = test_results['acc'] best_f1 = test_results['f1'] best_cm = test_results['c_m'] best_acc_and_f1 = test_results["acc_and_f1"] temp_output.write( "Step: {}, Test F1: {}, Test ACC: {}; Val Acc_and_F1: {}, Val Loss: {}\n" .format(global_step, best_f1, best_acc, val_acc_and_f1, best_loss_val)) temp_output.flush() # save the model if args.save_model: save_path = args.flat_output_file + "_save_model" save_dic = { "BaseModel": model, "LWN": group_weight, "step": global_step, "tokenizer": tokenizer } torch.save(save_dic, save_path) test_results = { key + "_test": value for key, value in test_results.items() } test_results.update({"type": "test"}) print(json.dumps(test_results)) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (g_loss - logging_g_loss) / args.logging_steps learning_rate_scalar = optimizer.defaults.get("lr", 0) logs["train_learning_rate"] = learning_rate_scalar logs["train_g_loss"] = loss_scalar logs["train_s_loss"] = (s_loss - logging_s_loss) / args.logging_steps logging_g_loss = g_loss logging_s_loss = s_loss # writer.add_scalar("Loss/g_train_{}".format(gold_ratio), logs['train_g_loss'], global_step) # writer.add_scalar("Loss/s_train_{}".format(gold_ratio), logs['train_s_loss'], global_step) # writer.add_scalar("Loss/val_train_{}".format(gold_ratio), results['loss_val'], global_step) if args.use_group_weight: try: eta_group = group_optimizer.get_lr() except: eta_group = group_optimizer.defaults.get("lr", 0) # writer.add_scalar("Loss/group_lr_{}".format(gold_ratio), eta_group, global_step) print(json.dumps({**{"step": global_step}, **logs})) if args.max_steps > 0 and global_step > args.max_steps: break if (args.use_group_net or args.use_group_weight) and isinstance( group_scheduler, torch.optim.lr_scheduler.CosineAnnealingLR): group_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( group_optimizer, t_total / args.num_train_epochs) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, t_total / args.num_train_epochs) print("EPOCH Finish") if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break temp_output.close() # return cache_instance_weight return global_step, g_loss / global_step, (best_f1, best_acc, best_cm)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max( len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) # Max size of input for the pre-trained model input_length = min(input_length, model.config.n_positions) # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_dataloader) * args.num_train_epochs optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item() if exp_average_loss is None else 0.7 * \ exp_average_loss+0.3*loss.item() nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels) _, mc_logits = model(input_ids, mc_token_ids) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))