def init_optimizer(args, model, optimization_step): # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=int(optimization_step * args.warmup), t_total=optimization_step) return optimizer, scheduler
def get_optimizers(num_training_steps: int, model): # no_decay = ['bias', 'gamma', 'beta'] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) warmup_steps = args.warmup_proportion * num_training_steps scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps) return optimizer, scheduler
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'align_mask': batch[2], 'labels': batch[4] } inputs['token_type_ids'] = batch[3] outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: if args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): with open( os.path.join(args.output_dir, "{}.txt".format(key)), 'a+') as w: w.write("%d\t%f\n" % (global_step, value)) with open(os.path.join(args.output_dir, "loss.txt"), 'a+') as w: w.write( "%d\t%f\n" % (global_step, (tr_loss - logging_loss) / args.logging_steps)) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--xlnet_model", default=None, type=str, required=True, help="Either one of the two: 'xlnet-large-cased', 'xlnet-base-cased'.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_steps", default=100, type=int, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() processor = dreamProcessor() label_list = processor.get_labels() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") os.makedirs(args.output_dir, exist_ok=True) ## only use cased model tokenizer = XLNetTokenizer.from_pretrained(args.xlnet_model, do_lower_case=False) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / n_class / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) ## prepare model model = XLNetForSequenceClassification.from_pretrained( args.xlnet_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_choices=3) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.weight'] ## note: no weight decay according to XLNet paper optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: # Adam Epsilon fixed at 1e-6 according to XLNet paper optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) warmup_steps = args.warmup_steps scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) input_ids = [] input_mask = [] segment_ids = [] label_id = [] for f in train_features: input_ids.append([]) input_mask.append([]) segment_ids.append([]) for i in range(n_class): ## put three input sequences tgt input_ids[-1].append(f[i].input_ids) input_mask[-1].append(f[i].input_mask) segment_ids[-1].append(f[i].segment_ids) label_id.append([f[0].label_id]) all_input_ids = torch.tensor(input_ids, dtype=torch.long) all_input_mask = torch.tensor(input_mask, dtype=torch.long) all_segment_ids = torch.tensor(segment_ids, dtype=torch.long) all_label_ids = torch.tensor(label_id, dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in range(int(args.num_train_epochs)): max_score = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _, _ = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, n_class=n_class) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 if step % 800 == 0: logger.info("Training loss: {}, global step: {}".format( tr_loss / nb_tr_steps, global_step)) if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running Dev Evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) input_ids = [] input_mask = [] segment_ids = [] label_id = [] for f in eval_features: input_ids.append([]) input_mask.append([]) segment_ids.append([]) for i in range(n_class): input_ids[-1].append(f[i].input_ids) input_mask[-1].append(f[i].input_mask) segment_ids[-1].append(f[i].segment_ids) label_id.append([f[0].label_id]) all_input_ids = torch.tensor(input_ids, dtype=torch.long) all_input_mask = torch.tensor(input_mask, dtype=torch.long) all_segment_ids = torch.tensor(segment_ids, dtype=torch.long) all_label_ids = torch.tensor(label_id, dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits, _ = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, n_class=n_class) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for i in range(len(logits)): logits_all += [logits[i]] tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples if args.do_train: result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } else: result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy } output_eval_file = os.path.join(args.output_dir, "eval_results_test.txt") with open(output_eval_file, "a+") as writer: logger.info(" Epoch: %d", (ep + 1)) logger.info("***** Eval results *****") writer.write(" Epoch: " + str(ep + 1)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # output_eval_file = os.path.join(args.output_dir, "logits_test.txt") # with open(output_eval_file, "w") as f: # for i in range(len(logits_all)): # for j in range(len(logits_all[i])): # f.write(str(logits_all[i][j])) # if j == len(logits_all[i])-1: # f.write("\n") # else: # f.write(" ") if eval_accuracy > max_score: max_score = eval_accuracy ## save trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model_{}epoch.bin".format(ep + 1)) torch.save(model_to_save.state_dict(), output_model_file) else: ## save trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model_{}epoch.bin".format(ep + 1)) torch.save(model_to_save.state_dict(), output_model_file)
def train(args, train_dataset, model, tokenizer): """ Train the model """ tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs warm_up_steps = int(args.warmup_steps * t_total) save_steps = int(args.save_steps * t_total) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] a = [] b = [] c = [] d = [] optimizer_grouped_parameters = [] for n, p in model.named_parameters(): if 'classifier' in n or 'linear_r' in n or 'linear_g' in n: if any(nd in n for nd in no_decay): a.append(p) else: b.append(p) else: if any(nd in n for nd in no_decay): c.append(p) else: d.append(p) optimizer_grouped_parameters.append({ "params": a, "weight_decay": 0, "lr": 2e-3 }) optimizer_grouped_parameters.append({ "params": b, "weight_decay": args.weight_decay, "lr": 2e-3 }) optimizer_grouped_parameters.append({"params": c, "weight_decay": 0}) optimizer_grouped_parameters.append({ "params": d, "weight_decay": args.weight_decay }) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warm_up_steps, t_total=t_total) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'align_mask': batch[2], 'labels': batch[4] } inputs['token_type_ids'] = None outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar('lr_n', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('lr_o', scheduler.get_lr()[2], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if save_steps > 0 and global_step % save_steps == 0: # Save model checkpoint if args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close() return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--output_dir", default='output', type=str, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument("--checkpoint", default='pretrain_ckpt/bert_small_ckpt.bin', type=str, help="checkpoint") parser.add_argument("--model_config", default='data/bert_small.json', type=str) # Other parameters parser.add_argument("--train_file", default='data/KorQuAD_v1.0_train.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=96, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=8.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O2', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) args = parser.parse_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer('data/ko_vocab_32k.txt', max_len=args.max_seq_length, do_basic_tokenize=True) # Prepare model config = Config.from_json_file(args.model_config) model = QuestionAnswering(config) model.bert.load_state_dict(torch.load(args.checkpoint)) num_params = count_parameters(model) logger.info("Total Parameter: %d" % num_params) model.to(device) model = torch.nn.DataParallel(model) cached_train_features_file = args.train_file + '_{0}_{1}_{2}'.format( str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_examples = read_squad_examples(input_file=args.train_file, is_training=True, version_2_with_negative=False) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) num_train_optimization_steps = int( len(train_features) / args.train_batch_size) * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=num_train_optimization_steps * 0.1, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) num_train_step = num_train_optimization_steps input_ids = np.load('input_ids2.npy') input_mask = np.load('input_mask.npy') input_segments = np.load('input_segments.npy') start_prob = np.load('start_prob.npy') end_prob = np.load('end_prob.npy') start_label = np.load('input_start.npy') stop_label = np.load('input_stop.npy') """ for i in range(1000): print(input_ids[i]) print(max(start_prob[i])) print(sum(start_prob[i])) input() """ paragraph = torch.tensor(input_ids.astype( np.int64)).type(dtype=torch.long).cuda() paragraph_mask = torch.tensor(input_mask.astype( np.int64)).type(dtype=torch.long).cuda() paragraph_segments = torch.tensor(input_segments.astype( np.int64)).type(dtype=torch.long).cuda() start_prob = torch.tensor(start_prob.astype( np.float32)).type(dtype=torch.float32).cuda() end_prob = torch.tensor(end_prob.astype( np.float32)).type(dtype=torch.float32).cuda() start_label = torch.tensor(start_label.astype( np.int64)).type(dtype=torch.long).cuda() stop_label = torch.tensor(stop_label.astype( np.int64)).type(dtype=torch.long).cuda() train_data = TensorDataset(paragraph, paragraph_mask, paragraph_segments, start_label, stop_label, start_prob, end_prob) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() global_step = 0 epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): iter_bar = tqdm( train_dataloader, desc="Train(XX Epoch) Step(XX/XX) (Mean loss=X.X) (loss=X.X)") tr_step, total_loss, mean_loss = 0, 0., 0. for step, batch in enumerate(iter_bar): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions, start_probs, end_probs = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions, start_probs, end_probs) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 tr_step += 1 total_loss += loss mean_loss = total_loss / tr_step iter_bar.set_description( "Train Step(%d / %d) (Mean loss=%5.5f) (loss=%5.5f)" % (global_step, num_train_step, mean_loss, loss.item())) logger.info("** ** * Saving file * ** **") model_checkpoint = "korquad_%d.bin" % (epoch) logger.info(model_checkpoint) output_model_file = os.path.join(args.output_dir, model_checkpoint) if n_gpu > 1: torch.save(model.module.state_dict(), output_model_file) else: torch.save(model.state_dict(), output_model_file) epoch += 1
def main(args): if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd logger.info("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() args.data_dir = os.path.join(args.data_dir, args.task_name) args.output_dir = os.path.join(args.output_dir, args.task_name) logger.info("args = %s", args) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: logger.info("Output directory already exists and is not empty.") if not os.path.exists(args.output_dir): try: os.makedirs(args.output_dir) except: logger.info("catch a error") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) # use bert to aug train_examples ori_train_examples = processor.get_train_examples(args.data_dir) eval_examples = processor.get_dev_examples(args.data_dir) test_examples = processor.get_test_examples(args.data_dir) if args.double_ori == 0: num_train_optimization_steps = int( len(ori_train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs else: num_train_optimization_steps = int( len(ori_train_examples) * 2 / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) config_class, tokenizer_class = (RobertaConfig, RobertaTokenizer) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) if args.use_saved == 1: bert_saved_dir = args.ckpt if args.co_training: model_class = RobertaForNSP_co model = model_class.from_pretrained(bert_saved_dir, args=args) elif args.only_bert: model_class = RobertaForSequenceClassification model = model_class.from_pretrained(bert_saved_dir) tokenizer = tokenizer_class.from_pretrained(bert_saved_dir) else: model_class = RobertaForNSPAug model = model_class.from_pretrained(bert_saved_dir, args=args) else: config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=task_name, cache_dir=args.cache_dir if args.cache_dir else None) if args.only_bert: model_class = RobertaForSequenceClassification model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) else: model_class = RobertaForNSPAug model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, args=args) model.cuda() if n_gpu > 1: model = torch.nn.DataParallel(model) cnt = np.sum(np.prod(v.size()) for name, v in model.named_parameters()) / 1e6 logger.info("cnt %s", str(cnt)) if args.do_first_eval: args.do_train = False res_file = os.path.join(args.output_dir, "first_test.tsv") eval_loss, eval_seq_loss, eval_aug_loss, eval_res, eval_aug_accuracy, res_parts = \ do_evaluate(args, processor, label_list, tokenizer, model, 0, output_mode, num_labels, task_name, eval_examples, type="dev") eval_res.update(res_parts) for key in sorted(eval_res.keys()): logger.info("first evaluation: %s = %s", key, str(eval_res[key])) idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, model) dataframe = pd.DataFrame({'index': range(idx), 'prediction': preds}) dataframe.to_csv(res_file, index=False, sep='\t') logger.info(" Num test length = %d", idx) logger.info(" Done ") # write mm test results if task_name == "mnli": res_file = os.path.join(args.output_dir, "first_test_mm.tsv") idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, model, do_mm=True) dataframe = pd.DataFrame({ 'index': range(idx), 'prediction': preds }) dataframe.to_csv(res_file, index=False, sep='\t') logger.info(" Num test length = %d", idx) logger.info(" Done write mm") if args.do_train: # Prepare optimizer no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(args.warmup_rate * num_train_optimization_steps), num_training_steps=num_train_optimization_steps) global_step = 0 best_val_acc = 0.0 first_time = time.time() logger.info("***** Running training *****") logger.info(" Num original examples = %d", len(ori_train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() aug_ratio = 0.0 aug_seed = np.random.randint(0, 1000) for epoch in range(int(args.num_train_epochs)): if args.only_bert: train_features = convert_examples_to_features( ori_train_examples, label_list, args.max_seq_length, tokenizer, num_show=args.num_show, output_mode=output_mode, args=args, pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0], do_roberta=1) else: logger.info("epoch=%d, aug_ratio = %f, aug_seed=%d", epoch, aug_ratio, aug_seed) train_examples = Aug_each_ckpt( ori_train_examples, label_list, model, tokenizer, args=args, num_show=args.num_show, output_mode=output_mode, seed=aug_seed, aug_ratio=aug_ratio, use_bert=False, do_roberta=1, ssa_roberta=1, pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0]) if aug_ratio + args.aug_ratio_each < 1.0: aug_ratio += args.aug_ratio_each aug_seed += 1 train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, num_show=args.num_show, output_mode=output_mode, args=args, pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0], do_roberta=1) logger.info("Done convert features") all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.float) token_real_label = torch.tensor( [f.token_real_label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, token_real_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) logger.info("begin training") tr_loss, tr_seq_loss, tr_aug_loss, train_seq_accuracy, train_aug_accuracy = 0, 0, 0, 0, 0 nb_tr_examples, nb_tr_steps, nb_tr_tokens = 0, 0, 0 preds = [] all_labels = [] for step, batch in enumerate(train_dataloader): batch = tuple(t.cuda() for t in batch) input_ids, input_mask, segment_ids, label_ids, token_real_label = batch if args.only_bert: outputs = model(input_ids, input_mask) seq_logits = outputs[0] else: seq_logits, aug_logits, aug_loss = model( input_ids, input_mask, labels=None, token_real_label=token_real_label) if output_mode == "classification": loss_fct = CrossEntropyLoss() seq_loss = loss_fct(seq_logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() seq_loss = loss_fct(seq_logits.view(-1), label_ids.view(-1)) token_real_label = token_real_label.detach().cpu().numpy() w = args.aug_loss_weight if args.only_bert: loss = seq_loss else: loss = (1 - w) * seq_loss + w * aug_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() total_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), 10000.0) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 tr_seq_loss += seq_loss.mean().item() seq_logits = seq_logits.detach().cpu().numpy() label_ids = label_ids.detach().cpu().numpy() if len(preds) == 0: preds.append(seq_logits) all_labels.append(label_ids) else: preds[0] = np.append(preds[0], seq_logits, axis=0) all_labels[0] = np.append(all_labels[0], label_ids, axis=0) if args.only_bert == 0: aug_logits = aug_logits.detach().cpu().numpy() tmp_train_aug_accuracy, tmp_tokens = accuracy( aug_logits, token_real_label, type="aug") train_aug_accuracy += tmp_train_aug_accuracy nb_tr_tokens += tmp_tokens tr_aug_loss += aug_loss.mean().item() if global_step % 20 == 0: loss = tr_loss / nb_tr_steps seq_loss = tr_seq_loss / nb_tr_steps aug_loss = tr_aug_loss / nb_tr_steps tmp_pred = preds[0] tmp_labels = all_labels[0] if output_mode == "classification": tmp_pred = np.argmax(tmp_pred, axis=1) elif output_mode == "regression": tmp_pred = np.squeeze(tmp_pred) res = accuracy(tmp_pred, tmp_labels, task_name=task_name) if nb_tr_tokens != 0: aug_avg = train_aug_accuracy / nb_tr_tokens else: aug_avg = 0.0 log_string = "" log_string += "epoch={:<5d}".format(epoch) log_string += " step={:<9d}".format(global_step) log_string += " total_loss={:<9.7f}".format(loss) log_string += " seq_loss={:<9.7f}".format(seq_loss) log_string += " aug_loss={:<9.7f}".format(aug_loss) log_string += " lr={:<9.7f}".format(scheduler.get_lr()[0]) log_string += " |g|={:<9.7f}".format(total_norm) #log_string += " tr_seq_acc={:<9.7f}".format(seq_avg) log_string += " tr_aug_acc={:<9.7f}".format(aug_avg) log_string += " mins={:<9.2f}".format( float(time.time() - first_time) / 60) for key in sorted(res.keys()): log_string += " " + key + "= " + str(res[key]) logger.info(log_string) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 train_loss = tr_loss / nb_tr_steps if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and epoch % 1 == 0: eval_loss, eval_seq_loss, eval_aug_loss, eval_res, eval_aug_accuracy, res_parts = \ do_evaluate(args, processor, label_list, tokenizer, model, epoch, output_mode, num_labels, task_name, eval_examples, type="dev") if "acc" in eval_res: tmp_acc = eval_res["acc"] elif "mcc" in eval_res: tmp_acc = eval_res["mcc"] else: tmp_acc = eval_res["corr"] if tmp_acc >= best_val_acc: best_val_acc = tmp_acc dev_test = "dev" model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_dir = os.path.join(args.output_dir, "dev_" + str(tmp_acc)) if not os.path.exists(output_model_dir): os.makedirs(output_model_dir) model_to_save.save_pretrained(output_model_dir) tokenizer.save_pretrained(output_model_dir) output_model_file = os.path.join(output_model_dir, 'pytorch_model.bin') torch.save(model_to_save.state_dict(), output_model_file) result = { 'eval_total_loss': eval_loss, 'eval_seq_loss': eval_seq_loss, 'eval_aug_loss': eval_aug_loss, 'eval_aug_accuracy': eval_aug_accuracy, 'global_step': global_step, 'train_loss': train_loss, 'best_epoch': epoch, 'train_batch_size': args.train_batch_size, 'args': args } result.update(eval_res) result.update(res_parts) output_eval_file = os.path.join( args.output_dir, dev_test + "_results_" + str(tmp_acc) + ".txt") with open(output_eval_file, "w") as writer: logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # write test results if args.do_test: res_file = os.path.join( args.output_dir, "test_" + str(tmp_acc) + ".tsv") idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, model) dataframe = pd.DataFrame({ 'index': range(idx), 'prediction': preds }) dataframe.to_csv(res_file, index=False, sep='\t') logger.info(" Num test length = %d", idx) logger.info(" Done ") # write mm test results if task_name == "mnli": res_file = os.path.join( args.output_dir, "mm_roberta_results_b_" + str(tmp_acc) + ".tsv") idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, do_mm=True) dataframe = pd.DataFrame({ 'index': range(idx), 'prediction': preds }) dataframe.to_csv(res_file, index=False, sep='\t') logger.info(" Num test length = %d", idx) logger.info(" Done write mm") else: logger.info(" tmp_val_acc = %f", tmp_acc)
def train(self): from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') task = self.args.task tb_writer = SummaryWriter(log_dir='./runs/' + task + "/" + current_time + self.args.prefix, comment=self.args.prefix) vocabs, lexical_mapping = self._build_model() train_data = DataLoader(self.args, vocabs, lexical_mapping, self.args.train_data, self.args.batch_size, for_train=True) dev_data = DataLoader(self.args, vocabs, lexical_mapping, self.args.dev_data, self.args.batch_size, for_train=False) test_data = DataLoader(self.args, vocabs, lexical_mapping, self.args.test_data, self.args.batch_size, for_train='Eval') train_data.set_unk_rate(self.args.unk_rate) # WRITE PARAMETERS with open('./' + 'param' + '.txt', 'w') as f: for name, param in self.model.named_parameters(): f.writelines('name:' + name + "\n") f.writelines(str(param)) f.writelines('size:' + str(param.size()) + '\n') no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0. }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] gradient_accumulation_steps = 1 t_total = len( train_data) // gradient_accumulation_steps * self.args.epochs optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.lr, eps=self.args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.args.warmup_steps, t_total=t_total) self.model.zero_grad() set_seed(42, self.args.gpus) batches_acm, loss_acm = 0, 0 # Train! logger.info("***** Running training *****") logger.info(" Task: %s", self.args.task) logger.info(" Num examples = %d", len(train_data)) logger.info(" Num Epochs = %d", self.args.epochs) logger.info(" Total optimization steps = %d", t_total) logger.info(" Running Language Model = %s", self.args.lm_model) logger.info(" Running Model = %s", self.args.encoder_type) best_acc = 0 best_model_wts = copy.deepcopy(self.model.state_dict()) total_steps = 0 train_iterator = trange(int(self.args.epochs), desc="Epoch") # initialize the early_stopping object early_stopping = EarlyStopping(patience=self.args.patience, verbose=True) for _ in train_iterator: epoch_iterator = tqdm(train_data, desc="Iteration") running_loss = 0.0 running_corrects = 0 batch_count = self.args.batch_multiplier # Turn on the train mode for step, batch in enumerate(epoch_iterator): self.model.train() batch = move_to_cuda(batch, self.device) logits, labels, ans_ids = self.model(batch, train=True) logits_for_pred = logits.clone().detach() loss = self.criterion(logits, labels) loss_value = loss.item() pred_values, pred_indices = torch.max(logits_for_pred, 1) labels = labels.tolist() pred = pred_indices.tolist() corrects = [i for i, j in zip(labels, pred) if i == j] # Statistics running_loss += loss.item() running_corrects += len(corrects) if batch_count == 0: torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) optimizer.step() scheduler.step() total_steps += 1 optimizer.zero_grad() self.model.zero_grad() batch_count = self.args.batch_multiplier loss_acm += loss_value loss.backward() batch_count -= 1 if (batches_acm % (self.args.batch_multiplier * self.args.batch_size) == 0) & (batches_acm != 0) & (step != 0): logger.info( 'Train Epoch %d, Batch %d, loss %.3f, Accuracy %.3f', _, batches_acm, loss_acm / batches_acm, running_corrects / (self.args.batch_size * step)) tb_writer.add_scalar('Training_loss', loss_acm / batches_acm, batches_acm) tb_writer.add_scalar( 'Training_Accuracy', running_corrects / (self.args.batch_size * step)) torch.cuda.empty_cache() batches_acm += 1 epoch_loss = running_loss / batches_acm epoch_acc = running_corrects / len(train_data) print('{} Loss: {:.4f} Acc: {:.4f}'.format(_, epoch_loss, epoch_acc)) tb_writer.add_scalar('Training_Epoch_loss', epoch_loss, _) tb_writer.add_scalar('Training_Epoch_Accuracy', epoch_acc, _) # Evaluate on Development Set eval_epoch_acc, eval_epoch_loss = self._run_evaluate( dev_data, _, write_answer=False) print('Overall_Dev Acc: {:.4f}'.format(eval_epoch_acc)) tb_writer.add_scalar('Dev_Epoch_Accuracy', eval_epoch_acc, _) ################################## # Evaluate on Test Set test_epoch_acc, test_epoch_loss = self._run_evaluate( test_data, _, write_answer=True) print('Overall_Test Acc: {:.4f}'.format(test_epoch_acc)) tb_writer.add_scalar('Test_Epoch_Accuracy', test_epoch_acc, _) # Save only best accuracy model on dev set if eval_epoch_acc > best_acc: best_acc = eval_epoch_acc best_model_wts = copy.deepcopy(self.model.state_dict()) # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model early_stopping(epoch_acc, self.model) if early_stopping.early_stop: print("Early stopping") break self.model.train() logger.info('Best val Acc: {:4f}'.format(best_acc)) torch.save( { 'args': self.save_args, 'model': best_model_wts }, '%s/epoch%d_batch%d_model_best_%s' % (self.args.ckpt, self.args.epochs, batches_acm, self.args.prefix))
def train(config): # electra config 객체 생성 electra_config = ElectraConfig.from_pretrained(config["train_model_path"], num_labels=config["num_labels"], cache_dir=config["cache_dir_path"]) # electra tokenizer 객체 생성 electra_tokenizer = ElectraTokenizer.from_pretrained(config["train_model_path"], do_lower_case=False, cache_dir=config["cache_dir_path"]) # electra model 객체 생성 electra_model = ElectraForSequenceClassification.from_pretrained(config["train_model_path"], config=electra_config, cache_dir=config["cache_dir_path"]) # electra_model.cuda() # 학습 데이터 읽기 train_datas = read_data(file_path=config["train_data_path"]) # 학습 데이터 전처리 train_dataset = convert_data2dataset(datas=train_datas, tokenizer=electra_tokenizer, max_length=config["max_length"]) # 학습 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성 train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config["batch_size"]) # 평가 데이터 읽기 test_datas = read_data(file_path=config["test_data_path"]) # 평가 데이터 전처리 test_dataset = convert_data2dataset(datas=test_datas, tokenizer=electra_tokenizer, max_length=config["max_length"]) # 평가 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성 test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=100) # 전체 학습 횟수(batch 단위) t_total = len(train_dataloader) // config["gradient_accumulation_steps"] * config["epoch"] # 모델 학습을 위한 optimizer optimizer = AdamW(electra_model.parameters(), lr=config["learning_rate"]) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config["warmup_steps"], num_training_steps=t_total) if os.path.isfile(os.path.join(config["model_dir_path"], "optimizer.pt")) and os.path.isfile( os.path.join(config["model_dir_path"], "scheduler.pt")): # 기존에 학습했던 optimizer와 scheduler의 정보 불러옴 optimizer.load_state_dict(torch.load(os.path.join(config["model_dir_path"], "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(config["model_dir_path"], "scheduler.pt"))) global_step = 0 electra_model.zero_grad() max_test_accuracy = 0 for epoch in range(config["epoch"]): electra_model.train() # 학습 데이터에 대한 정확도와 평균 loss train_accuracy, average_loss, global_step = do_train(config=config, electra_model=electra_model, optimizer=optimizer, scheduler= scheduler, train_dataloader=train_dataloader, epoch=epoch+1, global_step=global_step) print("train_accuracy : {}\taverage_loss : {}\n".format(round(train_accuracy, 4), round(average_loss, 4))) electra_model.eval() # 평가 데이터에 대한 정확도 test_accuracy = do_evaluate(electra_model=electra_model, test_dataloader=test_dataloader, mode=config["mode"]) print("test_accuracy : {}\n".format(round(test_accuracy, 4))) # 현재의 정확도가 기존 정확도보다 높은 경우 모델 파일 저장 if(max_test_accuracy < test_accuracy): max_test_accuracy = test_accuracy output_dir = os.path.join(config["model_dir_path"], "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) print("save model in checkpoint-{}\n".format(global_step)) electra_config.save_pretrained(output_dir) electra_tokenizer.save_pretrained(output_dir) electra_model.save_pretrained(output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
def main(): parser = argparse.ArgumentParser() # Required Parameters parser.add_argument( "--output_dir", default='output', type=str, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument("--checkpoint", default='pretrain_ckpt/bert_small_ckpt.bin', type=str, help="checkpoint") parser.add_argument("--resume_checkpoint", default=False, type=bool, help="resume") parser.add_argument('--log_dir', default='./runs', type=str) # Other Parameters parser.add_argument("--train_feature", default='./rsc/train_features.hdf5', type=str, help="SQuAD corpus for post-training.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=4.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument("--num_workers", default=8, type=int, help="Proportion of workers of DataLoader") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O2', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") args = parser.parse_args() summary_writer = SummaryWriter(args.log_dir) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Prepare model model = SampleCNN() # Multi-GPU Setting # if n_gpu > 1: # model = nn.DataParallel(model) num_params = count_parameters(model) logger.info("Total Parameter: %d" % num_params) model.to(device) post_training_dataset = SpeechDataset('./rsc/train.hdf5') num_train_optimization_steps = int( len(post_training_dataset) / args.train_batch_size) * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=num_train_optimization_steps * 0.1, t_total=num_train_optimization_steps) loss_fn = nn.KLDivLoss(reduction='batchmean').to(device) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(post_training_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) num_train_step = num_train_optimization_steps train_dataloader = DataLoader(post_training_dataset, batch_size=args.train_batch_size, num_workers=16, pin_memory=True) model.train() global_step = 0 epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): iter_bar = tqdm( train_dataloader, desc="Train(XX Epoch) Step(XX/XX) (Mean loss=X.X) (loss=X.X)") tr_step, total_loss, mean_loss = 0, 0., 0. for step, batch in enumerate(iter_bar): feature = batch['feature'].float().to(device) label = batch['label'].float().to(device) output = model(feature) # loss = -F.kl_div(output, label, reduction='batchmean') loss = loss_fn(output, label) # if n_gpu > 1: # loss = loss.mean() if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 tr_step += 1 total_loss += loss mean_loss = total_loss / tr_step iter_bar.set_description( "Train Step(%d / %d) (Mean loss=%5.5f) (loss=%5.5f)" % (global_step, num_train_step, mean_loss, loss.item())) if global_step % 100 == 0: print('output ', output) summary_writer.add_scalar('Train/Total_Mean_Loss', mean_loss, global_step) summary_writer.add_scalar('Train/Total_Loss', loss.item(), global_step) logger.info("***** Saving file *****") if args.resume_checkpoint: model_checkpoint = "pt_bert_from_checkpoint_%d.bin" % (epoch) else: model_checkpoint = "pt_scnn_%d.bin" % (epoch) logger.info(model_checkpoint) output_model_file = os.path.join(args.output_dir, model_checkpoint) # if n_gpu > 1: # torch.save(model.module.state_dict(), output_model_file) # else: torch.save(model.state_dict(), output_model_file) epoch += 1
print(" >> Distribute the model") else: model_ft.cuda() params_to_update = model_ft.parameters() # Observe that all parameters are being optimized if optim_type == 'Adam': optimizer_ft = optim.Adam(params_to_update, lr=begin_lr) if optim_type == 'SGD': optimizer_ft = optim.SGD(params_to_update, lr=begin_lr, momentum=0.9) else: optimizer_ft = AdamW(params_to_update, lr=begin_lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2) if lr_schedule == 'plateau': # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_ft, # mode='min', # factor=0.2, # patience=1, # verbose=False, # threshold=1e-4, # threshold_mode='rel', # cooldown=2, # min_lr=0, # eps=1e-8) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_ft,
def train(args, train_dataset, model, tokenizer): """ Train the model """ tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs warm_up_steps = int(args.warmup_steps * t_total) save_steps = int(args.save_steps * t_total) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warm_up_steps, t_total=t_total) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } inputs[ 'token_type_ids'] = None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[0] label_ids = torch.nn.functional.one_hot(batch[3]).float() tsa_start = 0.5 tsa_threshold = get_tsa_threshold("exp_schedule", global_step, t_total, tsa_start, end=1) larger_than_threshold = torch.exp(-loss) > tsa_threshold loss_mask = torch.ones_like(label_ids) * ( 1 - larger_than_threshold.float()) loss = torch.sum(loss * loss_mask) / torch.max( torch.sum(loss_mask), torch.tensor(1.0).cuda()) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if save_steps > 0 and global_step % save_steps == 0: # Save model checkpoint if args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close() return global_step, tr_loss / global_step