def train(args, train_dataset, dev_dataset, model, tokenizer): """ Train the model """ tb_writer = SummaryWriter(os.path.join(args.output_dir, 'TB_writer')) if args.dynamic_batching: train_sampler = CustomBatchSampler(train_dataset, args.train_batch_size) train_dataloader = DataLoader(train_dataset, batch_sampler=train_sampler, num_workers=1, collate_fn=dynamic_padding_collate_fn) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=1) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_path, "scheduler.pt"))) if args.fp16: try: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") model.train() model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch") # Added here for reproductibility set_seed(args) loss_cum = None # torch.autograd.set_detect_anomaly(True) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", smoothing=0.05) for step, batch_cpu in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue batch = tuple(t.to(args.device) for t in batch_cpu) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "start_positions": batch[2].squeeze(-1), "end_positions": batch[3].squeeze(-1), "max_ans_length": args.max_ans_length, } outputs = model(**inputs) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if loss_cum is None: loss_cum = loss.detach() else: loss_cum += loss.detach() else: loss.backward() if loss_cum is None: loss_cum = loss.detach() else: loss_cum += loss.detach() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log train metrics if (not global_step % args.train_logging_steps ) and args.train_logging_steps > 0: tb_writer.add_scalar( 'train_loss', loss_cum.item() / args.train_logging_steps, global_step) loss_cum = None # Log dev metrics if args.dev_logging_steps > 0 and global_step % args.dev_logging_steps == 0 and args.evaluate_during_training: dev_loss = evaluate(args, dev_dataset, model) tb_writer.add_scalar("dev_loss", dev_loss, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) # Save model checkpoint if args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) logger.info("Saving model checkpoint to %s", output_dir) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close()
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Train! print("***** Running training *****") print(" Num examples = %d", len(train_dataset)) print(" Num Epochs = %d", args.num_train_epochs) print(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) print( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) print(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) print(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) print(" Continuing training from checkpoint, will skip to saved global_step") print(" Continuing training from epoch %d", epochs_trained) print(" Continuing training from global step %d", global_step) print(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: print(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) # Added here for reproductibility set_seed(args) for _ in train_iterator: training_pbar = tqdm(total=len(train_dataset), position=0, leave=True, file=sys.stdout, bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)) for step, batch in enumerate(train_dataloader): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() training_pbar.update(batch[0].size(0)) # hiepnh if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # Save model checkpoint if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) print("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) print("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: training_pbar.close() # hiepnh break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (args.model_name_or_path and os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer, teacher=None): """Train the model""" if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) # Added here for reproductibility set_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() if teacher is not None: teacher.eval() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type != "distilbert": inputs[ "token_type_ids"] = None if args.model_type == "xlm" else batch[ 2] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) outputs = model(**inputs) loss, start_logits_stu, end_logits_stu = outputs # Distillation loss if teacher is not None: if "token_type_ids" not in inputs: inputs[ "token_type_ids"] = None if args.teacher_type == "xlm" else batch[ 2] with torch.no_grad(): start_logits_tea, end_logits_tea = teacher( input_ids=inputs["input_ids"], token_type_ids=inputs["token_type_ids"], attention_mask=inputs["attention_mask"], ) assert start_logits_tea.size() == start_logits_stu.size() assert end_logits_tea.size() == end_logits_stu.size() loss_fct = nn.KLDivLoss(reduction="batchmean") loss_start = loss_fct( nn.functional.log_softmax( start_logits_stu / args.temperature, dim=-1), nn.functional.softmax(start_logits_tea / args.temperature, dim=-1), ) * (args.temperature**2) loss_end = loss_fct( nn.functional.log_softmax( end_logits_stu / args.temperature, dim=-1), nn.functional.softmax(end_logits_tea / args.temperature, dim=-1), ) * (args.temperature**2) loss_ce = (loss_start + loss_end) / 2.0 loss = args.alpha_ce * loss_ce + args.alpha_squad * loss if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train( self, model, train_dataloader, dev_dataloader, dev_dataset, device, n_gpu, eval_fn, output_dir, save_optimizer, eval_params, bert_model, ): results = {} best_score = 0.0 t_total = (len(train_dataloader) // self.gradient_accumulation_steps * self.num_train_epochs) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW( optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon, ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=t_total, ) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(self.num_train_epochs), desc="Epoch", ) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": batch[0], "position_ids": batch[1], "token_type_ids": batch[2], "bbox": batch[3], "labels": batch[4], } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if n_gpu > 1: loss = ( loss.mean() ) # mean() to average on multi-gpu parallel training if self.gradient_accumulation_steps > 1: loss = loss / self.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % self.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if self.logging_steps > 0 and global_step % self.logging_steps == 0: loss_scalar = (tr_loss - logging_loss) / self.logging_steps learning_rate_scalar = scheduler.get_lr()[0] epoch_iterator.set_description( f"Loss :{loss_scalar} LR: {learning_rate_scalar}") logging_loss = tr_loss score = self.eval( model, dev_dataloader, dev_dataset, device, n_gpu, eval_fn, eval_params, mode="dev", bert_model=bert_model, ) results[epoch] = score with torch.no_grad(): if score >= best_score: logger.info(f"Storing the new model with score: {score}") if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(self.args, os.path.join(output_dir, "training_args.bin")) logger.info(f"Saving model checkpoint to {output_dir}") if save_optimizer: torch.save( optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"), ) torch.save( scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"), ) logger.info( "Saving optimizer and scheduler states to %s", output_dir) best_score = score return results
def train(args, train_dataset, model, tokenizer): """ Train the model """ train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(t_total * args.warmup_proportion), num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Train batch size per GPU = %d", args.train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() mb = master_bar(range(int(args.num_train_epochs))) # Added here for reproductibility set_seed(args) for epoch in mb: epoch_iterator = progress_bar(train_dataloader, parent=mb) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in [ "xlm", "roberta", "distilbert", "distilkobert", "xlm-roberta" ]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.evaluate_during_training: results = evaluate(args, model, tokenizer, global_step=global_step) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) logging_loss = tr_loss # Save model checkpoint if args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.save_optimizer: torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info( "Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: break mb.write("Epoch {} done".format(epoch + 1)) if args.max_steps > 0 and global_step > args.max_steps: break return global_step, tr_loss / global_step
class NeuralRstParserCoref(object): def __init__(self, clf, coref_trainer, data_helper, config): self.config = config self.data_helper = data_helper self.clf = clf self.coref_trainer = coref_trainer if self.config[MODEL_TYPE] in [2, 3]: self.clf.bert = self.coref_trainer.model.encoder.bert self.loss = CrossEntropyLoss(reduction='mean').to(config[DEVICE]) self.optim = None def get_optim_scheduler(self, train_loader): no_decay = ['bias', 'LayerNorm.weight'] self.optim = AdamW( params=[ { 'params': [ p for n, p in self.clf.bert.named_parameters() if not any(nd in n for nd in no_decay) ], 'lr': 1e-05 }, # Bert params outside no_decay { 'params': [ p for n, p in self.clf.bert.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': 1e-05 }, #Bert params in no_decay { 'params': [ p for n, p in self.clf.named_parameters() # Clf outside no_decay if ("bert" not in n and not any(nd in n for nd in no_decay)) ] }, { 'params': [ p for n, p in self.clf.named_parameters() if ("bert" not in n and any(nd in n for nd in no_decay)) ], 'weight_decay': 0.0 }, # Clf params in no_decay ], lr=0.0002, weight_decay=0.01) self.num_batches = 34685 / self.clf.config[BATCH_SIZE] train_steps = int(20 * self.num_batches) if self.config[MODEL_TYPE] > 1: self.task_p = self.num_batches / ( self.num_batches + len(self.coref_trainer.train_corpus)) self.scheduler = get_linear_schedule_with_warmup( self.optim, num_warmup_steps=int(train_steps * 0.1), num_training_steps=train_steps) def train_classifier(self, train_loader): # Initialize optimizer and scheduler self.get_optim_scheduler(train_loader) if os.path.isfile("../data/model/" + self.config[MODEL_NAME]): epoch_start = self.load("../data/model/" + self.config[MODEL_NAME]) else: epoch_start = 0 for epoch in range(epoch_start + 1, 21): cost_acc = 0 self.clf.train() if self.config[MODEL_TYPE] > 1: self.coref_trainer.model.train() print("============ epoch: ", epoch, " ============") for i, data in tqdm(enumerate(train_loader)): # if 0, train on random datapoint from coref corpus while self.config[MODEL_TYPE] > 1 and binomial( 1, self.task_p) == 0: cost_acc += self.coref_trainer.train_epoch(i, 1) cost_acc += self.train_sample_rst(data) print("Total cost for epoch %d is %f" % (epoch, cost_acc)) print("============ Evaluating on the dev set ============") self.save(self.config[MODEL_NAME], epoch) self.evaluate() def train_sample_rst(self, sample): docs, batched_clusters, action_feats, neural_feats, all_actions, all_relations, rel_mask = sample self.optim.zero_grad() # Forward pass if self.clf.config[MODEL_TYPE] in [0, 3]: span_embeds = self.clf.get_edus_bert_coref(docs, [None] * len(docs), neural_feats) elif self.clf.config[MODEL_TYPE] in [1, 2]: span_embeds = self.clf.get_edus_bert_coref(docs, batched_clusters, neural_feats) # Compute action loss action_probs, rel_probs = self.clf.decode_action_coref( span_embeds, action_feats) cost = self.loss(action_probs.to(self.config[DEVICE]), all_actions.to(self.config[DEVICE])) # Compute relation loss rel_probs, rel_labels = rel_probs[rel_mask], all_relations[rel_mask] if rel_labels.shape[0] > 0: cost += self.loss(rel_probs.to(self.config[DEVICE]), rel_labels.to(self.config[DEVICE])) # Update the model cost.backward() nn.utils.clip_grad_norm_(self.clf.parameters(), 1.0) self.optim.step() self.scheduler.step() return cost.item() def sr_parse(self, doc, gold_actions, gold_rels): # Generate coref clusters for the document if self.clf.config[MODEL_TYPE] in [1, 2]: with torch.no_grad(): clusters, _ = self.coref_trainer.predict_clusters(doc) else: clusters = None # Stack/Queue state conf = ParsingState([], [], self.clf.config) conf.init(doc) all_action_probs, all_rel_probs = [], [] # Until the tree is built while not conf.end_parsing(): # Get features for the current stack/queue state, and span boundaries stack, queue = conf.get_status() fg = ActionFeatureGenerator(stack, queue, [], doc, self.data_helper, self.config) action_feat, span_boundary = fg.gen_features() span_embeds = self.clf.get_edus_bert_coref([doc], [clusters], [span_boundary]) action_probs, rel_probs = self.clf.decode_action_coref( span_embeds, [action_feat]) all_action_probs.append(action_probs.squeeze()) sorted_action_idx = torch.argsort(action_probs, descending=True) sorted_rel_idx = torch.argsort(rel_probs, descending=True) # Select Shift/Reduce action (shift/reduce-nn/...) action_idx = 0 pred_action, pred_nuc = xidx_action_map[int( sorted_action_idx[0, action_idx])] while not conf.is_action_allowed( (pred_action, pred_nuc, None), doc): action_idx += 1 pred_action, pred_nuc = xidx_action_map[int( sorted_action_idx[0, action_idx])] # Select Relation annotation pred_rel = None if pred_action != "Shift": all_rel_probs.append(rel_probs.squeeze()) pred_rel_idx = int(sorted_rel_idx[0, 0]) pred_rel = xidx_relation_map[pred_rel_idx] #assert not (pred_action == "Reduce" and pred_rel is None) if (pred_action == "Reduce" and pred_rel is None): print( "Warning: got a Reduce with a None relation. Replacing with Elaboration" ) pred_rel = "Elaboration" predictions = (pred_action, pred_nuc, pred_rel) conf.operate(predictions) # Shift/Reduce loss cost = self.loss(torch.stack(all_action_probs), gold_actions) # Relation annotation loss if all_rel_probs != []: cost_relation = self.loss(torch.stack(all_rel_probs), gold_rels) cost += cost_relation tree = conf.get_parse_tree() rst_tree = RstTree() rst_tree.assign_tree(tree) rst_tree.assign_doc(doc) rst_tree.back_prop(tree, doc) return rst_tree, cost.item() def evaluate(self): self.clf.eval() if self.config[MODEL_TYPE] > 1: self.coref_trainer.model.eval() with torch.no_grad(): eval = Evaluator(self, self.data_helper, self.config) eval.eval_parser(self.data_helper.val_trees) def save(self, model_name, epoch): """Save models """ save_dict = { 'epoch': epoch, 'model_state_dict': self.clf.state_dict(), 'optimizer_state_dict': self.optim.state_dict(), 'scheduler_state_dict': self.scheduler.state_dict() } if self.clf.config[MODEL_TYPE] in [2, 3]: save_dict.update({ 'coref_state_dict': self.coref_trainer.model.state_dict(), 'coref_optimizer_state_dict': self.coref_trainer.optimizer.state_dict(), 'coref_scheduler_state_dict': self.coref_trainer.scheduler.state_dict(), }) torch.save(save_dict, os.path.join("../data/model/", model_name)) def load(self, model_dir): """ Load models """ model_save = torch.load(model_dir) cleanup_load_dict(model_save) self.clf.load_state_dict(model_save['model_state_dict']) self.clf.eval() if self.optim is not None: self.optim.load_state_dict(model_save['optimizer_state_dict']) self.scheduler.load_state_dict(model_save['scheduler_state_dict']) if self.config[MODEL_TYPE] in [2, 3]: self.coref_trainer.model.load_state_dict( model_save['coref_state_dict']) self.coref_trainer.model.eval() self.coref_trainer.optimizer.load_state_dict( model_save['coref_optimizer_state_dict']) self.coref_trainer.scheduler.load_state_dict( model_save['coref_scheduler_state_dict']) self.clf.bert = self.coref_trainer.model.encoder.bert return model_save['epoch']
def _train(task, logger, tb_writer, model, tokenizer, dataset, max_steps, num_train_epochs, gradient_accumulation_steps, weight_decay, learning_rate, adam_epsilon, max_grad_norm, warmup_steps, fp16, fp16_opt_level, n_gpu, local_rank, evaluate_during_training, evaluate_func, per_gpu_train_batch_size, device, output_dir, model_type, model_name_or_path, configs, seed, logging_steps, save_steps, **kwargs): """ The basic training process function """ train_batch_size = per_gpu_train_batch_size * max(1, n_gpu) train_sampler = RandomSampler( dataset) if local_rank == -1 else DistributedSampler(dataset) train_dataloader = DataLoader( dataset, sampler=train_sampler, batch_size=train_batch_size) # 直接shuffle = true, 默认使用Randomsampler if max_steps > 0: # 判断所有步数 t_total = max_steps num_train_epochs = max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader) // gradient_accumulation_steps * num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": float(weight_decay), # 默认的权重衰减值为0 }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=float(learning_rate), eps=float(adam_epsilon)) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) if os.path.isfile(os.path.join( model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(model_name_or_path, "scheduler.pt"))) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(dataset)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", train_batch_size * gradient_accumulation_steps * (torch.distributed.get_world_size() if local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from checkpoint if os.path.exists(model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path if re.match("checkpoint-\d+", model_name_or_path): global_step = int(model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(num_train_epochs), desc="Epoch", disable=local_rank not in [-1, 0]) set_seed(seed, n_gpu) # Added here for reproductibility for _ in train_iterator: # epoch epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # dataitor # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(device) for t in batch) # decide inputs based one task type inputs = _decide_inputs(task, batch, model_type) outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % gradient_accumulation_steps == 0: if fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if local_rank in [ -1, 0 ] and logging_steps > 0 and global_step % logging_steps == 0: # Log metrics if ( local_rank == -1 and evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results, _ = evaluate_func(mode="dev", model=model, tokenizer=tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / logging_steps, global_step) logging_loss = tr_loss if local_rank in [ -1, 0 ] and save_steps > 0 and global_step % save_steps == 0: # Save model checkpoint outputdir = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(outputdir): os.makedirs(outputdir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained( outputdir) # TODO: check save_pretrained method tokenizer.save_pretrained(outputdir) torch.save(configs, os.path.join(outputdir, "training_args.bin")) logger.info("Saving model checkpoint to %s", outputdir) torch.save(optimizer.state_dict(), os.path.join(outputdir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(outputdir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", outputdir) if max_steps > 0 and global_step > max_steps: epoch_iterator.close() break if max_steps > 0 and global_step > max_steps: train_iterator.close() break if local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): # argument parsing parser = argparse.ArgumentParser() parser.add_argument('--max-epochs', type=int, default=2) parser.add_argument('--batch-size', type=int, default=4) parser.add_argument('--max-sequence-length', type=int, default=128) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--data-dir', type=str, default='data') parser.add_argument('--real-dataset', type=str, default='webtext') parser.add_argument('--fake-dataset', type=str, default='xl-1542M-nucleus') parser.add_argument('--save-dir', type=str, default='bert_logs') parser.add_argument('--learning-rate', type=float, default=2e-5) parser.add_argument('--weight-decay', type=float, default=0) parser.add_argument('--model-name', type=str, default='bert-base-cased') parser.add_argument('--wandb', type=bool, default=True) args = parser.parse_args() if args.wandb: wandb.init(project=args.model_name) device = "cuda" if torch.cuda.is_available() else "cpu" # config, tokenizer, model config = AutoConfig.from_pretrained( args.model_name, num_labels=2 ) tokenizer = AutoTokenizer.from_pretrained(args.model_name) tokenization_utils.logger.setLevel('DEBUG') model = AutoModelForSequenceClassification.from_pretrained( args.model_name, config=config ) model.to(device) # load data train_loader, validation_loader, test_loader = load_datasets(args, tokenizer) # my model optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) best_val = 0. for epoch in range(args.max_epochs): train(model, optimizer, train_loader, args, device) val_acc = validation(model, validation_loader, args, device) test_acc = test(model, test_loader, args, device) print(f"Epoch {epoch + 1} | val_acc: {val_acc} test_acc: {test_acc}") if val_acc > best_val: os.makedirs(args.save_dir, exist_ok=True) model_name = 'baseline_' + args.model_name + '.pt' model_to_save = model.module if hasattr(model, 'module') else model torch.save(dict( epoch=epoch+1, model_state_dict=model_to_save.state_dict(), optimizer_state_dict=optimizer.state_dict(), args=args ), os.path.join(args.save_dir, model_name) ) print("Model saved to", args.save_dir) best_val = val_acc
ins_accum += BATCH_SIZE if niter % 1000 == 0: print("experiment on val...") model.eval() val_loss, val_acc = evaluate(model, val_dataset, val_labels) model.train() train_log['Val/Loss'].append((epoch, niter, val_loss)) train_log['Val/Acc'].append((epoch, niter, val_acc)) print(train_log['Val/Loss'][-1]) print(train_log['Val/Acc'][-1]) if val_acc > best_val_acc: best_val_acc = val_acc state = { 'net': model.state_dict(), 'optimizer': optim.state_dict(), 'epoch': epoch } train_log['Save'].append((epoch, niter, best_val_acc)) print('saving best model at epoch {} iter {}'.format( epoch, niter)) torch.save(state, 'state_bert_base_best.pth') if niter % 100 == 0: # print(niter, input_ids, outputs, labels) train_log['Train/Loss'].append( (epoch, niter, loss_accum / ins_accum)) train_log['Train/Acc'].append( (epoch, niter, correct_accum / ins_accum)) print(train_log['Train/Loss'][-1]) print(train_log['Train/Acc'][-1])
def main(): print('Start') parser = argparse.ArgumentParser() # Add the arguments to the parser parser.add_argument("--model_name", required=True) parser.add_argument("--checkpoint_input_path", required=False) parser.add_argument("--checkpoint_output_path", required=True) parser.add_argument("--bioasq_path", required=True) parser.add_argument("--seed", default=1995) parser.add_argument("--learning_rate", default=5e-5, type=float) parser.add_argument("--batch_size", default=16, type=int) parser.add_argument("--epochs", default=3, type=int) args = vars(parser.parse_args()) random.seed(args['seed']) with open(args['bioasq_path'], 'rb') as f: bio_list_raw = json.load(f)['questions'] bio_list_raw = [ question for question in bio_list_raw if question['type'] == 'list' ] bio_list_questions = [question['body'] for question in bio_list_raw] bio_list_ids = [question['id'] for question in bio_list_raw] bio_list_answers = [question['exact_answer'] for question in bio_list_raw] bio_snippets = { question['id']: [snippet['text'] for snippet in question['snippets']] for question in bio_list_raw } print(f'Number of questions: {len(bio_list_questions)}') ids = [] snippets = [] for key, value in bio_snippets.items(): for snippet in value: ids.append(key) snippets.append(snippet) snippets_df = pd.DataFrame({'id': ids, 'snippet': snippets}) questions_df = pd.DataFrame({ 'id': bio_list_ids, 'question': bio_list_questions, 'label': bio_list_answers }) val_df = pd.merge(snippets_df, questions_df, how='left', on='id') ids = [] labels = [] snippets = [] questions = [] for index, row in val_df.iterrows(): ids += [row['id'] + f'_{i}' for i in range(len(row['label']))] labels += [row['label'][i][0] for i in range(len(row['label']))] snippets += [row['snippet'] for i in range(len(row['label']))] questions += [row['question'] for i in range(len(row['label']))] list_df = pd.DataFrame({ 'id': ids, 'question': questions, 'snippet': snippets, 'label': labels }) list_df = list_df.sample(16) def get_start_answer(row): label = row['label'].lower() context = row['snippet'].lower() if label in context: return context.index(label) return None list_df['answer_start'] = list_df.apply(get_start_answer, axis=1) clean_df = list_df[~list_df.answer_start.isnull()] bio_list_questions = list(clean_df.question) bio_list_contexts = list(clean_df.snippet) bio_list_answers = [{ 'text': row['label'], 'answer_start': int(row['answer_start']) } for index, row in clean_df.iterrows()] from transformers import BertTokenizerFast tokenizer_fast = BertTokenizerFast.from_pretrained( args['model_name'], do_lower_case=True, padding=True, truncation=True, add_special_tokens=True, model_max_length=1000000000) # In[26]: from squad_processing import add_end_idx, add_token_positions add_end_idx(bio_list_answers, bio_list_contexts) # In[27]: list_encodings = tokenizer_fast(bio_list_contexts, bio_list_questions, add_special_tokens=True, truncation=True, padding=True, max_length=500) # In[29]: add_token_positions(list_encodings, bio_list_answers, tokenizer_fast) # In[30]: from torch.utils.data import Dataset class SquadDataset(Dataset): def __init__(self, encodings): self.encodings = encodings def __getitem__(self, idx): #print(self.encodings['start_positions'][idx]) #{key: torch.tensor(val[idx], dtype = torch.long) for key, val in self.encodings.items()} return { 'input_ids': torch.tensor(self.encodings['input_ids'][idx], dtype=torch.long), 'attention_mask': torch.tensor(self.encodings['attention_mask'][idx], dtype=torch.long), 'start_positions': torch.tensor(self.encodings['start_positions'][idx], dtype=torch.long), 'end_positions': torch.tensor(self.encodings['end_positions'][idx], dtype=torch.long) } def __len__(self): return len(self.encodings.input_ids) # In[32]: train_bio_list = SquadDataset(list_encodings) # In[46]: from transformers import BertPreTrainedModel, BertModel from torch import nn from torch.utils.data import DataLoader from transformers import AdamW from transformers.modeling_outputs import QuestionAnsweringModelOutput import torch from torch.nn import CrossEntropyLoss # In[47]: class BertForQuestionAnswering(BertPreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config, add_pooling_layer=False) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) total_loss = None if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss, ) + output) if total_loss is not None else output return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) # In[48]: model = BertForQuestionAnswering.from_pretrained(args['model_name']) # In[49]: device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') checkpoint = torch.load(args['checkpoint_input_path'], map_location=device) model.load_state_dict({ key.replace('module.', ''): value for key, value in checkpoint.items() }) # In[50]: model.to(device) model.train() from torch.nn import DataParallel model = DataParallel(model) train_loader = DataLoader(train_bio_list, batch_size=args['batch_size'], shuffle=True) optim = AdamW(model.parameters(), lr=args['learning_rate']) # In[51]: # Train on BioAsq from barbar import Bar for epoch in range(args['epochs']): for i, batch in enumerate(Bar(train_loader)): optim.zero_grad() input_ids = batch['input_ids'].to(device, dtype=torch.long) attention_mask = batch['attention_mask'].to(device, dtype=torch.long) start_positions = batch['start_positions'].to(device, dtype=torch.long) end_positions = batch['end_positions'].to(device, dtype=torch.long) outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions) loss = outputs[0] loss.sum().backward() optim.step() model.eval() # In[ ]: torch.save( { 'epoch': 3, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optim.state_dict(), 'loss': loss, }, args['checkpoint_output_path'] + '/checkpoint_list.pt')
def train(args, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() # Initial train dataloader if args.use_random_candidates: train_dataset, _, _= load_and_cache_examples(args, tokenizer) elif args.use_hard_negatives or args.use_hard_and_random_negatives: train_dataset, _, _ = load_and_cache_examples(args, tokenizer, model) else: train_dataset, _, _ = load_and_cache_examples(args, tokenizer) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if args.resume_path is not None and os.path.isfile(os.path.join(args.resume_path, "optimizer.pt")) \ and os.path.isfile(os.path.join(args.resume_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.resume_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.resume_path, "scheduler.pt"))) logger.info("INFO: Optimizer and scheduler state loaded successfully.") if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # For debugging: Register backward hooks to check gradient # def hook(self, grad_in, grad_out): # print(self) # print('grad_in') # print([_grad_in for _grad_in in grad_in if _grad_in is not None]) # print('grad_out') # print([_grad_out for _grad_out in grad_out if _grad_out is not None]) # # for module in model.modules(): # module.register_backward_hook(hook) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.resume_path is not None: # set global_step to global_step of last saved checkpoint from model path # global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) global_step = int(args.resume_path.split("/")[-2].split("-")[-1]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) set_seed(args) # Added here for reproductibility for epoch_num in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) ner_inputs = {"args": args, "mention_token_ids": batch[0], "mention_token_masks": batch[1], "mention_start_indices": batch[7], "mention_end_indices": batch[8], "mode": 'ner', } if args.use_hard_and_random_negatives: ned_inputs = {"args": args, "last_hidden_states": None, "mention_start_indices": batch[7], "mention_end_indices": batch[8], "candidate_token_ids_1": batch[2], "candidate_token_masks_1": batch[3], "candidate_token_ids_2": batch[4], "candidate_token_masks_2": batch[5], "labels": batch[6], "mode": 'ned', } else: ned_inputs = {"args": args, "mention_token_ids": batch[0], "mention_token_masks": batch[1], "mention_start_indices": batch[7], "mention_end_indices": batch[8], "candidate_token_ids_1": batch[2], "candidate_token_masks_1": batch[3], "labels": batch[6], "mode": 'ned', } if args.ner: loss, _ = model.forward(**ner_inputs) elif args.alternate_batch: # Randomly choose whether to do tagging or NED for the current batch if random.random() <= 0.5: loss = model.forward(**ner_inputs) else: loss, _ = model.forward(**ned_inputs) elif args.ner_and_ned: ner_loss, last_hidden_states = model.forward(**ner_inputs) ned_inputs["last_hidden_states"] = last_hidden_states ned_loss, _ = model.forward(**ned_inputs) loss = ner_loss + ned_loss else: logger.info(" Specify a training protocol from (ner, alternate_batch, ner_and_ned)") if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break # New data loader for the next epoch if args.use_random_candidates: # New data loader at every epoch for random sampler if we use random negative samples train_dataset, _, _= load_and_cache_examples(args, tokenizer) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) elif args.use_hard_negatives or args.use_hard_and_random_negatives: # New data loader at every epoch for hard negative sampler if we use hard negative mining train_dataset, _, _= load_and_cache_examples(args, tokenizer, model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) # Anneal the lamba_1 nd lambda_2 weights args.lambda_1 = args.lambda_1 - 1 / (epoch_num + 1) args.lambda_2 = args.lambda_2 + 1 / (epoch_num + 1) if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
predicted = torch.max(logits, 1)[1] # print("labels:") num_total += labels.size(0) num_correct += (predicted == labels).sum().item() running_loss += loss.item() print("predicted, labels:", predicted.cpu().detach().numpy(), labels.cpu().detach().numpy()) print('Validation Accuracy: {}'.format(num_correct / num_total), 'Average Loss: {}'.format(running_loss / len(valid_loader))) model.to(device) lr = 3e-5 # optimizer = optim.SGD(model.parameters(), lr=lr) optimizer = AdamW(model.parameters(), lr=lr) num_epochs = 25 train(model, train_dataloader, valid_dataloader, optimizer, num_epochs) torch.save(model.state_dict(), 'model.npy') torch.save(optimizer.state_dict(), 'optimizer.npy') # # num_epochs = 15 # train(model, train_dataloader, valid_dataloader, optimizer, criterion, num_epochs) # # torch.save(model.state_dict(), 'model.npy') # torch.save(optimizer.state_dict(), 'optimizer.npy')
def main(): print('Start') parser = argparse.ArgumentParser() # Add the arguments to the parser parser.add_argument("--model_name", required=True) parser.add_argument("--checkpoint_input_path", required=False) parser.add_argument("--checkpoint_output_path", required=True) parser.add_argument("--bioasq_path", required=True) parser.add_argument("--seed", default=1995) parser.add_argument("--learning_rate", default=5e-5, type=float) parser.add_argument("--batch_size", default=16, type=int) parser.add_argument("--epochs", default=3, type=int) parser.add_argument('--mid_layer', dest='mid_layer', action='store_true') parser.add_argument('--no-mid_layer', dest='mid_layer', action='store_false') parser.set_defaults(mid_layer=True) parser.add_argument('--balance', dest='balance', action='store_true') parser.add_argument('--no-balance', dest='balance', action='store_false') parser.set_defaults(balance=True) parser.add_argument("--mid_layer_size", default=256, type=int) args = vars(parser.parse_args()) print(args['mid_layer']) random.seed(args['seed']) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') with open(args['bioasq_path'], 'rb') as f: bio_yn_raw = json.load(f)['questions'] bio_yn = [ question for question in bio_yn_raw if question['type'] == 'yesno' ] bio_yn_questions = [question['body'] for question in bio_yn] bio_yn_ids = [question['id'] for question in bio_yn] bio_yn_answers = [question['exact_answer'] for question in bio_yn] bio_snippets = { question['id']: [snippet['text'] for snippet in question['snippets']] for question in bio_yn } ids = [] snippets = [] for key, value in bio_snippets.items(): for snippet in value: ids.append(key) snippets.append(snippet) snippets_df = pd.DataFrame({'id': ids, 'snippet': snippets}) questions_df = pd.DataFrame({ 'id': bio_yn_ids, 'question': bio_yn_questions, 'label': bio_yn_answers }) bio_yn_df = pd.merge(snippets_df, questions_df, how='left', on='id') bio_yn_df = bio_yn_df.sample(32) no_size = bio_yn_df[bio_yn_df.label == 'no'].shape[0] yes_index = bio_yn_df[bio_yn_df.label == 'yes'].index random_index = np.random.choice(yes_index, no_size, replace=False) yes_sample = bio_yn_df.loc[random_index] bio_yn_balanced = pd.concat( [yes_sample, bio_yn_df[bio_yn_df.label == 'no']]) bio_yn_balanced = bio_yn_balanced.sample(frac=1) if args['balance']: train_a = list(bio_yn_balanced.question) train_b = list(bio_yn_balanced.snippet) train_labels = [ int(answer == 'yes') for answer in bio_yn_balanced.label ] else: train_a = list(bio_yn_df.question) train_b = list(bio_yn_df.snippet) train_labels = [int(answer == 'yes') for answer in bio_yn_df.label] from transformers import BertTokenizer # Load the BERT tokenizer. tokenizer = BertTokenizer.from_pretrained(args['model_name'], do_lower_case=True) # In[39]: train_tokens = tokenizer(train_a, train_b, add_special_tokens=True, max_length=500, truncation=True, padding=True) train_tokens['labels'] = train_labels # In[40]: from torch.utils.data import Dataset, DataLoader class MnliDataset(Dataset): def __init__(self, encodings): self.encodings = encodings def __getitem__(self, idx): #print(self.encodings['start_positions'][idx]) #{key: torch.tensor(val[idx], dtype = torch.long) for key, val in self.encodings.items()} return { 'input_ids': torch.tensor(self.encodings['input_ids'][idx], dtype=torch.long), 'attention_mask': torch.tensor(self.encodings['attention_mask'][idx], dtype=torch.long), 'token_type_ids': torch.tensor(self.encodings['token_type_ids'][idx], dtype=torch.long), 'labels': torch.tensor(self.encodings['labels'][idx], dtype=torch.long) } def __len__(self): return len(self.encodings.input_ids) train_dataset = MnliDataset(train_tokens) # In[5]: # In[4]: from transformers import BertForSequenceClassification model = BertForSequenceClassification.from_pretrained(args['model_name'], num_labels=3) checkpoint = torch.load(args['checkpoint_input_path'], map_location=device) model.load_state_dict({ key.replace('module.', ''): value for key, value in checkpoint.items() }) # freeze all the parameters #for param in model.parameters(): # param.requires_grad = False # In[73]: class BERT_Arch(nn.Module): def __init__(self, model): super(BERT_Arch, self).__init__() self.model = model # dropout layer self.dropout = nn.Dropout(0.1) # relu activation function self.relu = nn.ReLU() # dense layer 1 if args['mid_layer']: self.fc1 = nn.Linear(3, args['mid_layer_size']) self.fc2 = nn.Linear(args['mid_layer_size'], 2) else: self.fc1 = nn.Linear(3, 2) #softmax activation function self.softmax = nn.LogSoftmax(dim=1) #define the forward pass def forward(self, input_ids, attention_mask, token_type_ids, labels): #pass the inputs to the model outputs = self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels) cls_hs = outputs.logits if args['mid_layer']: x = self.fc1(cls_hs) x = self.relu(x) x = self.dropout(x) # output layer x = self.fc2(x) # apply softmax activation x = self.softmax(x) else: x = self.dropout(cls_hs) # output layer x = self.fc1(x) # apply softmax activation x = self.softmax(x) return x # In[74]: model_full = BERT_Arch(model) # In[81]: from torch.utils.data import DataLoader from transformers import AdamW from torch.nn import DataParallel model_full.to(device) model_full.train() model_full = DataParallel(model_full) train_loader = DataLoader(train_dataset, batch_size=args['batch_size'], shuffle=True) optim = AdamW(model.parameters(), lr=args['learning_rate']) # In[83]: cross_entropy = nn.NLLLoss() for epoch in range(args['epochs']): for i, batch in enumerate(Bar(train_loader)): optim.zero_grad() input_ids = batch['input_ids'].to(device, dtype=torch.long) attention_mask = batch['attention_mask'].to(device, dtype=torch.long) token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long) labels = batch['labels'].to(device, dtype=torch.long) outputs = model_full(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels) #loss = outputs.loss loss = cross_entropy(outputs, labels) loss.backward() optim.step() model_full.eval() # In[ ]: if args['mid_layer']: checkpoint_output = args[ 'checkpoint_output_path'] + '/checkpoint_yn_' + str( args['mid_layer_size']) + '.pt' else: checkpoint_output = args[ 'checkpoint_output_path'] + '/checkpoint_yn_direct.pt' torch.save( { 'epoch': args['epochs'], 'model_state_dict': model_full.state_dict(), 'optimizer_state_dict': optim.state_dict(), 'loss': loss, }, checkpoint_output)
class BertForSeqFinetune(): def __init__(self, model_name, config, num_labels, model_desc, save_ckpt_path, save_flag, hf_model_class, hf_token_class, model_class_type, vocab_file=None, model_weights=None, from_tf=False): """ Params ------ model_name: model_name e.g. "bert-base-uncased" or path config: BertConfig object that is initialised with the same model_name num_labels: number of classes for finetuning categorical data model_desc: description of model used to name checkpoints that are saved with training save_ckpt_path: base path for saving checkpoints. save_flag: Whether to save, not used for model evaluation, i.e. using validation data. hf_model_class: HuggingFace model class hf_model_class: HuggingFace token class vocab_file: The vocabulary from a pretrained model model_weights: The Pytorch binary file from a pretrained model from_tf: Whether the model_name is a path pointing to a model pre-trained in Tensorflow. (Not tested) """ # super(BertForSeqFinetune, self).__init__(config) ### MODEL VARIABLES ### # self.args_loaded = False self.device = None self.model_name = model_name self.config = config # initialised outside of class self.model = None self.tokenizer = None self.hf_model_class = hf_model_class self.hf_token_class = hf_token_class self.model_class_type = model_class_type ### DATA VARIABLES ### self.training_data_loader = None self.testing_data_loader = None self.validating_data_loader = None ### TRAINING VARIABLES ### self.num_labels = num_labels self.max_token_len = 128 self.lr = 2e-5 # self.TEST_SIZE = 0.2 # self.EPOCHS = 3 # self.BATCH_SIZE = 8 # Save file names self.optimizer_pt = "optimizer.pt" self.scheduler_pt = "scheduler.pt" self.save_steps = 10 self.warmup_steps = None self.total_steps = None self.gradient_accumulation_steps = 1 self.logging_steps = 50 self.max_grad_norm = 1.0 self.loss_over_time = [] self.random_state = 2018 ### EVALUATION VARIABLES ### self.validation_accuracy = None # Precision-recall by topic # self.pr_dict = defaultdict(lambda: defaultdict(int)) self.preds_arr = None self.labels_arr = None self.topics_eval_arr = None self.doc_id_eval_arr = None ### SAVE PATH VARIABLES ### self.cache_dir = None self.save_flag = save_flag if self.save_flag: self.output_dir = f"./{save_ckpt_path}/{model_desc}" if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) self._specify_model(self.model_name, self.config, self.num_labels, vocab_file=vocab_file, model_weights=model_weights) def load_dataloader_train_and_test(self, training_data_loader, testing_data_loader): self.training_data_loader = training_data_loader self.testing_data_loader = testing_data_loader def load_dataloader_validate(self, validating_data_loader): self.validating_data_loader = validating_data_loader def _specify_model(self, model_name, config, num_labels, vocab_file=None, model_weights=None, from_tf=False): """ The naming conventions for loading a pretrained model is: "config.json" "vocab.txt" "pytorch_model.bin" To be explicit, we'll force the user to specify their files. The `config.json` file specified outside of the class, so account for the remaining two. If we are loading the files from Tensorflow, then we need to pass in a boolean (in this case from_tf) """ if (model_weights is not None) and (from_tf == False): self.model = self.hf_model_class.from_pretrained( f"{model_name}", config=self.config) elif from_tf: self.model = self.hf_model_class.from_pretrained( f"{model_name}", from_tf=from_tf, config=self.config) else: self.model = self.hf_model_class.from_pretrained( f"{model_name}", config=self.config) if vocab_file is not None: self.tokenizer = self.hf_token_class.from_pretrained( f"{model_name}/{vocab_file}", do_lower_case=True) else: self.tokenizer = self.hf_token_class.from_pretrained( f"{model_name}", do_lower_case=True) def train(self, epochs, batch_size, use_gpu): """ use_gpu: int """ if self.model is None: raise ValueError("Model has not been specified!") if torch.cuda.is_available() and use_gpu: print("Using GPU") self.device = torch.device("cuda") torch.cuda.empty_cache() if use_gpu > 1: self.model = nn.DataParallel(self.model) else: print("CUDA not available. Using CPU") self.device = torch.device("cpu") self.model.to(self.device) self.total_steps = len(self.training_data_loader) // ( self.gradient_accumulation_steps * epochs) self.warmup_steps = int(self.total_steps / 10) self.optimizer = AdamW(self.model.parameters(), lr=2e-5, correct_bias=False) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=self.total_steps) print(f"{self.output_dir}/optimizer.pt") if os.path.isfile( f"{self.output_dir}/optimizer.pt") and os.path.isfile( f"{self.output_dir}/scheduler.pt"): print("loading saved optimiser and scheduler") self.optimizer.load_state_dict( torch.load(f"{self.output_dir}/{self.optimizer_pt}")) self.scheduler.load_state_dict( torch.load(f"{self.output_dir}/{self.scheduler_pt}")) global_steps = 0 tr_loss, tr_loss_prev = 0.0, 0.0 nb_tr_examples = 0 for epoch in trange(epochs, desc="EPOCHS"): epoch_iterator = tqdm(self.training_data_loader, desc="Iteration") for step, batch in enumerate(epoch_iterator): self.model.train() self.model.zero_grad() batch = tuple(t.to(self.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } inputs['token_type_ids'] = (batch[2] if self.model_class_type in ["bert", "xlnet", "albert" ] else None) # Rewrite this code to check for model_type more easily. # if args.model_type != 'distilbert': # inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None outputs = self.model(**inputs) loss = outputs[0] print(f"loss: {loss}") loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) self.optimizer.step() self.scheduler.step() tr_loss += loss.item() self.loss_over_time.append(tr_loss) nb_tr_examples += inputs["input_ids"].size(0) global_steps += 1 # @TODO: Find suitable way to record this information if global_steps % self.logging_steps == 0: avg_loss = (tr_loss - tr_loss_prev) / self.logging_steps tr_loss_prev = tr_loss print( f"Statistics over the last {self.logging_steps} steps:" ) print(f"\t global_steps: {global_steps}") print(f"\t average loss: {avg_loss}") print(f"\t loss.item(): {loss.item()}") print(f"\t tr_loss: {tr_loss}") print(f"\t nb_tr_examples: {nb_tr_examples}") if self.save_flag: output_dir = os.path.join(self.output_dir, 'checkpoint-{}'.format(global_steps)) if not os.path.exists(output_dir): os.makedirs(output_dir) if (epoch % 1) == 0: tmp_eval = self.evaluate( {"precision_recall_by_topic": eval_pr_per_topics}, use_ids=True, validate=False, use_gpu=True) # Convert the defaultdict to dict, for JSON # print(tmp_eval[4]) for key in tmp_eval[4].keys(): tmp_eval[4][key] = dict(tmp_eval[4][key]) # print(tmp_eval[4]) pr_dict_tmp = dict(tmp_eval[4]) # print(pr_dict_tmp) output_dict = { "y_truth": tmp_eval[0].tolist(), "y_pred": tmp_eval[1].tolist(), "topics_arr": tmp_eval[2].tolist(), "doc_ids_arr": tmp_eval[3].tolist(), "pr_dict": pr_dict_tmp, } # print(output_dict["pr_dict"]) # print(type(output_dict["pr_dict"])) pd.to_pickle(output_dict, f"{output_dir}/ckpt_eval.pickle") # with open(f"{output_dir}/ckpt_eval.json", "w") as f: # pickle.dump(pr_dict_tmp, f) self.save_model(output_dir) # Take care of distributed/parallel training # model_to_save = self.model.module if hasattr(self.model, 'module') else self.model # model_to_save.save_pretrained(output_dir) # @TODO: Do we want to implement a way to save the arguments? # torch.save(args, os.path.join(output_dir, 'training_args.bin')) return global_steps, tr_loss / global_steps def evaluate(self, eval_metrics_dict, use_ids, validate, use_gpu): """ Format of the batch is different depending on use_ids: IF use_ids IS True: ( [ tensor([ [a1], ..., [an] ]) tensor([ [b1], ..., [bn] ]) ... tensor([ [j1], ..., [jn] ]) ], [id1, ..., idn] ) ELSE: ( tensor([ [a1], ..., [an] ]) ... tensor([ [j1], ..., [jn] ]) ) Params ------ eval_metric_dict: { "accuracy": num_correctly_classified, "precision_recall_by_topic": eval_pr_per_topics, "roc_curve": calc_roc } use_ids: If true, we use unique IDs are used to track the individual data points validate: If true, then this is a validation set with no labels. """ eval_loss = 0.0 nb_eval_steps = 0 acc_test_loss = 0.0 self.pr_dict = defaultdict(lambda: defaultdict(int)) if torch.cuda.is_available() and use_gpu: print("Using GPU") self.device = torch.device("cuda") torch.cuda.empty_cache() if use_gpu > 1: self.model = nn.DataParallel(self.model) else: print("CUDA not available. Using CPU") self.device = torch.device("cpu") self.model.to(self.device) self.model.eval() if validate: test_data = self.validating_data_loader else: test_data = self.testing_data_loader for batch in tqdm(test_data, desc="EVALUATING"): with torch.no_grad(): # print(batch) if use_ids: doc_ids_batch = batch[1] batch = tuple(t.to(self.device) for t in batch[0]) topics_batch = batch[4].detach().cpu().numpy() else: batch = tuple(t.to(self.device) for t in batch) topics_batch = batch[4].detach().cpu().numpy() doc_ids_batch = batch[5].detach().cpu().numpy() inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } inputs['token_type_ids'] = (batch[2] if self.model_class_type in ["bert", "xlnet", "albert" ] else None) # if args.model_type != 'distilbert': # inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids # What does this do? if validate: inputs.pop("labels") outputs = self.model(**inputs) logits = outputs[0] # print(logits) else: outputs = self.model(**inputs) tmp_test_loss, logits = outputs[:2] # What does this do? eval_loss += tmp_test_loss.mean().item() nb_eval_steps += 1 ################ UPDATE TOTAL LOSS ################ logits_batch = logits.detach().cpu().numpy() if not validate: labels_batch = inputs["labels"].cpu().numpy() if "accuracy" in eval_metrics_dict.keys(): batch_test_loss = eval_metrics_dict["accuracy"](logits_batch, labels_batch) acc_test_loss += batch_test_loss if "precision_recall_by_topic" in eval_metrics_dict.keys(): eval_metrics_dict["precision_recall_by_topic"](logits_batch, labels_batch, topics_batch, self.pr_dict) # We're going to save this and return it later if self.preds_arr is None: self.preds_arr = logits_batch if not validate: self.labels_arr = labels_batch self.topics_eval_arr = topics_batch self.doc_id_eval_arr = doc_ids_batch else: self.preds_arr = np.append(self.preds_arr, logits_batch, axis=0) if not validate: # print(inputs["labels"]) self.labels_arr = np.append(self.labels_arr, labels_batch, axis=0) self.topics_eval_arr = np.append(self.topics_eval_arr, topics_batch, axis=0) self.doc_id_eval_arr = np.append(self.doc_id_eval_arr, doc_ids_batch, axis=0) ################ DISPLAY RESULTS ################ # previous metric_function function accuracy percentage for each batch # self.validation_accuracy = acc_test_loss/nb_eval_steps if not validate: eval_loss = eval_loss / nb_eval_steps print(f"eval_loss: {eval_loss}") if validate: num_test_points = len(self.validating_data_loader.dataset) else: num_test_points = len(self.testing_data_loader.dataset) print(f"acc_test_loss: {acc_test_loss}") print(f"num_test_points: {num_test_points}") if "accuracy" in eval_metrics_dict.keys(): self.validation_accuracy = acc_test_loss / num_test_points print("Validation Accuracy: {}".format(self.validation_accuracy)) if "precision_recall_by_topic" in eval_metrics_dict.keys(): for topic in self.pr_dict.keys(): if (self.pr_dict[topic]["false_positive"] + self.pr_dict[topic]["true_positive"]) == 0: print(f"FP + TP = 0") precision = 0 else: precision = self.pr_dict[topic]["true_positive"] / ( self.pr_dict[topic]["false_positive"] + self.pr_dict[topic]["true_positive"]) if (self.pr_dict[topic]["false_negative"] + self.pr_dict[topic]["true_positive"]) == 0: print(f"FN + TP = 0") recall = 0 else: recall = self.pr_dict[topic]["true_positive"] / ( self.pr_dict[topic]["false_negative"] + self.pr_dict[topic]["true_positive"]) self.pr_dict[topic]["precision"] = precision self.pr_dict[topic]["recall"] = recall if "roc_curve" in eval_metrics_dict.keys(): eval_metrics_dict["roc_curve"](self.preds_arr, self.labels_arr, num_classes=self.num_labels) # self.labels_arr is None if we are evaluating with validation data. return self.labels_arr, self.preds_arr, self.topics_eval_arr, self.doc_id_eval_arr, self.pr_dict, self.validation_accuracy def save_model(self, output_dir): model_to_save = self.model.module if hasattr( self.model, 'module' ) else self.model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) self.tokenizer.save_pretrained(output_dir) torch.save(self.optimizer.state_dict(), f"{output_dir}/{self.optimizer_pt}") torch.save(self.scheduler.state_dict(), f"{output_dir}/{self.scheduler_pt}") # @TODO: Implement dict of args # torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned self.model = self.hf_model_class.from_pretrained(output_dir) self.tokenizer = self.hf_token_class.from_pretrained(output_dir) self.model.to(self.device)
def train(model, tokenizer, checkpoint, round): if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) else: amp = None train_data = Multi_task_dataset_eng(data_file=args.train_file, max_length=args.max_length, tokenizer=tokenizer, model_type=args.model_type) train_dataloader = DataLoader(dataset=train_data, batch_size=args.batch_size, shuffle=True) t_total = len(train_dataloader) * args.epochs warmup_steps = int(args.warmup_steps * t_total) optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) if args.fp16: model, optimizer = amp.initialize(model, optimizer, opt_level=args.fptype) # 读取断点 optimizer、scheduler checkpoint_dir = args.save_dir + "/checkpoint-" + str( checkpoint) + '-' + str(round) if os.path.isfile(os.path.join(checkpoint_dir, "optimizer.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(checkpoint_dir, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(checkpoint_dir, "scheduler.pt"))) if args.fp16: amp.load_state_dict( torch.load(os.path.join(checkpoint_dir, "amp.pt"))) # 开始训练 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataloader)) logger.info(" Num Epochs = %d", args.epochs) logger.info(" Batch size = %d", args.batch_size) logger.info(" learning_rate = %s", str(args.learning_rate)) logger.info(" Total steps = %d", t_total) logger.info(" warmup steps = %d", warmup_steps) logger.info(" Model_type = %s", args.model_type) logger.info(" Decoder_type = %s", args.decoder_type) logger.info(" vice_loss_weight = %s", str(args.vice_weight)) # 没有历史断点,则从0开始 if checkpoint < 0: checkpoint = 0 round = 0 else: checkpoint += 1 round += 1 max_test_acc = 0 max_test_f1 = 0 logger.debug(" Start Batch = %d", checkpoint) for epoch in range(checkpoint, args.epochs): model.train() epoch_loss = [] step = 0 for batch in tqdm(train_dataloader, desc="Iteration", ncols=50): model.zero_grad() # 设置tensor gpu运行 batch = tuple(t.to(args.device) for t in batch) if 'roberta' in args.model_type: input_ids, attention_mask, labels_main, labels_vice1, labels_vice2 = batch outputs = model(input_ids=input_ids.long(), attention_mask=attention_mask.long(), labels_main=labels_main, labels_vice1=labels_vice1, labels_vice2=labels_vice2, model_type='roberta') else: input_ids, token_type_ids, attention_mask, labels_main, labels_vice1, labels_vice2 = batch outputs = model(input_ids=input_ids.long(), token_type_ids=token_type_ids.long(), attention_mask=attention_mask.long(), labels_main=labels_main, labels_vice1=labels_vice1, labels_vice2=labels_vice2) loss = outputs[0] if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # 计算出梯度 epoch_loss.append(loss.item()) optimizer.step() scheduler.step() step += 1 if step % 500 == 0: logger.debug("loss:" + str(np.array(epoch_loss).mean())) logger.debug( 'learning_rate:' + str(optimizer.state_dict()['param_groups'][0]['lr'])) if step % args.saving_steps == 0: round += 1 dev_loss, dev_acc, dev_f1 = test(model=model, tokenizer=tokenizer, test_file=args.dev_file, checkpoint=epoch, round=round) logger.info( '【DEV】Train Epoch %d, round %d: train_loss=%.4f, acc=%.4f, f1=%.4f' % (epoch, round, dev_loss, dev_acc, dev_f1)) test_loss, test_acc, test_f1 = test(model=model, tokenizer=tokenizer, test_file=args.test_file, checkpoint=epoch, round=round) logger.info( '【TEST】Train Epoch %d, round %d: train_loss=%.4f, acc=%.4f, f1=%.4f' % (epoch, round, test_loss, test_acc, test_f1)) output_dir = args.save_dir + "/checkpoint-" + str( epoch) + '-' + str(round) if test_acc > max_test_acc or test_f1 > max_test_f1: max_test_acc = max(test_acc, max_test_acc) max_test_f1 = max(test_f1, max_test_f1) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = (model.module if hasattr(model, "module") else model) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.debug("Saving model checkpoint to %s", output_dir) if args.fp16: torch.save(amp.state_dict(), os.path.join(output_dir, "amp.pt")) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.debug("Saving optimizer and scheduler states to %s", output_dir) model.train() # 保存模型 output_dir = args.save_dir + "/checkpoint-" + str(epoch) + '-' + str( round) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = (model.module if hasattr(model, "module") else model) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.debug("Saving model checkpoint to %s", output_dir) if args.fp16: torch.save(amp.state_dict(), os.path.join(output_dir, "amp.pt")) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.debug("Saving optimizer and scheduler states to %s", output_dir) dev_loss, dev_acc, dev_f1 = test(model=model, tokenizer=tokenizer, test_file=args.dev_file, checkpoint=epoch, round=round) test_loss, test_acc, test_f1 = test(model=model, tokenizer=tokenizer, test_file=args.test_file, checkpoint=epoch, round=round) #print(test_loss, test_acc) logger.info( '【DEV】Train Epoch %d, round %d: train_loss=%.4f, acc=%.4f, f1=%.4f' % (epoch, round, dev_loss, dev_acc, dev_f1)) logger.info( '【TEST】Train Epoch %d, round %d: train_loss=%.4f, acc=%.4f, f1=%.4f' % (epoch, round, test_loss, test_acc, test_f1)) if test_acc > max_test_acc or test_f1 > max_test_f1: max_test_acc = max(test_acc, max_test_acc) max_test_f1 = max(test_f1, max_test_f1) logger.info('【BEST TEST ACC】: %.4f, 【BEST TEST F1】: %.4f' % (max_test_acc, max_test_f1))
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler_total = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) subset_quantity = args.div_subset # notice 难度划分 curriculum_sets_temp = [] # done 如何保证课程被采样了 diff_eval_result = Difficulty_Evaluation(args, train_dataset) for i,subset in enumerate(diff_eval_result): gate = int((len(train_dataset)/args.train_batch_size)/(subset_quantity)) print("第",i,"个 num:",len(subset)," 阈值 ",gate) random.shuffle(subset) # 如果subset过于小,就不采样了 if len(subset) > gate: # subset = list(subset) # 决定没一个采样的长度 curriculum_sets_temp.append(subset[0:int( gate /subset_quantity)]) # elif(len(subset) <= int(gate/subset_quantity)): # for i in range(subset_quantity): # curriculum_sets_temp.append(subset) else: curriculum_sets_temp.append(subset) # curriculum_sets_temp.append(subset) # 不采样的 # diff_eval_result = Difficulty_Evaluation(args, train_dataset) # for _ in range(int(args.num_train_epochs)): # for i, subset in enumerate(diff_eval_result): # random.shuffle(subset) # curriculum_sets_temp.append(subset) # 随机划分 # curriculum_sets_temp = Difficulty_Evaluation_Randomly(args,train_dataset) # 先添加全部任务 curriculum_sets = [] total_train_dataloader = DataLoader(train_dataset, sampler=train_sampler_total, batch_size=args.train_batch_size) for i in range(int(args.num_train_epochs)): curriculum_sets.append(total_train_dataloader) # 再添加课程任务 # notice 课程任务顺序 curriculum_sets += curriculum_sets_temp # CL阶段训练 if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(curriculum_sets[0]) // args.gradient_accumulation_steps) + 1 else: t_total = len(curriculum_sets[0]) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] # notice 添加L2正则化 optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon,weight_decay=0.01) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(curriculum_sets[0])) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(curriculum_sets[0]) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(curriculum_sets[0]) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( # epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] epochs_trained, int(len(curriculum_sets)), desc = "Epoch", disable = args.local_rank not in [-1, 0] ) # Added here for reproductibility set_seed(args) current_stage = 0 for _ in train_iterator: epoch_iterator = tqdm(curriculum_sets[current_stage], desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) # print("batch_size",batch[0].shape) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr(model.config, "lang2id"): inputs.update( {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} ) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] # notice 添加KL的loss 或者 wgan的那个w # pa = 100 # loss += (pa * (cal_diff(outputs.hidden_states[0], outputs.hidden_states[-1],norm="line",criterion="wd"))) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # Save model checkpoint if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break current_stage += 1 if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, data_generator, model, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(log_dir=args.run_name) train_dataset = data_generator.instance_a_train_dataset() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) criterion = nn.BCEWithLogitsLoss() def collate(batch): # if tokenizer._pad_token is None: # return pad_sequence(examples, batch_first=True) # return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) tokens = [b[0] for b in batch] features = [b[1] for b in batch] targets = [b[2] for b in batch] inputs = [b[3] for b in batch] lens = [len(x) for x in inputs] inputs = pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id) attention_mask = (inputs != tokenizer.pad_token_id).int() tokens, features, targets = [ torch.tensor(x) for x in [tokens, features, targets] ] return tokens, features, targets, inputs, attention_mask, torch.tensor( lens).unsqueeze(1) if args.use_bucket_iterator: print("\n\n\n\n USING THE BUCKET ITERATOR \n\n\n\n") bucket_boundaries = [0, 20, 40, 60, 80, 101] train_sampler = BySequenceLengthSampler( train_dataset, bucket_boundaries, batch_size=args.train_batch_size, drop_last=False) train_dataloader = DataLoader(train_dataset, batch_size=1, batch_sampler=train_sampler, collate_fn=collate) else: train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in model.bert.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, "lr": args.learning_rate, }, { "params": [ p for n, p in model.bert.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, "lr": args.learning_rate, }, { "params": [ p for n, p in model.mlp_net.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, "lr": args.mlp_learning_rate, }, { "params": [ p for n, p in model.mlp_net.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, "lr": args.mlp_learning_rate, }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # # Check if saved optimizer or scheduler states exist # TODO if ( # args.model_name_or_path # and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) # and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) # ): # # Load in optimizer and scheduler states # optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) # scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) # if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint # if args.model_name_or_path and os.path.exists(args.model_name_or_path): # try: # # set global_step to gobal_step of last saved checkpoint from model path # checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] # global_step = int(checkpoint_suffix) # epochs_trained = global_step // ((len(train_dataset)//args.train_batch_size) // args.gradient_accumulation_steps) # steps_trained_in_current_epoch = global_step % ((len(train_dataset)//args.train_batch_size) // args.gradient_accumulation_steps) # logger.info(" Continuing training from checkpoint, will skip to saved global_step") # logger.info(" Continuing training from epoch %d", epochs_trained) # logger.info(" Continuing training from global step %d", global_step) # logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) # except ValueError: # logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 # model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training # model_to_resize.resize_token_embeddings(len(tokenizer)) if args.continue_training: model.load_state_dict( torch.load(args.continue_training_path + "model.bin")) optimizer.load_state_dict( torch.load(args.continue_training_path + "optimizer.pt")) scheduler.load_state_dict( torch.load(args.continue_training_path + "scheduler.pt")) print("\n loaded model/optimizer/scheduler") model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproducibility evaluate(args, data_generator, tb_writer, model, tokenizer, global_step) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue # training loop tokens, features, targets, inputs, attention_mask, lens = batch tokens, features, targets, inputs, attention_mask, lens = [ x.to(args.device) for x in [tokens, features, targets, inputs, attention_mask, lens] ] tokens, features, targets = [ x.float() for x in [tokens, features, targets] ] model.train() logit = model(tokens, features, inputs, attention_mask, lens) loss = criterion(logit, targets) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if step % 100 == 0: tb_writer.add_scalar("training_loss", loss.item(), global_step) print("{}".format(loss.item())) with open("./train_log.txt", "a") as f: f.write("{} \n".format(loss.item())) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well # results = evaluate(args, tb_writer, model, tokenizer) evaluate(args, data_generator, tb_writer, model, tokenizer, global_step) # for key, value in results.items(): # tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) # model_to_save = ( # model.module if hasattr(model, "module") else model # ) # Take care of distributed/parallel training # model_to_save.save_pretrained(output_dir) torch.save(model.state_dict(), os.path.join(output_dir, "model.bin")) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(train_dataset, model, tokenizer, n_epochs=2, eval_every=2500, save_every=5000, output_folder='SQUAD_data', checkpoint='-1', bs=2, w_checkpoint=True, tensordir='runs', acc_steps=1): """ Train the model """ tb_writer = SummaryWriter(tensordir) train_batch_size = bs train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size) t_total = len(train_dataloader) // acc_steps * n_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8) # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=0, num_training_steps=t_total # ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(output_folder, checkpoint, "optimizer.pt")): # and os.path.isfile( # os.path.join(output_folder, checkpoint, "scheduler.pt") # ): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(output_folder, checkpoint, "optimizer.pt"))) # scheduler.load_state_dict(torch.load(os.path.join(output_folder, checkpoint, "scheduler.pt"))) print('Optimizer and scheduler found !\n') # Train! print("***** Running training *****") print(" Num examples = %s" % len(train_dataset)) print(" Num Epochs = %s" % n_epochs) print(" Instantaneous batch size per GPU = %s" % bs) print(" Total optimization steps = %s" % t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(os.path.join(output_folder, checkpoint)): try: # set global_step to gobal_step of last saved checkpoint from model path if checkpoint == '': t = glob.glob(os.path.join(output_folder, '*.txt'))[0] global_step = int(t[len(output_folder) + 1:-4]) else: checkpoint_suffix = checkpoint.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // acc_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // acc_steps) print( " Continuing training from checkpoint, will skip to saved global_step" ) print(" Continuing training from epoch %d" % epochs_trained) print(" Continuing training from global step %d" % global_step) print(" Will skip the first %d steps in the first epoch" % steps_trained_in_current_epoch) except ValueError: print(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 optimizer.zero_grad() train_iterator = trange(epochs_trained, n_epochs, desc="Epoch", disable=False) # Added here for reproductibility set_seed() for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr(model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(device) }) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if acc_steps > 1: loss = loss / acc_steps loss.backward() tr_loss += loss.item() if (step + 1) % acc_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() # scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 #if global_step % 5000 == 0: drive.mount("/content/gdrive", force_remount=True) # Log metrics if (global_step % eval_every == 0) and (eval_every != -1): # Only evaluate when single GPU otherwise metrics may not average well output_dir = os.path.join( output_folder, "checkpoint-{}".format( global_step)) if w_checkpoint else output_folder results = evaluate(model, tokenizer, output_dir, bs=train_batch_size) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) # tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / eval_every, global_step) logging_loss = tr_loss with open( os.path.join( output_dir, 'results_{}.json'.format(global_step)), 'w') as f: json.dump(results, f) # Save model checkpoint if (global_step % save_every == 0) and (save_every != -1): output_dir = os.path.join( output_folder, "checkpoint-{}".format( global_step)) if w_checkpoint else output_folder if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) #torch.save(args, os.path.join(output_dir, "training_args.bin")) print("Saving model checkpoint to %s" % output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) # torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) print("Saving optimizer and scheduler states to %s" % output_dir) if checkpoint == '': if global_step == save_every: pass else: t = glob.glob(os.path.join(output_dir, '*.txt'))[0] os.remove(t) with open( os.path.join(output_dir, '{}.txt'.format(global_step)), 'w') as f: f.write(' ') if save_every == -1: output_dir = os.path.join(output_folder, "checkpoint-{}".format( global_step)) if w_checkpoint else output_folder if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) #torch.save(args, os.path.join(output_dir, "training_args.bin")) print("Saving model checkpoint to %s" % output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) # torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) print("Saving optimizer and scheduler states to %s" % output_dir) if checkpoint == '': if global_step == save_every: pass else: t = glob.glob(os.path.join(output_dir, '*.txt'))[0] os.remove(t) with open( os.path.join(output_dir, '{}.txt'.format(global_step)), 'w') as f: f.write('') if eval_every == -1: # Only evaluate when single GPU otherwise metrics may not average well output_dir = os.path.join(output_folder, "checkpoint-{}".format( global_step)) if w_checkpoint else output_folder results = evaluate(model, tokenizer, output_dir, bs=train_batch_size) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) # tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / 500, global_step) logging_loss = tr_loss with open( os.path.join(output_dir, 'results_{}.json'.format(global_step)), 'w') as f: json.dump(results, f) tb_writer.close() return global_step, tr_loss / global_step
def train(args,input_qnlidata_dir, train_dataset, model, tokenizer,qnlimodel_output_path): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_this_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_this_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info(f" Continuing training from checkpoint, will skip to saved global_step") logger.info(f" Continuing training from epoch {epochs_trained}") logger.info(f" Continuing training from global step {global_step}") logger.info(f" Will skip the first {steps_trained_in_this_epoch} steps in the first epoch") tr_loss, logging_loss, epoch_loss = 0.0, 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], mininterval=10, ncols=100) set_seed(args) # Added here for reproductibility best_dev_performance = 0 best_epoch = epochs_trained train_acc = 0.0 for epoch, _ in enumerate(train_iterator): epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0], mininterval=10, ncols=100) train_iterator.set_description(f"train_epoch: {epoch} train_acc: {train_acc:.4f}") train_ids = None train_golds = None train_logits = None train_losses = None for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_this_epoch > 0: steps_trained_in_this_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if train_logits is None: # Keep track of training dynamics. train_ids = batch[4].detach().cpu().numpy() train_logits = outputs[1].detach().cpu().numpy() train_golds = inputs["labels"].detach().cpu().numpy() train_losses = loss.detach().cpu().numpy() else: train_ids = np.append(train_ids, batch[4].detach().cpu().numpy()) train_logits = np.append(train_logits, outputs[1].detach().cpu().numpy(), axis=0) train_golds = np.append(train_golds, inputs["labels"].detach().cpu().numpy()) train_losses = np.append(train_losses, loss.detach().cpu().numpy()) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if ( args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0 ): epoch_log = {} # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training_epoch: logger.info(f"From within the epoch at step {step}") results, _ = evaluate(args,input_qnlidata_dir,qnlimodel_output_path, model, tokenizer) for key, value in results.items(): eval_key = "eval_{}".format(key) epoch_log[eval_key] = value epoch_log["learning_rate"] = scheduler.get_lr()[0] epoch_log["loss"] = (tr_loss - logging_loss) / args.logging_steps logging_loss = tr_loss for key, value in epoch_log.items(): tb_writer.add_scalar(key, value, global_step) logger.info(json.dumps({**epoch_log, **{"step": global_step}})) if ( args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0 ): # Save model checkpoint output_dir = os.path.join(qnlimodel_output_path, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) epoch_iterator.set_description(f"lr = {scheduler.get_lr()[0]:.8f}, " f"loss = {(tr_loss-epoch_loss)/(step+1):.4f}") if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break #### Post epoch eval #### # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: best_dev_performance, best_epoch = save_model( args,input_qnlidata_dir, model, tokenizer, epoch, best_epoch, best_dev_performance,qnlimodel_output_path) # Keep track of training dynamics. log_training_dynamics(output_dir=qnlimodel_output_path, epoch=epoch, train_ids=list(train_ids), train_logits=list(train_logits), train_golds=list(train_golds)) train_result = compute_metrics(args.task_name, np.argmax(train_logits, axis=1), train_golds) train_acc = train_result["acc"] epoch_log = {"epoch": epoch, "train_acc": train_acc, "best_dev_performance": best_dev_performance, "avg_batch_loss": (tr_loss - epoch_loss) / args.per_gpu_train_batch_size, "learning_rate": scheduler.get_lr()[0],} epoch_loss = tr_loss logger.info(f" End of epoch : {epoch}") with open(os.path.join(qnlimodel_output_path, f"eval_metrics_train.json"), "a") as toutfile: toutfile.write(json.dumps(epoch_log) + "\n") for key, value in epoch_log.items(): tb_writer.add_scalar(key, value, global_step) logger.info(f" {key}: {value:.6f}") if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break elif args.evaluate_during_training and epoch - best_epoch >= args.patience: logger.info(f"Ran out of patience. Best epoch was {best_epoch}. " f"Stopping training at epoch {epoch} out of {args.num_train_epochs} epochs.") train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer, teacher=None): """Train the model""" if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(log_dir=args.output_dir) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if "mask_score" in n and p.requires_grad ], "lr": args.mask_scores_learning_rate, }, { "params": [ p for n, p in model.named_parameters() if "mask_score" not in n and p.requires_grad and not any( nd in n for nd in no_decay) ], "lr": args.learning_rate, "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if "mask_score" not in n and p.requires_grad and any( nd in n for nd in no_decay) ], "lr": args.learning_rate, "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) # Distillation if teacher is not None: logger.info(" Training with distillation") global_step = 0 # Global TopK if args.global_topk: threshold_mem = None epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to global_step of last saved checkpoint from model path try: global_step = int( args.model_name_or_path.split("-")[-1].split("/")[0]) except ValueError: global_step = 0 epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) threshold, regu_lambda = schedule_threshold( step=global_step, total_step=t_total, warmup_steps=args.warmup_steps, final_threshold=args.final_threshold, initial_threshold=args.initial_threshold, final_warmup=args.final_warmup, initial_warmup=args.initial_warmup, final_lambda=args.final_lambda, ) # Global TopK if args.global_topk: if threshold == 1.0: threshold = -1e2 # Or an indefinitely low quantity else: if (threshold_mem is None) or ( global_step % args.global_topk_frequency_compute == 0): # Sort all the values to get the global topK concat = torch.cat([ param.view(-1) for name, param in model.named_parameters() if "mask_scores" in name ]) n = concat.numel() kth = max(n - (int(n * threshold) + 1), 1) threshold_mem = concat.kthvalue(kth).values.item() threshold = threshold_mem else: threshold = threshold_mem inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "masked_bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids if "masked" in args.model_type: inputs["threshold"] = threshold outputs = model(**inputs) loss, logits_stu = outputs # model outputs are always tuple in transformers (see doc) # Distillation loss if teacher is not None: if "token_type_ids" not in inputs: inputs[ "token_type_ids"] = None if args.teacher_type == "xlm" else batch[ 2] with torch.no_grad(): (logits_tea, ) = teacher( input_ids=inputs["input_ids"], token_type_ids=inputs["token_type_ids"], attention_mask=inputs["attention_mask"], ) loss_logits = nn.functional.kl_div( input=nn.functional.log_softmax( logits_stu / args.temperature, dim=-1), target=nn.functional.softmax(logits_tea / args.temperature, dim=-1), reduction="batchmean", ) * (args.temperature**2) loss = args.alpha_distil * loss_logits + args.alpha_ce * loss # Regularization if args.regularization is not None: regu_ = regularization(model=model, mode=args.regularization) loss = loss + regu_lambda * regu_ if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator)): if args.fp16: nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: tb_writer.add_scalar("threshold", threshold, global_step) for name, param in model.named_parameters(): if not param.requires_grad: continue tb_writer.add_scalar("parameter_mean/" + name, param.data.mean(), global_step) tb_writer.add_scalar("parameter_std/" + name, param.data.std(), global_step) tb_writer.add_scalar("parameter_min/" + name, param.data.min(), global_step) tb_writer.add_scalar("parameter_max/" + name, param.data.max(), global_step) tb_writer.add_scalar("grad_mean/" + name, param.grad.data.mean(), global_step) tb_writer.add_scalar("grad_std/" + name, param.grad.data.std(), global_step) if args.regularization is not None and "mask_scores" in name: if args.regularization == "l1": perc = (torch.sigmoid(param) > threshold ).sum().item() / param.numel() elif args.regularization == "l0": perc = (torch.sigmoid(param - 2 / 3 * np.log(0.1 / 1.1)) ).sum().item() / param.numel() tb_writer.add_scalar( "retained_weights_perc/" + name, perc, global_step) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr() logs["learning_rate"] = learning_rate_scalar[0] if len(learning_rate_scalar) > 1: for idx, lr in enumerate(learning_rate_scalar[1:]): logs[f"learning_rate/{idx+1}"] = lr logs["loss"] = loss_scalar if teacher is not None: logs["loss/distil"] = loss_logits.item() if args.regularization is not None: logs["loss/regularization"] = regu_.item() if (teacher is not None) or (args.regularization is not None): if (teacher is not None) and (args.regularization is not None): logs["loss/instant_ce"] = ( loss.item() - regu_lambda * logs["loss/regularization"] - args.alpha_distil * logs["loss/distil"]) / args.alpha_ce elif teacher is not None: logs["loss/instant_ce"] = ( loss.item() - args.alpha_distil * logs["loss/distil"]) / args.alpha_ce else: logs["loss/instant_ce"] = loss.item( ) - regu_lambda * logs["loss/regularization"] logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{"step": global_step}})) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
scaled_loss.backward() if (global_step + 1) % accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if (time.time() - start_time) / 3600 > 7: break del examples, train_dataset, train_loader gc.collect() torch.save(model.state_dict(), output_model_file) torch.save(optimizer.state_dict(), output_optimizer_file) torch.save(amp.state_dict(), output_amp_file) # %% [code] print(f'trained {global_step * batch_size} samples') print(f'training time: {(time.time() - start_time) / 3600:.1f} hours') # %% [code] def eval_collate_fn( examples: List[Example]) -> Tuple[List[torch.Tensor], List[Example]]: # input tokens max_len = max([len(example.input_ids) for example in examples]) tokens = np.zeros((len(examples), max_len), dtype=np.int64) token_type_ids = np.ones((len(examples), max_len), dtype=np.int64) for i, example in enumerate(examples):
def train(model, tokenizer, checkpoint): if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") else: amp = None # 训练数据处理 train_data = DataBert(data_file=args.train_file, doc_file=doc_file, s1_length=args.s1_length, s2_length=args.s2_length, max_length=args.max_length, tokenizer=tokenizer ) train_dataLoader = DataLoader(dataset=train_data, batch_size=args.batch_size, shuffle= not args.pair) attacked_data = AttackedData(attacked_file=args.attacked_file) #攻击样本 attack_dataloader = DataLoader(dataset=attacked_data, batch_size=args.batch_size, shuffle=False) print('train_data:', len(train_data)) print('attack_data:', len(attacked_data)) # 初始化 optimizer,scheduler t_total = len(train_dataLoader) * args.epochs no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # apex if args.fp16: model, optimizer = amp.initialize(model, optimizer, opt_level=args.fptype) # 读取断点 optimizer、scheduler checkpoint_dir = args.save_dir + "/checkpoint-" + str(checkpoint) if os.path.isfile(os.path.join(checkpoint_dir, "optimizer.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(checkpoint_dir, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(checkpoint_dir, "scheduler.pt"))) if args.fp16: amp.load_state_dict(torch.load(os.path.join(checkpoint_dir, "amp.pt"))) # 开始训练 logger.debug("***** Running training *****") logger.debug(" Num examples = %d", len(train_dataLoader)) logger.debug(" Num Epochs = %d", args.epochs) logger.debug(" Set_Batch size = %d", args.batch_size) logger.debug(" Real_Batch_size = %d", args.batch_size * args.accumulate) # 没有历史断点,则从0开始 if checkpoint < 0: checkpoint = 0 else: checkpoint += 1 logger.debug(" Start Batch = %d", checkpoint) for epoch in range(checkpoint, args.epochs): model.train() epoch_loss = [] step = 0 for batch, batch_attack in tqdm(zip(train_dataLoader, attack_dataloader), desc="Iteration", total=len(train_dataLoader)): # 设置tensor gpu运行 batch = tuple(t.to('cuda') for t in batch[:4]) input_ids, token_type_ids, attention_mask, labels = batch outputs = model(input_ids=input_ids.long(), token_type_ids=token_type_ids.long(), labels=labels) loss_clean = outputs[0] # if args.fp16: # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() # else: # loss.backward() #计算出梯度 batch_attack = tuple(t.to('cuda') for t in batch_attack) input_ids2, token_type_ids2, attention_mask2, labels2 = batch_attack outputs_attack = model(input_ids=input_ids2.long(), token_type_ids=token_type_ids2.long(), attention_mask=attention_mask2, labels=labels2) loss_adv = outputs_attack[0] loss = (0.5 * loss_clean) + (0.5 * loss_adv) print(loss_clean.item(), loss_adv.item()) if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() epoch_loss.append(loss.item()) if step % args.accumulate == 0: optimizer.step() scheduler.step() model.zero_gra() step += 1 # 保存模型 output_dir = args.save_dir + "/checkpoint-" + str(epoch) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = (model.module if hasattr(model, "module") else model) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.debug("Saving model checkpoint to %s", output_dir) if args.fp16: torch.save(amp.state_dict(), os.path.join(output_dir, "amp.pt")) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.debug("Saving optimizer and scheduler states to %s", output_dir) # eval dev eval_loss, eval_map, eval_mrr = evaluate(model, tokenizer, eval_file=args.dev_file, checkpoint=epoch, output_dir=output_dir) # eval test test_eval_loss, test_eval_map, test_eval_mrr = evaluate(model, tokenizer, eval_file=args.test_file, checkpoint=epoch, output_dir=output_dir) # 输出日志 + 保存日志 logger.info('【DEV 】Train Epoch %d: train_loss=%.4f, map=%.4f, mrr=%.4f' % ( epoch, np.array(epoch_loss).mean(), eval_map, eval_mrr)) logger.info('【TEST】Train Epoch %d: train_loss=%.4f, map=%.4f, mrr=%.4f' % ( epoch, np.array(epoch_loss).mean(), test_eval_map, test_eval_mrr))
def train(args): if args.model_path is None: msg = 'Prepare for new run ...' output_dir = os.path.join( args.log_dir, args.run_name + '_' + datetime.datetime.now().strftime('%m%d_%H%M')) if not os.path.exists(output_dir): os.makedirs(output_dir) ckpt_dir = os.path.join( args.ckpt_dir, args.run_name + '_' + datetime.datetime.now().strftime('%m%d_%H%M')) if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) else: msg = 'Restart previous run ...\nlogs to save to %s, ckpt to save to %s, model to load from %s' % \ (args.log_dir, args.ckpt_dir, args.model_path) output_dir = args.log_dir ckpt_dir = args.ckpt_dir if not os.path.isdir(output_dir): print('Invalid log dir: %s' % output_dir) return if not os.path.isdir(ckpt_dir): print('Invalid ckpt dir: %s' % ckpt_dir) return set_logger(os.path.join(output_dir, 'outputs.log')) logging.info(msg) global device if args.device is not None: logging.info('Setting device to ' + args.device) device = torch.device(args.device) else: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logging.info('Setting up...') hparams.parse(args.hparams) logging.info(hparams_debug_string()) model = EdgeClassification if hparams.use_roberta: logging.info('Using Roberta...') model = RobertaEdgeClassification global_step = 0 if args.model_path is None: if hparams.load_pretrained: logging.info('Load online pretrained model...' + ( ('cached at ' + args.cache_path) if args.cache_path is not None else '')) if hparams.use_roberta: model = model.from_pretrained('roberta-base', cache_dir=args.cache_path, hparams=hparams) else: model = model.from_pretrained('bert-base-uncased', cache_dir=args.cache_path, hparams=hparams) else: logging.info('Build model from scratch...') if hparams.use_roberta: config = BertConfig.from_pretrained('bert-base-uncased') else: config = RobertaConfig.from_pretrained('roberta-base') model = model(config=config, hparams=hparams) else: if not os.path.isdir(args.model_path): raise OSError(str(args.model_path) + ' not found') logging.info('Load saved model from %s ...' % (args.model_path)) model = model.from_pretrained(args.model_path, hparams=hparams) step = args.model_path.split('_')[-1] if step.isnumeric(): global_step = int(step) logging.info('Initial step=%d' % global_step) if hparams.use_roberta: tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') hparams.parse(args.hparams) logging.info(hparams_debug_string()) if hparams.text_sample_eval: if args.eval_text_path is None: raise ValueError('eval_text_path not given') if ':' not in args.eval_text_path: eval_data_paths = [args.eval_text_path] else: eval_data_paths = args.eval_text_path.split(':') eval_feeder = [] for p in eval_data_paths: name = os.path.split(p)[-1] if name.endswith('.tsv'): name = name[:-4] eval_feeder.append( (name, ExternalTextFeeder(p, hparams, tokenizer, 'dev'))) else: eval_feeder = [('', DataFeeder(args.data_dir, hparams, tokenizer, 'dev'))] tb_writer = SummaryWriter(output_dir) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': hparams.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=hparams.learning_rate, eps=hparams.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=hparams.warmup_steps, lr_decay_step=hparams.lr_decay_step, max_lr_decay_rate=hparams.max_lr_decay_rate) acc_step = global_step * hparams.gradient_accumulation_steps time_window = ValueWindow() loss_window = ValueWindow() acc_window = ValueWindow() model.to(device) model.zero_grad() tr_loss = tr_acc = 0.0 start_time = time.time() if args.model_path is not None: logging.info('Load saved model from %s ...' % (args.model_path)) if os.path.exists(os.path.join(args.model_path, 'optimizer.pt')) \ and os.path.exists(os.path.join(args.model_path, 'scheduler.pt')): optimizer.load_state_dict( torch.load(os.path.join(args.model_path, 'optimizer.pt'))) optimizer.load_state_dict(optimizer.state_dict()) scheduler.load_state_dict( torch.load(os.path.join(args.model_path, 'scheduler.pt'))) scheduler.load_state_dict(scheduler.state_dict()) else: logging.warning('Could not find saved optimizer/scheduler') if global_step > 0: logs = run_eval(args, model, eval_feeder) for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) logging.info('Start training...') if hparams.text_sample_train: train_feeder = PrebuiltTrainFeeder(args.train_text_path, hparams, tokenizer, 'train') else: train_feeder = DataFeeder(args.data_dir, hparams, tokenizer, 'train') while True: batch = train_feeder.next_batch() model.train() outputs = model(input_ids=batch.input_ids.to(device), attention_mask=batch.input_mask.to(device), token_type_ids=None if batch.token_type_ids is None else batch.token_type_ids.to(device), labels=batch.labels.to(device)) loss = outputs['loss'] preds = outputs['preds'] acc = torch.mean((preds.cpu() == batch.labels).float()) preds = preds.cpu().detach().numpy() labels = batch.labels.detach().numpy() t_acc = np.sum(np.logical_and(preds == 1, labels == 1)) / np.sum(labels == 1) f_acc = np.sum(np.logical_and(preds == 0, labels == 0)) / np.sum(labels == 0) if hparams.gradient_accumulation_steps > 1: loss = loss / hparams.gradient_accumulation_steps acc = acc / hparams.gradient_accumulation_steps tr_loss += loss.item() tr_acc += acc.item() loss.backward() acc_step += 1 if acc_step % hparams.gradient_accumulation_steps != 0: continue torch.nn.utils.clip_grad_norm_(model.parameters(), hparams.max_grad_norm) optimizer.step() scheduler.step(None) model.zero_grad() global_step += 1 step_time = time.time() - start_time time_window.append(step_time) loss_window.append(tr_loss) acc_window.append(tr_acc) if global_step % args.save_steps == 0: # Save model checkpoint model_to_save = model.module if hasattr(model, 'module') else model cur_ckpt_dir = os.path.join(ckpt_dir, 'checkpoint_%d' % (global_step)) if not os.path.exists(cur_ckpt_dir): os.makedirs(cur_ckpt_dir) model_to_save.save_pretrained(cur_ckpt_dir) torch.save(args, os.path.join(cur_ckpt_dir, 'training_args.bin')) torch.save(optimizer.state_dict(), os.path.join(cur_ckpt_dir, 'optimizer.pt')) torch.save(scheduler.state_dict(), os.path.join(cur_ckpt_dir, 'scheduler.pt')) logging.info("Saving model checkpoint to %s", cur_ckpt_dir) if global_step % args.logging_steps == 0: logs = run_eval(args, model, eval_feeder) learning_rate_scalar = scheduler.get_lr()[0] logs['learning_rate'] = learning_rate_scalar logs['loss'] = loss_window.average logs['acc'] = acc_window.average for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f, acc=%.05f, avg_acc=%.05f, t_acc=%.05f, f_acc=%.05f]' % ( global_step, step_time, tr_loss, loss_window.average, tr_acc, acc_window.average, t_acc, f_acc) logging.info(message) tr_loss = tr_acc = 0.0 start_time = time.time()
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 tr_loss, logging_loss = 0.0, 0.0 tr_acc, logging_acc = 0.0, 0.0 best_eval_acc = 0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) # Added here for reproducibility set_seed(args) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", position=0, leave=True, disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "labels": batch[7], } outputs = model(**inputs) loss, logits = outputs[:2] logits = logits.detach().cpu().numpy() preds = np.argmax(logits, axis=1) gt = inputs["labels"].detach().cpu().numpy() acc = (preds == gt).mean() if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() tr_acc += acc if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.evaluate_during_training: logger.info( "Validation start for epoch {}".format(epoch)) eval_loss, eval_acc = evaluate(args, model, tokenizer, prefix=epoch) is_best = eval_acc > best_eval_acc best_eval_acc = max(eval_acc, best_eval_acc) current_loss = (tr_loss - logging_loss) / args.logging_steps logging_loss = tr_loss current_acc = (tr_acc - logging_acc) / args.logging_steps logging_acc = tr_acc logger.info( "best_eval_acc = {}, eval_acc = {}, eval_loss = {}, acc = {}, loss = {}, global_step = {}, " \ .format(best_eval_acc, eval_acc, eval_loss, current_acc, current_loss, global_step)) if IS_ON_NSML: nsml.report(summary=True, step=global_step, eval_acc=eval_acc, eval_loss=eval_loss, acc=current_acc, loss=current_loss) if is_best: nsml.save(args.model_type + "_best") if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: if IS_ON_NSML: nsml.save(args.model_type + "_gs{}_e{}".format(global_step, epoch)) else: output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save( args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info( "Saving optimizer and scheduler states to %s", output_dir) if 0 < args.max_steps < global_step: epoch_iterator.close() break if 0 < args.max_steps < global_step: train_iterator.close() break if IS_ON_NSML: nsml.save(args.model_type + "_last") return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer, fh, pool): """ Train the model """ if args.local_rank in [-1, 0]: args.tensorboard_dir = os.path.join(args.output_dir, 'tensorboard') if not os.path.exists(args.tensorboard_dir): os.makedirs(args.tensorboard_dir) tb_writer = SummaryWriter(args.tensorboard_dir) args.batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.batch_size, drop_last=True) total_examples = len(train_dataset) * (torch.distributed.get_world_size() if args.local_rank != -1 else 1) batch_size = args.batch_size * args.gradient_accumulation_steps * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1) # if args.max_steps > 0: # t_total = args.max_steps # args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 if args.num_train_epochs > 0: t_total = total_examples // batch_size * args.num_train_epochs args.max_steps = t_total model.to(args.device) if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) checkpoint_last = os.path.join(args.output_dir, 'checkpoint-last') scheduler_last = os.path.join(checkpoint_last, 'scheduler.pt') optimizer_last = os.path.join(checkpoint_last, 'optimizer.pt') if os.path.exists(scheduler_last): scheduler.load_state_dict( torch.load(scheduler_last, map_location="cpu")) if os.path.exists(optimizer_last): optimizer.load_state_dict( torch.load(optimizer_last, map_location="cpu")) if args.local_rank == 0: torch.distributed.barrier() if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank % args.gpu_per_node], output_device=args.local_rank % args.gpu_per_node, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", total_examples) logger.info(" Num epoch = %d", t_total * batch_size // total_examples) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = args.start_step tr_loss, logging_loss, avg_loss, tr_nb = 0.0, 0.0, 0.0, 0 # model.resize_token_embeddings(len(tokenizer)) model.zero_grad() set_seed( args) # Added here for reproducibility (even between python 2 and 3) best_bleu = 0.0 for idx in range(args.start_epoch, int(args.num_train_epochs)): for step, (batch, token_labels) in enumerate(train_dataloader): inputs = batch.to(args.device) attn_mask = torch.tensor(token_labels.clone().detach() != 0, dtype=torch.uint8, device=args.device) loss_mask = torch.tensor(token_labels.clone().detach() == 2, dtype=torch.uint8, device=args.device) model.train() # outputs = model(inputs, attention_mask=attn_mask, labels=inputs, loss_mask=loss_mask) # loss = outputs[0] outputs = model(inputs, attention_mask=attn_mask) logits = outputs[0] labels = inputs shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() flatten_shift_loss_mask = loss_mask[..., :-1].contiguous().view(-1) ids = torch.nonzero(flatten_shift_loss_mask).view(-1) loss = loss_fct( shift_logits.view(-1, shift_logits.size(-1))[ids], shift_labels.view(-1)[ids]) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 output_flag = True avg_loss = round( np.exp((tr_loss - logging_loss) / (global_step - tr_nb)), 4) if global_step % args.logging_steps == 0: logger.info(" steps: %s ppl: %s", global_step, round(avg_loss, 5)) if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar('lr', scheduler.get_last_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss tr_nb = global_step if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint if args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well # results = evaluate(args, model, tokenizer, eval_when_training=True) # for key, value in results.items(): # tb_writer.add_scalar('eval_{}'.format(key), value, global_step) # logger.info(" %s = %s", key, round(value,4)) # output_dir = os.path.join(args.output_dir, '{}-{}-{}'.format(checkpoint_prefix, global_step, round(results['perplexity'],4))) dev_bleu, dev_EM = eval_bleu(args, model, tokenizer, file_type='dev', num=100) logger.info(f"dev bleu: {dev_bleu}, dev EM: {dev_EM}") output_dir = os.path.join( args.output_dir, '{}-{}-{}'.format(checkpoint_prefix, global_step, round(dev_bleu, 2))) if dev_bleu > best_bleu: best_bleu = dev_bleu logger.info( f"best bleu updated. saved in {output_dir}") logger.info(f"best bleu: {best_bleu}") else: output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) # _rotate_checkpoints(args, checkpoint_prefix) last_output_dir = os.path.join(args.output_dir, 'checkpoint-last') if not os.path.exists(last_output_dir): os.makedirs(last_output_dir) model_to_save.save_pretrained(last_output_dir) tokenizer.save_pretrained(last_output_dir) idx_file = os.path.join(last_output_dir, 'idx_file.txt') with open(idx_file, 'w', encoding='utf-8') as idxf: idxf.write(str(0) + '\n') torch.save(optimizer.state_dict(), os.path.join(last_output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(last_output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", last_output_dir) step_file = os.path.join(last_output_dir, 'step_file.txt') with open(step_file, 'w', encoding='utf-8') as stepf: stepf.write(str(global_step) + '\n') # torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) # torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) # logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: break if args.max_steps > 0 and global_step > args.max_steps: break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) #scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) #choose schedule here scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.load_states == True: folder = [x[0] for x in os.walk(args.output_dir)][1] print(folder) states = torch.load(folder + "/checkp.pth") optimizer.load_state_dict(states["optimizer"]) scheduler.load_state_dict(states["scheduler"]) global_step = states["step"] print("optimizer and schedule were loaded succesfully") del states if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) if args.load_states == False: global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproducibility (even between python 2 and 3) counterL = 1 lossStack = 0 lossHistory = [] for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) counterL += 1 if counterL % 100 == 0: lossHistory.append(lossStack) lossStack = 0 counterL = 1 else: lossStack += loss.item() if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = 'checkpoint' # Save model checkpoint output_dir = os.path.join( args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) checkpoint = { "lossHistory": lossHistory, 'step': global_step, 'scheduler': scheduler.state_dict(), 'optimizer': optimizer.state_dict() } torch.save(checkpoint, os.path.join(output_dir, 'checkp.pth')) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
# checkpoints instead. if save_flag: torch.save({"epoch": epoch, "models": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses}, os.path.join(target_dir, "best.pth.tar")) # Save the models at each epoch. if save_flag: torch.save({"epoch": epoch, "models": model.state_dict(), "best_score": best_score, "optimizer": optimizer.state_dict(), "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses}, os.path.join(target_dir, "esim_{}.pth.tar".format(epoch))) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break plt.figure() plt.plot(epochs_count, train_losses, "-r") plt.plot(epochs_count, valid_losses, "-b") plt.xlabel("epoch") plt.ylabel("loss")
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{"step": global_step}})) if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args): # Init device n_gpu = torch.cuda.device_count() if n_gpu == 0: warnings.warn('No GPU detected. Training on CPU will be very slow') elif n_gpu > 1: warnings.warn('This codebase is not optimized for multi GPU usage') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Lambda for filenames example_tag_to_fp = lambda tag: os.path.join(args.examples_dir, '{}.pkl'. format(tag)) out_fn_to_fp = lambda fn: os.path.join(args.train_dir, fn) # Create training dir os.makedirs(args.train_dir, exist_ok=True) resuming = os.path.exists(out_fn_to_fp('step.pkl')) # Create tokenizer tokenizer = ilm.tokenize_util.Tokenizer[args.tokenizer_name.upper()] if tokenizer == ilm.tokenize_util.Tokenizer.CUSTOM: ilm.tokenize_util.set_custom_vocab_fp(args.tokenizer_custom_vocab_fp) # Update tokenizer base_vocab_size = ilm.tokenize_util.vocab_size(tokenizer) start_infill_id = base_vocab_size + 0 end_infill_id = base_vocab_size + 1 additional_ids_to_tokens = { start_infill_id: '<|startofinfill|>', end_infill_id: '<|endofinfill|>' } mask_cls = ilm.mask.util.mask_cls_str_to_type(args.mask_cls) mask_types = mask_cls.mask_types() mask_type_to_id = {} for i, t in enumerate(mask_types): t_id = base_vocab_size + 2 + i t_tok = '<|infill_{}|>'.format(mask_cls.mask_type_serialize(t)) additional_ids_to_tokens[t_id] = t_tok mask_type_to_id[t] = t_id print(additional_ids_to_tokens) vocab_size = ilm.tokenize_util.update_tokenizer(additional_ids_to_tokens, tokenizer) with open(out_fn_to_fp('additional_ids_to_tokens.pkl'), 'wb') as f: pickle.dump(additional_ids_to_tokens, f) # Load training data if not args.eval_only: print('Loading training data') loaded_from_cache = False if args.data_cache: try: train_inputs = np.load(out_fn_to_fp('train_inp.npy')) train_tts = np.load(out_fn_to_fp('train_tts.npy')) with open(out_fn_to_fp('train_num_docs.pkl'), 'rb') as f: train_num_docs = pickle.load(f) loaded_from_cache = True except: pass if not loaded_from_cache: train_inputs, train_tts, train_num_docs = masked_dataset_to_inputs_and_tts( 'train', tokenizer, start_infill_id, end_infill_id, mask_type_to_id, args) if args.data_cache: np.save(out_fn_to_fp('train_inp.npy'), train_inputs) np.save(out_fn_to_fp('train_tts.npy'), train_tts) with open(out_fn_to_fp('train_num_docs.pkl'), 'wb') as f: pickle.dump(train_num_docs, f) train_tt_to_count = { TargetType(k): v for k, v in zip(*np.unique(train_tts, return_counts=True)) } print(train_tt_to_count) num_unmasked = train_tt_to_count.get(TargetType.CONTEXT, 0) num_masked = train_tt_to_count.get(TargetType.INFILL, 0) print('Mask rate (tokens): {:.4f}'.format(num_masked / (num_unmasked + num_masked))) print('{} documents, {} examples'.format(train_num_docs, train_inputs.shape[0])) print(train_inputs.shape, train_inputs.dtype, train_tts.shape, train_tts.dtype) train_data = TensorDataset( torch.from_numpy(train_inputs.astype(np.int64)), torch.from_numpy(train_tts)) del train_inputs del train_tts # Load eval data print('Loading eval data') loaded_from_cache = False if args.data_cache: try: eval_inputs = np.load(out_fn_to_fp('eval_inp.npy')) eval_tts = np.load(out_fn_to_fp('eval_tts.npy')) with open(out_fn_to_fp('eval_num_docs.pkl'), 'rb') as f: eval_num_docs = pickle.load(f) loaded_from_cache = True except: pass if not loaded_from_cache: eval_inputs, eval_tts, eval_num_docs = masked_dataset_to_inputs_and_tts( 'eval', tokenizer, start_infill_id, end_infill_id, mask_type_to_id, args) if args.data_cache: np.save(out_fn_to_fp('eval_inp.npy'), eval_inputs) np.save(out_fn_to_fp('eval_tts.npy'), eval_tts) with open(out_fn_to_fp('eval_num_docs.pkl'), 'wb') as f: pickle.dump(eval_num_docs, f) eval_tt_to_count = { TargetType(k): v for k, v in zip(*np.unique(eval_tts, return_counts=True)) } print(eval_tt_to_count) num_unmasked = eval_tt_to_count.get(TargetType.CONTEXT, 0) num_masked = eval_tt_to_count.get(TargetType.INFILL, 0) print('Mask rate (tokens): {:.4f}'.format(num_masked / (num_unmasked + num_masked))) print('{} documents, {} examples'.format(eval_num_docs, eval_inputs.shape[0])) print(eval_inputs.shape, eval_inputs.dtype, eval_tts.shape, eval_tts.dtype) eval_data = TensorDataset(torch.from_numpy(eval_inputs.astype(np.int64)), torch.from_numpy(eval_tts)) del eval_inputs del eval_tts # Calculate number of steps to train for (return if we're just pre-cacheing data) if args.train_num_epochs is not None: train_num_batches = int( float(train_num_docs * args.train_num_epochs) / args.train_batch_size) if train_num_batches == 0: return print('Maximum number of training steps: {}'.format( train_num_batches / args.train_batch_accumulation)) # Create data iterators print('Creating datasets') if not args.eval_only: train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, drop_last=True) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, drop_last=True) # Load model print('Initializing model...') set_random_seed(args.seed) if args.model_name in ilm.constants.GPT2_MODEL_NAMES: model_type = GPT2LMHeadModel cfg_type = GPT2Config if resuming: print('from saved checkpoint (resuming)') model = model_type.from_pretrained(args.train_dir) else: if args.train_from_scratch: print('from scratch') cfg = cfg_type.from_pretrained(args.model_name) model = model_type(cfg) else: print('from pretrained checkpoint') model = model_type.from_pretrained('data/gpt-2-pytorch') model.resize_token_embeddings(vocab_size) model.to(device) model.train() # Reset random seed in case model init triggered RNG # Initialize optimizers if not args.eval_only: params = list(model.named_parameters()) no_decay = ['bias', 'ln'] optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': args.train_weight_decay }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.train_learning_rate, eps=args.train_adam_epsilon) if resuming: optimizer.load_state_dict(torch.load(out_fn_to_fp('optimizer.pt'))) # Create global step if resuming: try: with open(out_fn_to_fp('step.pkl'), 'rb') as f: step = pickle.load(f) except Exception as e: if args.eval_only: step = None else: raise e else: step = 0 if args.eval_only: print('Evaluating') model.eval() eval_start = time.time() eval_token_counts = defaultdict(int) eval_token_loss_sums = defaultdict(float) for i, eval_batch in enumerate(eval_dataloader): with torch.no_grad(): eval_inputs, eval_tts = tuple(t.to(device) for t in eval_batch) eval_logits, _ = model(eval_inputs) eval_logits_relevant = eval_logits[:, :-1].contiguous().view( -1, eval_logits.shape[-1]) for tag, tts in [ ('context', [TargetType.CONTEXT]), ('infill', [TargetType.INFILL, TargetType.INFILL_SPECIAL]), ('infill_textonly', [TargetType.INFILL]) ]: eval_labels = tts_to_labels(eval_inputs, eval_tts, tts) eval_labels_relevant = eval_labels[:, 1:] eval_labels_relevant_count = (eval_labels_relevant != -1).long().sum().item() eval_labels_loss = F.cross_entropy( eval_logits_relevant, eval_labels_relevant.contiguous().view(-1), ignore_index=-1).item() eval_token_counts[tag] += eval_labels_relevant_count eval_token_loss_sums[ tag] += eval_labels_loss * eval_labels_relevant_count eval_dict = {} for tag, count in eval_token_counts.items(): loss = eval_token_loss_sums[tag] if count > 0: loss /= count eval_dict['eval_{}_count'.format(tag)] = count eval_dict['eval_{}_loss'.format(tag)] = loss eval_dict['eval_{}_ppl'.format(tag)] = np.exp(loss) eval_dict['eval_time'] = time.time() - eval_start print('-' * 80) if step is not None: print('(Step {}) Eval'.format(step)) for k, v in eval_dict.items(): print('{}: {}'.format(k, v)) if args.wandb: wandb.log(eval_dict, step=step) else: print('Training') set_random_seed(args.seed) best_eval_loss = None num_save = -1 num_summary = -1 num_batches_complete = step * args.train_batch_accumulation start = time.time() while True: if args.train_num_epochs is not None and num_batches_complete >= train_num_batches: break for batch in train_dataloader: if args.train_num_epochs is not None and num_batches_complete >= train_num_batches: break elapsed = time.time() - start # Evaluate if int(elapsed / args.train_eval_secs) > num_save: num_save = int(elapsed / args.train_eval_secs) model.eval() eval_start = time.time() eval_token_counts = defaultdict(int) eval_token_loss_sums = defaultdict(float) for i, eval_batch in enumerate(eval_dataloader): with torch.no_grad(): eval_inputs, eval_tts = tuple( t.to(device) for t in eval_batch) eval_logits, _ = model(eval_inputs) eval_logits_relevant = eval_logits[:, : -1].contiguous( ).view( -1, eval_logits. shape[-1]) for tag, tts in [('context', [TargetType.CONTEXT]), ('infill', [ TargetType.INFILL, TargetType.INFILL_SPECIAL ]), ('infill_textonly', [TargetType.INFILL])]: eval_labels = tts_to_labels( eval_inputs, eval_tts, tts) eval_labels_relevant = eval_labels[:, 1:] eval_labels_relevant_count = ( eval_labels_relevant != -1).long().sum().item() eval_labels_loss = F.cross_entropy( eval_logits_relevant, eval_labels_relevant.contiguous().view(-1), ignore_index=-1).item() eval_token_counts[ tag] += eval_labels_relevant_count eval_token_loss_sums[ tag] += eval_labels_loss * eval_labels_relevant_count eval_dict = {} for tag, count in eval_token_counts.items(): loss = eval_token_loss_sums[tag] if count > 0: loss /= count eval_dict['eval_{}_count'.format(tag)] = count eval_dict['eval_{}_loss'.format(tag)] = loss eval_dict['eval_time'] = time.time() - eval_start print('-' * 80) print('(Step {}) Eval'.format(step)) for k, v in eval_dict.items(): print('{}: {}'.format(k, v)) if args.wandb: wandb.log(eval_dict, step=step) if best_eval_loss is None or eval_dict[ 'eval_infill_loss'] < best_eval_loss: print('Saving') model_to_save = model.module if hasattr( model, 'module') else model model_to_save.config.to_json_file( out_fn_to_fp(CONFIG_NAME)) torch.save(model_to_save.state_dict(), out_fn_to_fp(WEIGHTS_NAME)) torch.save(optimizer.state_dict(), out_fn_to_fp('optimizer.pt')) with open(out_fn_to_fp('step.pkl'), 'wb') as f: pickle.dump(step, f) best_eval_loss = eval_dict['eval_infill_loss'] model.train() # Train inputs, tts = tuple(t.to(device) for t in batch) # TODO: Option to train on CONTEXT_SPECIAL? labels_context = tts_to_labels(inputs, tts, [TargetType.CONTEXT]) # TODO: Option to skip training on INFILL_REDUNDANT? # NOTE: This would give Task.NAIVE/Task.LM less supervision overall but put them more in line with the supervision that Task.ILM and Task.NO_CONTEXT_ILM receive labels_infill = tts_to_labels(inputs, tts, [ TargetType.INFILL, TargetType.INFILL_SPECIAL, TargetType.INFILL_REDUNDANT ]) logits, _ = model(inputs) logits_relevant = logits[:, :-1].contiguous().view( -1, logits.shape[-1]) loss_context = F.cross_entropy( logits_relevant, labels_context[:, 1:].contiguous().view(-1), ignore_index=-1) loss_infill = F.cross_entropy( logits_relevant, labels_infill[:, 1:].contiguous().view(-1), ignore_index=-1) loss_context_item = loss_context.item() loss_infill_item = loss_infill.item() loss = loss_infill if args.train_context: loss += loss_context if args.train_batch_accumulation != 1: loss /= float(args.train_batch_accumulation) loss.backward() # Summarize if int(elapsed / args.train_summary_secs) > num_summary: num_summary = int(elapsed / args.train_summary_secs) print('-' * 80) print('(Step {}) Summary'.format(step)) print(loss_context_item) print(loss_infill_item) with torch.no_grad(): for t in inputs, labels_context, labels_infill: t0 = list(t[0].cpu().numpy()) print('-' * 40) print(t0) for t in inputs, labels_context, labels_infill: t0 = list(t[0].cpu().numpy()) print('-' * 40) print( ilm.tokenize_util.decode( [0 if t == -1 else t for t in t0], tokenizer)) if args.wandb: wandb.log( { 'loss_context': loss_context_item, 'loss_infill': loss_infill_item, }, step=step) if ((num_batches_complete + 1) % args.train_batch_accumulation) == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.train_max_grad_norm) optimizer.step() optimizer.zero_grad() step += 1 num_batches_complete += 1