def train(args, train_dataset, model, tokenizer, optimizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs scheduler = get_linear_schedule_with_warmup(optimizer, args.warmup_steps, t_total) checkpoint_last = os.path.join(args.output_dir, 'checkpoint-last') scheduler_last = os.path.join(checkpoint_last, 'scheduler.pt') if os.path.exists(scheduler_last): scheduler.load_state_dict(torch.load(scheduler_last)) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = args.start_step tr_loss, logging_loss = 0.0, 0.0 best_acc = 0.0 model.zero_grad() train_iterator = trange(args.start_epoch, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) model.train() for idx, _ in enumerate(train_iterator): tr_loss = 0.0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3] } ouputs = model(**inputs) loss = ouputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, checkpoint=str(global_step)) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) logger.info('loss %s', str(tr_loss - logging_loss)) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.max_steps > 0 and global_step > args.max_steps: # epoch_iterator.close() break if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): results = evaluate(args, model, tokenizer, checkpoint=str(args.start_epoch + idx)) last_output_dir = os.path.join(args.output_dir, 'checkpoint-last') if not os.path.exists(last_output_dir): os.makedirs(last_output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(last_output_dir) logger.info("Saving model checkpoint to %s", last_output_dir) idx_file = os.path.join(last_output_dir, 'idx_file.txt') with open(idx_file, 'w', encoding='utf-8') as idxf: idxf.write(str(args.start_epoch + idx) + '\n') torch.save(optimizer.state_dict(), os.path.join(last_output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(last_output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", last_output_dir) step_file = os.path.join(last_output_dir, 'step_file.txt') with open(step_file, 'w', encoding='utf-8') as stepf: stepf.write(str(global_step) + '\n') if (results['acc'] > best_acc): best_acc = results['acc'] output_dir = os.path.join(args.output_dir, 'checkpoint-best') if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save( args, os.path.join(output_dir, 'training_{}.bin'.format(idx))) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) # Added here for reproductibility set_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) if isinstance(model, torch.nn.DataParallel): inputs["return_tuple"] = True outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # Save model checkpoint if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None, verbose=True): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) t_total = len(train_dataloader) // args[ "gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args["weight_decay"], }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args[ "warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW( optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"], ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"], mininterval=0) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 steps_trained_in_current_epoch = 0 epochs_trained = 0 if args["model_name"] and os.path.exists(args["model_name"]): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args["model_name"].split("/")[-1].split( "-") if len(checkpoint_suffix) > 2: checkpoint_suffix = checkpoint_suffix[1] else: checkpoint_suffix = checkpoint_suffix[-1] global_step = int(checkpoint_suffix) epochs_trained = global_step // ( len(train_dataloader) // args["gradient_accumulation_steps"]) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args["gradient_accumulation_steps"]) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info( " Will skip the first %d steps in the current epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") if args["evaluate_during_training"]: training_progress_scores = self._create_training_progress_scores() if args["wandb_project"]: wandb.init(project=args["wandb_project"], config={**args}, **args["wandb_kwargs"]) wandb.watch(self.model) model.train() for _ in train_iterator: if epochs_trained > 0: epochs_trained -= 1 continue # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate( tqdm(train_dataloader, desc="Current iteration", disable=args["silent"])): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args["n_gpu"] > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # torch.nn.utils.clip_grad_norm_( # amp.master_params(optimizer), args["max_grad_norm"] # ) else: loss.backward() # torch.nn.utils.clip_grad_norm_( # model.parameters(), args["max_grad_norm"] # ) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: if args["fp16"]: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args["max_grad_norm"]) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args[ "logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar( "loss", (tr_loss - logging_loss) / args["logging_steps"], global_step, ) logging_loss = tr_loss if args["wandb_project"]: wandb.log({ "Training loss": current_loss, "lr": scheduler.get_lr()[0], "global_step": global_step, }) if args["save_steps"] > 0 and global_step % args[ "save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) self._save_model(output_dir_current, optimizer, scheduler, model=model) if args["evaluate_during_training"] and ( args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model(eval_df, verbose=True) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) os.makedirs(output_dir_current, exist_ok=True) if args["save_eval_checkpoints"]: self._save_model(output_dir_current, optimizer, scheduler, model=model, results=results) training_progress_scores["global_step"].append( global_step) training_progress_scores["train_loss"].append( current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False, ) if args["wandb_project"]: wandb.log( self._get_last_metrics( training_progress_scores)) if not best_eval_metric: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) if best_eval_metric and args[ "early_stopping_metric_minimize"]: if results[args[ "early_stopping_metric"]] - best_eval_metric < args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if results[args[ "early_stopping_metric"]] - best_eval_metric > args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return global_step, tr_loss / global_step epoch_number += 1 output_dir_current = os.path.join( output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args["save_model_every_epoch"] or args[ "evaluate_during_training"]: os.makedirs(output_dir_current, exist_ok=True) if args["save_model_every_epoch"]: self._save_model(output_dir_current, optimizer, scheduler, model=model) if args["evaluate_during_training"]: results, _, _ = self.eval_model(eval_df, verbose=True) self._save_model(output_dir_current, optimizer, scheduler, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False) if args["wandb_project"]: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) if best_eval_metric and args["early_stopping_metric_minimize"]: if results[args[ "early_stopping_metric"]] - best_eval_metric < args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if results[args[ "early_stopping_metric"]] - best_eval_metric > args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer) -> Tuple[int, float]: args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer.pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr(model, "module") else model model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch") set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, labels=labels) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics perplexity = evaluate(args, model, tokenizer)['perplexity'] logging_loss = tr_loss / global_step logger.info( f'Step={global_step}, train loss={logging_loss:.4f}, eval perplexity={perplexity:.4f}' ) if 0 < args.max_steps < global_step: epoch_iterator.close() break if args.save_steps > 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) if 0 < args.max_steps < global_step: train_iterator.close() break return global_step, tr_loss / global_step
def train(self): if self.args.method == "clean": print("clean data!") concatdataset = ConcatDataset([self.train_dataset, self.unlabeled]) train_sampler = RandomSampler(concatdataset) train_dataloader = DataLoader( concatdataset, sampler=train_sampler, batch_size=self.args.batch_size ) else: train_sampler = RandomSampler(self.train_dataset) train_dataloader = DataLoader( self.train_dataset, sampler=train_sampler, batch_size=self.args.batch_size, ) # assert 0 if self.args.max_steps > 0: t_total = self.args.max_steps self.args.num_train_epochs = ( self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 ) else: t_total = ( len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs ) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.args.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW( optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon, ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(self.train_dataset)) logger.info(" Num Epochs = %d", self.args.num_train_epochs) logger.info(" Total train batch size = %d", self.args.batch_size) logger.info( " Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps ) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss = 0.0 self.model.zero_grad() train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch") set_seed(self.args) criterion = nn.KLDivLoss(reduction="batchmean") for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): self.model.train() batch = tuple(t.to(self.device) for t in batch) # GPU or CPU inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "labels": batch[3], } if self.args.task_type == "wic": inputs["keys"] = batch[6] elif self.args.task_type == "re": inputs["e1_mask"] = batch[4] inputs["e2_mask"] = batch[5] outputs = self.model(**inputs) loss1 = outputs[0] logits = outputs[1] loss = criterion( input=F.log_softmax(logits), target=self.label_matrix[batch[3]].to(self.device), ) if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps if torch.cuda.device_count() > 1: # print(loss.size(), torch.cuda.device_count()) loss = loss.mean() loss.backward() tr_loss += loss.item() if (step + 1) % self.args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.args.max_grad_norm ) optimizer.step() scheduler.step() # Update learning rate schedule self.model.zero_grad() global_step += 1 epoch_iterator.set_description( "iteration:%d, w=%.1f, Loss:%.3f" % (_, self.args.soft_label_weight, tr_loss / global_step) ) if ( self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0 ): # self.evaluate("dev", global_step) self.evaluate("test", global_step) if ( self.args.save_steps > 0 and global_step % self.args.save_steps == 0 ): self.save_model() if 0 < self.args.max_steps < global_step: epoch_iterator.close() break if 0 < self.args.max_steps < global_step: train_iterator.close() break # assert 0 return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
inside_validation_dataset), # Pull out batches sequentially. batch_size=batch_size # Evaluate with this batch size. ) #---------------------------------------------------------------------------------------- # Note: AdamW is a class from the huggingface library (as opposed to pytorch) optimizer = AdamW(model.parameters(), lr=learning_rate, eps=epsilon) # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. # This changes the learning rate as the training loop progresses scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps) def format_time(elapsed): return str(datetime.timedelta(seconds=int(round((elapsed))))) total_t0 = time.time() model = model.to(device) for epoch_i in range(0, epochs): # ======================================== # Training
def main(argv): parser = argparse.ArgumentParser(description='') required = parser.add_argument_group('required arguments') required.add_argument( '-r', '--retrieval', choices=['IR', 'NSP', 'NN'], help='retrieval solver for the contexts. Options: IR, NSP or NN', required=True) parser.add_argument( '-d', '--device', default='gpu', choices=['gpu', 'cpu'], help='device to train the model with. Options: cpu or gpu. Default: gpu' ) parser.add_argument( '-p', '--pretrainings', default="checkpoints/pretrainings_e4.pth", help= 'path to the pretrainings model. If empty, the model will be the RobertForSequenceClassification with roberta-large weights. Default: checkpoints/pretrainings_e4.pth' ) parser.add_argument('-b', '--batchsize', default=8, type=int, help='size of the batches. Default: 8') parser.add_argument('-x', '--maxlen', default=64, type=int, help='max sequence length. Default: 64') parser.add_argument('-l', '--lr', default=1e-5, type=float, help='learning rate. Default: 1e-5') parser.add_argument('-e', '--epochs', default=2, type=int, help='number of epochs. Default: 2') parser.add_argument('-s', '--save', default=False, help='save model at the end of the training', action='store_true') args = parser.parse_args() print(args) model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=2) if args.pretrainings == "checkpoints/pretrainings_e4.pth": model.roberta = torch.load(args.pretrainings).roberta tokenizer = RobertaTokenizer.from_pretrained('roberta-large') if args.device == "gpu": device = torch.device("cuda") model.cuda() if args.device == "cpu": device = torch.device("cpu") model.cpu() model.zero_grad() batch_size = args.batchsize max_len = args.maxlen lr = args.lr epochs = args.epochs retrieval_solver = args.retrieval save_model = args.save raw_data_train = get_data_tf("train", retrieval_solver, tokenizer, max_len) raw_data_val = get_data_tf("val", retrieval_solver, tokenizer, max_len) train_dataloader = process_data_ndq(raw_data_train, batch_size, "train") val_dataloader = process_data_ndq(raw_data_val, batch_size, "val") optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8) total_steps = len(train_dataloader) * epochs print(total_steps) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) training_tf(model, train_dataloader, val_dataloader, optimizer, scheduler, epochs, retrieval_solver, device, save_model)
def train_f1_f2(args, model_f1, model_f2, train_dataset): if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.mini_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) args.num_train_epochs = 1 t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs if args.warmup_proportion > 0: args.warmup_steps = int(t_total * args.warmup_proportion) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in list(model_f1.named_parameters()) + list(model_f2.named_parameters()) if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in list(model_f1.named_parameters()) + list(model_f2.named_parameters()) if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) [model_f1, model_f2], optimizer = amp.initialize([model_f1, model_f2], optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model_f1 = torch.nn.DataParallel(model_f1) model_f2 = torch.nn.DataParallel(model_f2) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model_f1 = torch.nn.parallel.DistributedDataParallel( model_f1, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) model_f2 = torch.nn.parallel.DistributedDataParallel( model_f2, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 tr_loss, logging_loss = 0.0, 0.0 model_f1.zero_grad() model_f2.zero_grad() set_seed(args) logger.info("***** train f1 f2 ******") logger.info("***** Num examples: {} ********".format(len(train_dataset))) for _ in range(1): epoch_iterator = tqdm(train_dataloader, desc="Iter(loss=X.XXX, lr=X.XXXXXXXX)", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model_f1.train() model_f2.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "labels": batch[3], "label_mask": batch[4] } outputs1 = model_f1(**inputs) loss1 = outputs1 outputs2 = model_f2(**inputs) loss2 = outputs2 w1 = model_f1.classifier.weight #[hidden_size, num_labels] w2 = model_f2.classifier.weight.transpose( -1, -2) #[num_labels, hidden_size] norm_term = torch.norm(torch.matmul(w1, w2)) loss = loss1 + loss2 + args.alpha * norm_term if args.n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: epoch_iterator.set_description( 'Iter (loss=%5.3f) lr=%9.7f' % (loss.item(), scheduler.get_lr()[0])) if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model_f1.parameters(), args.max_grad_norm) torch.nn.utils.clip_grad_norm_(model_f2.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model_f1.zero_grad() model_f2.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar("f1_f2_lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("f1_f2_loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0]: tb_writer.close() return model_f1, model_f2
def train(): writer = SummaryWriter(comment="Relation") modelDir = writer.log_dir.replace("runs", "models") epochs = 20 device = "cuda" dataset = RelationDataset("albert-base-v2", device="cpu") dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn_padd) model = AlbertForRelation.from_pretrained( "albert-base-v2", num_rel_labels=len(relationTypes), ) model.resize_token_embeddings(len(dataset.dataset.tokenizer)) model.to(device) optim = AdamW( [ {"params": model.albert.parameters(), "lr": 1e-4}, { "params": model.classifier.parameters(), "lr": 1e-3, }, ] ) scheduler = get_linear_schedule_with_warmup(optim, 100, epochs * 10000 / 32) iTot = 0 for epoch in range(epochs): i = 0 lossesTrain = [] lossesVal = [] for ( input_ids, token_type_ids, attention_mask, rel_label, e1_index, e2_index, ) in dataloader: if i % 5 != 0: model.train() loss, acc = model( input_ids.to(device), token_type_ids.to(device), attention_mask.to(device), rel_label.to(device), e1_index.to(device), e2_index.to(device), ) loss.backward() optim.step() scheduler.step() optim.zero_grad() lossesTrain.append(loss.item()) writer.add_scalar("lossRel/Train", lossesTrain[-1], iTot) writer.add_scalar("accRel/Train", acc.item(), iTot) else: with torch.no_grad(): model.eval() loss, acc = model( input_ids.to(device), token_type_ids.to(device), attention_mask.to(device), rel_label.to(device), e1_index.to(device), e2_index.to(device), ) lossesVal.append(loss.item()) writer.add_scalar("accRel/Eval", acc.item(), iTot) writer.add_scalar("lossRel/Eval", lossesVal[-1], iTot) if iTot % 20 == 0: for (i2, lr) in enumerate(scheduler.get_lr()): writer.add_scalar("lr/" + str(i2), lr, iTot) print(epoch, i) i += 1 iTot += 1 model.save_pretrained(modelDir + "/" + str(epoch)) dataset.dataset.tokenizer.save_pretrained(modelDir + "/" + str(epoch))
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = (RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = ( args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1) else: t_total = (len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to global_step of last saved checkpoint from model path try: global_step = int( args.model_name_or_path.split("-")[-1].split("/")[0]) except ValueError: global_step = 0 epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info( " Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch, ) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator)): if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if (args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0): logs = {} if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{"step": global_step}})) if (args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0): # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
MODEL_NAME, hidden_dropout_prob=DROPOUT, attention_probs_dropout_prob=DROPOUT, num_labels=len(labels2ind), id2label={str(v): k for k, v in labels2ind.items()}) # Prepare optimizer and schedule (linear warmup and decay) optimizer = get_optimizer_with_weight_decay(model=nerbert, optimizer=OPTIMIZER, learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY) training_steps = (len(dataloader_tr) // ACUMULATE_GRAD_EVERY) * N_EPOCHS scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=training_steps * RATIO_WARMUP_STEPS, num_training_steps=training_steps) # Trainer trainer = BertTrainer(model=nerbert, tokenizer=tokenizer, optimizer=optimizer, scheduler=scheduler, labels2ind=labels2ind, device=DEVICE, n_epochs=N_EPOCHS, accumulate_grad_every=ACUMULATE_GRAD_EVERY, output_dir='./trained_models') # Train and validate model trainer.train(dataloader_train=dataloader_tr, dataloader_val=dataloader_val)
def train(model, tokenizer, train_dataloader, validation_dataloader, index_to_label, pad_token_dict, doc_start_ind_dict, device): def calculate_loss(lm_logits, b_labels, b_input_mask, cls_labels, index_to_label, doc_start_ind_dict, loss_fct): batch_size = lm_logits.shape[0] logits_collected = [] labels_collected = [] for b in range(batch_size): logits_ind = lm_logits[b, :, :] # seq_len x |V| labels_ind = b_labels[b, :] # seq_len mask = b_input_mask[b, :] > 0 maski = mask.unsqueeze(-1).expand_as(logits_ind) # unpad_seq_len x |V| logits_pad_removed = torch.masked_select(logits_ind, maski).view( -1, logits_ind.size(-1)) labels_pad_removed = torch.masked_select(labels_ind, mask) # unpad_seq_len doc_start_ind = doc_start_ind_dict[index_to_label[ cls_labels[b].item()]] shift_logits = logits_pad_removed[doc_start_ind - 1:-1, :].contiguous() shift_labels = labels_pad_removed[doc_start_ind:].contiguous() # Flatten the tokens logits_collected.append( shift_logits.view(-1, shift_logits.size(-1))) labels_collected.append(shift_labels.view(-1)) logits_collected = torch.cat(logits_collected, dim=0) labels_collected = torch.cat(labels_collected, dim=0) loss = loss_fct(logits_collected, labels_collected) return loss optimizer = AdamW( model.parameters(), lr=5e-4, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) loss_fct = CrossEntropyLoss() sample_every = 100 warmup_steps = 1e2 epochs = 5 total_steps = len(train_dataloader) * epochs scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps) seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) training_stats = [] total_t0 = time.time() for epoch_i in range(0, epochs): print("", flush=True) print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs), flush=True) print('Training...', flush=True) t0 = time.time() total_train_loss = 0 model.train() for step, batch in enumerate(train_dataloader): if step % sample_every == 0 and not step == 0: elapsed = format_time(time.time() - t0) print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format( step, len(train_dataloader), elapsed), flush=True) model.eval() lbl = random.choice(list(index_to_label.values())) temp_list = ["<|labelpad|>"] * pad_token_dict[lbl] if len(temp_list) > 0: label_str = " ".join( lbl.split("_")) + " " + " ".join(temp_list) else: label_str = " ".join(lbl.split("_")) text = tokenizer.bos_token + " " + label_str + " <|labelsep|> " sample_outputs = model.generate(input_ids=tokenizer.encode( text, return_tensors='pt').to(device), do_sample=True, top_k=50, max_length=200, top_p=0.95, num_return_sequences=1) for i, sample_output in enumerate(sample_outputs): print("{}: {}".format(i, tokenizer.decode(sample_output)), flush=True) model.train() b_input_ids = batch[0].to(device) b_labels = batch[0].to(device) b_input_mask = batch[1].to(device) cls_labels = batch[2].to(device) model.zero_grad() outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss = calculate_loss(outputs[1], b_labels, b_input_mask, cls_labels, index_to_label, doc_start_ind_dict, loss_fct) # loss = outputs[0] total_train_loss += loss.item() loss.backward() optimizer.step() scheduler.step() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) # Measure how long this epoch took. training_time = format_time(time.time() - t0) print("", flush=True) print(" Average training loss: {0:.2f}".format(avg_train_loss), flush=True) print(" Training epcoh took: {:}".format(training_time), flush=True) # ======================================== # Validation # ======================================== # After the completion of each training epoch, measure our performance on # our validation set. print("", flush=True) print("Running Validation...", flush=True) t0 = time.time() model.eval() total_eval_loss = 0 nb_eval_steps = 0 # Evaluate data for one epoch for batch in validation_dataloader: b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[0].to(device) cls_labels = batch[2].to(device) with torch.no_grad(): outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # Accumulate the validation loss. loss = calculate_loss(outputs[1], b_labels, b_input_mask, cls_labels, index_to_label, doc_start_ind_dict, loss_fct) # loss = outputs[0] total_eval_loss += loss.item() # Calculate the average loss over all of the batches. avg_val_loss = total_eval_loss / len(validation_dataloader) # Measure how long the validation run took. validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss), flush=True) print(" Validation took: {:}".format(validation_time), flush=True) # Record all statistics from this epoch. training_stats.append({ 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'Valid. Loss': avg_val_loss, 'Training Time': training_time, 'Validation Time': validation_time }) print("", flush=True) print("Training complete!", flush=True) print("Total training took {:} (h:mm:ss)".format( format_time(time.time() - total_t0)), flush=True) return model
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) model_id = "all" logger = get_logger() df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1) df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "leakage_feature" }, "answered_correctly": { "type": "leakage_feature" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" }, "prior_question_had_explanation": { "type": "category" }, "rating_diff_content_user_id": { "type": "numeric" }, "task_container_id_bin300": { "type": "category" }, "previous_answer_index_question_id": { "type": "category" }, "previous_answer_question_id": { "type": "category" }, "timediff-elapsedtime_bin500": { "type": "category" }, "timedelta_log10": { "type": "category" } } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder( rate_func="elo", column="user_id") feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="question_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_dict["user_id"][ "StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True) feature_factory_dict["user_id"][ f"MeanAggregatorStudyTimebyUserId"] = MeanAggregator( column="user_id", agg_column="study_time", remove_now=False) feature_factory_dict["user_id"][ "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder( ) feature_factory_dict["post"] = { "DurationFeaturePostProcess": DurationFeaturePostProcess() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) def f(x): x = x // 1000 if x < -100: return -100 if x > 400: return 400 return x df["task_container_id_bin300"] = [ x if x < 300 else 300 for x in df["task_container_id"] ] df["timediff-elapsedtime_bin500"] = [ f(x) for x in df["timediff-elapsedtime"].values ] df["timedelta_log10"] = np.log10( df["duration_previous_content"].values) df["timedelta_log10"] = df["timedelta_log10"].replace( -np.inf, -1).replace(np.inf, -1).fillna(-1).astype("int8") df = df[[ "user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300", "prior_question_had_explanation", "rating_diff_content_user_id", "task_container_id_bin300", "previous_answer_index_question_id", "previous_answer_question_id", "row_id", "timediff-elapsedtime_bin500", "timedelta_log10" ]] print(df.head(10)) print("data preprocess") ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df_val_row = pd.read_feather( "../input/riiid-test-answer-prediction/train_transformer_last2500k_only_row_id.feather" ) if is_debug: df_val_row = df_val_row.head(3000) df_val_row["is_val"] = 1 df = pd.merge(df, df_val_row, how="left", on="row_id") df["is_val"] = df["is_val"].fillna(0) print(df["is_val"].value_counts()) w_df = df[df["is_val"] == 0] w_df["group"] = ( w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype( str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model275_all", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model275_all/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model275_all/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model275_all/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model275_all/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout, cont_emb=params["cont_emb"]) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=params["lr"], weight_decay=0.01, ) num_train_optimization_steps = int(len(dataloader_train) * 20) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, epoch, output_dir, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}". format(epoch, loss, auc, auc_val)) torch.save( model.state_dict(), f"{output_dir}/transformers_epoch{epoch}_auc{round(auc_val, 4)}.pth" ) # df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model torch.cuda.empty_cache() with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def train(coarse_model, fine_model, coarse_tokenizer, fine_tokenizer, train_dataloader, validation_dataloader, label_to_exclusive_dataloader, doc_start_ind, index_to_label, device, secondary_device): def calculate_kl_div_loss(batch_fine_probs, batch_coarse_probs, batch_fine_input_masks, batch_coarse_input_masks, batch_fine_input_ids, batch_coarse_input_ids, coarse_tokenizer, fine_tokenizer, doc_start_ind): # Remove pad tokens # consider from doc_start_ind - 1 loss_fct = torch.nn.KLDivLoss(reduction="batchmean") batch_size = batch_fine_probs.shape[0] losses = [] for b in range(batch_size): fine_logits_ind = batch_fine_probs[b, :, :] # seq_len x |V| coarse_logits_ind = batch_coarse_probs[b, :, :] # seq_len x |V| fine_mask = batch_fine_input_masks[b, :] > 0 coarse_mask = batch_coarse_input_masks[b, :] > 0 if not torch.all(fine_mask.eq(coarse_mask)): print("Fine sentence", fine_tokenizer.decode(batch_fine_input_ids[b, :])) print("Coarse sentence", coarse_tokenizer.decode(batch_coarse_input_ids[b, :])) raise Exception("Fine and Coarse mask is not same") fine_dec_sent = fine_tokenizer.decode( batch_fine_input_ids[b, :][doc_start_ind:]) coarse_dec_sent = coarse_tokenizer.decode( batch_coarse_input_ids[b, :][doc_start_ind:]) if fine_dec_sent != coarse_dec_sent: print( "Fine sentence ", fine_tokenizer.decode( batch_fine_input_ids[b, :][doc_start_ind:])) print( "Coarse sentence ", coarse_tokenizer.decode( batch_coarse_input_ids[b, :][doc_start_ind:])) raise Exception("Fine and Coarse decoded sentence is not same") fine_maski = fine_mask.unsqueeze(-1).expand_as(fine_logits_ind) coarse_maski = coarse_mask.unsqueeze(-1).expand_as( coarse_logits_ind) # unpad_seq_len x |V| fine_logits_pad_removed = torch.masked_select( fine_logits_ind, fine_maski).view(-1, fine_logits_ind.size(-1)) coarse_logits_pad_removed = torch.masked_select( coarse_logits_ind, coarse_maski).view(-1, coarse_logits_ind.size(-1)) shift_fine_logits = fine_logits_pad_removed[doc_start_ind - 1:-1, :].contiguous() shift_coarse_logits = coarse_logits_pad_removed[ doc_start_ind - 1:-1, :].contiguous() # Compute loss here of shift_fine_logits and shift_coarse_logits append to losses loss = loss_fct(shift_fine_logits, shift_coarse_logits).unsqueeze(0) losses.append(loss) # Return mean of losses here losses = torch.cat(losses, dim=0) return losses.mean() def calculate_cross_entropy_loss(fine_model, label_to_exclusive_dataloader, doc_start_ind, device): loss_function = CrossEntropyLoss() b_labels_list = [] b_input_ids_list = [] b_input_mask_list = [] scores_list = [] selected_labs = random.sample( list(label_to_exclusive_dataloader.keys()), 6) for l in selected_labs: # print("Label", l) dataloader = label_to_exclusive_dataloader[l] it = 0 for step, batch in dataloader: # print("Step for exc", step, it) b_input_ids = batch[0].to(device) b_labels = batch[0].to(device) b_input_mask = batch[1].to(device) outputs = fine_model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) b_labels_list.append(b_labels) b_input_ids_list.append(b_input_ids) b_input_mask_list.append(b_input_mask) scores_list.append(outputs[1]) # reporter = MemReporter() # reporter.report() it += 1 if it == 1: break b_labels_tensor = torch.cat(b_labels_list, dim=0) b_input_ids_tensor = torch.cat(b_input_ids_list, dim=0) b_input_mask_tensor = torch.cat(b_input_mask_list, dim=0) scores_tensor = torch.cat(scores_list, dim=0) assert b_labels_tensor.shape[0] == b_input_ids_tensor.shape[0] == b_input_mask_tensor.shape[0] == \ scores_tensor.shape[0] batch_size = scores_tensor.shape[0] logits_collected = [] labels_collected = [] for b in range(batch_size): logits_ind = scores_tensor[b, :, :] # seq_len x |V| labels_ind = b_labels_tensor[b, :] # seq_len mask = b_input_mask_tensor[b, :] > 0 maski = mask.unsqueeze(-1).expand_as(logits_ind) # unpad_seq_len x |V| logits_pad_removed = torch.masked_select(logits_ind, maski).view( -1, logits_ind.size(-1)) labels_pad_removed = torch.masked_select(labels_ind, mask) # unpad_seq_len shift_logits = logits_pad_removed[doc_start_ind - 1:-1, :].contiguous() shift_labels = labels_pad_removed[doc_start_ind:].contiguous() # Flatten the tokens logits_collected.append( shift_logits.view(-1, shift_logits.size(-1))) labels_collected.append(shift_labels.view(-1)) logits_collected = torch.cat(logits_collected, dim=0) labels_collected = torch.cat(labels_collected, dim=0) loss = loss_function(logits_collected, labels_collected).to(device) return loss def calculate_loss(batch_fine_probs, batch_coarse_probs, batch_fine_input_masks, batch_coarse_input_masks, batch_fine_input_ids, batch_coarse_input_ids, coarse_tokenizer, fine_tokenizer, fine_model, label_to_exclusive_dataloader, doc_start_ind, device, lambda_1=5, is_val=False): kl_div_loss = calculate_kl_div_loss( batch_fine_probs, batch_coarse_probs, batch_fine_input_masks, batch_coarse_input_masks, batch_fine_input_ids, batch_coarse_input_ids, coarse_tokenizer, fine_tokenizer, doc_start_ind) # del batch_fine_probs # del batch_coarse_probs # del batch_fine_input_masks # del batch_coarse_input_masks # del batch_fine_input_ids # del batch_coarse_input_ids # torch.cuda.empty_cache() if not is_val: cross_ent_loss = calculate_cross_entropy_loss( fine_model, label_to_exclusive_dataloader, doc_start_ind, device) print("KL-loss", kl_div_loss.item(), "CE-loss", cross_ent_loss.item()) else: cross_ent_loss = 0 print("KL-loss", kl_div_loss.item(), "CE-loss", cross_ent_loss) return (1 - lambda_1) * kl_div_loss + lambda_1 * cross_ent_loss def compute_lambda(step, max_steps): temp = 1 - step / max_steps if temp < 0: return 0 else: return temp # epsilon = 1e-20 # Defined to avoid log probability getting undefined. fine_posterior = torch.nn.Parameter( torch.ones(len(index_to_label)).to(device)) optimizer = AdamW( list(fine_model.parameters()) + [fine_posterior], lr=5e-4, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) sample_every = 100 warmup_steps = 1e2 epochs = 5 total_steps = len(train_dataloader) * epochs scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps) seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) training_stats = [] total_t0 = time.time() coarse_model.eval() global_step = 0 for epoch_i in range(0, epochs): print("", flush=True) print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs), flush=True) print('Training...', flush=True) t0 = time.time() total_train_loss = 0 fine_model.train() for step, batch in enumerate(train_dataloader): # batch contains -> coarse_input_ids, coarse_attention_masks, fine_input_ids, fine_attention_masks if step % sample_every == 0 and not step == 0: elapsed = format_time(time.time() - t0) print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format( step, len(train_dataloader), elapsed), flush=True) fine_model.eval() lbl = random.choice(list(index_to_label.values())) temp_list = ["<|labelpad|>"] * pad_token_dict[lbl] if len(temp_list) > 0: label_str = " ".join( lbl.split("_")) + " " + " ".join(temp_list) else: label_str = " ".join(lbl.split("_")) text = fine_tokenizer.bos_token + " " + label_str + " <|labelsep|> " sample_outputs = fine_model.generate( input_ids=fine_tokenizer.encode( text, return_tensors='pt').to(device), do_sample=True, top_k=50, max_length=200, top_p=0.95, num_return_sequences=1) for i, sample_output in enumerate(sample_outputs): print("{}: {}".format( i, fine_tokenizer.decode(sample_output)), flush=True) fine_model.train() fine_posterior_log_probs = torch.log_softmax(fine_posterior, dim=0) print(torch.softmax(fine_posterior, dim=0), flush=True) b_coarse_input_ids = batch[0].to(secondary_device) b_coarse_labels = batch[0].to(secondary_device) b_coarse_input_mask = batch[1].to(secondary_device) b_size = b_coarse_input_ids.shape[0] b_fine_input_ids_minibatch = batch[2].to(device) b_fine_input_mask_minibatch = batch[3].to(device) coarse_model.zero_grad() # fine_model.zero_grad() optimizer.zero_grad() outputs = coarse_model(b_coarse_input_ids, token_type_ids=None, attention_mask=b_coarse_input_mask, labels=b_coarse_labels) batch_coarse_probs = torch.softmax(outputs[1], dim=-1).to( device) # (b_size, seq_len, |V|) b_coarse_input_ids = b_coarse_input_ids.to(device) b_coarse_input_mask = b_coarse_input_mask.to(device) batch_fine_probs = [] batch_fine_input_masks = [] batch_fine_input_ids = [] for b_ind in range(b_size): fine_label_sum_log_probs = [] for l_ind in index_to_label: b_fine_input_ids = b_fine_input_ids_minibatch[ b_ind, l_ind, :].unsqueeze(0).to(device) b_fine_labels = b_fine_input_ids_minibatch[ b_ind, l_ind, :].unsqueeze(0).to(device) b_fine_input_mask = b_fine_input_mask_minibatch[ b_ind, l_ind, :].unsqueeze(0).to(device) outputs = fine_model(b_fine_input_ids, token_type_ids=None, attention_mask=b_fine_input_mask, labels=b_fine_labels) b_fine_labels = b_fine_labels.to(secondary_device) fine_log_probs = torch.log_softmax(outputs[1], dim=-1) fine_label_sum_log_probs.append( (fine_log_probs + fine_posterior_log_probs[l_ind])) fine_label_sum_log_probs = torch.cat( fine_label_sum_log_probs, dim=0) # (|F|, seq_len, |V|) batch_fine_probs.append(fine_label_sum_log_probs.unsqueeze(0)) batch_fine_input_ids.append(b_fine_input_ids) batch_fine_input_masks.append(b_fine_input_mask) batch_fine_probs = torch.cat(batch_fine_probs, dim=0) # (b_size, |F|, seq_len, |V|) batch_fine_input_masks = torch.cat(batch_fine_input_masks, dim=0) # (b_size, seq_len) batch_fine_input_ids = torch.cat(batch_fine_input_ids, dim=0) # (b_size, seq_len) batch_fine_log_probs = torch.logsumexp( batch_fine_probs, dim=1) # This computes logsum_i P(f_i|c) P(D|f_i) loss = calculate_loss( batch_fine_log_probs, batch_coarse_probs, batch_fine_input_masks, b_coarse_input_mask, batch_fine_input_ids, b_coarse_input_ids, coarse_tokenizer, fine_tokenizer, fine_model, label_to_exclusive_dataloader, doc_start_ind, device, lambda_1=compute_lambda(global_step, max_steps=len(train_dataloader) * epochs)) # loss = criterion(batch_fine_probs.log(), batch_coarse_probs.detach()).sum(dim=-1).mean(dim=-1).mean(dim=-1) total_train_loss += loss.item() print("Loss:", loss.item(), flush=True) loss.backward() optimizer.step() scheduler.step() global_step += 1 # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) # Measure how long this epoch took. training_time = format_time(time.time() - t0) print("", flush=True) print(" Average training loss: {0:.2f}".format(avg_train_loss), flush=True) print(" Training epoch took: {:}".format(training_time), flush=True) # ======================================== # Validation # ======================================== # After the completion of each training epoch, measure our performance on # our validation set. print("", flush=True) print("Running Validation...", flush=True) t0 = time.time() fine_model.eval() total_eval_loss = 0 nb_eval_steps = 0 # Evaluate data for one epoch for batch in validation_dataloader: # batch contains -> coarse_input_ids, coarse_attention_masks, fine_input_ids, fine_attention_masks b_coarse_input_ids = batch[0].to(secondary_device) b_coarse_labels = batch[0].to(secondary_device) b_coarse_input_mask = batch[1].to(secondary_device) b_size = b_coarse_input_ids.shape[0] b_fine_input_ids_minibatch = batch[2].to(device) b_fine_input_mask_minibatch = batch[3].to(device) with torch.no_grad(): fine_posterior_log_probs = torch.log_softmax(fine_posterior, dim=0) outputs = coarse_model(b_coarse_input_ids, token_type_ids=None, attention_mask=b_coarse_input_mask, labels=b_coarse_labels) batch_coarse_probs = torch.softmax(outputs[1], dim=-1).to( device) # (b_size, seq_len, |V|) b_coarse_input_ids = b_coarse_input_ids.to(device) b_coarse_input_mask = b_coarse_input_mask.to(device) batch_fine_probs = [] batch_fine_input_masks = [] batch_fine_input_ids = [] for b_ind in range(b_size): fine_label_sum_log_probs = [] for l_ind in index_to_label: b_fine_input_ids = b_fine_input_ids_minibatch[ b_ind, l_ind, :].unsqueeze(0).to(device) b_fine_labels = b_fine_input_ids_minibatch[ b_ind, l_ind, :].unsqueeze(0).to(device) b_fine_input_mask = b_fine_input_mask_minibatch[ b_ind, l_ind, :].unsqueeze(0).to(device) outputs = fine_model(b_fine_input_ids, token_type_ids=None, attention_mask=b_fine_input_mask, labels=b_fine_labels) fine_log_probs = torch.log_softmax(outputs[1], dim=-1) fine_label_sum_log_probs.append( (fine_log_probs + fine_posterior_log_probs[l_ind])) fine_label_sum_log_probs = torch.cat( fine_label_sum_log_probs, dim=0) # (|F|, seq_len, |V|) batch_fine_probs.append( fine_label_sum_log_probs.unsqueeze(0)) batch_fine_input_ids.append(b_fine_input_ids) batch_fine_input_masks.append(b_fine_input_mask) batch_fine_probs = torch.cat( batch_fine_probs, dim=0) # (b_size, |F|, seq_len, |V|) batch_fine_input_masks = torch.cat(batch_fine_input_masks, dim=0) # (b_size, seq_len) batch_fine_input_ids = torch.cat(batch_fine_input_ids, dim=0) # (b_size, seq_len) batch_fine_log_probs = torch.logsumexp( batch_fine_probs, dim=1) # This computes logsum_i P(f_i|c) P(D|f_i) # Accumulate the validation loss. loss = calculate_loss( batch_fine_log_probs, batch_coarse_probs, batch_fine_input_masks, b_coarse_input_mask, batch_fine_input_ids, b_coarse_input_ids, coarse_tokenizer, fine_tokenizer, fine_model, label_to_exclusive_dataloader, doc_start_ind, device, is_val=True, lambda_1=compute_lambda(global_step, max_steps=len(train_dataloader) * epochs)) total_eval_loss += loss.item() # Calculate the average loss over all of the batches. avg_val_loss = total_eval_loss / len(validation_dataloader) # Measure how long the validation run took. validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss), flush=True) print(" Validation took: {:}".format(validation_time), flush=True) # Record all statistics from this epoch. training_stats.append({ 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'Valid. Loss': avg_val_loss, 'Training Time': training_time, 'Validation Time': validation_time }) # todo make temp_df, fine_input_ids, fine_attention_masks class variables. # true, preds, _ = test(fine_model, fine_posterior, fine_input_ids, fine_attention_masks, doc_start_ind, # index_to_label, label_to_index, list(temp_df.label.values), device) print("", flush=True) print("Training complete!", flush=True) print("Total training took {:} (h:mm:ss)".format( format_time(time.time() - total_t0)), flush=True) return fine_posterior, fine_model
def train_ft(args, model_ft, train_dataset): if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.mini_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) args.num_train_epochs = 1 t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs if args.warmup_proportion > 0: args.warmup_steps = int(t_total * args.warmup_proportion) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in list(model_ft.named_parameters()) if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in list(model_ft.named_parameters()) if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model_ft, optimizer = amp.initialize(model_ft, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model_ft = torch.nn.DataParallel(model_ft) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model_ft = torch.nn.parallel.DistributedDataParallel( model_ft, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 tr_loss, logging_loss = 0.0, 0.0 model_ft.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) logger.info("******* train ft *************") for _ in range(1): epoch_iterator = tqdm(train_dataloader, desc="Iter(loss=X.XXX, lr=X.XXXXXXXX)", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model_ft.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "labels": batch[3], "label_mask": batch[4], } outputs = model_ft(**inputs) loss = outputs if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: epoch_iterator.set_description( 'Iter (loss=%5.3f) lr=%9.7f' % (loss.item(), scheduler.get_lr()[0])) if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model_ft.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model_ft.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.close() return model_ft
def train(args, train_dataset, valid_dataset, model, tokenizer, labels): # Prepare train data train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) train_batch_size = args.train_batch_size # Prepare optimizer t_total = len(train_dataloader) * args.num_train_epochs no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=t_total // 10, num_training_steps=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", train_batch_size) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed(args) best_f1_score = 0 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2] } outputs = model(**inputs, return_dict=False) # model outputs are always tuple in transformers (see doc) loss = outputs[0] loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() optimizer.step() scheduler.step() model.zero_grad() global_step += 1 # Checking for validation accuracy and stopping after drop in accuracy for 3 epochs results = evaluate(args, model, tokenizer, labels, 'validation') if results.get('f1') > best_f1_score and args.save_steps > 0: best_f1_score = results.get('f1') model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, "training_args.bin")) return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer, criterion): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn, num_workers=args.num_workers, ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_f1, n_no_improve = 0, 0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) labels = batch[5] inputs = { "input_ids": batch[0], "input_modal": batch[2], "attention_mask": batch[1], "modal_start_tokens": batch[3], "modal_end_tokens": batch[4], } outputs = model(**inputs) logits = outputs[ 0] # model outputs are always tuple in transformers (see doc) loss = criterion(logits, labels) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, criterion) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{"step": global_step}})) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training torch.save(model_to_save.state_dict(), os.path.join(output_dir, WEIGHTS_NAME)) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank == -1: results = evaluate(args, model, tokenizer, criterion) if results["micro_f1"] > best_f1: best_f1 = results["micro_f1"] n_no_improve = 0 else: n_no_improve += 1 if n_no_improve > args.patience: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) model_id = "train_0" logger = get_logger() # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") df = pd.read_pickle( "../input/riiid-test-answer-prediction/split10/train_0.pickle" ).sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "leakage_feature" }, "answered_correctly": { "type": "leakage_feature" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" }, "prior_question_had_explanation": { "type": "category" }, "rating_diff_content_user_id": { "type": "numeric" }, "task_container_id_bin300": { "type": "category" }, } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent() feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder( rate_func="elo", column="user_id") feature_factory_dict["user_id"][ "StudyTermEncoder"] = StudyTermEncoder2() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="train_0", load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) df["task_container_id_bin300"] = [ x if x < 300 else 300 for x in df["task_container_id"].values ] def f(x): x = x // 1000 if x > 150: return 150 if x < -150: return -150 return x df["study_time_bin300"] = [f(x) for x in df["study_time"].values] df = df[[ "user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300", "study_time_bin300", "prior_question_had_explanation", "rating_diff_content_user_id", "task_container_id_bin300" ]] print(df.head(10)) print("data preprocess") train_idx = [] val_idx = [] np.random.seed(0) for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"): if np.random.random() < 0.01: # all val val_idx.extend(w_df.index.tolist()) else: train_num = int(len(w_df) * 0.95) train_idx.extend(w_df[:train_num].index.tolist()) val_idx.extend(w_df[train_num:].index.tolist()) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df["is_val"] = 0 df["is_val"].loc[val_idx] = 1 w_df = df[df["is_val"] == 0] w_df["group"] = ( w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype( str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model137", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model137/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model137/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model137/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model137/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True, num_workers=1) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False, num_workers=1) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout, n_encoder_layer=params["n_encoder_layer"], n_decoder_layer=params["n_decoder_layer"], emb1=params["emb1"], emb2=params["emb2"]) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=params["lr"], weight_decay=0.01, ) num_train_optimization_steps = int(len(dataloader_train) * epochs) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}". format(epoch, loss, auc, auc_val)) preds = [] labels = [] with torch.no_grad(): for item in tqdm(dataloader_val): label = item["label"].to(device).float() output = model(item, device) preds.extend(torch.nn.Sigmoid()( output[:, -1]).view(-1).data.cpu().numpy().tolist()) labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist()) auc_transformer = roc_auc_score(labels, preds) print("single transformer: {:.4f}".format(auc_transformer)) df_oof = pd.DataFrame() # df_oof["row_id"] = df.loc[val_idx].index print(len(dataloader_val)) print(len(preds)) df_oof["predict"] = preds df_oof["target"] = labels df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model torch.cuda.empty_cache() with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def __setup_model_data(self, dataset, lower_case): """ set up data/language model """ if self.model is not None: return if self.args.is_trained: self.model = transformers.AutoModelForTokenClassification.from_pretrained( self.args.transformers_model) self.transforms = Transforms(self.args.transformers_model, cache_dir=self.cache_dir) self.label_to_id = self.model.config.label2id self.dataset_split, self.label_to_id, self.language, self.unseen_entity_set = get_dataset_ner( dataset, label_to_id=self.label_to_id, fix_label_dict=True, lower_case=lower_case) self.id_to_label = {v: str(k) for k, v in self.label_to_id.items()} else: self.dataset_split, self.label_to_id, self.language, self.unseen_entity_set = get_dataset_ner( dataset, lower_case=lower_case) self.id_to_label = {v: str(k) for k, v in self.label_to_id.items()} config = transformers.AutoConfig.from_pretrained( self.args.transformers_model, num_labels=len(self.label_to_id), id2label=self.id_to_label, label2id=self.label_to_id, cache_dir=self.cache_dir) self.model = transformers.AutoModelForTokenClassification.from_pretrained( self.args.transformers_model, config=config) self.transforms = Transforms(self.args.transformers_model, cache_dir=self.cache_dir) # optimizer no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.args.weight_decay }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }] self.optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=self.args.lr, eps=1e-8) # scheduler self.scheduler = transformers.get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.args.warmup_step, num_training_steps=self.args.total_step) # GPU allocation self.model.to(self.device) # GPU mixture precision if self.args.fp16: try: from apex import amp # noqa: F401 self.model, self.optimizer = amp.initialize( self.model, self.optimizer, opt_level='O1', max_loss_scale=2**13, min_loss_scale=1e-5) self.master_params = amp.master_params self.scale_loss = amp.scale_loss logging.info('using `apex.amp`') except ImportError: logging.exception( "Skip apex: please install apex from https://www.github.com/nvidia/apex to use fp16" ) # multi-gpus if self.n_gpu > 1: # multi-gpu training (should be after apex fp16 initialization) self.model = torch.nn.DataParallel(self.model.cuda()) logging.info('using `torch.nn.DataParallel`') logging.info('running on %i GPUs' % self.n_gpu)
def run(n_epochs, lr, train_batch_size, val_batch_size, base_model, clustering_loss_weight, embedding_extractor, annealing_alphas, dataset, train_idx_file, val_idx_file, result_dir, early_stopping, early_stopping_tol, device, random_state): # Set random states np.random.seed(random_state) torch.manual_seed(random_state) torch.cuda.manual_seed_all(random_state) # load data df = pd.read_csv(dataset) with open(train_idx_file, 'r') as f: train_idx = np.array(list(map(int, f.readlines()))) with open(val_idx_file, 'r') as f: val_idx = np.array(list(map(int, f.readlines()))) all_idx = np.concatenate((train_idx, val_idx)) df_train = df.iloc[all_idx].copy() train_texts = df_train['texts'].to_numpy() train_labels = df_train['labels'].to_numpy() train_data = TextDataset(train_texts, train_labels) train_data_loader = DataLoader(dataset=train_data, batch_size=train_batch_size, shuffle=False) df_val = df.iloc[val_idx].copy() val_texts = df_val['texts'].to_numpy() val_labels = df_val['labels'].to_numpy() val_data = TextDataset(val_texts, val_labels) val_data_loader = DataLoader(dataset=val_data, batch_size=val_batch_size, shuffle=False) # init lm model & tokenizer lm_model = AutoModelForMaskedLM.from_pretrained(base_model, return_dict=True, output_hidden_states=True) tokenizer = AutoTokenizer.from_pretrained(base_model, return_dict=True, output_hidden_states=True) lm_model.to(device) # init clustering model model, initial_centroids, initial_embeddings = init_model( lm_model=lm_model, tokenizer=tokenizer, data_loader=train_data_loader, embedding_extractor=embedding_extractor, n_clusters=np.unique(train_labels).shape[0], device=device) # init optimizer & scheduler opt = torch.optim.RMSprop( params=model.parameters(), lr=lr, # 2e-5, 5e-7, eps=1e-8) total_steps = len(train_data_loader) * n_epochs scheduler = get_linear_schedule_with_warmup( optimizer=opt, num_warmup_steps=int(len(train_data_loader) * 0.5), num_training_steps=total_steps) # train the model hist = train(n_epochs=n_epochs, model=model, optimizer=opt, scheduler=scheduler, annealing_alphas=annealing_alphas, train_data_loader=train_data_loader, clustering_loss_weight=clustering_loss_weight, early_stopping=early_stopping, early_stopping_tol=early_stopping_tol, verbose=True) # do eval run_results = {} predicted_labels, true_labels = evaluate(model=model, eval_data_loader=val_data_loader, verbose=True) best_matching, accuracy = cluster_accuracy(true_labels, predicted_labels) ari = adjusted_rand_score(true_labels, predicted_labels) nmi = normalized_mutual_info_score(true_labels, predicted_labels) purity = purity_score(y_true=true_labels, y_pred=predicted_labels) run_results['best_matching'] = best_matching run_results['accuracy'] = accuracy run_results['ari'] = ari run_results['nmi'] = nmi run_results[ 'purity'] = purity # use purity to compare with microsoft paper # save results & model os.makedirs(result_dir) with open(os.path.join(result_dir, 'train_hist.h'), 'wb') as f: pickle.dump(hist, file=f) result_df = pd.DataFrame.from_records([run_results]) result_df.to_csv(os.path.join(result_dir, f'ag_news_subset5-distilbert.csv'), index=False) torch.save(model, os.path.join(result_dir, 'model.bin'))
def evaluate_model(original_data, model_config, eval_config, seed=None): if seed is not None: np.random.seed(seed) torch.manual_seed(seed) skf = StratifiedKFold(n_splits=eval_config['n_folds']) accuracies = [] fold = 1 data = original_data.copy() basic_cols = ['text'] manual_cols = [ 'constructive', 'toxic', 'sarcasm_irony', 'mockery_ridicule', 'insults', 'argument_discussion', 'negative_toxic_lang', 'aggressiveness', 'intolerance' ] manual_transformation = {'sí': 1, 'si': 1, 'no': 0, 'd': 0.5} label_col = 'toxicity_degree' implemented_models = ['Random Forest', 'SVC', 'Logistic Regression'] if eval_config['basic_manual_both'] == 2: for col in manual_cols: data[col] = data[col].str.lower().map(manual_transformation) data = data[basic_cols + manual_cols + [label_col]] elif eval_config['basic_manual_both'] == 1: for col in manual_cols: data[col] = data[col].str.lower().map(manual_transformation) data = data[manual_cols + [label_col]] else: data = data[basic_cols + [label_col]] device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') for train_index, test_index in skf.split(data, data[label_col].values): train = data.loc[train_index, :] test = data.loc[test_index, :] if model_config['name'].startswith('bert'): # Create data loaders based on the data split train_data_loader = create_data_loader(train, model_config['tokenizer'], model_config['max_len'], model_config['batch_size']) test_data_loader = create_data_loader(test, model_config['tokenizer'], model_config['max_len'], model_config['batch_size']) # Create the model and load it into the device model = HateSpeechClassifier(model_config['name'], data[label_col].nunique()) model = model.to(device) # Add the Adam optimizer optimizer = torch.optim.Adam(params=model.parameters(), lr=model_config['learning_rate']) total_steps = len(train_data_loader) * model_config['epochs'] scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) # Use the cross entropy loss weights = torch.Tensor( 1 / train.groupby('toxicity_degree').size().sort_index().values) loss_fn = nn.CrossEntropyLoss(weight=weights).to(device) # Evaluate model on test history = defaultdict(list) for epoch in range(model_config['epochs']): print('Epoch {}/{}'.format(epoch + 1, model_config['epochs'])) print('-' * 10) train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_index)) print(f'Train loss {train_loss} accuracy {train_acc}') history['train_acc'].append(train_acc) history['train_loss'].append(train_loss) y_pred, test_acc = eval_nn(model, test_data_loader, device, len(test)) print(f'Test accuracy {test_acc}') print() # y_pred = eval_nn(model, test_data_loader, device, len(val)) elif model_config['name'] in implemented_models: train_x = train[[c for c in train.columns if c != label_col]] train_y = train[label_col].values test_x = test[[c for c in test.columns if c != label_col]] if eval_config['basic_manual_both'] != 1: bow = TfidfVectorizer( strip_accents=model_config['strip_accents'], stop_words=model_config['stop_words']) train_bow_feats = bow.fit_transform( train_x.text.values).todense() # Perform dimensionality reduction pca = PCA(n_components=model_config['PCA_components'], svd_solver=model_config['svd_solver']) train_bow_feats = pca.fit_transform(train_bow_feats) test_bow_feats = bow.transform(test_x.text.values).todense() test_bow_feats = pca.transform(test_bow_feats) train_x.drop('text', axis=1, inplace=True) test_x.drop('text', axis=1, inplace=True) train_x = np.hstack((train_x.values, train_bow_feats)) test_x = np.hstack((test_x.values, test_bow_feats)) if model_config['name'] == 'Random Forest': rfc = RandomForestClassifier( n_estimators=model_config['n_trees'], criterion=model_config['criterion'], max_features=model_config['n_feats'], bootstrap=model_config['bootstrap']) rfc.fit(train_x, train_y) if eval_config['log']: print(rfc.feature_importances_) y_pred = rfc.predict(test_x) elif model_config['name'] == 'SVC': svm = SVC( kernel=model_config['kernel'], decision_function_shape=model_config['decision_func'], gamma=model_config['gamma'], C=model_config['penalty']) svm.fit(train_x, train_y) y_pred = svm.predict(test_x) elif model_config['name'] == 'Logistic Regression': lr = LogisticRegression( penalty=model_config['penalty'], solver=model_config['solver'], multi_class=model_config['multi_class']) lr.fit(train_x, train_y) y_pred = lr.predict(test_x) else: print('No valid model has been selected') return accuracies.append( f1_score(test[label_col].values, y_pred, labels=data[label_col].unique(), average='macro')) # print('Accuracy for Fold', fold, 'is:', np.round(accuracies[-1], 4)) fold += 1 mean_accuracy = np.mean(accuracies) std_accuracy = np.std(accuracies) # print('Total Prediction Accuracy is:', np.round(mean_accuracy, 4), '\u00B1', np.round(std_accuracy, 4)) return mean_accuracy, std_accuracy
def selftrain(self, soft=True): selftrain_dataset = ConcatDataset([self.train_dataset, self.unlabeled]) ## generating pseudo_labels pseudo_labels = [] train_sampler = RandomSampler(selftrain_dataset) train_dataloader = DataLoader( selftrain_dataset, sampler=train_sampler, batch_size=self.args.batch_size ) if self.args.self_training_max_step > 0: t_total = self.args.self_training_max_step self.args.num_train_epochs = ( self.args.self_training_max_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 ) else: t_total = ( len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs ) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.args.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW( optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon, ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total, ) self_training_loss = ( nn.KLDivLoss(reduction="none") if soft else nn.CrossEntropyLoss(reduction="none") ) softmax = nn.Softmax(dim=1) update_step = 0 self_training_steps = self.args.self_training_max_step global_step = 0 selftrain_loss = 0 set_seed(self.args) # self.model.zero_grad() for t3 in range(int(self_training_steps / len(train_dataloader)) + 1): epoch_iterator = tqdm(train_dataloader, desc="SelfTrain, Iteration") for step, batch in enumerate(epoch_iterator): if global_step % self.args.self_training_update_period == 0: teacher_model = copy.deepcopy(self.model) # .to("cuda") teacher_model.eval() for p in teacher_model.parameters(): p.requires_grad = False self.model.train() batch = tuple(t.to(self.device) for t in batch) # GPU or CPU inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } # self.model.eval() if self.args.task_type == "wic": inputs["keys"] = batch[6] elif self.args.task_type == "re": inputs["e1_mask"] = batch[4] inputs["e2_mask"] = batch[5] outputs = self.model(**inputs) outputs_pseudo = teacher_model(**inputs) logits = outputs[0] true_labels = batch[-1] loss = self.calc_loss( input=torch.log(softmax(logits)), target=outputs_pseudo[0], loss=self_training_loss, thresh=self.args.self_training_eps, soft=soft, conf="entropy", confreg=self.args.self_training_confreg, ) if self.args.self_training_contrastive_weight > 0: contrastive_loss = self.contrastive_loss( input=torch.log(softmax(logits)), feat=outputs_pseudo[-1], target=outputs_pseudo[0], conf="entropy", thresh=self.args.self_training_eps, distmetric=self.args.distmetric, ) loss = ( loss + self.args.self_training_contrastive_weight * contrastive_loss ) if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps if torch.cuda.device_count() > 1: loss = loss.mean() selftrain_loss += loss.item() loss.backward() if (step + 1) % self.args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.args.max_grad_norm ) optimizer.step() scheduler.step() # Update learning rate schedule self.model.zero_grad() teacher_model.zero_grad() global_step += 1 epoch_iterator.set_description( "SelfTrain iter:%d Loss:%.3f m:%.3f" % (step, selftrain_loss / global_step, 0) ) if ( self.args.logging_steps > 0 and global_step % self.args.self_train_logging_steps == 0 ): # self.evaluate("dev", global_step) self.evaluate("test", global_step) if ( self.args.save_steps > 0 and global_step % self.args.save_steps == 0 ): self.save_model() if 0 < self.args.self_training_max_step < global_step: epoch_iterator.close() break if 0 < self.args.self_training_max_step < global_step: break pass
def train(args, train_dataset, bert_model, model, tokenizer, labels, pad_token_label_id): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() loss_fct = torch.nn.CrossEntropyLoss() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path try: global_step = int( args.model_name_or_path.split("-")[-1].split("/")[0]) except ValueError: global_step = 0 epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet"] else None ) # XLM and RoBERTa don"t use segment_ids hs = bert_model(inputs["input_ids"], attention_mask=inputs["attention_mask"], output_hidden_states=True) avg_emb = 0 for layer in range(1, len(hs.hidden_states)): avg_emb += hs.hidden_states[layer] avg_emb = torch.div(avg_emb, len(hs.hidden_states) - 1) #print(avg_emb.shape, len(hs.hidden_states)) #cls_hs = hs[0] logits = model(avg_emb) active_loss = inputs["attention_mask"].view(-1) == 1 active_logits = logits.view(-1, args.num_labels) active_labels = torch.where( active_loss, inputs["labels"].view(-1), torch.tensor(loss_fct.ignore_index).type_as(inputs["labels"])) loss = loss_fct(active_logits, active_labels) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev") for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training bert_model_to_save = (bert_model.module if hasattr( bert_model, "module") else bert_model) bert_model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save( args, os.path.join(args.output_dir, "training_args.bin")) torch.save( model_to_save.state_dict(), os.path.join(args.output_dir, "bert_lstm.model")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def run(): seed_everything(config.SEED) dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop=True) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = TweetDataset(tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2) device = torch.device("cuda") model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) es = utils.EarlyStopping(patience=2, mode="max") for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) jaccard = engine.eval_fn(valid_data_loader, model, device) #print(f"Jaccard Score = {jaccard}") es(jaccard, model, model_path="model.bin") if es.early_stop: print("Early stopping") break
def __init__( self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module ): logger.info("Initializing Distiller") self.params = params self.dump_path = params.dump_path self.multi_gpu = params.multi_gpu self.fp16 = params.fp16 self.student = student self.teacher = teacher self.student_config = student.config self.vocab_size = student.config.vocab_size if params.n_gpu <= 1: sampler = RandomSampler(dataset) else: sampler = DistributedSampler(dataset) if params.group_by_size: groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size) sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size) else: sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False) self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences) self.temperature = params.temperature assert self.temperature > 0.0 self.alpha_ce = params.alpha_ce self.alpha_mlm = params.alpha_mlm self.alpha_clm = params.alpha_clm self.alpha_mse = params.alpha_mse self.alpha_cos = params.alpha_cos self.mlm = params.mlm if self.mlm: logger.info("Using MLM loss for LM step.") self.mlm_mask_prop = params.mlm_mask_prop assert 0.0 <= self.mlm_mask_prop <= 1.0 assert params.word_mask + params.word_keep + params.word_rand == 1.0 self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand]) self.pred_probs = self.pred_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else self.pred_probs self.token_probs = token_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else token_probs if self.fp16: self.pred_probs = self.pred_probs.half() self.token_probs = self.token_probs.half() else: logger.info("Using CLM loss for LM step.") self.epoch = 0 self.n_iter = 0 self.n_total_iter = 0 self.n_sequences_epoch = 0 self.total_loss_epoch = 0 self.last_loss = 0 self.last_loss_ce = 0 self.last_loss_mlm = 0 self.last_loss_clm = 0 if self.alpha_mse > 0.0: self.last_loss_mse = 0 if self.alpha_cos > 0.0: self.last_loss_cos = 0 self.last_log = 0 self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean") self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100) if self.alpha_mse > 0.0: self.mse_loss_fct = nn.MSELoss(reduction="sum") if self.alpha_cos > 0.0: self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean") logger.info("--- Initializing model optimizer") assert params.gradient_accumulation_steps >= 1 self.num_steps_epoch = len(self.dataloader) num_train_optimization_steps = ( int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1 ) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": params.weight_decay, }, { "params": [ p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": 0.0, }, ] logger.info( "------ Number of trainable parameters (student): %i" % sum([p.numel() for p in self.student.parameters() if p.requires_grad]) ) logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()])) self.optimizer = AdamW( optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98) ) warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps ) if self.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level") self.student, self.optimizer = amp.initialize( self.student, self.optimizer, opt_level=self.params.fp16_opt_level ) self.teacher = self.teacher.half() if self.multi_gpu: if self.fp16: from apex.parallel import DistributedDataParallel logger.info("Using apex.parallel.DistributedDataParallel for distributed training.") self.student = DistributedDataParallel(self.student) else: from torch.nn.parallel import DistributedDataParallel logger.info("Using nn.parallel.DistributedDataParallel for distributed training.") self.student = DistributedDataParallel( self.student, device_ids=[params.local_rank], output_device=params.local_rank, find_unused_parameters=True, ) self.is_master = params.is_master if self.is_master: logger.info("--- Initializing Tensorboard") self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, "log", "train")) self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0) self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0)
def training(tokenizer, content, n_splits, fold, train_data_loader, val_data_loader, model_type, model_name, hidden_layers, optimizer_name, lr_scheduler_name, lr, warmup_proportions, batch_size, valid_batch_size, num_epoch, start_epoch, accumlation_steps, checkpoint_folder, load_pretrain, seed, loss, extra_token, augment, early_stopping): torch.cuda.empty_cache() strng = "@%s: \n" % os.path.basename(__file__) strng += "\tset random seed = %d \n" % seed strng += "\t cuda environment: \n" strng += "\t torch version is %s \t torch.version.cuda is %s \t torch.backends.cudnn.version() = %s \n" % ( torch.__version__, torch.version.cuda, torch.backends.cudnn.version()) strng += "\t torch.cuda.device_count() is %s \n" % ( torch.cuda.device_count()) if augment: if extra_token: checkpoint_folder = os.path.join( checkpoint_folder, model_type + '/' + model_name + '-' + content + '-' + loss + '-' + optimizer_name + '-' + lr_scheduler_name + '-' + str(n_splits) + '-' + str(seed) + '-' + 'aug_differential_extra_token/') else: checkpoint_folder = os.path.join( checkpoint_folder, model_type + '/' + model_name + '-' + content + '-' + loss + '-' + optimizer_name + '-' + lr_scheduler_name + '-' + str(n_splits) + '-' + str(seed) + '-' + 'aug_differential/') else: if extra_token: checkpoint_folder = os.path.join( checkpoint_folder, model_type + '/' + model_name + '-' + content + '-' + loss + '-' + optimizer_name + '-' + lr_scheduler_name + '-' + str(n_splits) + '-' + str(seed) + '-' + 'extra_token/') else: checkpoint_folder = os.path.join( checkpoint_folder, model_type + '/' + model_name + '-' + content + '-' + loss + '-' + optimizer_name + '-' + lr_scheduler_name + '-' + str(n_splits) + '-' + str(seed) + '-' + '/') checkpoint_filename = 'fold_' + str(fold) + "_checkpoint.pth" checkpoint_filepath = os.path.join(checkpoint_folder, checkpoint_filename) os.makedirs(checkpoint_folder, exist_ok=True) log = Logger() log.open(os.path.join(checkpoint_folder, 'fold_' + str(fold) + '_train_log.txt'), mode='a+') log.write('\t%s\n' % strng) log.write("\t seed = %s, fold = %s, __file__ = %s, out_dir = %s" % (seed, fold, __file__, checkpoint_folder)) def load(model, pretrain_file, skip=[]): pretrain_dict = torch.load(pretrain_file) state_dict = model.state_dict() for key in state_dict.keys(): if any(s in key for s in skip): continue else: state_dict[key] = pretrain_dict[key] model.load_state_dict(state_dict, strict=False) return model if content == "Question_Answer": num_class = 30 elif content == "Question": num_class = 21 elif content == "Answer": num_class = 9 if model_type == "bert": if extra_token: model = QuestNet(model_type=model_name, tokenizer=tokenizer, n_classes=num_class, n_category_classes=num_category_class, n_host_classes=num_host_class, hidden_layers=hidden_layers, extra_token=True) else: model = QuestNet(model_type=model_name, tokenizer=tokenizer, n_classes=num_class, n_category_classes=num_category_class, n_host_classes=num_host_class, hidden_layers=hidden_layers, extra_token=False) elif model_type == "xlnet": if extra_token: model = QuestNet(model_type=model_name, tokenizer=tokenizer, n_classes=num_class, n_category_classes=num_category_class, n_host_classes=num_host_class, hidden_layers=hidden_layers, extra_token=True) else: model = QuestNet(model_type=model_name, tokenizer=tokenizer, n_classes=num_class, n_category_classes=num_category_class, n_host_classes=num_host_class, hidden_layers=hidden_layers, extra_token=False) else: raise NotImplementedError model = model.cuda() if load_pretrain: if content == "Answer": model = load(model, checkpoint_filepath, skip=['fc.weight', 'fc.bias']) else: model = load(model, checkpoint_filepath) if model_name == "t5-base": weight_decay = 0.9 else: weight_decay = 0.01 if (model_type == 'bert') or (model_type == 'xlnet'): optimizer_grouped_parameters = [] list_lr = [] if (model_name == 'bert-base-uncased') or (model_name == 'bert-base-cased'): list_layers = [ model.bert_model.embeddings, model.bert_model.encoder.layer[0], model.bert_model.encoder.layer[1], model.bert_model.encoder.layer[2], model.bert_model.encoder.layer[3], model.bert_model.encoder.layer[4], model.bert_model.encoder.layer[5], model.bert_model.encoder.layer[6], model.bert_model.encoder.layer[7], model.bert_model.encoder.layer[8], model.bert_model.encoder.layer[9], model.bert_model.encoder.layer[10], model.bert_model.encoder.layer[11], model.fc_1, model.fc ] elif (model_name == 'bert-large-uncased'): list_layers = [ model.bert_model.embeddings, model.bert_model.encoder.layer[0], model.bert_model.encoder.layer[1], model.bert_model.encoder.layer[2], model.bert_model.encoder.layer[3], model.bert_model.encoder.layer[4], model.bert_model.encoder.layer[5], model.bert_model.encoder.layer[6], model.bert_model.encoder.layer[7], model.bert_model.encoder.layer[8], model.bert_model.encoder.layer[9], model.bert_model.encoder.layer[10], model.bert_model.encoder.layer[11], model.bert_model.encoder.layer[12], model.bert_model.encoder.layer[13], model.bert_model.encoder.layer[14], model.bert_model.encoder.layer[15], model.bert_model.encoder.layer[16], model.bert_model.encoder.layer[17], model.bert_model.encoder.layer[18], model.bert_model.encoder.layer[19], model.bert_model.encoder.layer[20], model.bert_model.encoder.layer[21], model.bert_model.encoder.layer[22], model.bert_model.encoder.layer[23], model.fc_1, model.fc ] elif (model_name == "xlnet-base-cased"): list_layers = [ model.xlnet_model.word_embedding, model.xlnet_model.layer[0], model.xlnet_model.layer[1], model.xlnet_model.layer[2], model.xlnet_model.layer[3], model.xlnet_model.layer[4], model.xlnet_model.layer[5], model.xlnet_model.layer[6], model.xlnet_model.layer[7], model.xlnet_model.layer[8], model.xlnet_model.layer[9], model.xlnet_model.layer[10], model.xlnet_model.layer[11], model.fc_1, model.fc ] elif (model_name == "roberta-base"): list_layers = [ model.roberta_model.embeddings, model.roberta_model.encoder.layer[0], model.roberta_model.encoder.layer[1], model.roberta_model.encoder.layer[2], model.roberta_model.encoder.layer[3], model.roberta_model.encoder.layer[4], model.roberta_model.encoder.layer[5], model.roberta_model.encoder.layer[6], model.roberta_model.encoder.layer[7], model.roberta_model.encoder.layer[8], model.roberta_model.encoder.layer[9], model.roberta_model.encoder.layer[10], model.roberta_model.encoder.layer[11], model.fc_1, model.fc ] elif (model_name == "gpt2"): list_layers = [ # model.gpt2_model.wte, # model.gpt2_model.wpe, model.gpt2_model.h[0], model.gpt2_model.h[1], model.gpt2_model.h[2], model.gpt2_model.h[3], model.gpt2_model.h[4], model.gpt2_model.h[5], model.gpt2_model.h[6], model.gpt2_model.h[7], model.gpt2_model.h[8], model.gpt2_model.h[9], model.gpt2_model.h[10], model.gpt2_model.h[11], model.fc_1, model.fc ] else: raise NotImplementedError ######## Differential LR and optimizer group ############################################################ if model_name == "": for layer in list_layers: list_lr.append(lr) lr = lr * decay_factor list_lr.reverse() else: mult = lr / min_lr step = mult**(1 / (len(list_layers) - 1)) list_lr = [lr * (step**i) for i in range(len(list_layers))] no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias'] for i in range(len(list_lr)): if isinstance(list_lr[i], list): for list_layer in list_layers[i]: layer_parameters = list(list_layer.named_parameters()) optimizer_grouped_parameters.append({ 'params': [ p for n, p in layer_parameters if not any(nd in n for nd in no_decay) ], 'lr': list_lr[i], 'weight_decay': weight_decay }) optimizer_grouped_parameters.append({ 'params': [ p for n, p in layer_parameters if any(nd in n for nd in no_decay) ], 'lr': list_lr[i], 'weight_decay': 0.0 }) else: layer_parameters = list(list_layers[i].named_parameters()) optimizer_grouped_parameters.append({ 'params': [ p for n, p in layer_parameters if not any(nd in n for nd in no_decay) ], 'lr': list_lr[i], 'weight_decay': weight_decay }) optimizer_grouped_parameters.append({ 'params': [ p for n, p in layer_parameters if any(nd in n for nd in no_decay) ], 'lr': list_lr[i], 'weight_decay': 0.0 }) if extra_token: layer_parameters = list(model.fc_1_category.named_parameters()) optimizer_grouped_parameters.append({ 'params': [ p for n, p in layer_parameters if not any(nd in n for nd in no_decay) ], 'lr': 1e-6, 'weight_decay': weight_decay }) optimizer_grouped_parameters.append({ 'params': [ p for n, p in layer_parameters if any(nd in n for nd in no_decay) ], 'lr': 1e-6, 'weight_decay': 0.0 }) layer_parameters = list(model.fc_1_host.named_parameters()) optimizer_grouped_parameters.append({ 'params': [ p for n, p in layer_parameters if not any(nd in n for nd in no_decay) ], 'lr': 1e-6, 'weight_decay': weight_decay }) optimizer_grouped_parameters.append({ 'params': [ p for n, p in layer_parameters if any(nd in n for nd in no_decay) ], 'lr': 1e-6, 'weight_decay': 0.0 }) layer_parameters = list(model.fc_category.named_parameters()) optimizer_grouped_parameters.append({ 'params': [ p for n, p in layer_parameters if not any(nd in n for nd in no_decay) ], 'lr': 1e-6, 'weight_decay': weight_decay }) optimizer_grouped_parameters.append({ 'params': [ p for n, p in layer_parameters if any(nd in n for nd in no_decay) ], 'lr': 1e-6, 'weight_decay': 0.0 }) layer_parameters = list(model.fc_host.named_parameters()) optimizer_grouped_parameters.append({ 'params': [ p for n, p in layer_parameters if not any(nd in n for nd in no_decay) ], 'lr': 1e-6, 'weight_decay': weight_decay }) optimizer_grouped_parameters.append({ 'params': [ p for n, p in layer_parameters if any(nd in n for nd in no_decay) ], 'lr': 1e-6, 'weight_decay': 0.0 }) else: print("no extra token") else: raise NotImplementedError if optimizer_name == 'Adam': optimizer = torch.optim.Adam(optimizer_grouped_parameters) elif optimizer_name == 'Ranger': optimizer = Ranger(optimizer_grouped_parameters) elif optimizer_name == 'BertAdam': num_optimization_steps = num_epoch * len( train_data_loader) // accumlation_steps optimizer = BertAdam(optimizer_grouped_parameters, warmup=warmup_proportions, t_total=num_optimization_steps) elif optimizer_name == 'AdamW': optimizer = AdamW(optimizer_grouped_parameters, eps=4e-5) elif optimizer_name == 'FusedAdam': optimizer = FusedAdam(optimizer_grouped_parameters, bias_correction=False) else: raise NotImplementedError ######## LR shceduler ############################################################ if lr_scheduler_name == 'CosineAnealing': num_train_optimization_steps = num_epoch * len( train_data_loader) // accumlation_steps scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=int(warmup_proportions * num_train_optimization_steps), num_training_steps=num_train_optimization_steps) lr_scheduler_each_iter = False elif lr_scheduler_name == "WarmRestart": scheduler = WarmRestart(optimizer, T_max=5, T_mult=1, eta_min=1e-6) lr_scheduler_each_iter = False elif lr_scheduler_name == "WarmupLinearSchedule": num_train_optimization_steps = num_epoch * len( train_data_loader) // accumlation_steps scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(warmup_proportions * num_train_optimization_steps), num_training_steps=num_train_optimization_steps) lr_scheduler_each_iter = True else: raise NotImplementedError log.write("\t model name: %s \n" % model_name) log.write("\t optimizer name: %s \n" % optimizer_name) log.write("\t scheduler name: %s \n" % lr_scheduler_name) # AMP -automatic mixed precision training for faster training # https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html model, optimizer = amp.initialize(model, optimizer, opt_level='O1') eval_step = len(train_data_loader) log_step = 50 eval_count = 0 count = 0 log.write('\t training starts here!!\n') log.write('\t batch size = %d, accumulation steps = %d \n' % (batch_size, accumlation_steps)) log.write('\t experiment : %s' % str(__file__.split('/')[-2:])) valid_loss = np.zeros(1, np.float32) train_loss = np.zeros(1, np.float32) valid_metric_optimal = -np.inf writer = SummaryWriter() # Define loss if loss == 'mse': criterion = MSELoss() elif loss == 'mse-bce': criterion = MSBCELoss() elif loss == 'focal': criterion = FocalLoss() elif loss == 'bce': if content == 'Question_Answer': weights = torch.tensor(np.array(unbalance_weight), dtype=torch.float64).cuda() elif content == 'Answer': weights = torch.tensor(np.array(a_unbalance_weight), dtype=torch.float64).cuda() elif content == 'Question': weights = torch.tensor(np.array(q_unbalance_weight), dtype=torch.float64).cuda() else: raise NotImplementedError criterion = nn.BCEWithLogitsLoss(weight=weights) criterion_extra = nn.BCEWithLogitsLoss() else: raise NotImplementedError for epoch in range(1, num_epoch + 1): labels_train = None pred_train = None labels_val = None pred_val = None checkpoint_filename_last_epoch = "fold_" + str( fold) + "_checkpoint_last_epoch.pth" checkpoint_filepath_last_epoch = os.path.join( checkpoint_folder, checkpoint_filename_last_epoch) torch.save(model.state_dict(), checkpoint_filepath_last_epoch) if (epoch > 1) and (not lr_scheduler_each_iter): scheduler.step() if epoch < start_epoch: continue log.write("\t epoch is %d and time is %s \n" % (epoch, time.strftime("%H:%M:%S", time.gmtime(time.time())))) prev_time = time.time() sum_train_loss = np.zeros_like(train_loss) sum_train = np.zeros_like(train_loss) torch.cuda.empty_cache() model.zero_grad() if extra_token: for tr_batch_i, (token_ids, seg_ids, labels, labels_category, labels_host) in enumerate(train_data_loader): rate = 0 for param_group in optimizer.param_groups: rate += param_group['lr'] / len(optimizer.param_groups) model.train() token_ids = token_ids.cuda() seg_ids = seg_ids.cuda() labels = labels.cuda().float() labels_category = labels_category.cuda().float() labels_host = labels_host.cuda().float() prediction, prediction_category, prediction_host = model( token_ids, seg_ids) loss = auxiliary_weights[0] * criterion( prediction, labels) + auxiliary_weights[1] * criterion_extra( prediction_category, labels_category ) + auxiliary_weights[2] * criterion_extra( prediction_host, labels_host) with amp.scale_loss(loss / accumlation_steps, optimizer) as scaled_loss: scaled_loss.backward() if ((tr_batch_i + 1) % accumlation_steps == 0): torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=5.0, norm_type=2) optimizer.step() model.zero_grad() if lr_scheduler_each_iter: scheduler.step() #Write to tensorboard summary writer writer.add_scalar( "train_loss_" + str(fold), loss.item(), (epoch - 1) * len(train_data_loader) * batch_size + tr_batch_i * batch_size) prediction = torch.sigmoid(prediction) if tr_batch_i == 0: pred_train = prediction.cpu().detach().numpy() labels_train = labels.cpu().detach().numpy() else: pred_train = np.concatenate( (pred_train, prediction.cpu().detach().numpy()), axis=0) labels_train = np.concatenate( (labels_train, labels.cpu().detach().numpy()), axis=0) l = np.array([loss.item() * batch_size]) n = np.array([batch_size]) sum_train_loss += l sum_train += n #log for training if (tr_batch_i + 1) % log_step == 0: train_loss = sum_train_loss / (sum_train + 1e-12) pred_train = np.nan_to_num(pred_train) sp = Spearman(labels_train, pred_train) elapsed_time = time.time() - prev_time prev_time = time.time() log.write( "\t Batch # %d \t perc processed in epoch: %f \t train_loss is %f \t lr is %f \t spearman is %f \t elapsed time: %d\n" % ((tr_batch_i + 1), ((tr_batch_i + 1) / len(train_data_loader)), train_loss[0], rate, sp, elapsed_time)) if (tr_batch_i + 1) % eval_step == 0: eval_count += 1 valid_loss = np.zeros(1, np.float32) valid_num = np.zeros_like(valid_loss) with torch.no_grad(): torch.cuda.empty_cache() for val_batch_i, ( token_ids, seg_ids, labels, labels_category, labels_host) in enumerate(val_data_loader): model.eval() token_ids = token_ids.cuda() seg_ids = seg_ids.cuda() labels = labels.cuda().float() labels_category = labels_category.cuda().float() labels_host = labels_host.cuda().float() prediction, prediction_category, prediction_host = model( token_ids, seg_ids) val_loss = auxiliary_weights[0] * criterion( prediction, labels ) + auxiliary_weights[1] * criterion_extra( prediction_category, labels_category ) + auxiliary_weights[2] * criterion_extra( prediction_host, labels_host) writer.add_scalar( "val_loss_" + str(fold), val_loss.item(), (eval_count - 1) * len(val_data_loader) * valid_batch_size + val_batch_i * valid_batch_size) prediction = torch.sigmoid(prediction) if val_batch_i == 0: pred_val = prediction.cpu().detach().numpy() labels_val = labels.cpu().detach().numpy() else: pred_val = np.concatenate( (pred_val, prediction.cpu().detach().numpy()), axis=0) labels_val = np.concatenate( (labels_val, labels.cpu().detach().numpy()), axis=0) l = np.array([val_loss.item() * valid_batch_size]) n = np.array([valid_batch_size]) valid_loss += l valid_num += n valid_loss = valid_loss / (valid_num + 1e-12) pred_val = np.nan_to_num(pred_val) sp = Spearman(labels_val, pred_val) log.write( "\t Batch # %d perc processed in epoch: %f Validation loss is %f \t spearman is %f \n" % (val_batch_i, (val_batch_i / len(val_data_loader)), valid_loss[0], sp)) else: for tr_batch_i, (token_ids, seg_ids, labels) in enumerate(train_data_loader): rate = 0 for param_group in optimizer.param_groups: rate += param_group['lr'] / len(optimizer.param_groups) model.train() token_ids = token_ids.cuda() seg_ids = seg_ids.cuda() labels = labels.cuda().float() prediction = model(token_ids, seg_ids) loss = criterion(prediction, labels) with amp.scale_loss(loss / accumlation_steps, optimizer) as scaled_loss: scaled_loss.backward() if ((tr_batch_i + 1) % accumlation_steps == 0): torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=5.0, norm_type=2) optimizer.step() model.zero_grad() if lr_scheduler_each_iter: scheduler.step() #Write to tensorboard summary writer writer.add_scalar( "train_loss_" + str(fold), loss.item(), (epoch - 1) * len(train_data_loader) * batch_size + tr_batch_i * batch_size) prediction = torch.sigmoid(prediction) if tr_batch_i == 0: pred_train = prediction.cpu().detach().numpy() labels_train = labels.cpu().detach().numpy() else: pred_train = np.concatenate( (pred_train, prediction.cpu().detach().numpy()), axis=0) labels_train = np.concatenate( (labels_train, labels.cpu().detach().numpy()), axis=0) l = np.array([loss.item() * batch_size]) n = np.array([batch_size]) sum_train_loss += l sum_train += n #log for training if (tr_batch_i + 1) % log_step == 0: train_loss = sum_train_loss / (sum_train + 1e-12) pred_train = np.nan_to_num(pred_train) sp = Spearman(labels_train, pred_train) elapsed_time = time.time() - prev_time prev_time = time.time() log.write( "\t Batch # %d \t perc processed in epoch: %f \t train_loss is %f \t lr is %f \t spearman is %f \t elapsed time: %d\n" % ((tr_batch_i + 1), ((tr_batch_i + 1) / len(train_data_loader)), train_loss[0], rate, sp, elapsed_time)) if (tr_batch_i + 1) % eval_step == 0: eval_count += 1 valid_loss = np.zeros(1, np.float32) valid_num = np.zeros_like(valid_loss) with torch.no_grad(): torch.cuda.empty_cache() for val_batch_i, ( token_ids, seg_ids, labels) in enumerate(val_data_loader): model.eval() token_ids = token_ids.cuda() seg_ids = seg_ids.cuda() labels = labels.cuda().float() prediction = model(token_ids, seg_ids) val_loss = criterion(prediction, labels) writer.add_scalar( "val_loss_" + str(fold), val_loss.item(), (eval_count - 1) * len(val_data_loader) * valid_batch_size + val_batch_i * valid_batch_size) prediction = torch.sigmoid(prediction) if val_batch_i == 0: pred_val = prediction.cpu().detach().numpy() labels_val = labels.cpu().detach().numpy() else: pred_val = np.concatenate( (pred_val, prediction.cpu().detach().numpy()), axis=0) labels_val = np.concatenate( (labels_val, labels.cpu().detach().numpy()), axis=0) l = np.array([val_loss.item() * valid_batch_size]) n = np.array([valid_batch_size]) valid_loss += l valid_num += n valid_loss = valid_loss / (valid_num + 1e-12) pred_val = np.nan_to_num(pred_val) sp = Spearman(labels_val, pred_val) log.write( "\t Batch # %d perc processed in epoch: %f \t Validation loss is %f \t spearman is %f \n" % (val_batch_i, (val_batch_i / len(val_data_loader)), valid_loss[0], sp)) val_metric_epoch = sp log.write('Validation metric {:.6f}. Saving model ...'.format( val_metric_epoch)) checkpoint_filename_swa = "fold_" + str(fold) + "_checkpoint_swa.pth" checkpoint_filepath_swa = os.path.join(checkpoint_folder, checkpoint_filename_swa) state_dict_last_epoch = torch.load(checkpoint_filepath_last_epoch) state_dict = model.state_dict() for name, val in state_dict.items(): state_dict[name].data.copy_( (val.data + epoch * state_dict_last_epoch[name].data) / (epoch + 1)) model.load_state_dict(state_dict) torch.save(model.state_dict(), checkpoint_filepath_swa)
model.parameters(), lr=5e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) from transformers import get_linear_schedule_with_warmup # Number of training epochs (authors recommend between 2 and 4) epochs = 3 # Total number of training steps is number of batches * number of epochs. total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) """## 4.3. Training Loop""" # import numpy as np # from sklearn.metrics import f1_score, matthews_corrcoef import numpy as np from sklearn.metrics import f1_score, matthews_corrcoef # Function to calculate the accuracy of our predictions vs labels def flat_accuracy(preds, labels): pred_flat = np.argmax(preds, axis=1).flatten() labels_flat = labels.flatten() return np.sum(pred_flat == labels_flat) / len(labels_flat)
def main(): #load cola dataset for our langage model finetuning sentences2 = [] labels2 = [] with open("./cola_public/raw/in_domain_train.tsv") as tsvfile2: tsvreader2 = csv.reader(tsvfile2, delimiter="\t") for line in tsvreader2: sentences2 += [line[3]] labels2 += [int(line[1])] #load encoder tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') # ici juste un tour pour voir quelle est la taille max . on visera un peu pls haut par sécurité. max_len2 = 0 # For every sentence... for sent in sentences2: # Tokenize the text and add `[CLS]` and `[SEP]` tokens. input_ids2 = tokenizer.encode(sent, add_special_tokens=True) # Update the maximum sentence length. max_len2 = max(max_len2, len(input_ids2)) print('Max sentence length: ', max_len2) input_ids2 = [] for sent in sentences2: encoded_dict2 = tokenizer.encode( sent, # Sentence to encode. add_special_tokens=True, # Add '[CLS]' and '[SEP]' max_length=64, # Pad & truncate all sentences. "max_len2? truncation=True, pad_to_max_length=True, return_tensors='pt', # Return pytorch tensors. ) # Add the encoded sentence to the list. input_ids2.append(encoded_dict2) # Convert the lists into tensors. input_ids2 = torch.cat(input_ids2, dim=0) labels2 = torch.tensor(labels2) # Print sentence 0, now as a list of IDs. print('Original: ', sentences2[0]) print('Token IDs:', input_ids2[0]) # Combine the training inputs into a TensorDataset. dataset2 = TensorDataset(input_ids2, labels2) # Create a 90-10 train-validation split. # Calculate the number of samples to include in each set. train_size2 = int(0.9 * len(dataset2)) val_size2 = len(dataset2) - train_size2 # Divide the dataset by randomly selecting samples. train_dataset2, val_dataset2 = random_split(dataset2, [train_size2, val_size2]) # The DataLoader needs to know our batch size for training, so we specify it # here. For fine-tuning BERT on a specific task, the authors recommend a batch #size of 16 or 32. batch_size = 32 # Create the DataLoaders for our training and validation sets. # We'll take training samples in random order. train_dataloader2 = DataLoader( train_dataset2, # The training samples. sampler=RandomSampler(train_dataset2), # Select batches randomly batch_size=batch_size # Trains with this batch size. ) # For validation the order doesn't matter, so we'll just read them sequentially. validation_dataloader2 = DataLoader( val_dataset2, # The validation samples. sampler=SequentialSampler( val_dataset2), # Pull out batches sequentially. batch_size=batch_size # Evaluate with this batch size. ) # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. modd = DistilBertForSequenceClassification.from_pretrained( "./my_pretrained_distil", # Use the 12-layer BERT model, with an uncased vocab. num_labels= 2, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions=False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) # If there's a GPU available... if torch.cuda.is_available(): # Tell pytorch to run this model on the GPU. modd.cuda() # see https://mccormickml.com/2019/07/22/BERT-fine-tuning/ from transformers import AdamW # Note: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" optimizer2 = AdamW( modd.parameters(), lr=2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) from transformers import get_linear_schedule_with_warmup # Number of training epochs. The BERT authors recommend between 2 and 4. # We chose to run for 4, but we'll see later that this may be over-fitting the # training data. epochs = 4 # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). total_steps = len(train_dataloader2) * epochs # Create the learning rate scheduler. scheduler2 = get_linear_schedule_with_warmup( optimizer2, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) # If there's a GPU available... if torch.cuda.is_available(): # Tell PyTorch to use the GPU. device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not... else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") #second dataset: stanford treebank for our sentiment classification model (which is to be attacked) #extract data sentences = [] labels = [] with open("./glue_data/SST-2/dev.tsv") as tsvfile: tsvreader = csv.reader(tsvfile, delimiter="\t") for i, line in enumerate(tsvreader): if i > 0: sentences += [line[0]] labels += [int(line[1])] #what is sentence maximum lenght? max_len = 0 for sent in sentences: # Tokenize the text and add `[CLS]` and `[SEP]` tokens. input_ids = tokenizer.encode(sent, add_special_tokens=True) # Update the maximum sentence length. max_len = max(max_len, len(input_ids)) print('Max sentence length: ', max_len) #encode sentences: input_ids = [] for sent in sentences: encoded_dict = tokenizer.encode( sent, # Sentence to encode. add_special_tokens=True, # Add '[CLS]' and '[SEP]' max_length=64, # Pad & truncate all sentences. truncation=True, pad_to_max_length=True, return_tensors='pt', # Return pytorch tensors. ) # Add the encoded sentence to the list. input_ids.append(encoded_dict) # Convert the lists into tensors. input_ids = torch.cat(input_ids, dim=0) labels = torch.tensor(labels) # Print sentence 0, now as a list of IDs. print('Original: ', sentences[0]) print('Token IDs:', input_ids[0]) # Combine the training inputs into a TensorDataset. dataset = TensorDataset(input_ids, labels) # Create a 90-10 train-validation split: # Calculate the number of samples to include in each set. train_size = int(0.9 * len(dataset)) val_size = len(dataset) - train_size # Divide the dataset by randomly selecting samples. train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) # The DataLoader needs to know our batch size for training, so we specify it # here. For fine-tuning BERT on a specific task, the authors recommend a batch # size of 16 or 32. batch_size = 32 # Create the DataLoaders for our training and validation sets. # We'll take training samples in random order. train_dataloader = DataLoader( train_dataset, # The training samples. sampler=RandomSampler(train_dataset), # Select batches randomly batch_size=batch_size # Trains with this batch size. ) # For validation the order doesn't matter, so we'll just read them sequentially. validation_dataloader = DataLoader( val_dataset, # The validation samples. sampler=SequentialSampler( val_dataset), # Pull out batches sequentially. batch_size=batch_size # Evaluate with this batch size. ) #create model to be finetuned: # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. model = DistilBertForSequenceClassification.from_pretrained( "./my_pretrained_distil", # Use the 12-layer BERT model, with an uncased vocab. num_labels= 2, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions=False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) # If there's a GPU available... if torch.cuda.is_available(): # Tell pytorch to run this model on the GPU. model.cuda() # Note: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" optimizer = AdamW( model.parameters(), lr=2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) # Number of training epochs. The BERT authors recommend between 2 and 4. # We chose to run for 4, but we'll see later that this may be over-fitting the # training data. epochs = 4 # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) #the two trainings import random import numpy as np # This training code is based on the `run_glue.py` script here: # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128 # Set the seed value all over the place to make this reproducible. seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # We'll store a number of quantities such as training and validation loss, # validation accuracy, and timings. training_stats = [] # Measure the total training time for the whole run. total_t0 = time.time() # For each epoch... for epoch_i in range(0, epochs): # ======================================== # Training # ======================================== # Perform one full pass over the training set. print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...') # Measure how long the training epoch takes. t0 = time.time() # Reset the total loss for this epoch. total_train_loss = 0 # Put the model into training mode. Don't be mislead--the call to # `train` just changes the *mode*, it doesn't *perform* the training. # `dropout` and `batchnorm` layers behave differently during training # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch) modd.train() # For each batch of training data... for step, batch in enumerate(train_dataloader2): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # Report progress. print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format( step, len(train_dataloader2), elapsed)) # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using the # `to` method. # # `batch` contains two pytorch tensors: # [0]: input ids # [2]: labels b_input_ids = batch[0].to(device) b_labels = batch[1].to(device) # Always clear any previously calculated gradients before performing a # backward pass. PyTorch doesn't do this automatically because # accumulating the gradients is "convenient while training RNNs". # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) modd.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # It returns different numbers of parameters depending on what arguments # arge given and what flags are set. For our useage here, it returns # the loss (because we provided labels) and the "logits"--the model # outputs prior to activation. loss, logits = modd(b_input_ids, labels=b_labels) # Accumulate the training loss over all of the batches so that we can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. total_train_loss += loss.item() # Perform a backward pass to calculate the gradients. loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(modd.parameters(), 1.0) # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer2.step() # Update the learning rate. scheduler2.step() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader2) # Measure how long this epoch took. training_time = format_time(time.time() - t0) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epcoh took: {:}".format(training_time)) # ======================================== # Validation # ======================================== # After the completion of each training epoch, measure our performance on # our validation set. print("") print("Running Validation...") t0 = time.time() # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. modd.eval() # Tracking variables total_eval_accuracy = 0 total_eval_loss = 0 nb_eval_steps = 0 # Evaluate data for one epoch for batch in validation_dataloader2: # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using # the `to` method. # # `batch` contains two pytorch tensors: # [0]: input ids # [2]: labels b_input_ids = batch[0].to(device) b_labels = batch[1].to(device) # Tell pytorch not to bother with constructing the compute graph during # the forward pass, since this is only needed for backprop (training). with torch.no_grad(): # Forward pass, calculate logit predictions. # token_type_ids is the same as the "segment ids", which # differentiates sentence 1 and 2 in 2-sentence tasks. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # Get the "logits" output by the model. The "logits" are the output # values prior to applying an activation function like the softmax. (loss, logits) = modd(b_input_ids, labels=b_labels) # Accumulate the validation loss. total_eval_loss += loss.item() # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Calculate the accuracy for this batch of test sentences, and # accumulate it over all batches. total_eval_accuracy += flat_accuracy(logits, label_ids) # Report the final accuracy for this validation run. avg_val_accuracy = total_eval_accuracy / len(validation_dataloader2) print(" Accuracy: {0:.2f}".format(avg_val_accuracy)) # Calculate the average loss over all of the batches. avg_val_loss = total_eval_loss / len(validation_dataloader2) # Measure how long the validation run took. validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) # Record all statistics from this epoch. training_stats.append({ 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'Valid. Loss': avg_val_loss, 'Valid. Accur.': avg_val_accuracy, 'Training Time': training_time, 'Validation Time': validation_time }) print("") print("Training complete!") print("Total training took {:} (h:mm:ss)".format( format_time(time.time() - total_t0))) #save model torch.save(modd.state_dict(), "distil_languagemodel_finetuned.pt") # This training code is based on the `run_glue.py` script here: # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128 # Set the seed value all over the place to make this reproducible. seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # We'll store a number of quantities such as training and validation loss, # validation accuracy, and timings. training_stats = [] # Measure the total training time for the whole run. total_t0 = time.time() # For each epoch... for epoch_i in range(0, epochs): # ======================================== # Training # ======================================== # Perform one full pass over the training set. print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...') # Measure how long the training epoch takes. t0 = time.time() # Reset the total loss for this epoch. total_train_loss = 0 # Put the model into training mode. Don't be mislead--the call to # `train` just changes the *mode*, it doesn't *perform* the training. # `dropout` and `batchnorm` layers behave differently during training # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch) model.train() # For each batch of training data... for step, batch in enumerate(train_dataloader): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # Report progress. print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format( step, len(train_dataloader), elapsed)) # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using the # `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [2]: labels b_input_ids = batch[0].to(device) b_labels = batch[1].to(device) # Always clear any previously calculated gradients before performing a # backward pass. PyTorch doesn't do this automatically because # accumulating the gradients is "convenient while training RNNs". # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # It returns different numbers of parameters depending on what arguments # arge given and what flags are set. For our useage here, it returns # the loss (because we provided labels) and the "logits"--the model # outputs prior to activation. loss, logits = model(b_input_ids, labels=b_labels) # Accumulate the training loss over all of the batches so that we can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. total_train_loss += loss.item() # Perform a backward pass to calculate the gradients. loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer.step() # Update the learning rate. scheduler.step() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) # Measure how long this epoch took. training_time = format_time(time.time() - t0) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epcoh took: {:}".format(training_time)) # ======================================== # Validation # ======================================== # After the completion of each training epoch, measure our performance on # our validation set. print("") print("Running Validation...") t0 = time.time() # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. model.eval() # Tracking variables total_eval_accuracy = 0 total_eval_loss = 0 nb_eval_steps = 0 # Evaluate data for one epoch for batch in validation_dataloader: # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using # the `to` method. # # `batch` contains two pytorch tensors: # [0]: input ids # [2]: labels b_input_ids = batch[0].to(device) b_labels = batch[1].to(device) # Tell pytorch not to bother with constructing the compute graph during # the forward pass, since this is only needed for backprop (training). with torch.no_grad(): # Forward pass, calculate logit predictions. # token_type_ids is the same as the "segment ids", which # differentiates sentence 1 and 2 in 2-sentence tasks. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # Get the "logits" output by the model. The "logits" are the output # values prior to applying an activation function like the softmax. (loss, logits) = model(b_input_ids, labels=b_labels) # Accumulate the validation loss. total_eval_loss += loss.item() # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Calculate the accuracy for this batch of test sentences, and # accumulate it over all batches. total_eval_accuracy += flat_accuracy(logits, label_ids) # Report the final accuracy for this validation run. avg_val_accuracy = total_eval_accuracy / len(validation_dataloader) print(" Accuracy: {0:.2f}".format(avg_val_accuracy)) # Calculate the average loss over all of the batches. avg_val_loss = total_eval_loss / len(validation_dataloader) # Measure how long the validation run took. validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) # Record all statistics from this epoch. training_stats.append({ 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'Valid. Loss': avg_val_loss, 'Valid. Accur.': avg_val_accuracy, 'Training Time': training_time, 'Validation Time': validation_time }) print("") print("Training complete!") print("Total training took {:} (h:mm:ss)".format( format_time(time.time() - total_t0))) #save model torch.save(model.state_dict(), "distil_finetuned.pt")
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs model = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training model.resize_token_embeddings(len(tokenizer)) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if ( args.model_name_or_path and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if 0 < args.max_steps < global_step: epoch_iterator.close() break if 0 < args.max_steps < global_step: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step