def configure_optimizers(self): # Prepare optimizer param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.lr, correct_bias=False) # warm up lr num_workers = (self.hparams.gpus if self.hparams.gpus is not None else 1) * (self.hparams.num_nodes if self.hparams.num_nodes is not None else 1) data_len = len(self.train_dataloader().dataset) logging.info(f'number of workers {num_workers}, data length {data_len}') num_train_steps = int(data_len / (self.hparams.batch_size * num_workers) * self.hparams.max_epochs) logging.info(f'num_train_steps : {num_train_steps}') num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio) logging.info(f'num_warmup_steps : {num_warmup_steps}') scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps) lr_scheduler = {'scheduler': scheduler, 'monitor': 'loss', 'interval': 'step', 'frequency': 1} return [optimizer], [lr_scheduler]
def get_cosine_schedule_with_warmup(optimizer, epochs, batch_size, n_samples): warmup_proportion = 0.3 n_steps = int(np.ceil(n_samples / batch_size)) num_training_steps = n_steps * epochs num_warmup_steps = int(warmup_proportion * num_training_steps) sch = optimization.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps) return sch
def set_train(self): no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { 'params': [ param for name, param in self.model.named_parameters() if not any(nd in name for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ param for name, param in self.model.named_parameters() if any(nd in name for nd in no_decay) ], 'weight_decay': 0.0 }, ] num_total_train = len(self.train_loader) * self.num_epochs warmup_step = int(num_total_train * self.warmup_ratio) self.optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate) self.criterion = torch.nn.CrossEntropyLoss() self.scheduler = get_cosine_schedule_with_warmup( self.optimizer, num_warmup_steps=warmup_step, num_training_steps=num_total_train)
def configure_optimizers(self): param_optimizer = list(self.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.lr, correct_bias=False) num_train_steps = len( self.train_dataloader()) * self.hparams.max_epochs num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps) lr_scheduler = { 'scheduler': scheduler, 'name': 'cosine_schedule_with_warmup', 'monitor': 'loss', 'interval': 'step', 'frequency': 1 } return [optimizer], [lr_scheduler]
def init_fn(optimizer, epochs, batch_size, n_samples): n_steps = int(np.ceil(n_samples / batch_size)) num_training_steps = n_steps * epochs num_warmup_steps = int(warmup_proportion * num_training_steps) sch = optimization.get_cosine_schedule_with_warmup( optimizer, num_warmup_steps, num_training_steps) update_in_batch, update_in_epoch = True, False return sch, update_in_batch, update_in_epoch
def run(): """train the model""" # set the logger utils.set_logger(config.log_dir) logging.info("device: {}".format(config.device)) # 处理数据,分离文本和标签 processor = Processor(config) processor.process() logging.info("--------Process Done!--------") # 分离出验证集 word_train, word_dev, label_train, label_dev = load_dev('train') # build dataset train_dataset = NERDataset(word_train, label_train, config) dev_dataset = NERDataset(word_dev, label_dev, config) logging.info("--------Dataset Build!--------") # get dataset size train_size = len(train_dataset) # build data_loader train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=train_dataset.collate_fn) dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=dev_dataset.collate_fn) logging.info("--------Get Dataloader!--------") # Prepare model device = config.device model = BertNER.from_pretrained(config.roberta_model, num_labels=len(config.label2id)) model.to(device) # Prepare optimizer if config.full_fine_tuning: # model.named_parameters(): [bert, classifier, crf] bert_optimizer = list(model.bert.named_parameters()) classifier_optimizer = list(model.classifier.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in bert_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': config.weight_decay}, {'params': [p for n, p in bert_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, {'params': [p for n, p in classifier_optimizer if not any(nd in n for nd in no_decay)], 'lr': config.learning_rate * 5, 'weight_decay': config.weight_decay}, {'params': [p for n, p in classifier_optimizer if any(nd in n for nd in no_decay)], 'lr': config.learning_rate * 5, 'weight_decay': 0.0}, {'params': model.crf.parameters(), 'lr': config.learning_rate * 5} ] # only fine-tune the head classifier else: param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}] optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, correct_bias=False) train_steps_per_epoch = train_size // config.batch_size scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=(config.epoch_num // 10) * train_steps_per_epoch, num_training_steps=config.epoch_num * train_steps_per_epoch) # Train the model logging.info("--------Start Training!--------") train(train_loader, dev_loader, model, optimizer, scheduler, config.model_dir)
def configure_optimizers(self): # Prepare optimizer param_optimizer = list(self.model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.lr, correct_bias=False) # warm up lr num_workers = (self.hparams.gpus if self.hparams.gpus is not None else 1) * (self.hparams.num_nodes if self.hparams.num_nodes is not None else 1) data_len = len(self.train_dataloader().dataset) logging.info( f"number of workers {num_workers}, data length {data_len}") num_train_steps = int(data_len / (self.hparams.batch_size * num_workers * self.hparams.accumulate_grad_batches) * self.hparams.max_epochs) logging.info(f"num_train_steps : {num_train_steps}") num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio) logging.info(f"num_warmup_steps : {num_warmup_steps}") scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps, ) lr_scheduler = { "scheduler": scheduler, "monitor": "loss", "interval": "step", "frequency": 1, } return [optimizer], [lr_scheduler]
def train(model , train_data , test_data): train_iter = DataSetIter(train_data , batch_size = C.batch_size) test_iter = DataSetIter(test_data , batch_size = C.batch_size) loss_func = nn.CrossEntropyLoss(ignore_index = 0) optim = tc.optim.Adam(params = model.parameters() , lr = C.lr , weight_decay = C.weight_decay) scheduler = get_cosine_schedule_with_warmup( optim , num_warmup_steps = C.warmup , num_training_steps = train_iter.num_batches * C.epoch_number , ) best_test_loss = -1 best_test_epoch = -1 best_step = -1 try: for epoch_n in range(C.epoch_number): tra_loss = run(model , train_iter , loss_func , epoch_n , optim , scheduler , True) tes_loss = run(model , test_iter , loss_func , epoch_n , None , None , False) logger.log ("Epoch %d ended. Train loss = %.4f , Valid loss = %.4f" % ( epoch_n , tra_loss , tes_loss , )) fitlog.add_metric( tes_loss , step = train_iter.num_batches * (epoch_n + 1) , epoch = epoch_n , name = "valid loss" ) if best_test_epoch < 0 or tes_loss < best_test_loss: best_test_loss = tes_loss best_test_epoch = epoch_n best_step = fitlog_loss_step["train loss"] fitlog.add_best_metric(best_test_loss , name = "loss") with open(C.model_save , "wb") as fil:#暂时保存目前最好的模型 pickle.dump(model , fil) fitlog.add_hyper(name = "best_step" , value = "%d / %d" % ( best_step , train_iter.num_batches * C.epoch_number , )) except KeyboardInterrupt: # 手动提前停止 pass logger.log ("Train end.") logger.log ("Got best valid loss %.4f in epoch %d" % (best_test_loss , best_test_epoch)) return model
def __init__( self, optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1, ) -> None: lr_scheduler = get_cosine_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, num_cycles=num_cycles, last_epoch=last_epoch, ) super().__init__(lr_scheduler)
def _create_lr_scheduler(self) -> Dict: """Returns one of three default schedulers Possibilities: constant/linear/cosine schedule with or without warmup """ steps_per_epoch = math.ceil( len(self._train_instances) / self._trainer_config.batch_size ) try: training_steps = min( self._trainer_config.max_steps, self._trainer_config.max_epochs * steps_per_epoch, ) # One or both of the max_* is None: except TypeError: training_steps = ( self._trainer_config.max_steps # 1000 is the default of the lightning trainer or (self._trainer_config.max_epochs or 1000) * steps_per_epoch ) if self._trainer_config.lr_decay == "linear": scheduler = get_linear_schedule_with_warmup( optimizer=self._pipeline.model.optimizer, num_warmup_steps=self._trainer_config.warmup_steps, num_training_steps=training_steps, ) elif self._trainer_config.lr_decay == "cosine": scheduler = get_cosine_schedule_with_warmup( optimizer=self._pipeline.model.optimizer, num_warmup_steps=self._trainer_config.warmup_steps, num_training_steps=training_steps, ) else: scheduler = get_constant_schedule_with_warmup( optimizer=self._pipeline.model.optimizer, num_warmup_steps=self._trainer_config.warmup_steps, ) return { "scheduler": scheduler, "interval": "step", "name": "learning_rate", }
def train( self, train_dataset, output_dir, show_running_loss=True, eval_data=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ model = self.model args = self.args device = self.device tb_writer = SummaryWriter(logdir=args.tensorboard_dir) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=self.args.dataloader_num_workers, ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = ( args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 ) else: t_total = ( len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs ) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [] custom_parameter_names = set() for group in self.args.custom_parameter_groups: params = group.pop("params") custom_parameter_names.update(params) param_group = {**group} param_group["params"] = [ p for n, p in model.named_parameters() if n in params ] optimizer_grouped_parameters.append(param_group) for group in self.args.custom_layer_parameters: layer_number = group.pop("layer") layer = f"layer.{layer_number}." group_d = {**group} group_nd = {**group} group_nd["weight_decay"] = 0.0 params_d = [] params_nd = [] for n, p in model.named_parameters(): if n not in custom_parameter_names and layer in n: if any(nd in n for nd in no_decay): params_nd.append(p) else: params_d.append(p) custom_parameter_names.add(n) group_d["params"] = params_d group_nd["params"] = params_nd optimizer_grouped_parameters.append(group_d) optimizer_grouped_parameters.append(group_nd) if not self.args.train_custom_parameters_only: optimizer_grouped_parameters.extend( [ { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] ) warmup_steps = math.ceil(t_total * args.warmup_ratio) args.warmup_steps = ( warmup_steps if args.warmup_steps == 0 else args.warmup_steps ) if args.optimizer == "AdamW": optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, ) elif args.optimizer == "Adafactor": optimizer = Adafactor( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adafactor_eps, clip_threshold=args.adafactor_clip_threshold, decay_rate=args.adafactor_decay_rate, beta1=args.adafactor_beta1, weight_decay=args.weight_decay, scale_parameter=args.adafactor_scale_parameter, relative_step=args.adafactor_relative_step, warmup_init=args.adafactor_warmup_init, ) print("Using Adafactor for T5") else: raise ValueError( "{} is not a valid optimizer class. Please use one of ('AdamW', 'Adafactor') instead.".format( args.optimizer ) ) if args.scheduler == "constant_schedule": scheduler = get_constant_schedule(optimizer) elif args.scheduler == "constant_schedule_with_warmup": scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps ) elif args.scheduler == "linear_schedule_with_warmup": scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, ) elif args.scheduler == "cosine_schedule_with_warmup": scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, num_cycles=args.cosine_schedule_num_cycles, ) elif args.scheduler == "cosine_with_hard_restarts_schedule_with_warmup": scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, num_cycles=args.cosine_schedule_num_cycles, ) elif args.scheduler == "polynomial_decay_schedule_with_warmup": scheduler = get_polynomial_decay_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, lr_end=args.polynomial_decay_schedule_lr_end, power=args.polynomial_decay_schedule_power, ) else: raise ValueError("{} is not a valid scheduler.".format(args.scheduler)) if ( args.model_name and os.path.isfile(os.path.join(args.model_name, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name, "optimizer.pt")) ) scheduler.load_state_dict( torch.load(os.path.join(args.model_name, "scheduler.pt")) ) if args.n_gpu > 1: model = torch.nn.DataParallel(model) logger.info(" Training started") global_step = 0 training_progress_scores = None tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( int(args.num_train_epochs), desc="Epoch", disable=args.silent, mininterval=0 ) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 steps_trained_in_current_epoch = 0 epochs_trained = 0 if args.model_name and os.path.exists(args.model_name): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name.split("/")[-1].split("-") if len(checkpoint_suffix) > 2: checkpoint_suffix = checkpoint_suffix[1] else: checkpoint_suffix = checkpoint_suffix[-1] global_step = int(checkpoint_suffix) epochs_trained = global_step // ( len(train_dataloader) // args.gradient_accumulation_steps ) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps ) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info( " Will skip the first %d steps in the current epoch", steps_trained_in_current_epoch, ) except ValueError: logger.info(" Starting fine-tuning.") if args.evaluate_during_training: training_progress_scores = self._create_training_progress_scores(**kwargs) if args.wandb_project: wandb.init( project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs, ) wandb.run._label(repo="simpletransformers") wandb.watch(self.model) if args.fp16: from torch.cuda import amp scaler = amp.GradScaler() for current_epoch in train_iterator: model.train() if epochs_trained > 0: epochs_trained -= 1 continue train_iterator.set_description( f"Epoch {epoch_number + 1} of {args.num_train_epochs}" ) batch_iterator = tqdm( train_dataloader, desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}", disable=args.silent, mininterval=0, ) for step, batch in enumerate(batch_iterator): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs = self._get_inputs_dict(batch) if args.fp16: with amp.autocast(): outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] else: outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = ( loss.mean() ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: batch_iterator.set_description( f"Epochs {epoch_number}/{args.num_train_epochs}. Running Loss: {current_loss:9.4f}" ) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: scaler.scale(loss).backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: scaler.unscale_(optimizer) if args.optimizer == "AdamW": torch.nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm ) if args.fp16: scaler.step(optimizer) scaler.update() else: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar( "lr", scheduler.get_last_lr()[0], global_step ) tb_writer.add_scalar( "loss", (tr_loss - logging_loss) / args.logging_steps, global_step, ) logging_loss = tr_loss if args.wandb_project or self.is_sweeping: wandb.log( { "Training loss": current_loss, "lr": scheduler.get_last_lr()[0], "global_step": global_step, } ) if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step) ) self.save_model( output_dir_current, optimizer, scheduler, model=model ) if args.evaluate_during_training and ( args.evaluate_during_training_steps > 0 and global_step % args.evaluate_during_training_steps == 0 ): # Only evaluate when single GPU otherwise metrics may not average well results = self.eval_model( eval_data, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) for key, value in results.items(): try: tb_writer.add_scalar( "eval_{}".format(key), value, global_step ) except (NotImplementedError, AssertionError): pass output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step) ) if args.save_eval_checkpoints: self.save_model( output_dir_current, optimizer, scheduler, model=model, results=results, ) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join( args.output_dir, "training_progress_scores.csv" ), index=False, ) if args.wandb_project or self.is_sweeping: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self.save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results, ) if best_eval_metric and args.early_stopping_metric_minimize: if ( results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta ): best_eval_metric = results[args.early_stopping_metric] self.save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results, ) early_stopping_counter = 0 else: if args.use_early_stopping: if ( early_stopping_counter < args.early_stopping_patience ): early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) else: if ( results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta ): best_eval_metric = results[args.early_stopping_metric] self.save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results, ) early_stopping_counter = 0 else: if args.use_early_stopping: if ( early_stopping_counter < args.early_stopping_patience ): early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) model.train() epoch_number += 1 output_dir_current = os.path.join( output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number) ) if args.save_model_every_epoch or args.evaluate_during_training: os.makedirs(output_dir_current, exist_ok=True) if args.save_model_every_epoch: self.save_model(output_dir_current, optimizer, scheduler, model=model) if args.evaluate_during_training and args.evaluate_each_epoch: results = self.eval_model( eval_data, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) if args.save_eval_checkpoints: self.save_model( output_dir_current, optimizer, scheduler, results=results ) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args.output_dir, "training_progress_scores.csv"), index=False, ) if args.wandb_project or self.is_sweeping: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self.save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results, ) if best_eval_metric and args.early_stopping_metric_minimize: if ( results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta ): best_eval_metric = results[args.early_stopping_metric] self.save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results, ) early_stopping_counter = 0 else: if ( args.use_early_stopping and args.early_stopping_consider_epochs ): if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) else: if ( results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta ): best_eval_metric = results[args.early_stopping_metric] self.save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results, ) early_stopping_counter = 0 else: if ( args.use_early_stopping and args.early_stopping_consider_epochs ): if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, )
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, correct_bias=False) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=int(len(train_dataloader) * 0.1), num_training_steps=len(train_dataloader)) def F1_scores(preds, golds, eps=1e-9): tp = (preds * golds).sum() tn = ((1 - golds) * (1 - preds)).sum() fp = ((1 - golds) * preds).sum() fn = (golds * (1 - preds)).sum() precision = tp / (tp + fp + eps) recall = tp / (tp + fn + eps) return 2 * (precision * recall) / (precision + recall + eps)
def train(self): wandb.init(entity='samjkwong', project='gmt') train_loader, val_loader, test_loader = self.load_dataloader() # Load Model & Optimizer self.model = self.load_model() self.optimizer = torch.optim.Adam(self.model.parameters(), lr = self.args.lr, weight_decay = self.args.weight_decay) self.cls_criterion = torch.nn.BCEWithLogitsLoss() self.reg_criterion = torch.nn.MSELoss() if self.args.lr_schedule: self.scheduler = get_cosine_schedule_with_warmup(self.optimizer, self.args.patience * len(train_loader), self.args.num_epochs * len(train_loader)) logger, t_start = self.set_log() for epoch in trange(0, (self.args.num_epochs), desc = '[Epoch]', position = 1): self.model.train() total_loss = 0 for _, data in enumerate(tqdm(train_loader, desc="[Iteration]")): if data.x.shape[0] == 1 or data.batch[-1] == 0: pass self.optimizer.zero_grad() data = data.to(self.args.device) out = self.model(data) is_labeled = data.y == data.y if "classification" in self.args.task_type: loss = self.cls_criterion(out.to(torch.float32)[is_labeled], data.y.to(torch.float32)[is_labeled]) else: loss = self.reg_criterion(out.to(torch.float32)[is_labeled], data.y.to(torch.float32)[is_labeled]) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.grad_norm) total_loss += loss.item() * num_graphs(data) self.optimizer.step() if self.args.lr_schedule: self.scheduler.step() total_loss = total_loss / len(train_loader.dataset) train_perf, valid_perf, test_perf = self.eval(train_loader), self.eval(val_loader), self.eval(test_loader) self.organize_log(logger, train_perf, valid_perf, test_perf, total_loss, epoch) # WANDB logging wandb.log({ 'Epoch': epoch, 'Train Loss': total_loss, 'Train ROC-AUC': train_perf, 'Val ROC-AUC': valid_perf, 'Test ROC-AUC': test_perf }) t_end = time.perf_counter() if 'classification' in self.dataset.task_type: best_val_epoch = np.argmax(np.array(self.valid_curve)) best_train = max(self.train_curve) else: best_val_epoch = np.argmin(np.array(self.valid_curve)) best_train = min(self.train_curve) best_val = self.valid_curve[best_val_epoch] test_score = self.test_curve[best_val_epoch] logger.log("Train: {} Valid: {} Test: {} with Time: {}".format(best_train, best_val, test_score, (t_end - t_start))) result_file = "./results/{}/{}-results.txt".format(self.log_folder_name, self.exp_name) with open(result_file, 'a+') as f: f.write("{}: {} {} {} {}\n".format(self.args.seed, best_train, self.train_curve[best_val_epoch], best_val, test_score)) torch.save({ 'model_state_dict': self.model.state_dict(), 'Val': best_val, 'Train': self.train_curve[best_val_epoch], 'Test': test_score, 'BestTrain': best_train }, './checkpoints/{}/best-model_{}.pth'.format(self.log_folder_name, self.args.seed))
def train(self, training_data, student_model, student_optimizer, student_scheduler=None, tensorboard=None, num_epochs=20, log_interval=1e2, checkpoint_interval=1e5, iterations=0): # Parameters eta = 0.95 #scaling constant from MLE to RL loss learning_rate = 6e-4 # Variables total_loss = 0. current_epoch, model, optimizer, scheduler = self.from_checkpoint_if_exists( student_model, student_optimizer, student_scheduler) if model is not None: student_model = model if optimizer is not None: student_optimizer = optimizer if scheduler is not None: student_scheduler = scheduler student_model.train() for epoch in range(current_epoch, num_epochs): total_mle_loss = 0.0 num_chars_total = 0.0 num_chars_correct = 0.0 all_rewards = [] optimizer = AdamW(student_model.parameters(), lr=learning_rate) if self.use_mle or self.use_rl: scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=4e4, num_training_steps=len(training_data)) for batch_idx, batch in enumerate( tqdm(training_data, mininterval=2, leave=False)): batch_qs, batch_as = map(lambda x: x.to(self.device), batch) student_optimizer.zero_grad() if not self.use_mle: policy_losses, batch_rewards = self.policy_batch_loss( batch_qs, batch_as, student_model, gamma=0.9) if not self.use_rl: mle_loss, num_correct, num_chars = self.mle_batch_loss( batch_qs, batch_as, student_model.action_transformer) if self.use_mle: loss = mle_loss elif self.use_rl: loss = policy_losses else: #TODO: why is there an /2 in the next line? eta_linear_decay = eta - eta * ( iterations / (float(len(training_data) * num_epochs) / 2)) loss = (1 - eta_linear_decay ) * policy_losses + eta_linear_decay * mle_loss iterations += batch_qs.shape[0] total_loss += loss loss.backward() # Gradient clipping torch.nn.utils.clip_grad_norm_(student_model.parameters(), 0.1) student_optimizer.step() if scheduler and self.use_mle: scheduler.step() if not self.use_rl: num_chars_total += num_chars num_chars_correct += num_correct total_mle_loss += mle_loss if not self.use_mle: all_rewards.append(batch_rewards.cpu().numpy()) if tensorboard is not None and batch_idx % log_interval == 0: if self.use_mle: self.tb_mle_batch(tensorboard, total_mle_loss, num_chars_total, num_chars_correct, epoch, batch_idx, len(training_data)) # TODO: Fix missing value_losses #elif self.use_rl: # self.tb_policy_batch(tensorboard, batch_rewards, value_losses, epoch, batch_idx, len(training_data)) else: self.tb_mle_policy_batch(tensorboard, total_mle_loss, num_chars_total, num_chars_correct, batch_rewards, epoch, batch_idx, len(training_data))
def train( self, train_dataloader, output_dir, show_running_loss=True, eval_dataloader=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args.tensorboard_dir) t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [] custom_parameter_names = set() for group in self.args.custom_parameter_groups: params = group.pop("params") custom_parameter_names.update(params) param_group = {**group} param_group["params"] = [p for n, p in model.named_parameters() if n in params] optimizer_grouped_parameters.append(param_group) for group in self.args.custom_layer_parameters: layer_number = group.pop("layer") layer = f"layer.{layer_number}." group_d = {**group} group_nd = {**group} group_nd["weight_decay"] = 0.0 params_d = [] params_nd = [] for n, p in model.named_parameters(): if n not in custom_parameter_names and layer in n: if any(nd in n for nd in no_decay): params_nd.append(p) else: params_d.append(p) custom_parameter_names.add(n) group_d["params"] = params_d group_nd["params"] = params_nd optimizer_grouped_parameters.append(group_d) optimizer_grouped_parameters.append(group_nd) if not self.args.train_custom_parameters_only: optimizer_grouped_parameters.extend( [ { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] ) warmup_steps = math.ceil(t_total * args.warmup_ratio) args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps if args.optimizer == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) elif args.optimizer == "Adafactor": optimizer = Adafactor( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adafactor_eps, clip_threshold=args.adafactor_clip_threshold, decay_rate=args.adafactor_decay_rate, beta1=args.adafactor_beta1, weight_decay=args.weight_decay, scale_parameter=args.adafactor_scale_parameter, relative_step=args.adafactor_relative_step, warmup_init=args.adafactor_warmup_init, ) print("Using Adafactor for T5") else: raise ValueError( "{} is not a valid optimizer class. Please use one of ('AdamW', 'Adafactor') instead.".format( args.optimizer ) ) if args.scheduler == "constant_schedule": scheduler = get_constant_schedule(optimizer) elif args.scheduler == "constant_schedule_with_warmup": scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps) elif args.scheduler == "linear_schedule_with_warmup": scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) elif args.scheduler == "cosine_schedule_with_warmup": scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, num_cycles=args.cosine_schedule_num_cycles, ) elif args.scheduler == "cosine_with_hard_restarts_schedule_with_warmup": scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, num_cycles=args.cosine_schedule_num_cycles, ) elif args.scheduler == "polynomial_decay_schedule_with_warmup": scheduler = get_polynomial_decay_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, lr_end=args.polynomial_decay_schedule_lr_end, power=args.polynomial_decay_schedule_power, ) else: raise ValueError("{} is not a valid scheduler.".format(args.scheduler)) if args.n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 training_progress_scores = None tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.silent) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 if args.evaluate_during_training: training_progress_scores = self._create_training_progress_scores(**kwargs) if args.wandb_project: wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs) wandb.watch(self.model) if args.fp16: from torch.cuda import amp scaler = amp.GradScaler() for _ in train_iterator: model.train() train_iterator.set_description(f"Epoch {epoch_number + 1} of {args.num_train_epochs}") batch_iterator = tqdm( train_dataloader, desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}", disable=args.silent, mininterval=0, ) for step, batch in enumerate(batch_iterator): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, labels, mc_labels, token_type_ids = batch if args.fp16: with amp.autocast(): outputs = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, labels=labels, ) lm_loss, mc_loss = outputs[:2] # model outputs are always tuple in pytorch-transformers (see doc) loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef else: outputs = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, labels=labels, ) lm_loss, mc_loss = outputs[:2] # model outputs are always tuple in pytorch-transformers (see doc) loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % current_loss, end="") if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: scaler.scale(loss).backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: scaler.unscale_(optimizer) if args.optimizer == "AdamW": torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if args.fp16: scaler.step(optimizer) scaler.update() else: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.wandb_project or self.is_sweeping: wandb.log( { "Training loss": current_loss, "lr": scheduler.get_last_lr()[0], "global_step": global_step, } ) if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) self.save_model(output_dir_current, model=model) if args.evaluate_during_training and ( args.evaluate_during_training_steps > 0 and global_step % args.evaluate_during_training_steps == 0 ): # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model( eval_dataloader, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if args.save_eval_checkpoints: self.save_model(output_dir_current, model=model, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args.output_dir, "training_progress_scores.csv"), index=False, ) if args.wandb_project or self.is_sweeping: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) else: if results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) epoch_number += 1 output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args.save_model_every_epoch or args.evaluate_during_training: os.makedirs(output_dir_current, exist_ok=True) if args.save_model_every_epoch: self.save_model(output_dir_current, model=model) if args.evaluate_during_training and args.evaluate_each_epoch: results, _, _ = self.eval_model( eval_dataloader, verbose=verbose and args.evaluate_during_training_verbose, silent=True, **kwargs, ) self.save_model(output_dir_current, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args.output_dir, "training_progress_scores.csv"), index=False) if args.wandb_project or self.is_sweeping: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 else: if results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 model.train() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, )
def bert_train(opt): device = torch.device('cuda:{}'.format(opt.device)) bertmodel, vocab = get_pytorch_kobert_model() dataset_train = nlp.data.TSVDataset('{}'.format(opt.source), field_indices=[1, 2], num_discard_samples=1) # dataset_test = nlp.data.TSVDataset('/content/tst.txt', field_indices=[1,2], num_discard_samples=1) tokenizer = get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) max_len = 256 # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음 batch_size = opt.batch warmup_ratio = 0.1 num_epochs = 2 max_grad_norm = 1 log_interval = 200 learning_rate = 5e-5 data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False) # pytorch용 DataLoader 사용 train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5) # test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5) model = BERTClassifier(bertmodel, dr_rate=0.2).to(device) # model = torch.load('weights/last_kobert_pytorch_model_big2s.pt') # if torch.cuda.device_count() > 1: # model = nn.DataParallel(model) model = nn.DataParallel(model, output_device=[0, 1]) # model.to(device) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # 옵티마이저 선언 optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate) loss_fn = nn.CrossEntropyLoss( ) # softmax용 Loss Function 정하기 <- binary classification도 해당 loss function 사용 가능 t_total = len(train_dataloader) * num_epochs warmup_step = int(t_total * warmup_ratio) scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total) # lr_scheduler = optims.lr_scheduler.CosineAnnealingLR(optimizer,T_max=0.1,eta_min=0.0001) # 학습 평가 지표인 accuracy 계산 -> 얼마나 타겟값을 많이 맞추었는가 def calc_accuracy(X, Y): max_vals, max_indices = torch.max(X, 1) train_acc = (max_indices == Y).sum().data.cpu().numpy() / max_indices.size()[0] return train_acc # 모델 학습 시작 for e in range(num_epochs): train_acc = 0.0 test_acc = 0.0 best_acc = 0.0 model.train() for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)): optimizer.zero_grad() token_ids = token_ids.long().to(device) segment_ids = segment_ids.long().to(device) valid_length = valid_length label = label.long().to(device) out = model(token_ids, valid_length, segment_ids) loss = loss_fn(out, label) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # gradient clipping optimizer.step() scheduler.step() # Update learning rate schedule # lr_scheduler.step() train_acc += calc_accuracy(out, label) if batch_id % log_interval == 0: print("epoch {} batch id {} loss {} train acc {}".format( e + 1, batch_id + 1, loss.data.cpu().numpy(), train_acc / (batch_id + 1))) if train_acc > best_acc: best_acc = train_acc torch.save(model, '{}.pt'.format(opt.save_weights_name)) print("epoch {} train acc {}".format(e + 1, train_acc / (batch_id + 1))) torch.save(model, '{}.pt'.format(opt.save_weights_name))
def train(self): self.overall_results = { 'val_loss': [], 'val_acc': [], 'test_loss': [], 'test_acc': [], 'durations': [] } train_fold_iter = tqdm(range(1, 11), desc='Training') val_fold_iter = [i for i in range(1, 11)] for fold_number in train_fold_iter: val_fold_number = val_fold_iter[fold_number - 2] train_loader, val_loader, test_loader = self.load_dataloader( fold_number, val_fold_number) # Load Model & Optimizer self.model = self.load_model() self.optimizer = torch.optim.Adam( self.model.parameters(), lr=self.args.lr, weight_decay=self.args.weight_decay) if self.args.lr_schedule: self.scheduler = get_cosine_schedule_with_warmup( self.optimizer, self.args.patience * len(train_loader), self.args.num_epochs * len(train_loader)) logger, t_start = self.set_log(fold_number) # K-Fold Training for epoch in trange(0, (self.args.num_epochs), desc='[Epoch]', position=1): self.model.train() total_loss = 0 for _, data in enumerate(train_loader): self.optimizer.zero_grad() data = data.to(self.args.device) out = self.model(data) loss = F.nll_loss(out, data.y) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.grad_norm) total_loss += loss.item() * num_graphs(data) self.optimizer.step() if self.args.lr_schedule: self.scheduler.step() total_loss = total_loss / len(train_loader.dataset) # Validation val_acc, val_loss = self.eval(val_loader) self.organize_val_log(logger, total_loss, val_loss, val_acc, fold_number, epoch) train_fold_iter.set_description( '[Fold %d] TrL: %.2f VaL: %.2f VaAcc: %.2f%%' % (fold_number, total_loss, val_loss, val_acc)) train_fold_iter.refresh() if self.patience > self.args.patience: break t_end = time.perf_counter() checkpoint = torch.load( './checkpoints/{}/experiment-{}_fold-{}_seed-{}_best-model.pth' .format(self.log_folder_name, self.exp_name, fold_number, self.args.seed)) self.model.load_state_dict(checkpoint) test_acc, test_loss = self.eval(test_loader) self.organize_test_log(logger, test_loss, test_acc, t_start, t_end, fold_number) final_result_file = "./results/{}/{}-total_results.txt".format( self.log_folder_name, self.exp_name) with open(final_result_file, 'a+') as f: f.write("{}: {} {} {} {}\n".format( self.args.seed, np.array(self.overall_results['val_acc']).mean(), np.array(self.overall_results['val_acc']).std(), np.array(self.overall_results['test_acc']).mean(), np.array(self.overall_results['test_acc']).std()))
def train(data_dir, model_dir, args): seed_everything(args.seed) s_dir = args.model + str(args.num_hidden_layers) + '-' + args.preprocess + '-epoch' + str(args.epochs) + \ '-' + args.criterion + '-' + args.scheduler + '-' + args.optimizer + '-' + args.dataset + '-' + args.tokenize if args.name: s_dir += '-' + args.name save_dir = increment_path(os.path.join(model_dir, s_dir)) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print("This notebook use [%s]." % (device)) # load model and tokenizer MODEL_NAME = args.model if MODEL_NAME == "monologg/kobert": tokenizer = KoBertTokenizer.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # load dataset dataset = load_data("/opt/ml/input/data/train/train.tsv") labels = dataset['label'].values # setting model hyperparameter bert_config = BertConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = args.num_labels bert_config.num_hidden_layers = args.num_hidden_layers model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=bert_config) model.dropout = nn.Dropout(p=args.drop) model.to(device) summary(model) # loss & optimizer if args.criterion == 'f1' or args.criterion == 'label_smoothing' or args.criterion == 'f1cross': criterion = create_criterion(args.criterion, classes=args.num_labels, smoothing=0.1) else: criterion = create_criterion(args.criterion) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.optimizer == 'AdamP': optimizer = AdamP(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.weight_decay) else: opt_module = getattr(import_module("torch.optim"), args.optimizer) # default: SGD optimizer = opt_module( optimizer_grouped_parameters, lr=args.lr, ) # logging logger = SummaryWriter(log_dir=save_dir) with open(os.path.join(save_dir, 'config.json'), 'w', encoding='utf-8') as f: json.dump(vars(args), f, ensure_ascii=False, indent=4) set_neptune(save_dir, args) # preprocess dataset if args.preprocess != 'no': pre_module = getattr(import_module("preprocess"), args.preprocess) dataset = pre_module(dataset, model, tokenizer) # train, val split kfold = StratifiedKFold(n_splits=5) for train_idx, val_idx in kfold.split(dataset, labels): train_dataset, val_dataset = dataset.loc[train_idx], dataset.loc[ val_idx] break tok_module = getattr(import_module("load_data"), args.tokenize) train_tokenized = tok_module(train_dataset, tokenizer, max_len=args.max_len) val_tokenized = tok_module(val_dataset, tokenizer, max_len=args.max_len) # make dataset for pytorch. RE_train_dataset = RE_Dataset( train_tokenized, train_dataset['label'].reset_index(drop='index')) RE_val_dataset = RE_Dataset(val_tokenized, val_dataset['label'].reset_index(drop='index')) train_loader = DataLoader( RE_train_dataset, batch_size=args.batch_size, num_workers=4, shuffle=True, pin_memory=use_cuda, ) val_loader = DataLoader( RE_val_dataset, batch_size=12, num_workers=1, shuffle=False, pin_memory=use_cuda, ) if args.scheduler == 'cosine': scheduler = CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-6) elif args.scheduler == 'reduce': scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=5) elif args.scheduler == 'step': scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5) elif args.scheduler == 'cosine_warmup': t_total = len(train_loader) * args.epochs warmup_step = int(t_total * args.warmup_ratio) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total) else: scheduler = None print("Training Start!!!") best_val_acc = 0 best_val_loss = np.inf for epoch in range(args.epochs): # train loop model.train() train_loss, train_acc = AverageMeter(), AverageMeter() for idx, train_batch in enumerate(train_loader): optimizer.zero_grad() try: inputs, token_types, attention_mask, labels = train_batch.values( ) inputs = inputs.to(device) token_types = token_types.to(device) attention_mask = attention_mask.to(device) labels = labels.to(device) outs = model(input_ids=inputs, token_type_ids=token_types, attention_mask=attention_mask) except: inputs, attention_mask, labels = train_batch.values() inputs = inputs.to(device) attention_mask = attention_mask.to(device) labels = labels.to(device) outs = model(input_ids=inputs, attention_mask=attention_mask) preds = torch.argmax(outs.logits, dim=-1) loss = criterion(outs.logits, labels) acc = (preds == labels).sum().item() / len(labels) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.7) optimizer.step() if scheduler: scheduler.step() neptune.log_metric('learning_rate', get_lr(optimizer)) train_loss.update(loss.item(), len(labels)) train_acc.update(acc, len(labels)) if (idx + 1) % args.log_interval == 0: current_lr = get_lr(optimizer) print( f"Epoch[{epoch + 1}/{args.epochs}]({idx + 1}/{len(train_loader)}) || " f"training loss {train_loss.avg:.4f} || training accuracy {train_acc.avg:4.2%} || lr {current_lr}" ) logger.add_scalar("Train/loss", train_loss.avg, epoch * len(train_loader) + idx) logger.add_scalar("Train/accuracy", train_acc.avg, epoch * len(train_loader) + idx) neptune.log_metric(f'Train_loss', train_loss.avg) neptune.log_metric(f'Train_avg', train_acc.avg) neptune.log_metric('learning_rate', current_lr) val_loss, val_acc = AverageMeter(), AverageMeter() # val loop with torch.no_grad(): print("Calculating validation results...") model.eval() for val_batch in val_loader: try: inputs, token_types, attention_mask, labels = val_batch.values( ) inputs = inputs.to(device) token_types = token_types.to(device) attention_mask = attention_mask.to(device) labels = labels.to(device) outs = model(input_ids=inputs, token_type_ids=token_types, attention_mask=attention_mask) except: inputs, attention_mask, labels = val_batch.values() inputs = inputs.to(device) attention_mask = attention_mask.to(device) labels = labels.to(device) outs = model(input_ids=inputs, attention_mask=attention_mask) preds = torch.argmax(outs.logits, dim=-1) loss = criterion(outs.logits, labels) acc = (preds == labels).sum().item() / len(labels) val_loss.update(loss.item(), len(labels)) val_acc.update(acc, len(labels)) if val_acc.avg > best_val_acc: print( f"New best model for val acc : {val_acc.avg:4.2%}! saving the best model.." ) torch.save(model.state_dict(), f"{save_dir}/best.pth") best_val_acc = val_acc.avg best_val_loss = min(best_val_loss, val_loss.avg) print( f"[Val] acc : {val_acc.avg:4.2%}, loss : {val_loss.avg:.4f} || " f"best acc : {best_val_acc:4.2%}, best loss : {best_val_loss:.4f}" ) logger.add_scalar("Val/loss", val_loss.avg, epoch) logger.add_scalar("Val/accuracy", val_acc.avg, epoch) neptune.log_metric(f'Val_loss', val_loss.avg) neptune.log_metric(f'Val_avg', val_acc.avg) print()
def run_KoBERT(): torch.multiprocessing.freeze_support() device = None # GPU 사용 시 if torch.cuda.is_available(): print("GPU 사용...") device = torch.device("cuda") # CPU 사용 시 else: print("CPU 사용...") device = torch.device("cpu") bertmodel, vocab = get_pytorch_kobert_model() # Train, Test 텍스트 데이터 로드 dataset_train = nlp.data.TSVDataset("txt/alertMsg_train_top10.txt", field_indices=[1, 2], num_discard_samples=1) dataset_test = nlp.data.TSVDataset("txt/alertMsg_test_top10.txt", field_indices=[1, 2], num_discard_samples=1) # 기본 Bert Tokenizer 사용 tokenizer = get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) # Setting Hyper Parameters max_len = 64 # 한 문장의 최대 토큰 수: 64 batch_size = 64 # batch 크기 : 64 warmup_ratio = 0.1 num_epochs = 2 # epoch 수: 5 max_grad_norm = 1 log_interval = 200 learning_rate = 5e-5 data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False) data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False) train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5) test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5) model = BERTClassifier(bertmodel, dr_rate=0.1).to(device) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate) loss_fn = nn.CrossEntropyLoss() t_total = len(train_dataloader) * num_epochs warmup_step = int(t_total * warmup_ratio) scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total) # GPU가 여러개이면 torch.nn.DataParallel 실행 if torch.cuda.is_available() and torch.cuda.device_count() > 1: print("DataParallel 사용...") model = DataParallel(model) for e in range(num_epochs): train_acc = 0.0 test_acc = 0.0 model.train() # torch.cuda.empty_cache() for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)): optimizer.zero_grad() token_ids = token_ids.long().to(device) segment_ids = segment_ids.long().to(device) valid_length = valid_length label = label.long().to(device) out = model(token_ids, valid_length, segment_ids) loss = loss_fn(out, label) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule train_acc += calc_accuracy(out, label) if batch_id % log_interval == 0: print("epoch {} batch id {} loss {} train acc {}".format(e + 1, batch_id + 1, loss.data.cpu().numpy(), train_acc / (batch_id + 1))) print("epoch {} train acc {}".format(e + 1, train_acc / (batch_id + 1))) # torch.save(model, "model_covid-classification.pt") torch.save(model.module.state_dict(), "model/kobert_model-classification_state-dict8.pt") # torch.cuda.empty_cache() model.eval() for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)): token_ids = token_ids.long().to(device) segment_ids = segment_ids.long().to(device) valid_length = valid_length label = label.long().to(device) out = model(token_ids, valid_length, segment_ids) test_acc += calc_accuracy(out, label) print("epoch {} test acc {}".format(e + 1, test_acc / (batch_id + 1)))
def train(self, training_data, model, optimizer, scheduler=None, tb=None, epochs=20, log_interval=100, checkpoint_interval=10000): curr_epoch, model, optimizer, scheduler = self.from_checkpoint_if_exists(model, optimizer, scheduler) model.train() # ignore_index = model.action_transformer.trg_pad_idx eta = 0.95 iterations = 0 for epoch in range(curr_epoch, epochs): total_mle_loss = 0.0 n_char_total = 0.0 n_char_correct = 0.0 all_rewards = [] all_value_losses = [] optimizer = AdamW(model.parameters(), lr=6e-4) if self.use_mle or self.use_rl: scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=40000, num_training_steps=len(training_data)) for batch_idx, batch in enumerate(tqdm(training_data, mininterval=2, leave=False)): batch_qs, batch_as = map(lambda x: x.to(self.device), batch) optimizer.zero_grad() if not self.use_mle: # policy_losses, value_losses, batch_rewards = self.policy_batch_loss(batch_qs, batch_as, model, gamma=0.9) policy_losses, batch_rewards = self.policy_batch_loss(batch_qs, batch_as, model, gamma=0.9) if not self.use_rl: mle_loss, n_correct, n_char = self.mle_batch_loss(batch_qs, batch_as, model.action_transformer) if self.use_mle: loss = mle_loss elif self.use_rl: # loss = policy_losses + value_losses loss = policy_losses else: # eta linear decay eta_ld = eta - eta * (iterations / (float(len(training_data) * epochs) / 2)) # loss = (1-eta_ld)* (policy_losses + value_losses) + eta_ld*mle_loss loss = (1 - eta_ld) * policy_losses + eta_ld * mle_loss iterations += batch_qs.shape[0] loss.backward() # clipping gradients torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() if scheduler and self.use_mle: scheduler.step() if not self.use_rl: n_char_total += n_char n_char_correct += n_correct total_mle_loss += mle_loss if not self.use_mle: all_rewards.append(batch_rewards.cpu().numpy()) # all_value_losses.append(value_losses) if tb is not None and batch_idx % log_interval == 0: if self.use_mle: self.tb_mle_batch(tb, total_mle_loss, n_char_total, n_char_correct, epoch, batch_idx, len(training_data)) elif self.use_rl: self.tb_policy_batch(tb, batch_rewards, value_losses, epoch, batch_idx, len(training_data)) else: # self.tb_mle_policy_batch(tb, total_mle_loss, n_char_total, n_char_correct, batch_rewards, value_losses, epoch, batch_idx, len(training_data)) self.tb_mle_policy_batch(tb, total_mle_loss, n_char_total, n_char_correct, batch_rewards, epoch, batch_idx, len(training_data)) if batch_idx != 0 and epoch % checkpoint_interval == 0: self.save_checkpoint(epoch, model, optimizer, scheduler, suffix=str(epoch) + "-ml_rle") print("average rewards " + str(all_rewards)) loss_per_char = total_mle_loss / n_char_total accuracy = n_char_correct / n_char_total if self.use_rl: average_rewards = np.mean(all_rewards) # average_value_loss = np.mean(all_value_losses) if tb is not None: if self.use_mle: self.tb_mle_epoch(tb, loss_per_char, accuracy, epoch) elif self.use_rl: self.tb_policy_epoch(tb, average_rewards, average_value_loss, epoch) else: # self.tb_mle_policy_epoch(tb, loss_per_char, accuracy, average_rewards, average_value_loss, epoch) self.tb_mle_policy_epoch(tb, loss_per_char, accuracy, average_rewards, epoch)
def train(_type, config, load='tmp_vocab.pt'): dev_id = 0 device = torch.device(dev_id) config['g2t']['device'] = device config['t2g']['device'] = device pool, vocab = prep_data(config['main'], load=load) model_g2t, model_t2g = prep_model(config, vocab) model_g2t.to(device) model_t2g.to(device) optimizerG2T = torch.optim.Adam(model_g2t.parameters(), lr=config['g2t']['lr'], weight_decay=config['g2t']['weight_decay']) schedulerG2T = get_cosine_schedule_with_warmup( optimizer = optimizerG2T, num_warmup_steps = 400, num_training_steps = 800 * config['main']['epoch'], ) optimizerT2G = torch.optim.Adam(model_t2g.parameters(), lr = config['t2g']['lr'], weight_decay=config['t2g']['weight_decay']) schedulerT2G = get_cosine_schedule_with_warmup( optimizer = optimizerT2G, num_warmup_steps = 400, num_training_steps = 800 * config['main']['epoch'], ) loss_t2g, loss_g2t = [], [] best_g2t, best_t2g = 0., 0. t2g_weight = [vocab['relation'].wf.get(x, 0) for x in vocab['relation'].i2s] t2g_weight[0] = 0 max_w = max(t2g_weight) t2g_weight = np.array(t2g_weight).astype('float32') t2g_weight = (max_w + 1000) / (t2g_weight + 1000) for i in range(0, config['main']['epoch']): _data_g2t = list(pool.draw_with_type(config['main']['batch_size'], True, _type + '_g2t')) _data_t2g = list(pool.draw_with_type(config['main']['batch_size'], True, _type + '_t2g')) data_list = list(zip(_data_g2t, _data_t2g)) _data = data_list with tqdm.tqdm(_data, disable=True if not config['main']['display'] else False) as tqb: for j, (batch_g2t, batch_t2g) in enumerate(tqb): if i < config['main']['pre_epoch'] and config['main']['mode'] == 'warm_unsup': _loss1, _loss2 = warmup_step1( batch_g2t, batch_t2g, model_g2t, model_t2g, optimizerG2T, optimizerT2G, config, t2g_weight, vocab ) if i == config['main']['pre_epoch'] + 1 and config['main']['mode'] == 'warm_unsup': _loss1, _loss2 = warmup_step2( batch_g2t, batch_t2g, model_g2t, model_t2g, optimizerG2T, optimizerT2G, config, t2g_weight, vocab ) if config['main']['mode'] == 'sup': _loss1, _loss2 = supervise( batch_g2t, batch_t2g, model_g2t, model_t2g, optimizerG2T, optimizerT2G, config, t2g_weight, vocab ) if (i >= config['main']['pre_epoch'] + 1 and config['main']['mode'] == 'warm_unsup') or (config['main']['mode'] == 'cold_unsup'): _loss1, _loss2 = back_translation( batch_g2t, batch_t2g, model_g2t, model_t2g, optimizerG2T, optimizerT2G, config, t2g_weight, vocab ) loss_t2g.append(_loss1) schedulerT2G.step() loss_g2t.append(_loss2) schedulerG2T.step() tqb.set_postfix({'t2g loss': np.mean(loss_t2g), 'g2t loss': np.mean(loss_g2t)}) logging.info('Epoch '+str(i)) if i % 5 == 0: if i < config['main']['pre_epoch'] and config['main']['mode'] == 'warm_unsup': model_g2t.blind, model_t2g.blind = True, True else: model_g2t.blind, model_t2g.blind = False, False if model_t2g.blind: e = eval_t2g(pool, 'dev_t2g_blind', vocab, model_t2g, config['t2g'], display=config['main']['display']) else: e = eval_t2g(pool, 'dev', vocab, model_t2g, config['t2g'], display=config['main']['display']) if e > best_t2g: best_t2g = max(best_t2g, e) torch.save(model_t2g.state_dict(), config['t2g']['save'] + 'X' + 'best') e = eval_g2t(pool, 'dev', vocab, model_g2t, config['g2t'], display=config['main']['display']) if e > best_g2t: best_g2t = max(best_g2t, e) torch.save(model_g2t.state_dict(), config['g2t']['save']+'X'+'best') if i == config['main']['pre_epoch']: torch.save(model_t2g.state_dict(), config['t2g']['save']+'X'+'mid') torch.save(model_g2t.state_dict(), config['g2t']['save']+'X'+'mid') model_g2t.load_state_dict(torch.load(config['g2t']['save']+'X'+'best')) model_t2g.load_state_dict(torch.load(config['t2g']['save']+'X'+'best')) logging.info('Final Test mode {0:}'.format(config['main']['mode'])) e = eval_t2g(pool, 'test', vocab, model_t2g, config['t2g']) e = eval_g2t(pool, 'test', vocab, model_g2t, config['g2t'])
def train(): seed_everything(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # setting model hyperparameter # config 자체에는 학습 weight 정보 없기 때문에, from_pretrained 사용해 weight 가져올 수 있다 # bert_config = BertConfig.from_pretrained(MODEL_NAME) # bert_config.num_labels = 42 # model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=bert_config) # Auto model_config = XLMRobertaConfig.from_pretrained(args.model_name) model_config.num_labels = 42 model = XLMRobertaForSequenceClassification.from_pretrained( args.model_name, config=model_config) # load model and tokenizer # MODEL_NAME = "monologg/koelectra-base-v3-discriminator" # roberta: https://huggingface.co/transformers/model_doc/xlmroberta.html tokenizer = AutoTokenizer.from_pretrained(args.model_name) # load dataset dataset = load_data("/opt/ml/input/data/train/train.tsv") # label = dataset['label'].values train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=args.seed) tokenized_train = tokenized_dataset(train_dataset, tokenizer) tokenized_val = tokenized_dataset(val_dataset, tokenizer) tokenized_train_label = train_dataset['label'].values tokenized_val_label = val_dataset['label'].values # train_datasets = TokenDataset(train_dataset, tokenizer) # val_datasets = TokenDataset(val_dataset, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, tokenized_train_label) RE_val_dataset = RE_Dataset(tokenized_val, tokenized_val_label) # print(model.parameters) model.to(device) model = torch.nn.DataParallel(model) train_loader = DataLoader( RE_train_dataset, batch_size=args.batch_size, # num_workers=8, pin_memory=torch.cuda.is_available(), shuffle=True, ) val_loader = DataLoader( RE_val_dataset, batch_size=args.batch_size, # num_workers=8, shuffle=False, pin_memory=torch.cuda.is_available(), ) optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) loss_fn = LabelSmoothingLoss(smoothing=0.5) # loss_fn = nn.CrossEntropyLoss() # t_total = len(train_loader) * args.epoch t_total = args.epoch warmup_step = int(t_total * args.warmup_steps) scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total) log_dir = "" log_list = glob("./logs/*") if len(log_list) == 0: log_dir = "./logs/exp1" else: log_list = [int(log[-1]) for log in log_list] log_dir = "./logs/exp" + str(max(log_list) + 1) logger = SummaryWriter(log_dir=log_dir) scaler = GradScaler() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) import time for epoch in tqdm(range(args.epoch)): train_acc = 0.0 train_loss = 0.0 val_acc = 0.0 val_loss = 0.0 best_acc = 0.0 model.train() for batch_id, batch in enumerate(tqdm(train_loader)): input_ids = batch["input_ids"].to(device) attention_mask = batch["attention_mask"].to(device) labels = batch["labels"].to(device) optimizer.zero_grad() with autocast(): outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = loss_fn(outputs.logits, labels) # loss.backward() # optimizer.step() scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scaler.step(optimizer) scaler.update() train_acc += compute_acc(outputs.logits.cpu(), labels.cpu()) train_loss += loss if (batch_id + 1) % args.logging_steps == 0: train_loss = train_loss.data.cpu().numpy() print( f"[Train] epoch {epoch + 1} | batch_id {batch_id + 1} | loss {(train_loss) / args.logging_steps:.4f} | train_acc {train_acc / args.logging_steps:.4f}" ) logger.add_scalar("Train/loss", train_loss / args.logging_steps, epoch * len(train_loader) + batch_id) logger.add_scalar("Train/acc", train_acc / args.logging_steps, epoch * len(train_loader) + batch_id) train_acc = 0.0 train_loss = 0.0 # scheduler.step() print("\nStart Validation Step!") with torch.no_grad(): model.eval() for batch_id, batch in enumerate(tqdm(val_loader)): input_ids = batch["input_ids"].to(device) attention_mask = batch["attention_mask"].to(device) labels = batch["labels"].to(device) outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = loss_fn(outputs.logits, labels) val_acc += compute_acc(outputs.logits.cpu(), labels.cpu()) val_loss += loss print( f"[Val] epoch {epoch + 1} | val_acc {val_acc / (batch_id + 1):.4f}" ) logger.add_scalar("Val/loss", val_loss / (batch_id + 1), epoch) logger.add_scalar("Val/acc", val_acc / (batch_id + 1), epoch) if val_acc >= best_acc: best_acc = val_acc # torch.save(model.state_dict(), os.path.join(args.output_dir, "saved_" + str(epoch) + ".pth")) torch.save(model.state_dict(), os.path.join(args.output_dir, "best.pth")) print("Saved best acc model...") scheduler.step() torch.save(model.state_dict(), os.path.join(args.output_dir, "last.pth"))
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriterP(args.output_dir) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if p.requires_grad and not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if p.requires_grad and any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) warmup_steps = args.warmup_samples // args.train_batch_size if args.lr_decay: scheduler = tfopt.get_cosine_schedule_with_warmup( optimizer, warmup_steps, t_total) else: scheduler = tfopt.get_constant_schedule(optimizer) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) try: with open(os.path.join(args.model_name_or_path, 'step.txt'), 'r') as c: global_step = int(c.readline()) except OSError as e: global_step = 0 tr_loss, logging_loss = 0.0, 0.0 moving_loss = MovingLoss(10000 // args.logging_steps) model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproducibility (even between python 2 and 3) try: for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): inputs, labels = mask_tokens( batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model( inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() moving_loss.add(loss.item()) if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank == -1 and args.evaluate_during_training and global_step % args.eval_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, f"checkpoint-{global_step}") for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss epoch_iterator.set_postfix( MovingLoss=f'{moving_loss.loss:.2f}', Perplexity= f'{torch.exp(torch.tensor(moving_loss.loss)):.2f}') if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint save_state(args, model, tokenizer, global_step) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break print_sample(model, tokenizer, args.device, args) if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break except (KeyboardInterrupt, SystemExit): save_state(args, model, tokenizer, global_step) raise if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
# Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # 옵티마이저 선언 optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate) loss_fn = nn.CrossEntropyLoss() # softmax용 Loss Function 정하기 <- binary classification도 해당 loss function 사용 가능 t_total = len(train_dataloader) * num_epochs warmup_step = int(t_total * warmup_ratio) scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total) # 학습 평가 지표인 accuracy 계산 -> 얼마나 타겟값을 많이 맞추었는가 def calc_accuracy(X,Y): max_vals, max_indices = torch.max(X, 1) train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0] return train_acc # 모델 학습 시작 for e in range(num_epochs): train_acc = 0.0 test_acc = 0.0 model.train() for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)): optimizer.zero_grad() token_ids = token_ids.long().to(device)
def train(**kwargs): global id_map, label_map if kwargs["dryrun"]: os.environ["WANDB_MODE"] = "dryrun" wandb.init(project="APAuT", name=kwargs["name"]) train_texts, train_labels = read_mgb( kwargs["train_set"], kwargs["train_set_start_index"], kwargs["train_set_length"], kwargs["train_set_max_words"], ) test_texts, test_labels = read_mgb(kwargs["test_set"]) label_types = np.unique(np.array(flatten(train_labels))) print(label_types) encode = lambda l: [(np.where(label_types == item)[0][0] + 1) for item in l] encode_all = lambda l: [encode(item) for item in l] join_all = lambda l: [" ".join(item) for item in l] train_labels = encode_all(train_labels) test_labels = encode_all(test_labels) train_texts = join_all(train_texts) test_texts = join_all(test_texts) tokenizer = AutoTokenizer.from_pretrained(kwargs["model"], use_fast=True) train_encodings = tokenizer( train_texts, truncation=True, padding=True, pad_to_multiple_of=kwargs["pad_length"], ) test_encodings = tokenizer( test_texts, truncation=True, padding=True, pad_to_multiple_of=kwargs["pad_length"], ) if len(train_encodings["input_ids"][0]) != len( test_encodings["input_ids"][0]): raise ValueError(f""" train length with padding is {len(train_encodings['input_ids'][0])} while test length is {len(test_encodings['input_ids'][0])} """) else: length = len(train_encodings["input_ids"][0]) train_dataset = MGBDataset(train_encodings, train_labels, length) test_dataset = MGBDataset(test_encodings, test_labels, length) kwargs["effective_batch_size"] = (kwargs["gradient_accumulation_steps"] * kwargs["batch_size"]) training_args = TrainingArguments( output_dir="./results", num_train_epochs=kwargs["epochs"], per_device_train_batch_size=kwargs["batch_size"], per_device_eval_batch_size=kwargs["batch_size"], logging_dir="./logs", logging_steps=100, evaluation_strategy="steps", eval_steps=100, # 50_000 // kwargs["effective_batch_size"], gradient_accumulation_steps=kwargs["gradient_accumulation_steps"], fp16=kwargs["fp16"], ) label_map = {i + 1: label for i, label in enumerate(label_types)} label_map[0] = "<pad>" id_map = {label: i + 1 for i, label in enumerate(label_types)} id_map["<pad>"] = 0 config = AutoConfig.from_pretrained( kwargs["model"], num_labels=len(label_types), id2label=label_map, label2id=id_map, ) model = AutoModelForTokenClassification.from_pretrained(kwargs["model"], config=config) optimizer = AdamW( [ { "params": model.base_model.parameters() }, { "params": model.classifier.parameters() }, #'lr': 1e-3} ], lr=kwargs["learning_rate"], weight_decay=kwargs["weight_decay"], ) total_steps = len(train_dataset) // kwargs["effective_batch_size"] total_steps = total_steps * kwargs["epochs"] schedule = get_cosine_schedule_with_warmup(optimizer, kwargs["warmup_steps"], total_steps) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset, compute_metrics=compute_metrics, optimizers=(optimizer, schedule), ) wandb.config.update(kwargs) trainer.train() if kwargs["save"]: model_path = f'./models/{kwargs["name"]}' model.save_pretrained(model_path) tokenizer.save_pretrained(model_path) trainer.evaluate()
def __new__(cls, optimizer, *args, **kwargs): return get_cosine_schedule_with_warmup(optimizer, *args, **kwargs)
def train(_type, config, load="tmp_vocab.pt"): dev_id = 0 device = torch.device( dev_id) if torch.cuda.is_available() else torch.device('cpu') config["g2t"]["device"] = device config["t2g"]["device"] = device pool, vocab = prep_data(config["main"], load=load) model_g2t, model_t2g = prep_model(config, vocab) model_g2t.to(device) model_t2g.to(device) from transformers.optimization import ( get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup, ) optimizerG2T = torch.optim.Adam( model_g2t.parameters(), lr=config["g2t"]["lr"], weight_decay=config["g2t"]["weight_decay"], ) schedulerG2T = get_cosine_schedule_with_warmup( optimizer=optimizerG2T, num_warmup_steps=400, num_training_steps=800 * config["main"]["epoch"], ) optimizerT2G = torch.optim.Adam( model_t2g.parameters(), lr=config["t2g"]["lr"], weight_decay=config["t2g"]["weight_decay"], ) schedulerT2G = get_cosine_schedule_with_warmup( optimizer=optimizerT2G, num_warmup_steps=400, num_training_steps=800 * config["main"]["epoch"], ) loss_t2g, loss_g2t = [], [] best_g2t, best_t2g = 0.0, 0.0 klds = [] t2g_weight = [ vocab["relation"].wf.get(x, 0) for x in vocab["relation"].i2s ] t2g_weight[0] = 0 max_w = max(t2g_weight) t2g_weight = np.array(t2g_weight).astype("float32") t2g_weight = (max_w + 1000) / (t2g_weight + 1000) for i in range(0, config["main"]["epoch"]): _data_g2t = list( pool.draw_with_type(config["main"]["batch_size"], True, _type + "_g2t")) _data_t2g = list( pool.draw_with_type(config["main"]["batch_size"], True, _type + "_t2g")) data_list = list(zip(_data_g2t, _data_t2g)) _data = data_list with tqdm.tqdm(_data, disable=True if not config["main"]["display"] else False) as tqb: for j, (batch_g2t, batch_t2g) in enumerate(tqb): if (i < config["main"]["pre_epoch"] and config["main"]["mode"] == "warm_unsup"): _loss1, _loss2, kld = warmup_step1( batch_g2t, batch_t2g, model_g2t, model_t2g, optimizerG2T, optimizerT2G, config, t2g_weight, vocab, ) if (i == config["main"]["pre_epoch"] + 1 and config["main"]["mode"] == "warm_unsup"): _loss1, _loss2, kld = warmup_step2( batch_g2t, batch_t2g, model_g2t, model_t2g, optimizerG2T, optimizerT2G, config, t2g_weight, vocab, ) if config["main"]["mode"] == "sup": _loss1, _loss2, kld = supervise( batch_g2t, batch_t2g, model_g2t, model_t2g, optimizerG2T, optimizerT2G, config, t2g_weight, vocab, ) if (i >= config["main"]["pre_epoch"] + 1 and config["main"]["mode"] == "warm_unsup") or ( config["main"]["mode"] == "cold_unsup"): _loss1, _loss2, kld = back_translation( batch_g2t, batch_t2g, model_g2t, model_t2g, optimizerG2T, optimizerT2G, config, t2g_weight, vocab, ) loss_t2g.append(_loss1) schedulerT2G.step() loss_g2t.append(_loss2) schedulerG2T.step() klds.append(kld) tqb.set_postfix({ "t2g loss": np.mean(loss_t2g), "g2t loss": np.mean(loss_g2t), "kld loss": np.mean(klds), }) logging.info("Epoch " + str(i)) if i % 1 == 0: if (i < config["main"]["pre_epoch"] and config["main"]["mode"] == "warm_unsup"): model_g2t.blind, model_t2g.blind = True, True else: model_g2t.blind, model_t2g.blind = False, False if model_t2g.blind: e = eval_t2g( pool, "dev_t2g_blind", vocab, model_t2g, config["t2g"], display=config["main"]["display"], ) else: e = eval_t2g( pool, "dev", vocab, model_t2g, config["t2g"], display=config["main"]["display"], ) if e > best_t2g: best_t2g = max(best_t2g, e) torch.save(model_t2g.state_dict(), config["t2g"]["save"] + "X" + "best") e = eval_g2t( pool, "dev", vocab, model_g2t, config["g2t"], display=config["main"]["display"], ) if e > best_g2t: best_g2t = max(best_g2t, e) torch.save(model_g2t.state_dict(), config["g2t"]["save"] + "X" + "best") if i == config["main"]["pre_epoch"]: torch.save(model_t2g.state_dict(), config["t2g"]["save"] + "X" + "mid") torch.save(model_g2t.state_dict(), config["g2t"]["save"] + "X" + "mid") model_g2t.load_state_dict(torch.load(config["g2t"]["save"] + "X" + "best")) model_t2g.load_state_dict(torch.load(config["t2g"]["save"] + "X" + "best")) logging.info("Final Test mode {0:}".format(config["main"]["mode"])) e = eval_t2g(pool, "test", vocab, model_t2g, config["t2g"]) e = eval_g2t(pool, "test", vocab, model_g2t, config["g2t"])