def attach_test(validation_engine, verbose=VERBOSE_BATCH_WISE): # Attaching would be repaeted for serveral metrics. # Thus, we can reduce the repeated codes by using this function. def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name, ) # If the verbosity is set, progress bar would be shown for mini-batch iterations. # Without ignite, you can use tqdm to implement progress bar. validation_metric_names = ['loss', 'accuracy'] for metric_name in validation_metric_names: attach_running_average(validation_engine, metric_name) # Do same things for validation engine. if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(validation_engine, validation_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): print('Test - loss={:.4e} accuracy={:.4f} '.format( engine.state.metrics['loss'], engine.state.metrics['accuracy']))
def main(config): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") train_loader, valid_loader, test_loader = load_dataloader_for_featureNet( config) model = DeepSleepNet(input_dim=1, n_classes=5, is_train=True, use_dropout=config.use_dropout, use_rnn=config.use_rnn).to(device) optimizer = optim.Adam(model.parameters()) crit = nn.CrossEntropyLoss() data = torch.load("./folder0_model.pth") model.load_state_dict(data["model"]) def validate(engine, mini_batch): print(1) engine.model.eval() with torch.no_grad(): x, y = mini_batch x, y = x.to(engine.device), y.to(engine.device) y_hat = engine.model(x) loss = engine.crit(y_hat, y) if isinstance(y, torch.LongTensor) or isinstance( y, torch.cuda.LongTensor): accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float( y.size(0)) else: accuracy = 0 return {'loss': float(loss), 'accuracy': float(accuracy)} test_engine = MyEngine(validate, model, crit, optimizer, config) if config.verbose >= 2: print(model) print(optimizer) print(crit) def log_metrics(engine, title): print(engine.state.metrics.items()) print(f"{title} accuracy: {engine.state.metrics['accuracy']:.2f}") test_engine.add_event_handler(Events.EPOCH_COMPLETED, log_metrics, 'test') RunningAverage(output_transform=lambda x: x['accuracy']).attach( test_engine, 'accuracy') pbar = ProgressBar() pbar.attach(test_engine, ['accuracy']) test_engine.run(test_loader, max_epochs=1)
def attach_pbar_and_metrics(trainer, evaluator): loss_metric = Average(output_transform=lambda output: output["loss"]) accuracy_metric = Accuracy( output_transform=lambda output: (output["logit"], output["label"])) pbar = ProgressBar() loss_metric.attach(trainer, "loss") accuracy_metric.attach(trainer, "accuracy") accuracy_metric.attach(evaluator, "accuracy") pbar.attach(trainer)
def create_supervised_trainer_skipgram(model, optimizer, prepare_batch, metrics={}, device=None, log_dir='output/log/', checkpoint_dir='output/checkpoints/', checkpoint_every=None, tensorboard_every=50) -> Engine: def _prepare_batch(batch): return batch def _update(engine, batch): model.train() optimizer.zero_grad() batch = _prepare_batch(batch) batch_loss = model._loss(batch) loss = batch_loss.mean() loss.backward() optimizer.step() return {'loss': loss.item(), 'y_pred': scores, 'y': target} model.to(device) engine = Engine(_update) # Metrics RunningAverage(output_transform=lambda x: x['loss']).attach( engine, 'average_loss') # TQDM pbar = ProgressBar(persist=True, ) pbar.attach(engine, ['average_loss']) # Checkpoint saving # to_save = {'model': model, 'optimizer': optimizer, 'engine': engine} final_checkpoint_handler = Checkpoint({'model': model}, DiskSaver(checkpoint_dir, create_dir=True), n_saved=None, filename_prefix='final') engine.add_event_handler(Events.COMPLETED, final_checkpoint_handler) @engine.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): metrics = engine.state.metrics print(f"Epoch results - Avg loss: {metrics['average_loss']:.6f}," f" Accuracy: {metrics['accuracy']:.6f}," f" Non-Pad-Accuracy: {metrics['non_pad_accuracy']:.6f}") return engine
def inference( cfg, model, val_loader, num_query ): device = cfg.MODEL.DEVICE logger = logging.getLogger("reid_baseline.inference") logger.info("Enter inferencing") if cfg.TEST.RE_RANKING == 'no': print("Create evaluator") if 'test_all' in cfg.TEST.TEST_MODE: if len(val_loader.dataset.dataset[0]) == 4: # mask no new eval evaluator = create_supervised_all_evaluator_with_mask(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)}, seq_len=cfg.INPUT.SEQ_LEN,device=device) elif len(val_loader.dataset.dataset[0]) == 6: # mask , new eval evaluator = create_supervised_all_evaluator_with_mask_new_eval(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM,new_eval=True)}, seq_len=cfg.INPUT.SEQ_LEN,device=device) else: evaluator = create_supervised_all_evaluator(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)}, seq_len=cfg.INPUT.SEQ_LEN,device=device) else: if len(val_loader.dataset.dataset[0]) == 6: # mask , new eval evaluator = create_supervised_evaluator_with_mask_new_eval(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM,new_eval=True)}, device=device) elif len(val_loader.dataset.dataset[0]) == 4 : # mask, no new eval evaluator = create_supervised_evaluator_with_mask(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)}, device=device) else: evaluator = create_supervised_evaluator(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)}, device=device) elif cfg.TEST.RE_RANKING == 'yes': # haven't implement with mask print("Create evaluator for reranking") if 'test_all' in cfg.TEST.TEST_MODE: evaluator = create_supervised_all_evaluator(model, metrics={'r1_mAP': R1_mAP_reranking(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)}, seq_len=cfg.INPUT.SEQ_LEN,device=device) else: evaluator = create_supervised_evaluator(model, metrics={'r1_mAP': R1_mAP_reranking(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)}, device=device) else: print("Unsupported re_ranking config. Only support for no or yes, but got {}.".format(cfg.TEST.RE_RANKING)) pbar = ProgressBar(persist=True,ncols=120) pbar.attach(evaluator) evaluator.run(val_loader) cmc, mAP = evaluator.state.metrics['r1_mAP'] logger.info('Validation Results') logger.info("mAP: {:.1%}".format(mAP)) for r in [1, 5, 10]: logger.info("CMC curve, Rank-{:<3}:{:.1%}".format(r, cmc[r - 1]))
def train(self): """ Full training logic """ if self.train_logger is not None: self.train_logger.watch(self.model) engine = Engine(self._train_update_func) @engine.on(Events.EPOCH_STARTED) def log_training_loss(engine): engine.state.total_loss = 0 @engine.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): engine.state.total_loss += engine.state.output['loss'] for name, metric in self.metrics.items(): metric.attach(engine, name) pbar = ProgressBar() pbar.attach(engine) if self.valid: # TODO proper implementation: currently handled only in subclass evaluator = self._prepare_evaluator() engine.add_event_handler(Events.EPOCH_COMPLETED, self.run_validate, evaluator) @engine.on(Events.EPOCH_COMPLETED) def mk_checkpoints( engine): # TODO use checkpointing/scheduling from ignnite log = { 'epoch': engine.state.epoch, 'loss': engine.state.total_loss / len(engine.state.dataloader), 'metrics': engine.state.metrics } if hasattr(engine.state, 'validation_result'): log['val_loss'] = engine.state.validation_result.total_loss / len( engine.state.validation_result.dataloader) self._prepare_checkpoint(log=log) self._reschedule_lr(epoch=engine.state.epoch) if self.train_logger is not None: self.train_logger.add_entry(log) if self.verbosity >= 1: for key, value in log.items(): self.logger.info(' {:15s}: {}'.format( str(key), value)) engine.run( self.data_loader, max_epochs=self.epochs ) # TODO return resume logic of range(self.start_epoch, self.epochs + 1):
def attach(train_engine, validation_engine, verbose=VERBOSE_BATCH_WISE): # Attaching would be repaeted for serveral metrics. # Thus, we can reduce the repeated codes by using this function. def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name, ) # runningAverage : 미니 배치마다 return을 하면 알아서 통계적 수치를 보여줌 training_metric_names = ['loss', 'accuracy', '|param|', '|g_param|'] for metric_name in training_metric_names: attach_running_average(train_engine, metric_name) # If the verbosity is set, progress bar would be shown for mini-batch iterations. # Without ignite, you can use tqdm to implement progress bar. if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, training_metric_names) # progress bar를 출력하라는 것 # If the verbosity is set, statistics would be shown after each epoch. if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) # epoch가 끝났을때 Print def print_train_logs(engine): print( 'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} accuracy={:.4f}' .format( engine.state.epoch, engine.state.metrics['|param|'], engine.state.metrics['|g_param|'], engine.state.metrics['loss'], engine.state.metrics['accuracy'], )) validation_metric_names = ['loss', 'accuracy'] for metric_name in validation_metric_names: attach_running_average(validation_engine, metric_name) # Do same things for validation engine. if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(validation_engine, validation_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): print( 'Validation - loss={:.4e} accuracy={:.4f} best_loss={:.4e}' .format( engine.state.metrics['loss'], engine.state.metrics['accuracy'], engine.best_loss, ))
def train(self, epochs: int, train_loader, test_loader=None, trainsize=None, valsize=None): self.model.train() train_engine = Engine(lambda e, b: self.train_step(b)) @train_engine.on(Events.EPOCH_COMPLETED(every=self.track_loss_freq)) def eval_test(engine): if self.track_loss: self.tb_log(train_loader, engine.state.epoch, is_train=True, eval_length=valsize) if test_loader is not None: self.tb_log(test_loader, engine.state.epoch, is_train=False, eval_length=valsize) @train_engine.on(Events.EPOCH_COMPLETED) def save_state(engine): torch.save(self.model.state_dict(), self.snail_path) torch.save(self.opt.state_dict(), self.snail_opt_path) @train_engine.on( Events.ITERATION_COMPLETED(every=self.track_params_freq)) def tb_log_histogram_params(engine): if self.track_layers: for name, params in self.model.named_parameters(): self.logger.add_histogram(name.replace('.', '/'), params, engine.state.iteration) if params.grad is not None: self.logger.add_histogram( name.replace('.', '/') + '/grad', params.grad, engine.state.iteration) if self.trainpbar: RunningAverage(output_transform=lambda x: x).attach( train_engine, 'loss') p = ProgressBar() p.attach(train_engine, ['loss']) train_engine.run(train_loader, max_epochs=epochs, epoch_length=trainsize)
def attach(train_engine, validation_engine, training_metric_names=[ 'actor', 'baseline', 'risk', '|param|', '|g_param|' ], validation_metric_names=[ 'BLEU', ], verbose=VERBOSE_BATCH_WISE): # Attaching would be repaeted for serveral metrics. # Thus, we can reduce the repeated codes by using this function. def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name, ) for metric_name in training_metric_names: attach_running_average(train_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, training_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): avg_p_norm = engine.state.metrics['|param|'] avg_g_norm = engine.state.metrics['|g_param|'] avg_reward = engine.state.metrics['actor'] print('Epoch {} - |param|={:.2e} |g_param|={:.2e} BLEU={:.2f}'. format( engine.state.epoch, avg_p_norm, avg_g_norm, avg_reward, )) for metric_name in validation_metric_names: attach_running_average(validation_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(validation_engine, validation_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): avg_bleu = engine.state.metrics['BLEU'] print('Validation - BLEU={:.2f} best_BLEU={:.2f}'.format( avg_bleu, -engine.best_loss, ))
def attach(train_engine, validation_engine, verbose=VERBOSE_BATCH_WISE): '''현재 상황 보고 및 출력 함수''' def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name) ''' Train Attach Process ''' training_metric_names = ['loss', 'accuracy', '|param|', '|g_param|'] for metric_name in training_metric_names: attach_running_average(train_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, training_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) def print_train_tag(engine): print( 'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} accuracy={:.4f}' .format( engine.state.epoch, engine.state.metrics['|param|'], engine.state.metrics['|g_param|'], engine.state.metrics['loss'], engine.state.metrics['accuracy'], )) ''' Validate Attach Process ''' validation_metric_names = ['loss', 'accuracy'] for metric_name in validation_metric_names: attach_running_average(validation_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(validation_engine, validation_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): print( 'Validation - loss={:.4e} accuracy={:.4f} best_loss={:.4e}' .format( engine.state.metrics['loss'], engine.state.metrics['accuracy'], engine.best_loss, ))
def set_handlers(trainer: Engine, evaluator: Engine, valloader: DataLoader, model: nn.Module, optimizer: optim.Optimizer, args: Namespace) -> None: ROC_AUC( output_transform=lambda output: (output.logit, output.label)).attach( engine=evaluator, name='roc_auc') Accuracy(output_transform=lambda output: ( (output.logit > 0).long(), output.label)).attach(engine=evaluator, name='accuracy') Loss(loss_fn=nn.BCEWithLogitsLoss(), output_transform=lambda output: (output.logit, output.label.float())).attach(engine=evaluator, name='loss') ProgressBar(persist=True, desc='Epoch').attach( engine=trainer, output_transform=lambda output: {'loss': output.loss}) ProgressBar(persist=False, desc='Eval').attach(engine=evaluator) ProgressBar(persist=True, desc='Eval').attach( engine=evaluator, metric_names=['roc_auc', 'accuracy', 'loss'], event_name=Events.EPOCH_COMPLETED, closing_event_name=Events.COMPLETED) @trainer.on(Events.ITERATION_COMPLETED(every=args.evaluation_interval)) def _evaluate(trainer: Engine): evaluator.run(valloader, max_epochs=1) evaluator.add_event_handler( event_name=Events.EPOCH_COMPLETED, handler=Checkpoint( to_save={ 'model': model, 'optimizer': optimizer, 'trainer': trainer }, save_handler=DiskSaver(dirname=args.checkpoint_dir, atomic=True, create_dir=True, require_empty=False), filename_prefix='best', score_function=lambda engine: engine.state.metrics['roc_auc'], score_name='val_roc_auc', n_saved=1, global_step_transform=global_step_from_engine(trainer)))
def attach(trainer, evaluator, verbose=VERBOSE_BATCH_WISE): from ignite.engine import Events from ignite.metrics import RunningAverage from ignite.contrib.handlers.tqdm_logger import ProgressBar RunningAverage(output_transform=lambda x: x[0]).attach( trainer, 'actor') RunningAverage(output_transform=lambda x: x[1]).attach( trainer, 'baseline') RunningAverage(output_transform=lambda x: x[2]).attach( trainer, 'reward') RunningAverage(output_transform=lambda x: x[3]).attach( trainer, '|param|') RunningAverage(output_transform=lambda x: x[4]).attach( trainer, '|g_param|') if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach( trainer, ['|param|', '|g_param|', 'actor', 'baseline', 'reward']) if verbose >= VERBOSE_EPOCH_WISE: @trainer.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): avg_p_norm = engine.state.metrics['|param|'] avg_g_norm = engine.state.metrics['|g_param|'] avg_reward = engine.state.metrics['actor'] print('Epoch {} - |param|={:.2e} |g_param|={:.2e} BLEU={:.2f}'. format( engine.state.epoch, avg_p_norm, avg_g_norm, avg_reward, )) RunningAverage(output_transform=lambda x: x).attach(evaluator, 'BLEU') if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(evaluator, ['BLEU']) if verbose >= VERBOSE_EPOCH_WISE: @evaluator.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): avg_bleu = engine.state.metrics['BLEU'] print('Validation - BLEU={:.2f} best_BLEU={:.2f}'.format( avg_bleu, -engine.best_loss, ))
def attach(trainer, evaluator, verbose=VERBOSE_BATCH_WISE): from ignite.engine import Events from ignite.metrics import RunningAverage from ignite.contrib.handlers.tqdm_logger import ProgressBar RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'loss') RunningAverage(output_transform=lambda x: x[1]).attach( trainer, '|param|') RunningAverage(output_transform=lambda x: x[2]).attach( trainer, '|g_param|') if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(trainer, ['|param|', '|g_param|', 'loss']) if verbose >= VERBOSE_EPOCH_WISE: @trainer.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): avg_p_norm = engine.state.metrics['|param|'] avg_g_norm = engine.state.metrics['|g_param|'] avg_loss = engine.state.metrics['loss'] print( 'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} ppl={:.2f}' .format( engine.state.epoch, avg_p_norm, avg_g_norm, avg_loss, np.exp(avg_loss), )) RunningAverage(output_transform=lambda x: x).attach(evaluator, 'loss') if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(evaluator, ['loss']) if verbose >= VERBOSE_EPOCH_WISE: @evaluator.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): avg_loss = engine.state.metrics['loss'] print( 'Validation - loss={:.4e} ppl={:.2f} best_loss={:.4e} best_ppl={:.2f}' .format( avg_loss, np.exp(avg_loss), engine.best_loss, np.exp(engine.best_loss), ))
def attach( train_engine, validation_engine, training_metric_names=['loss', 'ppl', '|param|', '|g_param|'], validation_metric_names=['loss', 'ppl'], verbose=VERBOSE_BATCH_WISE, ): def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name, ) for metric_name in training_metric_names: attach_running_average(train_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, training_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): avg_p_norm = engine.state.metrics['|param|'] avg_g_norm = engine.state.metrics['|g_param|'] avg_loss = engine.state.metrics['loss'] print('Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} ppl={:.2f}'.format( engine.state.epoch, avg_p_norm, avg_g_norm, avg_loss, np.exp(avg_loss), )) for metric_name in validation_metric_names: attach_running_average(validation_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(validation_engine, validation_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): avg_loss = engine.state.metrics['loss'] print('Validation - loss={:.4e} ppl={:.2f} best_loss={:.4e} best_ppl={:.2f}'.format( avg_loss, np.exp(avg_loss), engine.best_loss, np.exp(engine.best_loss), ))
def attatch_running_average(engine, meric_name): RunningAverage(output_transform = lambda x : x[metric_name]).attach( engine, meric_name, ) training_metric_names = ['loss', 'accuracy', '|param|', '|g_param|'] for metric_name in training_metric_names: attatch_running_average(train_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format = None, ncols = 120) pbar.attach(train_engine, training_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): print('Epoch {} - |param\ = {:.2e} loss = {:.4e} accuracy = {}').format( engine.state.epoch, engine.state.metrics['|param|'], engnie.state.metrics['|g_param|'], engine.state.metrics['loss'], engnie.state.metrics['accuracy'] ) validation_metric_names = ['loss','accuracy'] for metric_name in validation_metric_names: attatch_running_average(validation_engine, metric_name) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): print('Validation - loss={:.4e} accuracy={:.4f} best_loss={:4.}'.format( engine.state.metrics['loss'], engine.state.metrics['accuracy'], engine.best_loss, )) @staticmethos
def attach( train_engine, # validation_engine, training_metric_names=['loss', 'ppl', '|param|', '|g_param|'], # validation_metric_names = ['loss', 'ppl'], verbose=VERBOSE_BATCH_WISE, ): # Attaching would be repaeted for serveral metrics. # Thus, we can reduce the repeated codes by using this function. def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name, ) for metric_name in training_metric_names: attach_running_average(train_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, training_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): avg_p_norm = engine.state.metrics['|param|'] avg_g_norm = engine.state.metrics['|g_param|'] avg_loss = engine.state.metrics['loss'] print( 'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} ppl={:.2f}' .format( engine.state.epoch, avg_p_norm, avg_g_norm, avg_loss, np.exp(avg_loss), ))
def attach(train_engine, validation_engine, verbose=VERBOSE_BATCH_WISE): def attach_runngin_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name) training_metric_names = ["loss", "accuracy", "|param|", "|g_param|"] for metric_name in training_metric_names: attach_runngin_average(train_engine, metric_name) # 매 iteration 마다 출력 if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, training_metric_names) # epoch이 끝났을때 출력 if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) def print_train_loss(engine): print( "Epoch {} - |param| = {:.2e} |g_param| = {:.2e} loss = {:.4e} accuracy = {:.4f}" .format(engine.state.epoch, engine.state.metrics["|param|"], engine.state.metrics["|g_param|"], engine.state.metrics["loss"], engine.state.metrics["accuracy"])) validation_metric_names = ["loss", "accuracy"] for metric_name in validation_metric_names: attach_runngin_average(validation_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(validation_engine, validation_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_loss(engine): print( "Validation - loss={:.4e} accuracy={:.4f} best_loss={:.4e}" .format(engine.state.metrics["loss"], engine.state.metrics["accuracy"], engine.best_loss))
def __init__(self, config): """ Args: config (object): task configuration object """ self._validate(config) config_dict = config.asdict() config_dict['config_filepath'] = config.config_filepath self._update_attributes(config_dict) self.log_dir = None self.log_filepath = None self.logger = logging.getLogger(self.name) self.pbar = ProgressBar() self.log_level = logging.INFO self.writer = None
def __init__(self, name, model, log_dir, lr, lr_decay_step, adam=False): """ Initialize to train the given model. :param name: The name of the model to be trained. :param model: The model to be trained. :param log_dir: String. The log directory of the tensorboard. :param lr: Float. The learning rate. :param lr_decay_step: Integer. The amount of steps the learning rate decays. :param adam: Bool. Whether to use adam optimizer or not. """ super(Trainer, self).__init__(self.update_model) self.model = model # tqdm ProgressBar(persist=True).attach(self) # Optimizer params = [p for p in model.parameters() if p.requires_grad] if adam: self.optimizer = torch.optim.Adam(params, lr=lr) else: self.optimizer = torch.optim.SGD(params, lr=lr, momentum=0.9) # Scheduler if lr_decay_step > 0: self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=lr_decay_step, gamma=0.1) self.add_event_handler(Events.EPOCH_COMPLETED, lambda e: e.scheduler.step()) else: self.scheduler = None # Terminate if nan values found self.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) # Tensorboard logging self.tb_logger = TensorboardLogger(log_dir=os.path.join(log_dir, name)) self.add_event_handler(Events.COMPLETED, lambda x: self.tb_logger.close()) self.tb_logger.attach(self, log_handler=OptimizerParamsHandler(self.optimizer), event_name=Events.EPOCH_COMPLETED) self.tb_logger.attach(self, log_handler=OutputHandler(tag='training', output_transform=lambda x: { 'rpn_box_loss': round(self.state.output['loss_rpn_box_reg'].item(), 4), 'rpn_cls_loss': round(self.state.output['loss_objectness'].item(), 4), 'roi_box_loss': round(self.state.output['loss_box_reg'].item(), 4), 'roi_cls_loss': round(self.state.output['loss_classifier'].item(), 4) }), event_name=Events.EPOCH_COMPLETED) # Run on GPU (cuda) if available if torch.cuda.is_available(): torch.cuda.set_device(int(get_free_gpu())) model.cuda(torch.cuda.current_device())
def attach(trainer, evaluator, verbose=2): from ignite.engine import Events from ignite.metrics import RunningAverage from ignite.contrib.handlers.tqdm_logger import ProgressBar RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'loss') RunningAverage(output_transform=lambda x: x[1]).attach( trainer, '|param|') RunningAverage(output_transform=lambda x: x[2]).attach( trainer, '|g_param|') if verbose >= 2: pbar = ProgressBar() pbar.attach(trainer, ['|param|', '|g_param|', 'loss']) if verbose >= 1: @trainer.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): avg_p_norm = engine.state.metrics['|param|'] avg_g_norm = engine.state.metrics['|g_param|'] avg_loss = engine.state.metrics['loss'] print('Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e}'. format(engine.state.epoch, avg_p_norm, avg_g_norm, avg_loss)) RunningAverage(output_transform=lambda x: x).attach(evaluator, 'loss') if verbose >= 2: pbar = ProgressBar() pbar.attach(evaluator, ['loss']) if verbose >= 1: @evaluator.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): avg_loss = engine.state.metrics['loss'] print('Validation - loss={:.4e} lowest_loss={:.4e}'.format( avg_loss, engine.lowest_loss))
def __init__(self, model, tb_logger): """ Initialize to evaluate the given model. :param model: The model to be evaluated. :param tb_logger: The tensorboard to be logged to. """ super(Evaluator, self).__init__(self.predict_on_batch) self.model = model # FROC avg_fps = list(range(1, 26)) avg_fps.append(0.5) avg_fps.sort() tags = ['froc_{}fp'.format(fp) for fp in avg_fps] for avg_fp, tag in zip(avg_fps, tags): FROC([avg_fp], iou_threshold=0.5).attach(self, tag) # tqdm ProgressBar(persist=True).attach(self) # Tensorboard logging tb_logger.attach(self, log_handler=OutputHandler(tag='validation', metric_names=tags, global_step_transform=lambda engine, name: engine.state.epoch), event_name=Events.EPOCH_COMPLETED)
def get_trainer(model: Module, learning_rate: float, loss: Callable) -> Engine: """ construct a trainer ``ignite`` engine with pre-attached progress bar and loss running average :param model: a network to train :param learning_rate: the learning rate for training :param loss: a loss to minimise during training :returns: an ``ignite`` trainer """ trainer = create_supervised_trainer( model, Adam(model.parameters(), lr=learning_rate), loss, CURRENT_DEVICE, ) RunningAverage(output_transform=lambda x: x).attach( trainer, "running_loss") ProgressBar().attach( trainer, output_transform=lambda x: x, event_name=Events.EPOCH_COMPLETED, closing_event_name=Events.COMPLETED, ) return trainer
def attach(train_engine, validation_engine, training_metric_names=[ 'x2y', 'y2x', 'reg', '|param|', '|g_param|' ], validation_metric_names=['x2y', 'y2x'], verbose=VERBOSE_BATCH_WISE): # Attaching would be repaeted for serveral metrics. # Thus, we can reduce the repeated codes by using this function. def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name, ) for metric_name in training_metric_names: attach_running_average(train_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, training_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): avg_p_norm = engine.state.metrics['|param|'] avg_g_norm = engine.state.metrics['|g_param|'] avg_x2y = engine.state.metrics['x2y'] avg_y2x = engine.state.metrics['y2x'] avg_reg = engine.state.metrics['reg'] print( 'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss_x2y={:.4e} ppl_x2y={:.2f} loss_y2x={:.4e} ppl_y2x={:.2f} dual_loss={:.4e}' .format( engine.state.epoch, avg_p_norm, avg_g_norm, avg_x2y, np.exp(avg_x2y), avg_y2x, np.exp(avg_y2x), avg_reg, )) for metric_name in validation_metric_names: attach_running_average(validation_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(validation_engine, validation_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): avg_x2y = engine.state.metrics['x2y'] avg_y2x = engine.state.metrics['y2x'] print( 'Validation X2Y - loss={:.4e} ppl={:.2f} best_loss={:.4e} best_ppl={:.2f}' .format( avg_x2y, np.exp(avg_x2y), engine.best_x2y, np.exp(engine.best_x2y), )) print( 'Validation Y2X - loss={:.4e} ppl={:.2f} best_loss={:.4e} best_ppl={:.2f}' .format( avg_y2x, np.exp(avg_y2x), engine.best_y2x, np.exp(engine.best_y2x), ))
def _start(self): """Method to run the task """ if 'cuda' in self.device: self.model = self.model.to(self.device) mlflow.log_param("model", get_object_name(self.model)) self.logger.debug("Setup criterion") if "cuda" in self.device: self.criterion = self.criterion.to(self.device) mlflow.log_param("criterion", get_object_name(self.criterion)) mlflow.log_param("optimizer", get_object_name(self.optimizer)) self.logger.debug("Setup ignite trainer") trainer = self._setup_trainer() self._setup_trainer_handlers(trainer) metrics = {'loss': Loss(self.criterion)} metrics.update(self.metrics) self.logger.debug("Input data info: ") msg = "- train data loader: {} number of batches".format( len(self.train_dataloader)) if isinstance(self.train_dataloader, DataLoader): msg += " | {} number of samples".format( len(self.train_dataloader.sampler)) self.logger.debug(msg) if isinstance(self.train_dataloader, DataLoader): write_model_graph(self.writer, model=self.model, data_loader=self.train_dataloader, device=self.device) self.pbar_eval = None if self.train_eval_dataloader is not None: self.pbar_eval = ProgressBar() self._setup_offline_train_metrics_computation(trainer, metrics) if self.val_dataloader is not None: if self.val_metrics is None: self.val_metrics = metrics if self.pbar_eval is None: self.pbar_eval = ProgressBar() val_evaluator = self._setup_val_metrics_computation(trainer) if self.reduce_lr_on_plateau is not None: assert self.reduce_lr_on_plateau_var in self.val_metrics, \ "Monitor variable {} is not found in metrics {}" \ .format(self.reduce_lr_on_plateau_var, metrics) @val_evaluator.on(Events.COMPLETED) def update_reduce_on_plateau(engine): val_var = engine.state.metrics[ self.reduce_lr_on_plateau_var] self.reduce_lr_on_plateau.step(val_var) def default_score_function(engine): val_loss = engine.state.metrics['loss'] # Objects with highest scores will be retained. return -val_loss # Setup early stopping: if self.early_stopping_kwargs is not None: if 'score_function' in self.early_stopping_kwargs: es_score_function = self.early_stopping_kwargs[ 'score_function'] else: es_score_function = default_score_function self._setup_early_stopping(trainer, val_evaluator, es_score_function) # Setup model checkpoint: if self.model_checkpoint_kwargs is None: self.model_checkpoint_kwargs = { "filename_prefix": "model", "score_name": "val_loss", "score_function": default_score_function, "n_saved": 3, "atomic": True, "create_dir": True, "save_as_state_dict": True } self._setup_best_model_checkpointing(val_evaluator) self.logger.debug("Setup other handlers") if self.lr_scheduler is not None: @trainer.on(Events.ITERATION_STARTED) def update_lr_scheduler(engine): self.lr_scheduler.step() self._setup_log_learning_rate(trainer) self.logger.info("Start training: {} epochs".format(self.num_epochs)) mlflow.log_param("num_epochs", self.num_epochs) trainer.run(self.train_dataloader, max_epochs=self.num_epochs) self.logger.debug("Training is ended")
class BasicTrainTask(BaseTask): name = "Train Task" def _validate(self, config): """ Method to check if specific configuration is correct. Raises AssertError if is incorrect. """ assert isinstance(config, BasicTrainConfig), \ "Configuration should be instance of `BasicTrainConfig`, but given {}".format(type(config)) def _start(self): """Method to run the task """ if 'cuda' in self.device: self.model = self.model.to(self.device) mlflow.log_param("model", get_object_name(self.model)) self.logger.debug("Setup criterion") if "cuda" in self.device: self.criterion = self.criterion.to(self.device) mlflow.log_param("criterion", get_object_name(self.criterion)) mlflow.log_param("optimizer", get_object_name(self.optimizer)) self.logger.debug("Setup ignite trainer") trainer = self._setup_trainer() self._setup_trainer_handlers(trainer) metrics = {'loss': Loss(self.criterion)} metrics.update(self.metrics) self.logger.debug("Input data info: ") msg = "- train data loader: {} number of batches".format( len(self.train_dataloader)) if isinstance(self.train_dataloader, DataLoader): msg += " | {} number of samples".format( len(self.train_dataloader.sampler)) self.logger.debug(msg) if isinstance(self.train_dataloader, DataLoader): write_model_graph(self.writer, model=self.model, data_loader=self.train_dataloader, device=self.device) self.pbar_eval = None if self.train_eval_dataloader is not None: self.pbar_eval = ProgressBar() self._setup_offline_train_metrics_computation(trainer, metrics) if self.val_dataloader is not None: if self.val_metrics is None: self.val_metrics = metrics if self.pbar_eval is None: self.pbar_eval = ProgressBar() val_evaluator = self._setup_val_metrics_computation(trainer) if self.reduce_lr_on_plateau is not None: assert self.reduce_lr_on_plateau_var in self.val_metrics, \ "Monitor variable {} is not found in metrics {}" \ .format(self.reduce_lr_on_plateau_var, metrics) @val_evaluator.on(Events.COMPLETED) def update_reduce_on_plateau(engine): val_var = engine.state.metrics[ self.reduce_lr_on_plateau_var] self.reduce_lr_on_plateau.step(val_var) def default_score_function(engine): val_loss = engine.state.metrics['loss'] # Objects with highest scores will be retained. return -val_loss # Setup early stopping: if self.early_stopping_kwargs is not None: if 'score_function' in self.early_stopping_kwargs: es_score_function = self.early_stopping_kwargs[ 'score_function'] else: es_score_function = default_score_function self._setup_early_stopping(trainer, val_evaluator, es_score_function) # Setup model checkpoint: if self.model_checkpoint_kwargs is None: self.model_checkpoint_kwargs = { "filename_prefix": "model", "score_name": "val_loss", "score_function": default_score_function, "n_saved": 3, "atomic": True, "create_dir": True, "save_as_state_dict": True } self._setup_best_model_checkpointing(val_evaluator) self.logger.debug("Setup other handlers") if self.lr_scheduler is not None: @trainer.on(Events.ITERATION_STARTED) def update_lr_scheduler(engine): self.lr_scheduler.step() self._setup_log_learning_rate(trainer) self.logger.info("Start training: {} epochs".format(self.num_epochs)) mlflow.log_param("num_epochs", self.num_epochs) trainer.run(self.train_dataloader, max_epochs=self.num_epochs) self.logger.debug("Training is ended") def _setup_trainer(self): trainer = create_supervised_trainer(self.model, self.optimizer, self.criterion, device=self.device, non_blocking='cuda' in self.device) return trainer def _setup_trainer_handlers(self, trainer): # Setup timer to measure training time timer = setup_timer(trainer) self._setup_log_training_loss(trainer) @trainer.on(Events.EPOCH_COMPLETED) def log_training_time(engine): self.logger.info("One epoch training time (seconds): {}".format( timer.value())) last_model_saver = ModelCheckpoint( self.log_dir.as_posix(), filename_prefix="checkpoint", save_interval=self.trainer_checkpoint_interval, n_saved=1, atomic=True, create_dir=True, save_as_state_dict=True) model_name = get_object_name(self.model) to_save = { model_name: self.model, "optimizer": self.optimizer, } if self.lr_scheduler is not None: to_save['lr_scheduler'] = self.lr_scheduler trainer.add_event_handler(Events.ITERATION_COMPLETED, last_model_saver, to_save) trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) def _setup_log_training_loss(self, trainer): self.avg_output = RunningAverage(output_transform=lambda out: out) self.avg_output.attach(trainer, 'running_avg_loss') self.pbar.attach(trainer, ['running_avg_loss']) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iteration = (engine.state.iteration - 1) % len( self.train_dataloader) + 1 if iteration % self.log_interval == 0: # self.logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.4f}".format(engine.state.epoch, iteration, # len(self.train_dataloader), # engine.state.output)) self.writer.add_scalar("training/loss_vs_iterations", engine.state.output, engine.state.iteration) mlflow.log_metric("training_loss_vs_iterations", engine.state.output) def _setup_log_learning_rate(self, trainer): @trainer.on(Events.EPOCH_STARTED) def log_lrs(engine): if len(self.optimizer.param_groups) == 1: lr = float(self.optimizer.param_groups[0]['lr']) self.logger.debug("Learning rate: {}".format(lr)) self.writer.add_scalar("learning_rate", lr, engine.state.epoch) mlflow.log_metric("learning_rate", lr) else: for i, param_group in enumerate(self.optimizer.param_groups): lr = float(param_group['lr']) self.logger.debug("Learning rate (group {}): {}".format( i, lr)) self.writer.add_scalar("learning_rate_group_{}".format(i), lr, engine.state.epoch) mlflow.log_metric("learning_rate_group_{}".format(i), lr) def _setup_offline_train_metrics_computation(self, trainer, metrics): train_eval_loader = self.train_eval_dataloader msg = "- train evaluation data loader: {} number of batches".format( len(train_eval_loader)) if isinstance(train_eval_loader, DataLoader): msg += " | {} number of samples".format( len(train_eval_loader.sampler)) self.logger.debug(msg) train_evaluator = create_supervised_evaluator(self.model, metrics=metrics, device=self.device, non_blocking="cuda" in self.device) self.pbar_eval.attach(train_evaluator) @trainer.on(Events.EPOCH_COMPLETED) def log_training_metrics(engine): epoch = engine.state.epoch if epoch % self.val_interval_epochs == 0: self.logger.debug("Compute training metrics") metrics_results = train_evaluator.run( train_eval_loader).metrics self.logger.info("Training Results - Epoch: {}".format(epoch)) for name in metrics_results: self.logger.info("\tAverage {}: {:.5f}".format( name, metrics_results[name])) self.writer.add_scalar("training/avg_{}".format(name), metrics_results[name], epoch) mlflow.log_metric("training_avg_{}".format(name), metrics_results[name]) return train_evaluator def _setup_val_metrics_computation(self, trainer): val_evaluator = create_supervised_evaluator(self.model, metrics=self.val_metrics, device=self.device, non_blocking="cuda" in self.device) self.pbar_eval.attach(val_evaluator) msg = "- validation data loader: {} number of batches".format( len(self.val_dataloader)) if isinstance(self.val_dataloader, DataLoader): msg += " | {} number of samples".format( len(self.val_dataloader.sampler)) self.logger.debug(msg) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): epoch = engine.state.epoch if epoch % self.val_interval_epochs == 0: self.logger.debug("Compute validation metrics") metrics_results = val_evaluator.run( self.val_dataloader).metrics self.logger.info( "Validation Results - Epoch: {}".format(epoch)) for name in metrics_results: self.logger.info("\tAverage {}: {:.5f}".format( name, metrics_results[name])) self.writer.add_scalar("validation/avg_{}".format(name), metrics_results[name], epoch) mlflow.log_metric("validation_avg_{}".format(name), metrics_results[name]) return val_evaluator def _setup_early_stopping(self, trainer, val_evaluator, score_function): kwargs = dict(self.early_stopping_kwargs) if 'score_function' not in kwargs: kwargs['score_function'] = score_function handler = EarlyStopping(trainer=trainer, **kwargs) setup_logger(handler._logger, self.log_filepath, self.log_level) val_evaluator.add_event_handler(Events.COMPLETED, handler) def _setup_best_model_checkpointing(self, val_evaluator): model_name = get_object_name(self.model) best_model_saver = ModelCheckpoint(self.log_dir.as_posix(), **self.model_checkpoint_kwargs) val_evaluator.add_event_handler(Events.COMPLETED, best_model_saver, {model_name: self.model})
def run(root_path, log_path, student_class_module, teacher_class_module, student_class_name, teacher_class_name, init_interval, hard_ratio, train_targets, test_targets, test_camera_base, augmentation_types, batch_size, n_workers, save_interval, n_saved, gpu_ids, max_epochs=150, init_lr_student_conv=.01, init_lr_teacher_conv=.01, init_lr_student_classifier=.01, init_lr_teacher_classifier=.1, lr_decay_step=100, lr_decay_rate=.1): device = 'cuda:{}'.format(gpu_ids[0]) train_transformer = Transformer(True, augmentation_types) test_transformer = Transformer(False, []) train_dataset = TrainDatasetWrapper(root_path, train_targets, train_transformer) train_loader = utils.data.DataLoader(train_dataset, batch_size, shuffle=True, num_workers=n_workers, pin_memory=True) test_datasets = [] for test_target in test_targets: test_datasets.append( TestDatasetWrapper(root_path, test_target, test_transformer, test_camera_base)) loader_caller = _get_test_data_loader_caller(batch_size, n_workers) student_class_module = importlib.import_module(student_class_module) student_model_class = getattr(student_class_module, student_class_name) teacher_class_module = importlib.import_module(teacher_class_module) teacher_model_class = getattr(teacher_class_module, teacher_class_name) models = { 'student': student_model_class(train_dataset.n_classes), 'teacher': teacher_model_class(train_dataset.n_classes), 'generator': teacher_model_class(train_dataset.n_classes) } loss_functions = { 'student': SoftLabelLoss(), 'teacher': nn.CrossEntropyLoss() } student_classifier_parameters = list( models['student'].classifier.parameters()) student_classifier_parameters_ids = [] for p in student_classifier_parameters: student_classifier_parameters_ids.append(id(p)) student_conv_parameters = [] for p in models['student'].parameters(): if id(p) not in student_classifier_parameters_ids: student_conv_parameters.append(p) teacher_classifier_parameters = list( models['teacher'].classifier.parameters()) teacher_classifier_parameters_ids = [] for p in teacher_classifier_parameters: teacher_classifier_parameters_ids.append(id(p)) teacher_conv_parameters = [] for p in models['teacher'].parameters(): if id(p) not in teacher_classifier_parameters_ids: teacher_conv_parameters.append(p) optimizers = { 'student_conv': optim.SGD(student_conv_parameters, init_lr_student_conv, momentum=.9, weight_decay=5e-4, nesterov=True), 'student_classifier': optim.SGD(student_classifier_parameters, init_lr_student_classifier, momentum=.9, weight_decay=5e-4, nesterov=True), 'teacher_conv': optim.SGD(teacher_conv_parameters, init_lr_teacher_conv, momentum=.9, weight_decay=5e-4, nesterov=True), 'teacher_classifier': optim.SGD(teacher_classifier_parameters, init_lr_teacher_classifier, momentum=.9, weight_decay=5e-4, nesterov=True), } schedulers = { 'student_conv': optim.lr_scheduler.StepLR(optimizers['student_conv'], lr_decay_step, gamma=lr_decay_rate), 'student_classifier': optim.lr_scheduler.StepLR(optimizers['student_classifier'], lr_decay_step, gamma=lr_decay_rate), 'teacher_conv': optim.lr_scheduler.StepLR(optimizers['teacher_conv'], lr_decay_step, gamma=lr_decay_rate), } writer = SummaryWriter(log_dir=log_path) trainer = create_supervised_soft_label_trainer( models, optimizers, loss_functions, hard_ratio, init_interval, device=device, non_blocking=True, output_transform=lambda x, y, y_pred_student, y_pred_teacher, loss_student, loss_teacher: (y, y_pred_student, y_pred_teacher, loss_student, loss_teacher)) RunningAverage(output_transform=lambda output: output[3].item()).attach( trainer, 'loss_student') RunningAverage(output_transform=lambda output: output[4].item()).attach( trainer, 'loss_teacher') Accuracy(output_transform=lambda output: (output[1], output[0])).attach( trainer, 'accuracy_student') Accuracy(output_transform=lambda output: (output[2], output[0])).attach( trainer, 'accuracy_teacher') progress_bar = ProgressBar() progress_bar.attach(trainer, ['loss_student', 'loss_teacher']) checkpointer = ModelCheckpoint(log_path, 'checkpoint', save_interval=save_interval, n_saved=n_saved) rank_accuracy = RankAccuracy(n_workers) evaluator = create_supervised_evaluator(models['student'], metrics={'rank': rank_accuracy}, device=device, non_blocking=True) trainer.add_event_handler( Events.EPOCH_COMPLETED, _get_result_write_function(rank_accuracy, test_datasets, loader_caller, evaluator, writer)) trainer.add_event_handler( Events.EPOCH_COMPLETED, _get_init_classifier_function(models, optimizers['teacher_classifier'], init_interval)) trainer.add_event_handler(Events.ITERATION_COMPLETED, _get_loss_write_function(writer)) trainer.add_event_handler(Events.EPOCH_COMPLETED, _get_lr_decay_function(schedulers)) trainer.add_event_handler(Events.EPOCH_COMPLETED, _get_lr_write_function(optimizers, writer)) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpointer, { 'student_model': models['student'], 'teacher_model': models['teacher'], 'generator_model': models['generator'] }) trainer.run(train_loader, max_epochs=max_epochs) writer.close()
def main(): args = get_args() if 'e-SNLI-VE' in args.data_path: args.no_image = False else: args.no_image = True if not args.no_image: args.no_premise = True args.with_expl = True '''Setup''' t = datetime.today() output_dir = os.path.join(args.output_folder, f"{t.month}_{t.day}_{t.hour}_{t.minute}_{t.second}") if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig(filename=os.path.join(output_dir, 'app.log'), filemode='a', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) # This is a logger.warning: it will be printed by all distributed processes logger.warning(f"Running process {args.local_rank}") logger.info(f"Arguments: {pformat(args)}") logger.info(f'Image not used:{args.no_image}') logger.info(f'Premise not used:{args.no_premise}') logger.info(f'Explanations used:{args.with_expl}') '''Initialize distributed training if needed''' args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint) tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT) if args.no_image: model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint) else: import image_gpt2_291 model = image_gpt2_291.GPT2LMHeadModel.from_pretrained( args.model_checkpoint) model.resize_token_embeddings(len(tokenizer)) model.to(args.device) optimizer = AdamW(model.parameters(), lr=args.lr) ''' Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) ''' if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) model = model.module logger.info("Prepare datasets") train_loader, val_loader = get_data_loaders(args, tokenizer) '''Training function and trainer''' def train(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) if args.no_image: input_ids, lm_label, label, input_mask = batch else: image, input_ids, lm_label, label, input_mask = batch if args.no_image: output = model(input_ids=input_ids, # attention_mask=input_mask, labels=lm_label) else: output = model(input_ids=input_ids, images=image, # attention_mask=input_mask, labels=lm_label) loss, logits, _ = output loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() if not args.with_expl: lbl_accuracy = torch.eq(label, logits.argmax( dim=1)).float().sum() / len(label) return { 'loss': loss.item(), 'lbl_accuracy': lbl_accuracy.item() } else: if engine.state.iteration % (args.gradient_accumulation_steps * 500) == 0: input_output = list(zip(input_ids, logits)) random_item = random.choice(input_output) in_sent = tokenizer.decode(list(filter( lambda x: x != tokenizer.eos_token_id, random_item[0]))) out_expl = tokenizer.decode(random_item[1].argmax(dim=1), skip_special_tokens=True) logger.info(f'MODEL INPUT: {in_sent}') logger.info(f'GEN. EXPL {out_expl}') logger.info('--------------------------------') return { 'loss': loss.item(), } '''Validation function and validator (validator output is the input of the metrics)''' def validation(engine, batch): model.eval() with torch.no_grad(): batch = tuple(input_tensor.to(args.device) for input_tensor in batch) if args.no_image: input_ids, lm_label, label, input_mask = batch else: image, input_ids, lm_label, label, input_mask = batch if args.no_image: output = model(input_ids=input_ids, # attention_mask=input_mask ) else: output = model(input_ids=input_ids, images=image, # attention_mask=input_mask ) logits, _ = output logits_shifted = logits[..., :-1, :].contiguous().view(-1, logits.size(-1)) labels_shifted = lm_label[..., 1:].contiguous().view(-1) return logits_shifted, labels_shifted '''Engines''' trainer = Engine(train) validator = Engine(validation) # t_total = len( # train_loader) // args.gradient_accumulation_steps * args.n_epochs # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) '''Linearly decrease the learning rate from lr to zero''' scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) ''' Attach validation to trainer: we evaluate when we start the training and at the end of each epoch ''' trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: validator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: validator.run(val_loader)) '''Prepare metrics - note how we compute distributed metrics''' RunningAverage(output_transform=lambda x: x['loss']).attach( trainer, "loss") RunningAverage(output_transform=lambda x: math.exp( average_distributed_scalar(x['loss'], args))).attach(trainer, "ppl") if not args.with_expl: RunningAverage(output_transform=lambda x: 100 * x['lbl_accuracy']).attach( trainer, "lbl_accuracy") metrics = {} metrics["lbl_loss"] = Loss(torch.nn.CrossEntropyLoss(), output_transform=lambda x: (x[0], x[1])) metrics["loss"] = MetricsLambda( lambda l, a: average_distributed_scalar( l / a.gradient_accumulation_steps, a), metrics["lbl_loss"], args) metrics["ppl"] = MetricsLambda(math.exp, metrics["loss"]) if not args.with_expl: metrics["lbl_accuracy"] = 100 * \ Accuracy(output_transform=lambda x: (x[0], x[1])) for name, metric in metrics.items(): metric.attach(validator, name) ''' On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train ''' if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss", 'ppl'] if args.with_expl else ["loss", 'lbl_accuracy', 'ppl']) validator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(validator.state.metrics))) tb_logger = TensorboardLogger(log_dir=output_dir) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(trainer, log_handler=OutputHandler( tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OutputHandler( tag="training", metric_names=["ppl"] if args.with_expl else ["lbl_accuracy", "ppl"]), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(validator, log_handler=OutputHandler( tag="validation", metric_names=[ 'ppl', 'loss'] if args.with_expl else['ppl', 'loss', 'lbl_accuracy'], global_step_transform=lambda *args, **kwargs: trainer.state.iteration), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(output_dir, 'checkpoint', n_saved=8, require_empty=False) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=1), checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation torch.save(args, os.path.join(output_dir, 'model_training_args.bin')) getattr(model, 'module', model).config.to_json_file( os.path.join(output_dir, CONFIG_NAME)) tokenizer.save_vocabulary(output_dir) '''Run the training''' trainer.run(train_loader, max_epochs=args.n_epochs)
def evaluate(self, dl): self.tester = Engine(self.test_step) self.attach_metrics(self.tester, self.test_metrics) if self.hparams.add_pbar: ProgressBar(persist=False).attach(self.tester) self.tester.run(dl, epoch_length=self.hparams.val_length)
def setup(self): self._init_distribution() self.trainer = Engine(self.train_step) self.trainer.logger = setup_logger(name="trainer", distributed_rank=self.local_rank) self.log_basic_info(self.trainer.logger) self.load_trainer_from_checkpoint() if self.scheduler: self.scheduler_event = self.trainer.add_event_handler( Events.ITERATION_STARTED, self.scheduler) else: self.scheduler_event = None self.attach_metrics(self.trainer, self.train_metrics) if idist.get_world_size() > 1: def set_epoch(engine): self.train_loader.sampler.set_epoch(engine.state.epoch) self.trainer.add_event_handler(Events.EPOCH_STARTED, set_epoch) common.setup_common_training_handlers( self.trainer, train_sampler=self.train_loader.sampler, to_save=None, save_every_iters=0, output_path=None, lr_scheduler=None, output_names=None, with_pbars=self.hparams.add_pbar, clear_cuda_cache=True, stop_on_nan=False) self.evaluator = Engine(self.eval_step) self.evaluator.logger = setup_logger("evaluator", distributed_rank=self.local_rank) if self.hparams.add_pbar: ProgressBar(persist=False).attach(self.evaluator) def complete_clear(engine): engine.state.batch = None engine.state.output = None import gc gc.collect() self.trainer.add_event_handler(Events.EPOCH_COMPLETED, complete_clear) self.validation_handler_event = self.trainer.add_event_handler( Events.EPOCH_COMPLETED(every=self.hparams.eval_every), self.validate(self.valid_loader)) self.evaluator.add_event_handler(Events.EPOCH_COMPLETED, complete_clear) train_handler_params = { "model": self.model, "optimizer": self.optimizer, "scheduler": self.scheduler } eval_handler_params = { "model": self.model, "optimizer": self.optimizer, "scheduler": self.scheduler } to_save = { "model": self.model, "trainer": self.trainer, "optimizer": self.optimizer } if self.scheduler is not None: to_save["scheduler"] = self.scheduler if USE_AMP: to_save["amp"] = amp self.attach_metrics(self.evaluator, self.validation_metrics) self.setup_checkpoint_saver(to_save) if self.rank == 0: self._init_logger() if self.logger: self.logger._init_logger(self.trainer, self.evaluator) self.logger._add_train_events(**train_handler_params) self.logger._add_eval_events(**eval_handler_params)
# In[41]: handler = EarlyStopping( patience=6, score_function=lambda engine: engine.state.metrics['accuracy'], trainer=trainer) val_evaluator.add_event_handler(Events.COMPLETED, handler) # In[42]: checkpoints = ModelCheckpoint('models', f'Model_{model_name}_3channels', save_interval=3, n_saved=15, create_dir=True) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoints, {f'{learning_rate_str}': model}) # In[43]: pbar = ProgressBar(bar_format='') # In[44]: print('Training started') trainer.run(loader, max_epochs=50) # In[ ]: