예제 #1
0
    def attach_test(validation_engine, verbose=VERBOSE_BATCH_WISE):
        # Attaching would be repaeted for serveral metrics.
        # Thus, we can reduce the repeated codes by using this function.
        def attach_running_average(engine, metric_name):
            RunningAverage(output_transform=lambda x: x[metric_name]).attach(
                engine,
                metric_name,
            )

        # If the verbosity is set, progress bar would be shown for mini-batch iterations.
        # Without ignite, you can use tqdm to implement progress bar.

        validation_metric_names = ['loss', 'accuracy']

        for metric_name in validation_metric_names:
            attach_running_average(validation_engine, metric_name)

        # Do same things for validation engine.
        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(validation_engine, validation_metric_names)

        if verbose >= VERBOSE_EPOCH_WISE:

            @validation_engine.on(Events.EPOCH_COMPLETED)
            def print_valid_logs(engine):
                print('Test - loss={:.4e} accuracy={:.4f} '.format(
                    engine.state.metrics['loss'],
                    engine.state.metrics['accuracy']))
예제 #2
0
    def attach(train_engine, validation_engine, verbose=VERBOSE_BATCH_WISE):
        # Attaching would be repaeted for serveral metrics.
        # Thus, we can reduce the repeated codes by using this function.
        def attach_running_average(engine, metric_name):
            RunningAverage(output_transform=lambda x: x[metric_name]).attach(
                engine,
                metric_name,
            )
            # runningAverage : 미니 배치마다 return을 하면 알아서 통계적 수치를 보여줌

        training_metric_names = ['loss', 'accuracy', '|param|', '|g_param|']

        for metric_name in training_metric_names:
            attach_running_average(train_engine, metric_name)

        # If the verbosity is set, progress bar would be shown for mini-batch iterations.
        # Without ignite, you can use tqdm to implement progress bar.
        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(train_engine, training_metric_names)
            # progress bar를 출력하라는 것

        # If the verbosity is set, statistics would be shown after each epoch.
        if verbose >= VERBOSE_EPOCH_WISE:

            @train_engine.on(Events.EPOCH_COMPLETED)
            # epoch가 끝났을때 Print
            def print_train_logs(engine):
                print(
                    'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} accuracy={:.4f}'
                    .format(
                        engine.state.epoch,
                        engine.state.metrics['|param|'],
                        engine.state.metrics['|g_param|'],
                        engine.state.metrics['loss'],
                        engine.state.metrics['accuracy'],
                    ))

        validation_metric_names = ['loss', 'accuracy']

        for metric_name in validation_metric_names:
            attach_running_average(validation_engine, metric_name)

        # Do same things for validation engine.
        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(validation_engine, validation_metric_names)

        if verbose >= VERBOSE_EPOCH_WISE:

            @validation_engine.on(Events.EPOCH_COMPLETED)
            def print_valid_logs(engine):
                print(
                    'Validation - loss={:.4e} accuracy={:.4f} best_loss={:.4e}'
                    .format(
                        engine.state.metrics['loss'],
                        engine.state.metrics['accuracy'],
                        engine.best_loss,
                    ))
예제 #3
0
def attach_pbar_and_metrics(trainer, evaluator):
    loss_metric = Average(output_transform=lambda output: output["loss"])
    accuracy_metric = Accuracy(
        output_transform=lambda output: (output["logit"], output["label"]))
    pbar = ProgressBar()
    loss_metric.attach(trainer, "loss")
    accuracy_metric.attach(trainer, "accuracy")
    accuracy_metric.attach(evaluator, "accuracy")
    pbar.attach(trainer)
예제 #4
0
def main(config):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    train_loader, valid_loader, test_loader = load_dataloader_for_featureNet(
        config)

    model = DeepSleepNet(input_dim=1,
                         n_classes=5,
                         is_train=True,
                         use_dropout=config.use_dropout,
                         use_rnn=config.use_rnn).to(device)
    optimizer = optim.Adam(model.parameters())
    crit = nn.CrossEntropyLoss()

    data = torch.load("./folder0_model.pth")
    model.load_state_dict(data["model"])

    def validate(engine, mini_batch):
        print(1)
        engine.model.eval()

        with torch.no_grad():
            x, y = mini_batch
            x, y = x.to(engine.device), y.to(engine.device)

            y_hat = engine.model(x)

            loss = engine.crit(y_hat, y)

            if isinstance(y, torch.LongTensor) or isinstance(
                    y, torch.cuda.LongTensor):
                accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(
                    y.size(0))
            else:
                accuracy = 0

        return {'loss': float(loss), 'accuracy': float(accuracy)}

    test_engine = MyEngine(validate, model, crit, optimizer, config)

    if config.verbose >= 2:
        print(model)
        print(optimizer)
        print(crit)

    def log_metrics(engine, title):
        print(engine.state.metrics.items())
        print(f"{title} accuracy: {engine.state.metrics['accuracy']:.2f}")

    test_engine.add_event_handler(Events.EPOCH_COMPLETED, log_metrics, 'test')

    RunningAverage(output_transform=lambda x: x['accuracy']).attach(
        test_engine, 'accuracy')
    pbar = ProgressBar()
    pbar.attach(test_engine, ['accuracy'])

    test_engine.run(test_loader, max_epochs=1)
예제 #5
0
    def attach(train_engine,
               validation_engine,
               training_metric_names=[
                   'actor', 'baseline', 'risk', '|param|', '|g_param|'
               ],
               validation_metric_names=[
                   'BLEU',
               ],
               verbose=VERBOSE_BATCH_WISE):
        # Attaching would be repaeted for serveral metrics.
        # Thus, we can reduce the repeated codes by using this function.
        def attach_running_average(engine, metric_name):
            RunningAverage(output_transform=lambda x: x[metric_name]).attach(
                engine,
                metric_name,
            )

        for metric_name in training_metric_names:
            attach_running_average(train_engine, metric_name)

        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(train_engine, training_metric_names)

        if verbose >= VERBOSE_EPOCH_WISE:

            @train_engine.on(Events.EPOCH_COMPLETED)
            def print_train_logs(engine):
                avg_p_norm = engine.state.metrics['|param|']
                avg_g_norm = engine.state.metrics['|g_param|']
                avg_reward = engine.state.metrics['actor']

                print('Epoch {} - |param|={:.2e} |g_param|={:.2e} BLEU={:.2f}'.
                      format(
                          engine.state.epoch,
                          avg_p_norm,
                          avg_g_norm,
                          avg_reward,
                      ))

        for metric_name in validation_metric_names:
            attach_running_average(validation_engine, metric_name)

        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(validation_engine, validation_metric_names)

        if verbose >= VERBOSE_EPOCH_WISE:

            @validation_engine.on(Events.EPOCH_COMPLETED)
            def print_valid_logs(engine):
                avg_bleu = engine.state.metrics['BLEU']
                print('Validation - BLEU={:.2f} best_BLEU={:.2f}'.format(
                    avg_bleu,
                    -engine.best_loss,
                ))
예제 #6
0
    def attach(train_engine, validation_engine, verbose=VERBOSE_BATCH_WISE):
        '''현재 상황 보고 및 출력 함수'''
        def attach_running_average(engine, metric_name):
            RunningAverage(output_transform=lambda x: x[metric_name]).attach(
                engine, metric_name)

        '''
        Train Attach Process
        '''
        training_metric_names = ['loss', 'accuracy', '|param|', '|g_param|']

        for metric_name in training_metric_names:
            attach_running_average(train_engine, metric_name)

        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(train_engine, training_metric_names)

        if verbose >= VERBOSE_EPOCH_WISE:

            @train_engine.on(Events.EPOCH_COMPLETED)
            def print_train_tag(engine):
                print(
                    'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} accuracy={:.4f}'
                    .format(
                        engine.state.epoch,
                        engine.state.metrics['|param|'],
                        engine.state.metrics['|g_param|'],
                        engine.state.metrics['loss'],
                        engine.state.metrics['accuracy'],
                    ))

        '''
        Validate Attach Process
        '''
        validation_metric_names = ['loss', 'accuracy']

        for metric_name in validation_metric_names:
            attach_running_average(validation_engine, metric_name)

        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(validation_engine, validation_metric_names)

        if verbose >= VERBOSE_EPOCH_WISE:

            @validation_engine.on(Events.EPOCH_COMPLETED)
            def print_valid_logs(engine):
                print(
                    'Validation - loss={:.4e} accuracy={:.4f} best_loss={:.4e}'
                    .format(
                        engine.state.metrics['loss'],
                        engine.state.metrics['accuracy'],
                        engine.best_loss,
                    ))
예제 #7
0
def create_supervised_trainer_skipgram(model,
                                       optimizer,
                                       prepare_batch,
                                       metrics={},
                                       device=None,
                                       log_dir='output/log/',
                                       checkpoint_dir='output/checkpoints/',
                                       checkpoint_every=None,
                                       tensorboard_every=50) -> Engine:
    def _prepare_batch(batch):

        return batch

    def _update(engine, batch):
        model.train()
        optimizer.zero_grad()

        batch = _prepare_batch(batch)
        batch_loss = model._loss(batch)
        loss = batch_loss.mean()

        loss.backward()
        optimizer.step()

        return {'loss': loss.item(), 'y_pred': scores, 'y': target}

    model.to(device)
    engine = Engine(_update)

    # Metrics
    RunningAverage(output_transform=lambda x: x['loss']).attach(
        engine, 'average_loss')

    # TQDM
    pbar = ProgressBar(persist=True, )
    pbar.attach(engine, ['average_loss'])

    # Checkpoint saving
    # to_save = {'model': model, 'optimizer': optimizer, 'engine': engine}
    final_checkpoint_handler = Checkpoint({'model': model},
                                          DiskSaver(checkpoint_dir,
                                                    create_dir=True),
                                          n_saved=None,
                                          filename_prefix='final')

    engine.add_event_handler(Events.COMPLETED, final_checkpoint_handler)

    @engine.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        metrics = engine.state.metrics
        print(f"Epoch results - Avg loss: {metrics['average_loss']:.6f},"
              f" Accuracy: {metrics['accuracy']:.6f},"
              f" Non-Pad-Accuracy: {metrics['non_pad_accuracy']:.6f}")

    return engine
예제 #8
0
    def attach(trainer, evaluator, verbose=VERBOSE_BATCH_WISE):
        from ignite.engine import Events
        from ignite.metrics import RunningAverage
        from ignite.contrib.handlers.tqdm_logger import ProgressBar

        RunningAverage(output_transform=lambda x: x[0]).attach(
            trainer, 'actor')
        RunningAverage(output_transform=lambda x: x[1]).attach(
            trainer, 'baseline')
        RunningAverage(output_transform=lambda x: x[2]).attach(
            trainer, 'reward')
        RunningAverage(output_transform=lambda x: x[3]).attach(
            trainer, '|param|')
        RunningAverage(output_transform=lambda x: x[4]).attach(
            trainer, '|g_param|')

        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(
                trainer,
                ['|param|', '|g_param|', 'actor', 'baseline', 'reward'])

        if verbose >= VERBOSE_EPOCH_WISE:

            @trainer.on(Events.EPOCH_COMPLETED)
            def print_train_logs(engine):
                avg_p_norm = engine.state.metrics['|param|']
                avg_g_norm = engine.state.metrics['|g_param|']
                avg_reward = engine.state.metrics['actor']

                print('Epoch {} - |param|={:.2e} |g_param|={:.2e} BLEU={:.2f}'.
                      format(
                          engine.state.epoch,
                          avg_p_norm,
                          avg_g_norm,
                          avg_reward,
                      ))

        RunningAverage(output_transform=lambda x: x).attach(evaluator, 'BLEU')

        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(evaluator, ['BLEU'])

        if verbose >= VERBOSE_EPOCH_WISE:

            @evaluator.on(Events.EPOCH_COMPLETED)
            def print_valid_logs(engine):
                avg_bleu = engine.state.metrics['BLEU']
                print('Validation - BLEU={:.2f} best_BLEU={:.2f}'.format(
                    avg_bleu,
                    -engine.best_loss,
                ))
예제 #9
0
    def attach(
        train_engine, validation_engine,
        training_metric_names=['loss', 'ppl', '|param|', '|g_param|'],
        validation_metric_names=['loss', 'ppl'],
        verbose=VERBOSE_BATCH_WISE,
    ):
        def attach_running_average(engine, metric_name):
            RunningAverage(output_transform=lambda x: x[metric_name]).attach(
                engine,
                metric_name,
            )

        for metric_name in training_metric_names:
            attach_running_average(train_engine, metric_name)

        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(train_engine, training_metric_names)

        if verbose >= VERBOSE_EPOCH_WISE:
            @train_engine.on(Events.EPOCH_COMPLETED)
            def print_train_logs(engine):
                avg_p_norm = engine.state.metrics['|param|']
                avg_g_norm = engine.state.metrics['|g_param|']
                avg_loss = engine.state.metrics['loss']

                print('Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} ppl={:.2f}'.format(
                    engine.state.epoch,
                    avg_p_norm,
                    avg_g_norm,
                    avg_loss,
                    np.exp(avg_loss),
                ))

        for metric_name in validation_metric_names:
            attach_running_average(validation_engine, metric_name)

        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(validation_engine, validation_metric_names)

        if verbose >= VERBOSE_EPOCH_WISE:
            @validation_engine.on(Events.EPOCH_COMPLETED)
            def print_valid_logs(engine):
                avg_loss = engine.state.metrics['loss']

                print('Validation - loss={:.4e} ppl={:.2f} best_loss={:.4e} best_ppl={:.2f}'.format(
                    avg_loss,
                    np.exp(avg_loss),
                    engine.best_loss,
                    np.exp(engine.best_loss),
                ))
예제 #10
0
    def attach(trainer, evaluator, verbose=VERBOSE_BATCH_WISE):
        from ignite.engine import Events
        from ignite.metrics import RunningAverage
        from ignite.contrib.handlers.tqdm_logger import ProgressBar

        RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'loss')
        RunningAverage(output_transform=lambda x: x[1]).attach(
            trainer, '|param|')
        RunningAverage(output_transform=lambda x: x[2]).attach(
            trainer, '|g_param|')

        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(trainer, ['|param|', '|g_param|', 'loss'])

        if verbose >= VERBOSE_EPOCH_WISE:

            @trainer.on(Events.EPOCH_COMPLETED)
            def print_train_logs(engine):
                avg_p_norm = engine.state.metrics['|param|']
                avg_g_norm = engine.state.metrics['|g_param|']
                avg_loss = engine.state.metrics['loss']

                print(
                    'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} ppl={:.2f}'
                    .format(
                        engine.state.epoch,
                        avg_p_norm,
                        avg_g_norm,
                        avg_loss,
                        np.exp(avg_loss),
                    ))

        RunningAverage(output_transform=lambda x: x).attach(evaluator, 'loss')

        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(evaluator, ['loss'])

        if verbose >= VERBOSE_EPOCH_WISE:

            @evaluator.on(Events.EPOCH_COMPLETED)
            def print_valid_logs(engine):
                avg_loss = engine.state.metrics['loss']
                print(
                    'Validation - loss={:.4e} ppl={:.2f} best_loss={:.4e} best_ppl={:.2f}'
                    .format(
                        avg_loss,
                        np.exp(avg_loss),
                        engine.best_loss,
                        np.exp(engine.best_loss),
                    ))
예제 #11
0
def inference(
        cfg,
        model,
        val_loader,
        num_query
):
    device = cfg.MODEL.DEVICE

    logger = logging.getLogger("reid_baseline.inference")
    logger.info("Enter inferencing")
    if cfg.TEST.RE_RANKING == 'no':
        print("Create evaluator")
        if 'test_all' in cfg.TEST.TEST_MODE:
            if len(val_loader.dataset.dataset[0]) == 4: # mask no new eval
                evaluator = create_supervised_all_evaluator_with_mask(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)},
                        seq_len=cfg.INPUT.SEQ_LEN,device=device)
            elif len(val_loader.dataset.dataset[0]) == 6: # mask , new eval
                evaluator = create_supervised_all_evaluator_with_mask_new_eval(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM,new_eval=True)},
                        seq_len=cfg.INPUT.SEQ_LEN,device=device)
            else:
                evaluator = create_supervised_all_evaluator(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)},
                        seq_len=cfg.INPUT.SEQ_LEN,device=device)
        else:
            if len(val_loader.dataset.dataset[0]) == 6: # mask , new eval
                evaluator = create_supervised_evaluator_with_mask_new_eval(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM,new_eval=True)},
                        device=device)
            elif len(val_loader.dataset.dataset[0]) == 4 : # mask, no new eval
                evaluator = create_supervised_evaluator_with_mask(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)},
                        device=device)
            else:
                evaluator = create_supervised_evaluator(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)},
                                                device=device)
    elif cfg.TEST.RE_RANKING == 'yes': # haven't implement with mask
        print("Create evaluator for reranking")
        if 'test_all' in cfg.TEST.TEST_MODE:
            evaluator = create_supervised_all_evaluator(model, metrics={'r1_mAP': R1_mAP_reranking(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)},
                        seq_len=cfg.INPUT.SEQ_LEN,device=device)
        else:
            evaluator = create_supervised_evaluator(model, metrics={'r1_mAP': R1_mAP_reranking(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)},
                                                device=device)
    else:
        print("Unsupported re_ranking config. Only support for no or yes, but got {}.".format(cfg.TEST.RE_RANKING))

    pbar = ProgressBar(persist=True,ncols=120)
    pbar.attach(evaluator)

    evaluator.run(val_loader)
    cmc, mAP = evaluator.state.metrics['r1_mAP']
    logger.info('Validation Results')
    logger.info("mAP: {:.1%}".format(mAP))
    for r in [1, 5, 10]:
        logger.info("CMC curve, Rank-{:<3}:{:.1%}".format(r, cmc[r - 1]))
예제 #12
0
    def train(self):
        """
        Full training logic
        """
        if self.train_logger is not None:
            self.train_logger.watch(self.model)

        engine = Engine(self._train_update_func)

        @engine.on(Events.EPOCH_STARTED)
        def log_training_loss(engine):
            engine.state.total_loss = 0

        @engine.on(Events.ITERATION_COMPLETED)
        def log_training_loss(engine):
            engine.state.total_loss += engine.state.output['loss']

        for name, metric in self.metrics.items():
            metric.attach(engine, name)

        pbar = ProgressBar()
        pbar.attach(engine)

        if self.valid:  # TODO proper implementation: currently handled only in subclass
            evaluator = self._prepare_evaluator()
            engine.add_event_handler(Events.EPOCH_COMPLETED, self.run_validate,
                                     evaluator)

        @engine.on(Events.EPOCH_COMPLETED)
        def mk_checkpoints(
                engine):  # TODO use checkpointing/scheduling from ignnite
            log = {
                'epoch': engine.state.epoch,
                'loss': engine.state.total_loss / len(engine.state.dataloader),
                'metrics': engine.state.metrics
            }
            if hasattr(engine.state, 'validation_result'):
                log['val_loss'] = engine.state.validation_result.total_loss / len(
                    engine.state.validation_result.dataloader)

            self._prepare_checkpoint(log=log)
            self._reschedule_lr(epoch=engine.state.epoch)
            if self.train_logger is not None:
                self.train_logger.add_entry(log)
                if self.verbosity >= 1:
                    for key, value in log.items():
                        self.logger.info('    {:15s}: {}'.format(
                            str(key), value))

        engine.run(
            self.data_loader, max_epochs=self.epochs
        )  # TODO return resume logic of range(self.start_epoch, self.epochs + 1):
예제 #13
0
    def train(self,
              epochs: int,
              train_loader,
              test_loader=None,
              trainsize=None,
              valsize=None):
        self.model.train()
        train_engine = Engine(lambda e, b: self.train_step(b))

        @train_engine.on(Events.EPOCH_COMPLETED(every=self.track_loss_freq))
        def eval_test(engine):
            if self.track_loss:
                self.tb_log(train_loader,
                            engine.state.epoch,
                            is_train=True,
                            eval_length=valsize)
                if test_loader is not None:
                    self.tb_log(test_loader,
                                engine.state.epoch,
                                is_train=False,
                                eval_length=valsize)

        @train_engine.on(Events.EPOCH_COMPLETED)
        def save_state(engine):
            torch.save(self.model.state_dict(), self.snail_path)
            torch.save(self.opt.state_dict(), self.snail_opt_path)

        @train_engine.on(
            Events.ITERATION_COMPLETED(every=self.track_params_freq))
        def tb_log_histogram_params(engine):
            if self.track_layers:
                for name, params in self.model.named_parameters():
                    self.logger.add_histogram(name.replace('.', '/'), params,
                                              engine.state.iteration)
                    if params.grad is not None:
                        self.logger.add_histogram(
                            name.replace('.', '/') + '/grad', params.grad,
                            engine.state.iteration)

        if self.trainpbar:
            RunningAverage(output_transform=lambda x: x).attach(
                train_engine, 'loss')
            p = ProgressBar()
            p.attach(train_engine, ['loss'])
        train_engine.run(train_loader,
                         max_epochs=epochs,
                         epoch_length=trainsize)
    def attach(train_engine, validation_engine, verbose=VERBOSE_BATCH_WISE):
        def attach_runngin_average(engine, metric_name):
            RunningAverage(output_transform=lambda x: x[metric_name]).attach(
                engine, metric_name)

        training_metric_names = ["loss", "accuracy", "|param|", "|g_param|"]

        for metric_name in training_metric_names:
            attach_runngin_average(train_engine, metric_name)

        # 매 iteration 마다 출력
        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(train_engine, training_metric_names)

        # epoch이 끝났을때 출력
        if verbose >= VERBOSE_EPOCH_WISE:

            @train_engine.on(Events.EPOCH_COMPLETED)
            def print_train_loss(engine):
                print(
                    "Epoch {} - |param| = {:.2e} |g_param| = {:.2e} loss = {:.4e} accuracy = {:.4f}"
                    .format(engine.state.epoch,
                            engine.state.metrics["|param|"],
                            engine.state.metrics["|g_param|"],
                            engine.state.metrics["loss"],
                            engine.state.metrics["accuracy"]))

        validation_metric_names = ["loss", "accuracy"]

        for metric_name in validation_metric_names:
            attach_runngin_average(validation_engine, metric_name)

        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(validation_engine, validation_metric_names)

        if verbose >= VERBOSE_EPOCH_WISE:

            @validation_engine.on(Events.EPOCH_COMPLETED)
            def print_valid_loss(engine):
                print(
                    "Validation - loss={:.4e} accuracy={:.4f} best_loss={:.4e}"
                    .format(engine.state.metrics["loss"],
                            engine.state.metrics["accuracy"],
                            engine.best_loss))
예제 #15
0
    def attach(trainer, evaluator, verbose=2):
        from ignite.engine import Events
        from ignite.metrics import RunningAverage
        from ignite.contrib.handlers.tqdm_logger import ProgressBar

        RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'loss')
        RunningAverage(output_transform=lambda x: x[1]).attach(
            trainer, '|param|')
        RunningAverage(output_transform=lambda x: x[2]).attach(
            trainer, '|g_param|')

        if verbose >= 2:
            pbar = ProgressBar()
            pbar.attach(trainer, ['|param|', '|g_param|', 'loss'])

        if verbose >= 1:

            @trainer.on(Events.EPOCH_COMPLETED)
            def print_train_logs(engine):
                avg_p_norm = engine.state.metrics['|param|']
                avg_g_norm = engine.state.metrics['|g_param|']
                avg_loss = engine.state.metrics['loss']

                print('Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e}'.
                      format(engine.state.epoch, avg_p_norm, avg_g_norm,
                             avg_loss))

        RunningAverage(output_transform=lambda x: x).attach(evaluator, 'loss')

        if verbose >= 2:
            pbar = ProgressBar()
            pbar.attach(evaluator, ['loss'])

        if verbose >= 1:

            @evaluator.on(Events.EPOCH_COMPLETED)
            def print_valid_logs(engine):
                avg_loss = engine.state.metrics['loss']
                print('Validation - loss={:.4e} lowest_loss={:.4e}'.format(
                    avg_loss, engine.lowest_loss))
예제 #16
0
        def attatch_running_average(engine, meric_name):
            RunningAverage(output_transform = lambda x : x[metric_name]).attach(
                engine,
                meric_name,
            )

            training_metric_names = ['loss', 'accuracy', '|param|', '|g_param|']

            for metric_name in training_metric_names:
                attatch_running_average(train_engine, metric_name)

            if verbose >= VERBOSE_BATCH_WISE:
                pbar = ProgressBar(bar_format = None, ncols = 120)
                pbar.attach(train_engine, training_metric_names)

            if verbose >= VERBOSE_EPOCH_WISE:
                @train_engine.on(Events.EPOCH_COMPLETED)
                def print_train_logs(engine):
                    print('Epoch {} - |param\ = {:.2e} loss = {:.4e} accuracy = {}').format(
                        engine.state.epoch,
                        engine.state.metrics['|param|'],
                        engnie.state.metrics['|g_param|'],
                        engine.state.metrics['loss'],
                        engnie.state.metrics['accuracy']
                    )
                validation_metric_names = ['loss','accuracy']

                for metric_name in validation_metric_names:
                    attatch_running_average(validation_engine, metric_name)

                if verbose >= VERBOSE_EPOCH_WISE:
                    @validation_engine.on(Events.EPOCH_COMPLETED)
                    def print_valid_logs(engine):
                        print('Validation - loss={:.4e} accuracy={:.4f} best_loss={:4.}'.format(
                            engine.state.metrics['loss'],
                            engine.state.metrics['accuracy'],
                            engine.best_loss,
                        ))

                    @staticmethos
예제 #17
0
    def attach(
        train_engine,  # validation_engine,
        training_metric_names=['loss', 'ppl', '|param|', '|g_param|'],
        # validation_metric_names = ['loss', 'ppl'],
        verbose=VERBOSE_BATCH_WISE,
    ):
        # Attaching would be repaeted for serveral metrics.
        # Thus, we can reduce the repeated codes by using this function.
        def attach_running_average(engine, metric_name):
            RunningAverage(output_transform=lambda x: x[metric_name]).attach(
                engine,
                metric_name,
            )

        for metric_name in training_metric_names:
            attach_running_average(train_engine, metric_name)

        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(train_engine, training_metric_names)

        if verbose >= VERBOSE_EPOCH_WISE:

            @train_engine.on(Events.EPOCH_COMPLETED)
            def print_train_logs(engine):
                avg_p_norm = engine.state.metrics['|param|']
                avg_g_norm = engine.state.metrics['|g_param|']
                avg_loss = engine.state.metrics['loss']

                print(
                    'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} ppl={:.2f}'
                    .format(
                        engine.state.epoch,
                        avg_p_norm,
                        avg_g_norm,
                        avg_loss,
                        np.exp(avg_loss),
                    ))
예제 #18
0
    def attach(train_engine, val_engine, verbose=EPOCHWISE):
        def attach_running_average(engine, metric_name):
            RunningAverage(output_transform=lambda x: x[metric_name]).attach(
                engine,
                metric_name,
            )

        training_metric_names = ['loss', 'accuracy']

        for metric_name in training_metric_names:
            attach_running_average(train_engine, metric_name)

        if verbose == BATCHWISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(train_engine, ['loss', 'accuracy'])

        if verbose == EPOCHWISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(train_engine, ['loss', 'accuracy'])

            @train_engine.on(Events.EPOCH_COMPLETED)
            def print_logs(engine):
                print('Epoch {} Train - Accuracy: {:.4f} Loss: {:.4f}'.format(
                    engine.state.epoch,
                    engine.state.metrics['accuracy'],
                    engine.state.metrics['loss'],
                ))

        validation_metric_names = ['loss', 'accuracy']

        for metric_name in validation_metric_names:
            attach_running_average(val_engine, metric_name)

        if verbose == BATCHWISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(val_engine, ['loss', 'accuracy'])

        if verbose == EPOCHWISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(val_engine, ['loss', 'accuracy'])
예제 #19
0
    def attach(train_engine,
               validation_engine,
               training_metric_names=[
                   'x2y', 'y2x', 'reg', '|param|', '|g_param|'
               ],
               validation_metric_names=['x2y', 'y2x'],
               verbose=VERBOSE_BATCH_WISE):
        # Attaching would be repaeted for serveral metrics.
        # Thus, we can reduce the repeated codes by using this function.
        def attach_running_average(engine, metric_name):
            RunningAverage(output_transform=lambda x: x[metric_name]).attach(
                engine,
                metric_name,
            )

        for metric_name in training_metric_names:
            attach_running_average(train_engine, metric_name)

        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(train_engine, training_metric_names)

        if verbose >= VERBOSE_EPOCH_WISE:

            @train_engine.on(Events.EPOCH_COMPLETED)
            def print_train_logs(engine):
                avg_p_norm = engine.state.metrics['|param|']
                avg_g_norm = engine.state.metrics['|g_param|']
                avg_x2y = engine.state.metrics['x2y']
                avg_y2x = engine.state.metrics['y2x']
                avg_reg = engine.state.metrics['reg']

                print(
                    'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss_x2y={:.4e} ppl_x2y={:.2f} loss_y2x={:.4e} ppl_y2x={:.2f} dual_loss={:.4e}'
                    .format(
                        engine.state.epoch,
                        avg_p_norm,
                        avg_g_norm,
                        avg_x2y,
                        np.exp(avg_x2y),
                        avg_y2x,
                        np.exp(avg_y2x),
                        avg_reg,
                    ))

        for metric_name in validation_metric_names:
            attach_running_average(validation_engine, metric_name)

        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(validation_engine, validation_metric_names)

        if verbose >= VERBOSE_EPOCH_WISE:

            @validation_engine.on(Events.EPOCH_COMPLETED)
            def print_valid_logs(engine):
                avg_x2y = engine.state.metrics['x2y']
                avg_y2x = engine.state.metrics['y2x']

                print(
                    'Validation X2Y - loss={:.4e} ppl={:.2f} best_loss={:.4e} best_ppl={:.2f}'
                    .format(
                        avg_x2y,
                        np.exp(avg_x2y),
                        engine.best_x2y,
                        np.exp(engine.best_x2y),
                    ))
                print(
                    'Validation Y2X - loss={:.4e} ppl={:.2f} best_loss={:.4e} best_ppl={:.2f}'
                    .format(
                        avg_y2x,
                        np.exp(avg_y2x),
                        engine.best_y2x,
                        np.exp(engine.best_y2x),
                    ))
예제 #20
0
def run(root_path,
        log_path,
        student_class_module,
        teacher_class_module,
        student_class_name,
        teacher_class_name,
        init_interval,
        hard_ratio,
        train_targets,
        test_targets,
        test_camera_base,
        augmentation_types,
        batch_size,
        n_workers,
        save_interval,
        n_saved,
        gpu_ids,
        max_epochs=150,
        init_lr_student_conv=.01,
        init_lr_teacher_conv=.01,
        init_lr_student_classifier=.01,
        init_lr_teacher_classifier=.1,
        lr_decay_step=100,
        lr_decay_rate=.1):
    device = 'cuda:{}'.format(gpu_ids[0])

    train_transformer = Transformer(True, augmentation_types)
    test_transformer = Transformer(False, [])

    train_dataset = TrainDatasetWrapper(root_path, train_targets,
                                        train_transformer)
    train_loader = utils.data.DataLoader(train_dataset,
                                         batch_size,
                                         shuffle=True,
                                         num_workers=n_workers,
                                         pin_memory=True)

    test_datasets = []
    for test_target in test_targets:
        test_datasets.append(
            TestDatasetWrapper(root_path, test_target, test_transformer,
                               test_camera_base))

    loader_caller = _get_test_data_loader_caller(batch_size, n_workers)

    student_class_module = importlib.import_module(student_class_module)
    student_model_class = getattr(student_class_module, student_class_name)
    teacher_class_module = importlib.import_module(teacher_class_module)
    teacher_model_class = getattr(teacher_class_module, teacher_class_name)
    models = {
        'student': student_model_class(train_dataset.n_classes),
        'teacher': teacher_model_class(train_dataset.n_classes),
        'generator': teacher_model_class(train_dataset.n_classes)
    }

    loss_functions = {
        'student': SoftLabelLoss(),
        'teacher': nn.CrossEntropyLoss()
    }

    student_classifier_parameters = list(
        models['student'].classifier.parameters())
    student_classifier_parameters_ids = []
    for p in student_classifier_parameters:
        student_classifier_parameters_ids.append(id(p))
    student_conv_parameters = []
    for p in models['student'].parameters():
        if id(p) not in student_classifier_parameters_ids:
            student_conv_parameters.append(p)
    teacher_classifier_parameters = list(
        models['teacher'].classifier.parameters())
    teacher_classifier_parameters_ids = []
    for p in teacher_classifier_parameters:
        teacher_classifier_parameters_ids.append(id(p))
    teacher_conv_parameters = []
    for p in models['teacher'].parameters():
        if id(p) not in teacher_classifier_parameters_ids:
            teacher_conv_parameters.append(p)

    optimizers = {
        'student_conv':
        optim.SGD(student_conv_parameters,
                  init_lr_student_conv,
                  momentum=.9,
                  weight_decay=5e-4,
                  nesterov=True),
        'student_classifier':
        optim.SGD(student_classifier_parameters,
                  init_lr_student_classifier,
                  momentum=.9,
                  weight_decay=5e-4,
                  nesterov=True),
        'teacher_conv':
        optim.SGD(teacher_conv_parameters,
                  init_lr_teacher_conv,
                  momentum=.9,
                  weight_decay=5e-4,
                  nesterov=True),
        'teacher_classifier':
        optim.SGD(teacher_classifier_parameters,
                  init_lr_teacher_classifier,
                  momentum=.9,
                  weight_decay=5e-4,
                  nesterov=True),
    }

    schedulers = {
        'student_conv':
        optim.lr_scheduler.StepLR(optimizers['student_conv'],
                                  lr_decay_step,
                                  gamma=lr_decay_rate),
        'student_classifier':
        optim.lr_scheduler.StepLR(optimizers['student_classifier'],
                                  lr_decay_step,
                                  gamma=lr_decay_rate),
        'teacher_conv':
        optim.lr_scheduler.StepLR(optimizers['teacher_conv'],
                                  lr_decay_step,
                                  gamma=lr_decay_rate),
    }

    writer = SummaryWriter(log_dir=log_path)

    trainer = create_supervised_soft_label_trainer(
        models,
        optimizers,
        loss_functions,
        hard_ratio,
        init_interval,
        device=device,
        non_blocking=True,
        output_transform=lambda x, y, y_pred_student, y_pred_teacher,
        loss_student, loss_teacher:
        (y, y_pred_student, y_pred_teacher, loss_student, loss_teacher))

    RunningAverage(output_transform=lambda output: output[3].item()).attach(
        trainer, 'loss_student')
    RunningAverage(output_transform=lambda output: output[4].item()).attach(
        trainer, 'loss_teacher')
    Accuracy(output_transform=lambda output: (output[1], output[0])).attach(
        trainer, 'accuracy_student')
    Accuracy(output_transform=lambda output: (output[2], output[0])).attach(
        trainer, 'accuracy_teacher')

    progress_bar = ProgressBar()
    progress_bar.attach(trainer, ['loss_student', 'loss_teacher'])

    checkpointer = ModelCheckpoint(log_path,
                                   'checkpoint',
                                   save_interval=save_interval,
                                   n_saved=n_saved)

    rank_accuracy = RankAccuracy(n_workers)
    evaluator = create_supervised_evaluator(models['student'],
                                            metrics={'rank': rank_accuracy},
                                            device=device,
                                            non_blocking=True)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED,
        _get_result_write_function(rank_accuracy, test_datasets, loader_caller,
                                   evaluator, writer))

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED,
        _get_init_classifier_function(models, optimizers['teacher_classifier'],
                                      init_interval))

    trainer.add_event_handler(Events.ITERATION_COMPLETED,
                              _get_loss_write_function(writer))
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              _get_lr_decay_function(schedulers))
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              _get_lr_write_function(optimizers, writer))
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED, checkpointer, {
            'student_model': models['student'],
            'teacher_model': models['teacher'],
            'generator_model': models['generator']
        })

    trainer.run(train_loader, max_epochs=max_epochs)

    writer.close()
예제 #21
0
class BasicTrainTask(BaseTask):

    name = "Train Task"

    def _validate(self, config):
        """
        Method to check if specific configuration is correct. Raises AssertError if is incorrect.
        """
        assert isinstance(config, BasicTrainConfig), \
            "Configuration should be instance of `BasicTrainConfig`, but given {}".format(type(config))

    def _start(self):
        """Method to run the task
        """
        if 'cuda' in self.device:
            self.model = self.model.to(self.device)

        mlflow.log_param("model", get_object_name(self.model))

        self.logger.debug("Setup criterion")
        if "cuda" in self.device:
            self.criterion = self.criterion.to(self.device)

        mlflow.log_param("criterion", get_object_name(self.criterion))
        mlflow.log_param("optimizer", get_object_name(self.optimizer))

        self.logger.debug("Setup ignite trainer")
        trainer = self._setup_trainer()
        self._setup_trainer_handlers(trainer)

        metrics = {'loss': Loss(self.criterion)}
        metrics.update(self.metrics)

        self.logger.debug("Input data info: ")
        msg = "- train data loader: {} number of batches".format(
            len(self.train_dataloader))
        if isinstance(self.train_dataloader, DataLoader):
            msg += " | {} number of samples".format(
                len(self.train_dataloader.sampler))
        self.logger.debug(msg)

        if isinstance(self.train_dataloader, DataLoader):
            write_model_graph(self.writer,
                              model=self.model,
                              data_loader=self.train_dataloader,
                              device=self.device)

        self.pbar_eval = None
        if self.train_eval_dataloader is not None:
            self.pbar_eval = ProgressBar()
            self._setup_offline_train_metrics_computation(trainer, metrics)

        if self.val_dataloader is not None:
            if self.val_metrics is None:
                self.val_metrics = metrics

            if self.pbar_eval is None:
                self.pbar_eval = ProgressBar()

            val_evaluator = self._setup_val_metrics_computation(trainer)

            if self.reduce_lr_on_plateau is not None:
                assert self.reduce_lr_on_plateau_var in self.val_metrics, \
                    "Monitor variable {} is not found in metrics {}" \
                    .format(self.reduce_lr_on_plateau_var, metrics)

                @val_evaluator.on(Events.COMPLETED)
                def update_reduce_on_plateau(engine):
                    val_var = engine.state.metrics[
                        self.reduce_lr_on_plateau_var]
                    self.reduce_lr_on_plateau.step(val_var)

            def default_score_function(engine):
                val_loss = engine.state.metrics['loss']
                # Objects with highest scores will be retained.
                return -val_loss

            # Setup early stopping:
            if self.early_stopping_kwargs is not None:
                if 'score_function' in self.early_stopping_kwargs:
                    es_score_function = self.early_stopping_kwargs[
                        'score_function']
                else:
                    es_score_function = default_score_function
                self._setup_early_stopping(trainer, val_evaluator,
                                           es_score_function)

            # Setup model checkpoint:
            if self.model_checkpoint_kwargs is None:
                self.model_checkpoint_kwargs = {
                    "filename_prefix": "model",
                    "score_name": "val_loss",
                    "score_function": default_score_function,
                    "n_saved": 3,
                    "atomic": True,
                    "create_dir": True,
                    "save_as_state_dict": True
                }
            self._setup_best_model_checkpointing(val_evaluator)

        self.logger.debug("Setup other handlers")

        if self.lr_scheduler is not None:

            @trainer.on(Events.ITERATION_STARTED)
            def update_lr_scheduler(engine):
                self.lr_scheduler.step()

        self._setup_log_learning_rate(trainer)

        self.logger.info("Start training: {} epochs".format(self.num_epochs))
        mlflow.log_param("num_epochs", self.num_epochs)
        trainer.run(self.train_dataloader, max_epochs=self.num_epochs)
        self.logger.debug("Training is ended")

    def _setup_trainer(self):
        trainer = create_supervised_trainer(self.model,
                                            self.optimizer,
                                            self.criterion,
                                            device=self.device,
                                            non_blocking='cuda' in self.device)
        return trainer

    def _setup_trainer_handlers(self, trainer):
        # Setup timer to measure training time
        timer = setup_timer(trainer)
        self._setup_log_training_loss(trainer)

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_training_time(engine):
            self.logger.info("One epoch training time (seconds): {}".format(
                timer.value()))

        last_model_saver = ModelCheckpoint(
            self.log_dir.as_posix(),
            filename_prefix="checkpoint",
            save_interval=self.trainer_checkpoint_interval,
            n_saved=1,
            atomic=True,
            create_dir=True,
            save_as_state_dict=True)

        model_name = get_object_name(self.model)

        to_save = {
            model_name: self.model,
            "optimizer": self.optimizer,
        }

        if self.lr_scheduler is not None:
            to_save['lr_scheduler'] = self.lr_scheduler

        trainer.add_event_handler(Events.ITERATION_COMPLETED, last_model_saver,
                                  to_save)
        trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan())

    def _setup_log_training_loss(self, trainer):

        self.avg_output = RunningAverage(output_transform=lambda out: out)
        self.avg_output.attach(trainer, 'running_avg_loss')
        self.pbar.attach(trainer, ['running_avg_loss'])

        @trainer.on(Events.ITERATION_COMPLETED)
        def log_training_loss(engine):
            iteration = (engine.state.iteration - 1) % len(
                self.train_dataloader) + 1
            if iteration % self.log_interval == 0:
                # self.logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.4f}".format(engine.state.epoch, iteration,
                #                                                                   len(self.train_dataloader),
                #                                                                   engine.state.output))
                self.writer.add_scalar("training/loss_vs_iterations",
                                       engine.state.output,
                                       engine.state.iteration)
                mlflow.log_metric("training_loss_vs_iterations",
                                  engine.state.output)

    def _setup_log_learning_rate(self, trainer):
        @trainer.on(Events.EPOCH_STARTED)
        def log_lrs(engine):
            if len(self.optimizer.param_groups) == 1:
                lr = float(self.optimizer.param_groups[0]['lr'])
                self.logger.debug("Learning rate: {}".format(lr))
                self.writer.add_scalar("learning_rate", lr, engine.state.epoch)
                mlflow.log_metric("learning_rate", lr)
            else:
                for i, param_group in enumerate(self.optimizer.param_groups):
                    lr = float(param_group['lr'])
                    self.logger.debug("Learning rate (group {}): {}".format(
                        i, lr))
                    self.writer.add_scalar("learning_rate_group_{}".format(i),
                                           lr, engine.state.epoch)
                    mlflow.log_metric("learning_rate_group_{}".format(i), lr)

    def _setup_offline_train_metrics_computation(self, trainer, metrics):

        train_eval_loader = self.train_eval_dataloader
        msg = "- train evaluation data loader: {} number of batches".format(
            len(train_eval_loader))
        if isinstance(train_eval_loader, DataLoader):
            msg += " | {} number of samples".format(
                len(train_eval_loader.sampler))
        self.logger.debug(msg)

        train_evaluator = create_supervised_evaluator(self.model,
                                                      metrics=metrics,
                                                      device=self.device,
                                                      non_blocking="cuda"
                                                      in self.device)

        self.pbar_eval.attach(train_evaluator)

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_training_metrics(engine):
            epoch = engine.state.epoch
            if epoch % self.val_interval_epochs == 0:
                self.logger.debug("Compute training metrics")
                metrics_results = train_evaluator.run(
                    train_eval_loader).metrics
                self.logger.info("Training Results - Epoch: {}".format(epoch))
                for name in metrics_results:
                    self.logger.info("\tAverage {}: {:.5f}".format(
                        name, metrics_results[name]))
                    self.writer.add_scalar("training/avg_{}".format(name),
                                           metrics_results[name], epoch)
                    mlflow.log_metric("training_avg_{}".format(name),
                                      metrics_results[name])

        return train_evaluator

    def _setup_val_metrics_computation(self, trainer):
        val_evaluator = create_supervised_evaluator(self.model,
                                                    metrics=self.val_metrics,
                                                    device=self.device,
                                                    non_blocking="cuda"
                                                    in self.device)
        self.pbar_eval.attach(val_evaluator)

        msg = "- validation data loader: {} number of batches".format(
            len(self.val_dataloader))
        if isinstance(self.val_dataloader, DataLoader):
            msg += " | {} number of samples".format(
                len(self.val_dataloader.sampler))
        self.logger.debug(msg)

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_validation_results(engine):
            epoch = engine.state.epoch
            if epoch % self.val_interval_epochs == 0:
                self.logger.debug("Compute validation metrics")
                metrics_results = val_evaluator.run(
                    self.val_dataloader).metrics
                self.logger.info(
                    "Validation Results - Epoch: {}".format(epoch))
                for name in metrics_results:
                    self.logger.info("\tAverage {}: {:.5f}".format(
                        name, metrics_results[name]))
                    self.writer.add_scalar("validation/avg_{}".format(name),
                                           metrics_results[name], epoch)
                    mlflow.log_metric("validation_avg_{}".format(name),
                                      metrics_results[name])

        return val_evaluator

    def _setup_early_stopping(self, trainer, val_evaluator, score_function):
        kwargs = dict(self.early_stopping_kwargs)
        if 'score_function' not in kwargs:
            kwargs['score_function'] = score_function
        handler = EarlyStopping(trainer=trainer, **kwargs)
        setup_logger(handler._logger, self.log_filepath, self.log_level)
        val_evaluator.add_event_handler(Events.COMPLETED, handler)

    def _setup_best_model_checkpointing(self, val_evaluator):
        model_name = get_object_name(self.model)
        best_model_saver = ModelCheckpoint(self.log_dir.as_posix(),
                                           **self.model_checkpoint_kwargs)
        val_evaluator.add_event_handler(Events.COMPLETED, best_model_saver,
                                        {model_name: self.model})
예제 #22
0
def main():
    args = get_args()
    if 'e-SNLI-VE' in args.data_path:
        args.no_image = False
    else:
        args.no_image = True
    if not args.no_image:
        args.no_premise = True
    args.with_expl = True

    '''Setup'''
    t = datetime.today()
    output_dir = os.path.join(args.output_folder,
                              f"{t.month}_{t.day}_{t.hour}_{t.minute}_{t.second}")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(filename=os.path.join(output_dir, 'app.log'),
                        filemode='a',
                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    # This is a logger.warning: it will be printed by all distributed processes
    logger.warning(f"Running process {args.local_rank}")
    logger.info(f"Arguments: {pformat(args)}")
    logger.info(f'Image not used:{args.no_image}')
    logger.info(f'Premise not used:{args.no_premise}')
    logger.info(f'Explanations used:{args.with_expl}')

    '''Initialize distributed training if needed'''
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info(
        "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")
    tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint)
    tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT)
    if args.no_image:
        model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint)
    else:
        import image_gpt2_291
        model = image_gpt2_291.GPT2LMHeadModel.from_pretrained(
            args.model_checkpoint)
    model.resize_token_embeddings(len(tokenizer))
    model.to(args.device)
    optimizer = AdamW(model.parameters(), lr=args.lr)

    '''
    Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    '''
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)
        model = model.module

    logger.info("Prepare datasets")
    train_loader, val_loader = get_data_loaders(args, tokenizer)

    '''Training function and trainer'''
    def train(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        if args.no_image:
            input_ids, lm_label, label, input_mask = batch
        else:
            image, input_ids, lm_label, label, input_mask = batch

        if args.no_image:
            output = model(input_ids=input_ids,
                           #    attention_mask=input_mask,
                           labels=lm_label)
        else:
            output = model(input_ids=input_ids,
                           images=image,
                           #    attention_mask=input_mask,
                           labels=lm_label)
        loss, logits, _ = output

        loss = loss / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(
                amp.master_params(optimizer), args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        if not args.with_expl:
            lbl_accuracy = torch.eq(label, logits.argmax(
                dim=1)).float().sum() / len(label)
            return {
                'loss': loss.item(),
                'lbl_accuracy': lbl_accuracy.item()
            }
        else:
            if engine.state.iteration % (args.gradient_accumulation_steps * 500) == 0:
                input_output = list(zip(input_ids, logits))
                random_item = random.choice(input_output)
                in_sent = tokenizer.decode(list(filter(
                    lambda x: x != tokenizer.eos_token_id,
                    random_item[0])))
                out_expl = tokenizer.decode(random_item[1].argmax(dim=1),
                                            skip_special_tokens=True)
                logger.info(f'MODEL INPUT: {in_sent}')
                logger.info(f'GEN. EXPL {out_expl}')
                logger.info('--------------------------------')
            return {
                'loss': loss.item(),
            }

    '''Validation function and validator (validator output is the input of the metrics)'''
    def validation(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(input_tensor.to(args.device)
                          for input_tensor in batch)
            if args.no_image:
                input_ids, lm_label, label, input_mask = batch
            else:
                image, input_ids, lm_label, label, input_mask = batch

            if args.no_image:
                output = model(input_ids=input_ids,
                               #    attention_mask=input_mask
                               )
            else:
                output = model(input_ids=input_ids,
                               images=image,
                               #    attention_mask=input_mask
                               )
            logits, _ = output

            logits_shifted = logits[..., :-1, :].contiguous().view(-1,
                                                                   logits.size(-1))
            labels_shifted = lm_label[..., 1:].contiguous().view(-1)
            return logits_shifted, labels_shifted

    '''Engines'''
    trainer = Engine(train)
    validator = Engine(validation)

    # t_total = len(
    #     train_loader) // args.gradient_accumulation_steps * args.n_epochs
    # scheduler = get_linear_schedule_with_warmup(
    #     optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    '''Linearly decrease the learning rate from lr to zero'''
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    '''
    Attach validation to trainer: we evaluate when we start the training and at the end of each epoch
    '''
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: validator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: validator.run(val_loader))

    '''Prepare metrics - note how we compute distributed metrics'''
    RunningAverage(output_transform=lambda x: x['loss']).attach(
        trainer, "loss")
    RunningAverage(output_transform=lambda x: math.exp(
        average_distributed_scalar(x['loss'], args))).attach(trainer, "ppl")
    if not args.with_expl:
        RunningAverage(output_transform=lambda x: 100 * x['lbl_accuracy']).attach(
            trainer, "lbl_accuracy")

    metrics = {}
    metrics["lbl_loss"] = Loss(torch.nn.CrossEntropyLoss(),
                               output_transform=lambda x: (x[0], x[1]))
    metrics["loss"] = MetricsLambda(
        lambda l, a: average_distributed_scalar(
            l / a.gradient_accumulation_steps, a), metrics["lbl_loss"], args)
    metrics["ppl"] = MetricsLambda(math.exp, metrics["loss"])
    if not args.with_expl:
        metrics["lbl_accuracy"] = 100 * \
            Accuracy(output_transform=lambda x: (x[0], x[1]))
    for name, metric in metrics.items():
        metric.attach(validator, name)

    '''
    On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    '''
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer,
                    metric_names=["loss", 'ppl'] if args.with_expl else ["loss", 'lbl_accuracy', 'ppl'])
        validator.add_event_handler(Events.COMPLETED,
                                    lambda _: pbar.log_message(
                                        "Validation: %s" % pformat(validator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir=output_dir)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(trainer,
                         log_handler=OutputHandler(
                             tag="training",
                             metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OutputHandler(
                             tag="training",
                             metric_names=["ppl"] if args.with_expl else ["lbl_accuracy", "ppl"]),
                         event_name=Events.EPOCH_COMPLETED)

        tb_logger.attach(validator,
                         log_handler=OutputHandler(
                             tag="validation",
                             metric_names=[
                                 'ppl', 'loss'] if args.with_expl else['ppl', 'loss', 'lbl_accuracy'],
                             global_step_transform=lambda *args, **kwargs: trainer.state.iteration),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(output_dir,
                                             'checkpoint',
                                             n_saved=8,
                                             require_empty=False)
        trainer.add_event_handler(Events.EPOCH_COMPLETED(every=1),
                                  checkpoint_handler,
                                  {'mymodel': getattr(model, 'module', model)})

        # "getattr" take care of distributed encapsulation
        torch.save(args, os.path.join(output_dir, 'model_training_args.bin'))
        getattr(model, 'module', model).config.to_json_file(
            os.path.join(output_dir, CONFIG_NAME))
        tokenizer.save_vocabulary(output_dir)

    '''Run the training'''
    trainer.run(train_loader, max_epochs=args.n_epochs)
예제 #23
0
def main(args):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
    cfg = load_config(args.config)

    path, config_name = os.path.split(args.config)
    copyfile(args.config, os.path.join(cfg.workdir, config_name))
    copyfile(os.path.join(path, "model.py"), os.path.join(cfg.workdir, "model.py"))

    pbar = ProgressBar()
    tb_logger = TensorboardLogger(log_dir=os.path.join(cfg.workdir, "tb_logs"))
    checkpointer = ModelCheckpoint(os.path.join(cfg.workdir, "checkpoints"), '', 
                                   save_interval=1, n_saved=cfg.n_epochs, create_dir=True, atomic=True)

    def _update(engine, batch):
        cfg.model.train()
        cfg.optimizer.zero_grad()
        x, y = cfg.prepare_train_batch(batch)
        y_pred = cfg.model(**x)
        loss = cfg.loss_fn(y_pred, y)
        loss['loss'].backward()
        cfg.optimizer.step()
        for k in loss:
            loss[k] = loss[k].item()
        return loss
    trainer = Engine(_update)

    pbar.attach(trainer, output_transform=lambda x: {k: "{:.5f}".format(v) for k, v in x.items()})
    trainer.add_event_handler(Events.ITERATION_STARTED, cfg.scheduler)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'model': cfg.model, 'optimizer': cfg.optimizer})
    tb_logger.attach(trainer,
                     log_handler=OutputHandler(tag="training", output_transform=lambda x: x),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(trainer,
                     log_handler=OptimizerParamsHandler(cfg.optimizer),
                     event_name=Events.ITERATION_STARTED)
    # tb_logger.attach(trainer,
    #                  log_handler=WeightsScalarHandler(cfg.model),
    #                  event_name=Events.ITERATION_COMPLETED)
    # tb_logger.attach(trainer,
    #                  log_handler=WeightsHistHandler(cfg.model),
    #                  event_name=Events.EPOCH_COMPLETED)
    # tb_logger.attach(trainer,
    #                  log_handler=GradsScalarHandler(cfg.model),
    #                  event_name=Events.ITERATION_COMPLETED)
    # tb_logger.attach(trainer,
    #                  log_handler=GradsHistHandler(cfg.model),
    #                  event_name=Events.EPOCH_COMPLETED)

    def _evaluate(engine, batch):
        cfg.model.eval()
        x, y = cfg.prepare_train_batch(batch)
        batch_size = len(batch[list(batch.keys())[0]])
        with torch.no_grad():
            y_pred = cfg.model(**x)
            loss = cfg.loss_fn(y_pred, y)
        for k in loss:
            loss[k] = loss[k].item()
            if k not in engine.state.metrics:
                engine.state.metrics[k] = 0.0
            engine.state.metrics[k] += loss[k] * batch_size / len(cfg.valid_ds)
        return loss
    evaluator = Engine(_evaluate)

    pbar.attach(evaluator, output_transform=lambda x: {k: "{:.5f}".format(v) for k, v in x.items()})

    @trainer.on(Events.EPOCH_COMPLETED)
    def evaluate_on_valid_dl(engine):
        evaluator.run(cfg.valid_dl)

    tb_logger.attach(evaluator,
                     log_handler=OutputHandler(tag="validation",
                                               metric_names=['loss', 'rot_loss_cos', 'rot_loss_l1', 'trans_loss', 'true_distance', 'cls_loss'],
                                               global_step_transform=global_step_from_engine(trainer)),
                     event_name=Events.EPOCH_COMPLETED)

    trainer.run(cfg.train_dl, cfg.n_epochs)
    tb_logger.close()
예제 #24
0
        # sample from the prior

        prior_sample_args = {}
        prior_sample_args.update(svi_args)
        prior_sample_args["cond"] = False
        prior_sample_args["cond_label"] = False
        fwd_trace = poutine.trace(forward_model).get_trace(x,
                                                           y,
                                                           N=x.shape[0],
                                                           **prior_sample_args)
        prior_sample = fwd_trace.nodes["pixels"]["fn"].mean
        prior_canonical_sample = fwd_trace.nodes["canonical_view"]["value"]
        tb.add_image("prior_samples",
                     torchvision.utils.make_grid(prior_sample), epoch)

        tb.add_image(
            "canonical_prior_samples",
            torchvision.utils.make_grid(prior_canonical_sample),
            epoch,
        )

    pbar = ProgressBar()
    pbar.attach(train_engine)

    @train_engine.on(Events.EPOCH_COMPLETED(every=eval_every))
    def eval(engine):
        eval_engine.run(test_dl, seed=engine.state.epoch)

    train_engine.run(train_dl, max_epochs=50)
예제 #25
0
    def setup(self, training_metrics: Dict):
        def metric_name(n) -> str:
            if n.endswith('Accuracy'):
                n = 'acc'
            else:
                n = n[:-6] if n.endswith('Metric') else n
            return n

        def print_metrics(metrics) -> str:
            rv = ''
            metric_keys = sorted(k for k in metrics)
            for k in metric_keys:
                if k == 'Accuracy':
                    rv += f'{metric_name(k)}: {metrics[k]:.3} | '
                else:
                    rv += f'{metric_name(k)}: {metrics[k]} | '
            return rv

        def store_metrics(metrics: Dict, mode: str):
            metric_keys = sorted(k for k in metrics)
            for k in metric_keys:
                self.metrics_history[mode][metric_name(k)].append(metrics[k])

        if self.seed:
            set_seed_everywhere(self.seed, self.cuda)

        pbar = ProgressBar(persist=True)

        names = []
        for k, v in training_metrics.items():
            name = f'r{k}'
            names.append(name)
            RunningAverage(v).attach(self.trainer, name)
        RunningAverage(None, output_transform=lambda x: x[-1]).attach(
            self.trainer, 'rloss')

        names.append('rloss')
        pbar.attach(self.trainer, names)

        ProgressBar(persist=True).attach(engine=self.evaluator,
                                         metric_names=names)

        # A few events handler. To add / modify the events handler, you need to extend the __init__ method of RunnerABC
        # Ignite provides the necessary abstractions and a furnished repository of useful tools

        @self.trainer.on(Events.EPOCH_COMPLETED)
        def log_training_validation_results(trainer):

            self.evaluator.run(self.dataset_splits.train_data_loader())
            metrics = self.evaluator.state.metrics
            store_metrics(metrics=metrics, mode="training")
            logger.info(
                f"Training Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}"
            )

            self.evaluator.run(self.dataset_splits.val_data_loader())
            metrics = self.evaluator.state.metrics
            store_metrics(metrics=metrics, mode="validation")
            logger.info(
                f"Validation Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}"
            )

            metrics = self.trainer.state.metrics
            if self.scheduler:
                self.scheduler.step(metrics["rloss"])
                # self.scheduler.step(metrics[self.loss_metric.__class__.__name__])

        @self.trainer.on(Events.COMPLETED)
        def log_test_results(trainer):
            if self.dataset_splits.test_set:
                self.evaluator.run(self.dataset_splits.test_data_loader())
                metrics = self.evaluator.state.metrics
                store_metrics(metrics=metrics, mode="test")
                logger.info(
                    f"Test Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}"
                )
예제 #26
0
    def __call__(self, model, train_dataset, val_dataset=None, **_):
        """Train a PyTorch model.

        Args:
            model (torch.nn.Module): PyTorch model to train.
            train_dataset (torch.utils.data.Dataset): Dataset used to train.
            val_dataset (torch.utils.data.Dataset, optional): Dataset used to validate.

        Returns:
            trained_model (torch.nn.Module): Trained PyTorch model.
        """
        assert train_dataset is not None
        train_params = self.train_params
        mlflow_logging = self.mlflow_logging

        if mlflow_logging:
            try:
                import mlflow  # NOQA
            except ImportError:
                log.warning(
                    "Failed to import mlflow. MLflow logging is disabled.")
                mlflow_logging = False

        loss_fn = train_params.get("loss_fn")
        assert loss_fn
        epochs = train_params.get("epochs")
        seed = train_params.get("seed")
        optimizer = train_params.get("optimizer")
        assert optimizer
        optimizer_params = train_params.get("optimizer_params", dict())
        train_dataset_size_limit = train_params.get("train_dataset_size_limit")
        if train_dataset_size_limit:
            train_dataset = PartialDataset(train_dataset,
                                           train_dataset_size_limit)
            log.info("train dataset size is set to {}".format(
                len(train_dataset)))

        val_dataset_size_limit = train_params.get("val_dataset_size_limit")
        if val_dataset_size_limit and (val_dataset is not None):
            val_dataset = PartialDataset(val_dataset, val_dataset_size_limit)
            log.info("val dataset size is set to {}".format(len(val_dataset)))

        train_data_loader_params = train_params.get("train_data_loader_params",
                                                    dict())
        val_data_loader_params = train_params.get("val_data_loader_params",
                                                  dict())
        evaluation_metrics = train_params.get("evaluation_metrics")
        evaluate_train_data = train_params.get("evaluate_train_data")
        evaluate_val_data = train_params.get("evaluate_val_data")
        progress_update = train_params.get("progress_update")

        scheduler = train_params.get("scheduler")
        scheduler_params = train_params.get("scheduler_params", dict())

        model_checkpoint = train_params.get("model_checkpoint")
        model_checkpoint_params = train_params.get("model_checkpoint_params")
        early_stopping_params = train_params.get("early_stopping_params")
        time_limit = train_params.get("time_limit")

        cudnn_deterministic = train_params.get("cudnn_deterministic")
        cudnn_benchmark = train_params.get("cudnn_benchmark")

        if seed:
            torch.manual_seed(seed)
            np.random.seed(seed)
        if cudnn_deterministic:
            torch.backends.cudnn.deterministic = cudnn_deterministic
        if cudnn_benchmark:
            torch.backends.cudnn.benchmark = cudnn_benchmark

        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device)
        optimizer_ = optimizer(model.parameters(), **optimizer_params)
        trainer = create_supervised_trainer(model,
                                            optimizer_,
                                            loss_fn=loss_fn,
                                            device=device)

        train_data_loader_params.setdefault("shuffle", True)
        train_data_loader_params.setdefault("drop_last", True)
        train_data_loader_params["batch_size"] = _clip_batch_size(
            train_data_loader_params.get("batch_size", 1), train_dataset,
            "train")
        train_loader = DataLoader(train_dataset, **train_data_loader_params)

        RunningAverage(output_transform=lambda x: x,
                       alpha=0.98).attach(trainer, "ema_loss")

        RunningAverage(output_transform=lambda x: x,
                       alpha=2**(-1022)).attach(trainer, "batch_loss")

        if scheduler:

            class ParamSchedulerSavingAsMetric(
                    ParamSchedulerSavingAsMetricMixIn, scheduler):
                pass

            cycle_epochs = scheduler_params.pop("cycle_epochs", 1)
            scheduler_params.setdefault("cycle_size",
                                        int(cycle_epochs * len(train_loader)))
            scheduler_params.setdefault("param_name", "lr")
            scheduler_ = ParamSchedulerSavingAsMetric(optimizer_,
                                                      **scheduler_params)
            trainer.add_event_handler(Events.ITERATION_STARTED, scheduler_)

        if evaluate_train_data:
            evaluator_train = create_supervised_evaluator(
                model, metrics=evaluation_metrics, device=device)

        if evaluate_val_data:
            val_data_loader_params["batch_size"] = _clip_batch_size(
                val_data_loader_params.get("batch_size", 1), val_dataset,
                "val")
            val_loader = DataLoader(val_dataset, **val_data_loader_params)
            evaluator_val = create_supervised_evaluator(
                model, metrics=evaluation_metrics, device=device)

        if model_checkpoint_params:
            assert isinstance(model_checkpoint_params, dict)
            minimize = model_checkpoint_params.pop("minimize", True)
            save_interval = model_checkpoint_params.get("save_interval", None)
            if not save_interval:
                model_checkpoint_params.setdefault(
                    "score_function",
                    get_score_function("ema_loss", minimize=minimize))
            model_checkpoint_params.setdefault("score_name", "ema_loss")
            mc = model_checkpoint(**model_checkpoint_params)
            trainer.add_event_handler(Events.EPOCH_COMPLETED, mc,
                                      {"model": model})

        if early_stopping_params:
            assert isinstance(early_stopping_params, dict)
            metric = early_stopping_params.pop("metric", None)
            assert (metric is None) or (metric in evaluation_metrics)
            minimize = early_stopping_params.pop("minimize", False)
            if metric:
                assert (
                    "score_function" not in early_stopping_params
                ), "Remove either 'metric' or 'score_function' from early_stopping_params: {}".format(
                    early_stopping_params)
                early_stopping_params["score_function"] = get_score_function(
                    metric, minimize=minimize)

            es = EarlyStopping(trainer=trainer, **early_stopping_params)
            if evaluate_val_data:
                evaluator_val.add_event_handler(Events.COMPLETED, es)
            elif evaluate_train_data:
                evaluator_train.add_event_handler(Events.COMPLETED, es)
            elif early_stopping_params:
                log.warning(
                    "Early Stopping is disabled because neither "
                    "evaluate_val_data nor evaluate_train_data is set True.")

        if time_limit:
            assert isinstance(time_limit, (int, float))
            tl = TimeLimit(limit_sec=time_limit)
            trainer.add_event_handler(Events.ITERATION_COMPLETED, tl)

        pbar = None
        if progress_update:
            if not isinstance(progress_update, dict):
                progress_update = dict()
            progress_update.setdefault("persist", True)
            progress_update.setdefault("desc", "")
            pbar = ProgressBar(**progress_update)
            pbar.attach(trainer, ["ema_loss"])

        else:

            def log_train_metrics(engine):
                log.info("[Epoch: {} | {}]".format(engine.state.epoch,
                                                   engine.state.metrics))

            trainer.add_event_handler(Events.EPOCH_COMPLETED,
                                      log_train_metrics)

        if evaluate_train_data:

            def log_evaluation_train_data(engine):
                evaluator_train.run(train_loader)
                train_report = _get_report_str(engine, evaluator_train,
                                               "Train Data")
                if pbar:
                    pbar.log_message(train_report)
                else:
                    log.info(train_report)

            eval_train_event = (Events[evaluate_train_data] if isinstance(
                evaluate_train_data, str) else Events.EPOCH_COMPLETED)
            trainer.add_event_handler(eval_train_event,
                                      log_evaluation_train_data)

        if evaluate_val_data:

            def log_evaluation_val_data(engine):
                evaluator_val.run(val_loader)
                val_report = _get_report_str(engine, evaluator_val, "Val Data")
                if pbar:
                    pbar.log_message(val_report)
                else:
                    log.info(val_report)

            eval_val_event = (Events[evaluate_val_data] if isinstance(
                evaluate_val_data, str) else Events.EPOCH_COMPLETED)
            trainer.add_event_handler(eval_val_event, log_evaluation_val_data)

        if mlflow_logging:
            mlflow_logger = MLflowLogger()

            logging_params = {
                "train_n_samples": len(train_dataset),
                "train_n_batches": len(train_loader),
                "optimizer": _name(optimizer),
                "loss_fn": _name(loss_fn),
                "pytorch_version": torch.__version__,
                "ignite_version": ignite.__version__,
            }
            logging_params.update(_loggable_dict(optimizer_params,
                                                 "optimizer"))
            logging_params.update(
                _loggable_dict(train_data_loader_params, "train"))
            if scheduler:
                logging_params.update({"scheduler": _name(scheduler)})
                logging_params.update(
                    _loggable_dict(scheduler_params, "scheduler"))

            if evaluate_val_data:
                logging_params.update({
                    "val_n_samples": len(val_dataset),
                    "val_n_batches": len(val_loader),
                })
                logging_params.update(
                    _loggable_dict(val_data_loader_params, "val"))

            mlflow_logger.log_params(logging_params)

            batch_metric_names = ["batch_loss", "ema_loss"]
            if scheduler:
                batch_metric_names.append(scheduler_params.get("param_name"))

            mlflow_logger.attach(
                trainer,
                log_handler=OutputHandler(
                    tag="step",
                    metric_names=batch_metric_names,
                    global_step_transform=global_step_from_engine(trainer),
                ),
                event_name=Events.ITERATION_COMPLETED,
            )

            if evaluate_train_data:
                mlflow_logger.attach(
                    evaluator_train,
                    log_handler=OutputHandler(
                        tag="train",
                        metric_names=list(evaluation_metrics.keys()),
                        global_step_transform=global_step_from_engine(trainer),
                    ),
                    event_name=Events.COMPLETED,
                )
            if evaluate_val_data:
                mlflow_logger.attach(
                    evaluator_val,
                    log_handler=OutputHandler(
                        tag="val",
                        metric_names=list(evaluation_metrics.keys()),
                        global_step_transform=global_step_from_engine(trainer),
                    ),
                    event_name=Events.COMPLETED,
                )

        trainer.run(train_loader, max_epochs=epochs)

        try:
            if pbar and pbar.pbar:
                pbar.pbar.close()
        except Exception as e:
            log.error(e, exc_info=True)

        model = load_latest_model(model_checkpoint_params)(model)

        return model
예제 #27
0
                pbar.log_message(name + ' is unfrozen')
                for param in child.parameters():
                    param.requires_grad = True
            else:
                pbar.log_message(name + ' is frozen')
                for param in child.parameters():
                    param.requires_grad = False
    elif epoch > 1:
        pbar.log_message("Turn on all the layers")
        for name, child in model.named_children():
            for param in child.parameters():
                param.requires_grad = True


pbar = ProgressBar(bar_format='')
pbar.attach(trainer, output_transform=lambda x: {'loss': x})
trainer.run(loader, max_epochs=500)


# with torch.no_grad():
#     preds = np.empty(0)
#     for x, _ in tqdm_notebook(tloader):
#         x = x.to(device)
#         output = model_resnet_18(x)
#         idx = output.max(dim=-1)[1].cpu().numpy()
#         preds = np.append(preds, idx, axis=0)
#
#
# submission = pd.read_csv(path_data + '/test.csv')
# submission['sirna'] = preds.astype(int)
# submission.to_csv('submission_1.csv', index=False, columns=['id_code', 'sirna'])
예제 #28
0
    ewc_loss = 0
    if 'ewc_loss' in engine.state.output:
        ewc_loss = engine.state.output['ewc_loss']
    metric = model.get_metrics()
    lr = optimizer.param_groups[0]['lr']
    e = engine.state.epoch
    n = engine.state.max_epochs
    i = engine.state.iteration
    print(
        "Epoch {}/{} : {} - batch loss: {}, ewc loss: {}, lr: {}, accuracy: {}, average: {} "
        .format(e, n, i, batch_loss, ewc_loss, lr, metric['accuracy'],
                metric['average']))


pbar = ProgressBar()
pbar.attach(itrainer, ['loss'])
current_task = None


@itrainer.on(Events.EPOCH_COMPLETED)
def run_validation(engine):
    val_iterator = BucketIterator(batch_size=args.bs,
                                  sorting_keys=[("tokens", "num_tokens")])
    val_iterator.index_with(vocabulary[current_task])
    raw_val_generator = iterator(dev_data[current_task], num_epochs=1)
    val_groups = list(raw_val_generator)
    model.get_metrics(True)
    ievaluator.run(val_groups)
    batch_loss = ievaluator.state.metric['loss']
    metric = ievaluator.state.metric
    lr = optimizer.param_groups[0]['lr']
예제 #29
0
    def setup(self, training_metrics):
        def metric_name(n) -> str:
            if n.endswith('Accuracy'):
                n = 'acc'
            else:
                n = n[:-6] if n.endswith('Metric') else n
            return n

        def print_metrics(metrics) -> str:
            rv = ''
            metric_keys = sorted(k for k in metrics)
            for k in metric_keys:
                if k == 'Accuracy':
                    rv += f'{metric_name(k)}: {metrics[k]:.3}'
                else:
                    rv += f'{metric_name(k)}: {metrics[k]:.6}'
            return rv

        if self.seed:
            set_seed_everywhere(self.seed, self.cuda)

        pbar = ProgressBar()

        names = []
        for k, v in training_metrics.items():
            name = f'r{k}'
            names.append(name)
            RunningAverage(v).attach(self.trainer, name)
        RunningAverage(None,
                       output_transform=lambda x: x[-1] * self.
                       loss_accumulation_steps).attach(self.trainer, 'rloss')
        names.append('rloss')
        pbar.attach(self.trainer, names)

        pbar = ProgressBar()
        pbar.attach(self.evaluator)

        # A few events handler. To add / modify the events handler, you need to extend the __init__ method of RunnerABC
        # Ignite provides the necessary abstractions and a furnished repository of useful tools

        @self.trainer.on(Events.EPOCH_COMPLETED)
        def log_validation_results(trainer):
            self.evaluator.run(self.dataset_splits.val_data_loader())
            metrics = self.evaluator.state.metrics
            logger.info(
                f"Validation Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}"
            )

            if self.scheduler:
                self.scheduler.step(
                    metrics[self.loss_metric.__class__.__name__])

        @self.trainer.on(Events.COMPLETED)
        def log_test_results(trainer):
            self.evaluator.run(self.dataset_splits.test_data_loader())
            metrics = self.evaluator.state.metrics
            logger.info(
                f"Test Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}"
            )

        if self.tensorboard_logs:
            tb_logger = TensorboardLogger(log_dir=self.tensorboard_logs)
            tb_logger.attach(self.trainer,
                             log_handler=OutputHandler(
                                 tag="training",
                                 output_transform=lambda loss: {'loss': loss}),
                             event_name=Events.ITERATION_COMPLETED)
            tb_logger.attach(self.evaluator,
                             log_handler=OutputHandler(
                                 tag="validation",
                                 metric_names=["LossMetric"],
                                 another_engine=self.trainer),
                             event_name=Events.EPOCH_COMPLETED)
            tb_logger.attach(self.trainer,
                             log_handler=OptimizerParamsHandler(
                                 self.optimizer),
                             event_name=Events.ITERATION_STARTED)
            tb_logger.attach(self.trainer,
                             log_handler=WeightsScalarHandler(self.model),
                             event_name=Events.ITERATION_COMPLETED)
            tb_logger.attach(self.trainer,
                             log_handler=WeightsHistHandler(self.model),
                             event_name=Events.EPOCH_COMPLETED)
            tb_logger.attach(self.trainer,
                             log_handler=GradsScalarHandler(self.model),
                             event_name=Events.ITERATION_COMPLETED)

            # This is important to close the tensorboard file logger
            @self.trainer.on(Events.COMPLETED)
            def end_tensorboard(trainer):
                logger.info("Training completed")
                tb_logger.close()

        if self.embeddings_name:

            @self.trainer.on(Events.COMPLETED)
            def log_embeddings(trainer):
                if hasattr(self.model, self.embeddings_name) and hasattr(
                        self.dataset_splits, "vectorizer"):
                    logger.info(
                        f"Logging embeddings ({self.embeddings_name}) to Tensorboard!"
                    )
                    embeddings = getattr(self.model,
                                         self.embeddings_name).weight.data
                    metadata = [
                        str(self.dataset_splits.vectorizer.data_vocab.
                            _id2token[token_index]).encode('utf-8')
                        for token_index in range(embeddings.shape[0])
                    ]
                    self.writer.add_embedding(
                        mat=embeddings,
                        metadata=metadata,
                        global_step=self.trainer.state.epoch)
예제 #30
0
def main(local_rank):
    params = init_parms(local_rank)
    device = params.get('device')
    model = ASRModel(input_features=config.num_mel_banks,
                     num_classes=config.vocab_size).to(device)
    logger.info(
        f'Model initialized with {get_model_size(model):.3f}M parameters')
    optimizer = Ranger(model.parameters(), lr=config.lr, eps=1e-5)
    model = DistributedDataParallel(model,
                                    device_ids=[local_rank],
                                    output_device=local_rank,
                                    check_reduction=True)
    load_checkpoint(model, optimizer, params)
    print(f"Loaded model on {local_rank}")
    start_epoch = params['start_epoch']
    sup_criterion = CustomCTCLoss()
    unsup_criterion = UDALoss()
    if args.local_rank == 0:
        tb_logger = TensorboardLogger(log_dir=log_path)
        pbar = ProgressBar(persist=True, desc="Training")
        pbar_valid = ProgressBar(persist=True, desc="Validation Clean")
        pbar_valid_other = ProgressBar(persist=True, desc="Validation Other")
        pbar_valid_airtel = ProgressBar(persist=True, desc="Validation Airtel")
        pbar_valid_airtel_payments = ProgressBar(
            persist=True, desc="Validation Airtel Payments")
        timer = Timer(average=True)
        best_meter = params.get('best_stats', BestMeter())

    trainCleanPath = os.path.join(lmdb_root_path, 'train-labelled')
    trainOtherPath = os.path.join(lmdb_root_path, 'train-unlabelled')
    trainCommonVoicePath = os.path.join(lmdb_commonvoice_root_path,
                                        'train-labelled-en')
    trainAirtelPath = os.path.join(lmdb_airtel_root_path, 'train-labelled-en')
    trainAirtelPaymentsPath = os.path.join(lmdb_airtel_payments_root_path,
                                           'train-labelled-en')
    testCleanPath = os.path.join(lmdb_root_path, 'test-clean')
    testOtherPath = os.path.join(lmdb_root_path, 'test-other')
    testAirtelPath = os.path.join(lmdb_airtel_root_path, 'test-labelled-en')
    testAirtelPaymentsPath = os.path.join(lmdb_airtel_payments_root_path,
                                          'test-labelled-en')
    devOtherPath = os.path.join(lmdb_root_path, 'dev-other')

    train_clean = lmdbMultiDataset(roots=[
        trainCleanPath, trainOtherPath, trainCommonVoicePath, trainAirtelPath,
        trainAirtelPaymentsPath
    ],
                                   transform=image_train_transform)
    train_other = lmdbMultiDataset(roots=[devOtherPath],
                                   transform=image_train_transform)

    test_clean = lmdbMultiDataset(roots=[testCleanPath],
                                  transform=image_val_transform)
    test_other = lmdbMultiDataset(roots=[testOtherPath],
                                  transform=image_val_transform)
    test_airtel = lmdbMultiDataset(roots=[testAirtelPath],
                                   transform=image_val_transform)
    test_payments_airtel = lmdbMultiDataset(roots=[testAirtelPaymentsPath],
                                            transform=image_val_transform)

    logger.info(
        f'Loaded Train & Test Datasets, train_labbeled={len(train_clean)}, train_unlabbeled={len(train_other)}, test_clean={len(test_clean)}, test_other={len(test_other)}, test_airtel={len(test_airtel)}, test_payments_airtel={len(test_payments_airtel)} examples'
    )

    def train_update_function(engine, _):
        optimizer.zero_grad()
        # Supervised gt, pred
        imgs_sup, labels_sup, label_lengths = next(
            engine.state.train_loader_labbeled)
        imgs_sup = imgs_sup.cuda(local_rank, non_blocking=True)
        labels_sup = labels_sup
        probs_sup = model(imgs_sup)

        # Unsupervised gt, pred
        # imgs_unsup, augmented_imgs_unsup = next(engine.state.train_loader_unlabbeled)
        # with torch.no_grad():
        #     probs_unsup = model(imgs_unsup.to(device))
        # probs_aug_unsup = model(augmented_imgs_unsup.to(device))

        sup_loss = sup_criterion(probs_sup, labels_sup, label_lengths)
        # unsup_loss = unsup_criterion(probs_unsup, probs_aug_unsup)

        # Blend supervised and unsupervised losses till unsupervision_warmup_epoch
        # alpha = get_alpha(engine.state.epoch)
        # final_loss = ((1 - alpha) * sup_loss) + (alpha * unsup_loss)

        # final_loss = sup_loss
        sup_loss.backward()
        optimizer.step()

        return sup_loss.item()

    @torch.no_grad()
    def validate_update_function(engine, batch):
        img, labels, label_lengths = batch
        y_pred = model(img.cuda(local_rank, non_blocking=True))
        if np.random.rand() > 0.99:
            pred_sentences = get_most_probable(y_pred)
            labels_list = labels.tolist()
            idx = 0
            for i, length in enumerate(label_lengths.cpu().tolist()):
                pred_sentence = pred_sentences[i]
                gt_sentence = get_sentence(labels_list[idx:idx + length])
                idx += length
                print(f"Pred sentence: {pred_sentence}, GT: {gt_sentence}")
        return (y_pred, labels, label_lengths)

    train_sampler_labbeled = torch.utils.data.distributed.DistributedSampler(
        train_clean, num_replicas=3, rank=args.local_rank)
    train_sampler_unlabbeled = torch.utils.data.distributed.DistributedSampler(
        train_other, num_replicas=3, rank=args.local_rank)
    test_sampler_clean = torch.utils.data.distributed.DistributedSampler(
        test_clean, num_replicas=3, rank=args.local_rank, shuffle=False)
    test_sampler_other = torch.utils.data.distributed.DistributedSampler(
        test_other, num_replicas=3, rank=args.local_rank, shuffle=False)
    test_sampler_airtel = torch.utils.data.distributed.DistributedSampler(
        test_airtel, num_replicas=3, rank=args.local_rank, shuffle=False)
    test_sampler_airtel_payments = torch.utils.data.distributed.DistributedSampler(
        test_payments_airtel,
        num_replicas=3,
        rank=args.local_rank,
        shuffle=False)

    train_loader_labbeled_loader = torch.utils.data.DataLoader(
        train_clean,
        batch_size=train_batch_size // 3,
        sampler=train_sampler_labbeled,
        num_workers=config.workers // 3,
        pin_memory=True,
        collate_fn=allign_collate)
    train_loader_unlabbeled_loader = torch.utils.data.DataLoader(
        train_other,
        batch_size=train_batch_size * 4,
        sampler=train_sampler_unlabbeled,
        num_workers=config.workers // 3,
        pin_memory=True,
        collate_fn=allign_collate)
    test_loader_clean = torch.utils.data.DataLoader(
        test_clean,
        batch_size=1,
        sampler=test_sampler_clean,
        num_workers=config.workers // 3,
        pin_memory=True,
        collate_fn=allign_collate)
    test_loader_other = torch.utils.data.DataLoader(
        test_other,
        batch_size=1,
        sampler=test_sampler_other,
        num_workers=config.workers // 3,
        pin_memory=True,
        collate_fn=allign_collate)
    test_loader_airtel = torch.utils.data.DataLoader(
        test_airtel,
        batch_size=1,
        sampler=test_sampler_airtel,
        num_workers=config.workers // 3,
        pin_memory=True,
        collate_fn=allign_collate)
    test_loader_airtel_payments = torch.utils.data.DataLoader(
        test_payments_airtel,
        batch_size=1,
        sampler=test_sampler_airtel_payments,
        num_workers=config.workers // 3,
        pin_memory=True,
        collate_fn=allign_collate)
    trainer = Engine(train_update_function)
    iteration_log_step = int(0.33 * len(train_loader_labbeled_loader))
    evaluator_clean = Engine(validate_update_function)
    evaluator_other = Engine(validate_update_function)
    evaluator_airtel = Engine(validate_update_function)
    evaluator_airtel_payments = Engine(validate_update_function)
    metrics = {'wer': WordErrorRate(), 'cer': CharacterErrorRate()}
    for name, metric in metrics.items():
        metric.attach(evaluator_clean, name)
        metric.attach(evaluator_other, name)
        metric.attach(evaluator_airtel, name)
        metric.attach(evaluator_airtel_payments, name)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=config.lr_gamma,
        patience=int(config.epochs * 0.05),
        verbose=True,
        threshold_mode="abs",
        cooldown=int(config.epochs * 0.025),
        min_lr=1e-5)
    if args.local_rank == 0:
        tb_logger.attach(trainer,
                         log_handler=OutputHandler(
                             tag="training",
                             output_transform=lambda loss: {'loss': loss}),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(trainer,
                         log_handler=WeightsHistHandler(model),
                         event_name=Events.EPOCH_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=WeightsScalarHandler(model),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=GradsScalarHandler(model),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=GradsHistHandler(model),
                         event_name=Events.EPOCH_COMPLETED)
        tb_logger.attach(evaluator_clean,
                         log_handler=OutputHandler(tag="validation_clean",
                                                   metric_names=["wer", "cer"],
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)
        tb_logger.attach(evaluator_other,
                         log_handler=OutputHandler(tag="validation_other",
                                                   metric_names=["wer", "cer"],
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)
        tb_logger.attach(evaluator_airtel,
                         log_handler=OutputHandler(tag="validation_airtel",
                                                   metric_names=["wer", "cer"],
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)
        tb_logger.attach(evaluator_airtel_payments,
                         log_handler=OutputHandler(
                             tag="validation_airtel_payments",
                             metric_names=["wer", "cer"],
                             another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)
        pbar.attach(trainer, output_transform=lambda x: {'loss': x})
        pbar_valid.attach(evaluator_clean, ['wer', 'cer'],
                          event_name=Events.EPOCH_COMPLETED,
                          closing_event_name=Events.COMPLETED)
        pbar_valid_other.attach(evaluator_other, ['wer', 'cer'],
                                event_name=Events.EPOCH_COMPLETED,
                                closing_event_name=Events.COMPLETED)
        pbar_valid_airtel.attach(evaluator_airtel, ['wer', 'cer'],
                                 event_name=Events.EPOCH_COMPLETED,
                                 closing_event_name=Events.COMPLETED)
        pbar_valid_airtel_payments.attach(evaluator_airtel_payments,
                                          ['wer', 'cer'],
                                          event_name=Events.EPOCH_COMPLETED,
                                          closing_event_name=Events.COMPLETED)
        timer.attach(trainer)

    @trainer.on(Events.STARTED)
    def set_init_epoch(engine):
        engine.state.epoch = params['start_epoch']
        logger.info(f'Initial epoch for trainer set to {engine.state.epoch}')

    @trainer.on(Events.EPOCH_STARTED)
    def set_model_train(engine):
        if hasattr(engine.state, 'train_loader_labbeled'):
            del engine.state.train_loader_labbeled
        engine.state.train_loader_labbeled = iter(train_loader_labbeled_loader)
        # engine.state.train_loader_unlabbeled = iter(train_loader_unlabbeled_loader)

    @trainer.on(Events.ITERATION_COMPLETED)
    def iteration_completed(engine):
        if (engine.state.iteration % iteration_log_step
                == 0) and (engine.state.iteration > 0):
            engine.state.epoch += 1
            train_clean.set_epochs(engine.state.epoch)
            train_other.set_epochs(engine.state.epoch)
            model.eval()
            logger.info('Model set to eval mode')
            evaluator_clean.run(test_loader_clean)
            evaluator_other.run(test_loader_other)
            evaluator_airtel.run(test_loader_airtel)
            evaluator_airtel_payments.run(test_loader_airtel_payments)
            model.train()
            logger.info('Model set back to train mode')

    if args.local_rank == 0:

        @evaluator_other.on(Events.EPOCH_COMPLETED)
        def save_checkpoints(engine):
            metrics = engine.state.metrics
            wer = metrics['wer']
            cer = metrics['cer']
            epoch = trainer.state.epoch
            scheduler.step(wer)
            save_checkpoint(model, optimizer, best_meter, wer, cer, epoch)
            best_meter.update(wer, cer, epoch)

        @trainer.on(Events.EPOCH_COMPLETED)
        def after_complete(engine):
            logger.info('Epoch {} done. Time per batch: {:.3f}[s]'.format(
                engine.state.epoch, timer.value()))
            timer.reset()

    trainer.run(train_loader_labbeled_loader, max_epochs=epochs)
    if args.local_rank == 0:
        tb_logger.close()