Пример #1
0
    def _test(n_epochs, metric_device):
        n_iters = 100
        s = 16
        n_classes = 10

        offset = n_iters * s
        y_true = torch.randint(0, n_classes, size=(offset * idist.get_world_size(),)).to(device)
        y_preds = torch.rand(offset * idist.get_world_size(), n_classes).to(device)

        def update(engine, i):
            return (
                y_preds[i * s + rank * offset : (i + 1) * s + rank * offset, :],
                y_true[i * s + rank * offset : (i + 1) * s + rank * offset],
            )

        engine = Engine(update)

        k = 5
        acc = TopKCategoricalAccuracy(k=k, device=metric_device)
        acc.attach(engine, "acc")

        data = list(range(n_iters))
        engine.run(data=data, max_epochs=n_epochs)

        assert "acc" in engine.state.metrics
        res = engine.state.metrics["acc"]
        if isinstance(res, torch.Tensor):
            res = res.cpu().numpy()

        true_res = top_k_accuracy(y_true.cpu().numpy(), y_preds.cpu().numpy(), k=k)

        assert pytest.approx(res) == true_res
Пример #2
0
def test_zero_div():
    acc = TopKCategoricalAccuracy(2)
    with pytest.raises(
            NotComputableError,
            match=
            r"TopKCategoricalAccuracy must have at least one example before it can be computed"
    ):
        acc.compute()
def test_compute():
    acc = TopKCategoricalAccuracy(2)

    y_pred = torch.FloatTensor([[0.2, 0.4, 0.6, 0.8], [0.8, 0.6, 0.4, 0.2]])
    y = torch.ones(2).type(torch.LongTensor)
    acc.update((y_pred, y))
    assert acc.compute() == 0.5

    acc.reset()
    y_pred = torch.FloatTensor([[0.4, 0.8, 0.2, 0.6], [0.8, 0.6, 0.4, 0.2]])
    y = torch.ones(2).type(torch.LongTensor)
    acc.update((y_pred, y))
    assert acc.compute() == 1.0
def create_supervised_classification_trainer(model,
                                             loss_fn,
                                             optimizer,
                                             val_loader,
                                             learning_rate_scheduler,
                                             callback=None,
                                             use_cuda=None):
    """
    Todo: Add description
    :param model:
    :param loss_fn:
    :param optimizer:
    :param val_loader:
    :param learning_rate_scheduler:
    :param callback:
    :param use_cuda:
    :return:
    """

    if use_cuda and not torch.cuda.is_available():
        raise RuntimeError(
            'Trying to run using cuda, while cuda is not available')

    if use_cuda and torch.cuda.is_available():
        device = torch.device('cuda:0')
        torch.backends.cudnn.benchmark = True
        if torch.cuda.device_count() > 1 and not isinstance(
                model, nn.DataParallel):
            model = nn.DataParallel(model)
            print("Using {} gpus for training".format(
                torch.cuda.device_count()))
    else:
        device = torch.device('cpu')

    trainer = create_trainer(model=model,
                             optimizer=optimizer,
                             loss_fn=loss_fn,
                             metrics={
                                 'top_1_accuracy': CategoricalAccuracy(),
                                 'top_5_accuracy': TopKCategoricalAccuracy(),
                                 'loss': Loss(loss_fn),
                             },
                             device=device)

    evaluator = create_supervised_classification_evaluator(
        model, loss_fn, use_cuda)

    if learning_rate_scheduler:
        trainer.add_event_handler(Events.EPOCH_STARTED,
                                  lambda _: learning_rate_scheduler.step())

    if callback is not None:
        trainer.add_event_handler(Events.ITERATION_COMPLETED, callback, model)

    trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results,
                              optimizer)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, run_evaluation,
                              evaluator, val_loader)

    return trainer, evaluator
def create_supervised_classification_evaluator(model, loss_fn, use_cuda):
    """
    Create an evaluator
    :param model:
    :param loss_fn:
    :param use_cuda:
    :return:
    """

    if use_cuda and torch.cuda.is_available():
        device = torch.device('cuda:0')
        # multiple GPUs, we can remove this as well
        torch.backends.cudnn.benchmark = True
        if torch.cuda.device_count() > 1 and not isinstance(
                model, nn.DataParallel):
            model = nn.DataParallel(model)
            logger.info("Using %d gpus for training",
                        torch.cuda.device_count())
    else:
        device = torch.device('cpu')

    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'top_1_accuracy':
                                                CategoricalAccuracy(),
                                                'top_5_accuracy':
                                                TopKCategoricalAccuracy(),
                                                'loss':
                                                Loss(loss_fn)
                                            },
                                            device=device)
    return evaluator
Пример #6
0
def create_classification_evaluator(
    model,
    device,
    non_blocking=True,
):
    from ignite.metrics import Accuracy, TopKCategoricalAccuracy
    from ignite.engine import create_supervised_evaluator

    def _prepare_batch(batch, device, non_blocking):
        image = batch['image'].to(device, non_blocking=non_blocking)
        label = batch['label'].to(device, non_blocking=non_blocking)
        return image, label

    metrics = {
        'accuracy': Accuracy(),
        'top5': TopKCategoricalAccuracy(k=5),
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics,
                                            device,
                                            non_blocking=non_blocking,
                                            prepare_batch=_prepare_batch)

    return evaluator
Пример #7
0
def metrics_selector(mode, loss):
    mode = mode.lower()
    if mode == "classification":
        metrics = {
            "loss": loss,
            "accuracy": Accuracy(),
            "accuracy_topk": TopKCategoricalAccuracy(),
            "precision": Precision(average=True),
            "recall": Recall(average=True)
        }
    elif mode == "multiclass-multilabel":
        metrics = {
            "loss": loss,
            "accuracy": Accuracy(),
        }
    elif mode == "regression":
        metrics = {
            "loss": loss,
            "mse": MeanSquaredError(),
            "mae": MeanAbsoluteError()
        }
    else:
        raise RuntimeError(
            "Invalid task mode, select classification or regression")

    return metrics
Пример #8
0
def accuracy_metrics(ks: Iterable[int],
                     output_transform=lambda x: x,
                     prefix="") -> Dict[str, Metric]:
    return {
        f"{prefix}accuracy@{k}":
        TopKCategoricalAccuracy(k=k, output_transform=output_transform)
        for k in ks
    }
Пример #9
0
    def __init__(self, prefix, loss_type: str, threshold=0.5, top_k=[1, 5, 10], n_classes: int = None,
                 multilabel: bool = None, metrics=["precision", "recall", "top_k", "accuracy"]):
        super().__init__()

        self.loss_type = loss_type.upper()
        self.threshold = threshold
        self.n_classes = n_classes
        self.multilabel = multilabel
        self.top_ks = top_k
        self.prefix = prefix

        self.metrics = {}
        for metric in metrics:
            if "precision" == metric:
                self.metrics[metric] = Precision(average=True, is_multilabel=multilabel)
            elif "recall" == metric:
                self.metrics[metric] = Recall(average=True, is_multilabel=multilabel)

            elif "top_k" in metric:
                if n_classes:
                    top_k = [k for k in top_k if k < n_classes]

                if multilabel:
                    self.metrics[metric] = TopKMultilabelAccuracy(k_s=top_k)
                else:
                    self.metrics[metric] = TopKCategoricalAccuracy(k=max(int(np.log(n_classes)), 1),
                                                                   output_transform=None)
            elif "macro_f1" in metric:
                self.metrics[metric] = F1(num_classes=n_classes, average="macro", multilabel=multilabel)
            elif "micro_f1" in metric:
                self.metrics[metric] = F1(num_classes=n_classes, average="micro", multilabel=multilabel)
            elif "mse" == metric:
                self.metrics[metric] = MeanSquaredError()
            elif "auroc" == metric:
                self.metrics[metric] = AUROC(num_classes=n_classes)
            elif "avg_precision" in metric:
                self.metrics[metric] = AveragePrecision(num_classes=n_classes, )


            elif "accuracy" in metric:
                self.metrics[metric] = Accuracy(top_k=int(metric.split("@")[-1]) if "@" in metric else None)

            elif "ogbn" in metric:
                self.metrics[metric] = OGBNodeClfMetrics(NodeEvaluator(metric))
            elif "ogbg" in metric:
                self.metrics[metric] = OGBNodeClfMetrics(GraphEvaluator(metric))
            elif "ogbl" in metric:
                self.metrics[metric] = OGBLinkPredMetrics(LinkEvaluator(metric))
            else:
                print(f"WARNING: metric {metric} doesn't exist")

            # Needed to add the PytorchGeometric methods as Modules, so they'll be on the correct CUDA device during training
            if isinstance(self.metrics[metric], torchmetrics.metric.Metric):
                setattr(self, metric, self.metrics[metric])

        self.reset_metrics()
def _test_distrib_accumulator_device(device):

    metric_devices = [torch.device("cpu")]
    if device.type != "xla":
        metric_devices.append(idist.device())
    for metric_device in metric_devices:

        acc = TopKCategoricalAccuracy(2, device=metric_device)
        assert acc._device == metric_device
        assert acc._num_correct.device == metric_device, "{}:{} vs {}:{}".format(
            type(acc._num_correct.device), acc._num_correct.device,
            type(metric_device), metric_device)

        y_pred = torch.tensor([[0.2, 0.4, 0.6, 0.8], [0.8, 0.6, 0.4, 0.2]])
        y = torch.ones(2).long()
        acc.update((y_pred, y))

        assert acc._num_correct.device == metric_device, "{}:{} vs {}:{}".format(
            type(acc._num_correct.device), acc._num_correct.device,
            type(metric_device), metric_device)
Пример #11
0
 def _create_evaluator_engine(self):
     """ """
     return create_hog_gcn_evaluator(
         self.model,
         device=self.device,
         metrics={
             "Accuracy": Accuracy(),
             "Loss": Loss(self.loss),
             "Recall": Recall(average=True),
             "Top K Categorical Accuracy": TopKCategoricalAccuracy(k=10),
         },
     )
Пример #12
0
    def run(self, logging_dir=None, best_model_only=True):

        #assert self.model is not None, '[ERROR] No model object loaded. Please load a PyTorch model torch.nn object into the class object.'
        #assert (self.train_loader is not None) or (self.val_loader is not None), '[ERROR] You must specify data loaders.'

        for key in self.trainer_status.keys():
            assert self.trainer_status[
                key], '[ERROR] The {} has not been generated and you cannot proceed.'.format(
                    key)
        print('[INFO] Trainer pass OK for training.')

        # TRAIN ENGINE
        # Create the objects for training
        self.train_engine = self.create_trainer()

        # METRICS AND EVALUATION
        # Metrics - running average
        RunningAverage(output_transform=lambda x: x).attach(
            self.train_engine, 'loss')

        # Metrics - epochs
        metrics = {
            'accuracy': Accuracy(),
            'recall': Recall(average=True),
            'precision': Precision(average=True),
            'f1': Fbeta(beta=1),
            'topKCatAcc': TopKCategoricalAccuracy(k=5),
            'loss': Loss(self.criterion)
        }

        # Create evaluators
        self.evaluator = self.create_evaluator(metrics=metrics)
        self.train_evaluator = self.create_evaluator(metrics=metrics,
                                                     tag='train')

        # LOGGING
        # Create logging to terminal
        self.add_logging()

        # Create Tensorboard logging
        self.add_tensorboard_logging(logging_dir=logging_dir)

        ## CALLBACKS
        self.create_callbacks(best_model_only=best_model_only)

        ## TRAIN
        # Train the model
        print('[INFO] Executing model training...')
        self.train_engine.run(self.train_loader,
                              max_epochs=self.config.TRAIN.NUM_EPOCHS)
        print('[INFO] Model training is complete.')
Пример #13
0
 def _create_evaluator_engine(self):
     """ """
     return create_classification_gcn_evaluator(
         self.model,
         self.dataset.classes_dataframe,
         device=self.device,
         processes=self.processes,
         metrics={
             "Accuracy": Accuracy(),
             "Loss": Loss(self.loss),
             "Recall": Recall(average=True),
             "Top K Categorical Accuracy": TopKCategoricalAccuracy(k=10),
         },
     )
Пример #14
0
def test_zero_div():
    acc = TopKCategoricalAccuracy(2)
    with pytest.raises(NotComputableError):
        acc.compute()
Пример #15
0
def test_compute():
    acc = TopKCategoricalAccuracy(2)

    y_pred = torch.FloatTensor([[0.2, 0.4, 0.6, 0.8], [0.8, 0.6, 0.4, 0.2]])
    y = torch.ones(2).long()
    acc.update((y_pred, y))
    assert isinstance(acc.compute(), float)
    assert acc.compute() == 0.5

    acc.reset()
    y_pred = torch.FloatTensor([[0.4, 0.8, 0.2, 0.6], [0.8, 0.6, 0.4, 0.2]])
    y = torch.ones(2).long()
    acc.update((y_pred, y))
    assert isinstance(acc.compute(), float)
    assert acc.compute() == 1.0
Пример #16
0
    engine.state.iteration = resume_epoch * len(engine.state.dataloader)
    engine.state.epoch = resume_epoch
    print('Las iteraciones son')
    print(engine.state.iteration)
    print('El epoch actual es')
    print(engine.state.epoch)


#trainer.add_event_handler(Events.STARTED, resume_training)

metrics = {
    'Loss': Loss(criterion),
    'Accuracy': Accuracy(),
    'Precision': Precision(average=True),
    'Recall': Recall(average=True),
    'Top-5 Accuracy': TopKCategoricalAccuracy(k=5)
}

evaluator = create_supervised_evaluator(model,
                                        metrics=metrics,
                                        device=device,
                                        prepare_batch=zca_prepare_batch,
                                        non_blocking=True)
train_evaluator = create_supervised_evaluator(model,
                                              metrics=metrics,
                                              device=device,
                                              prepare_batch=zca_prepare_batch,
                                              non_blocking=True)

cpe = CustomPeriodicEvent(n_epochs=3)
cpe.attach(trainer)
def evaluate(net, test_dataloader):

    with torch.no_grad():
        net.eval()
        preds_all = torch.empty((len(test_dataloader), 256))
        top_1 = TopKCategoricalAccuracy(k=1)
        top_5 = TopKCategoricalAccuracy(k=5)
        top_10 = TopKCategoricalAccuracy(k=10)
        for i, data in enumerate(test_dataloader):
            lidar, beams = data
            lidar = lidar.cuda()
            beams = beams.cuda()
            preds = net(lidar)
            preds = F.softmax(preds, dim=1)
            preds_all[i, :] = preds
            top_1.update((preds, torch.argmax(beams)))
            top_5.update((preds, torch.argmax(beams)))
            top_10.update((preds, torch.argmax(beams)))
        net.train()

        print("Top-1: {:.4f} Top-5: {:.4f} Top-10: {:.4f}".format(
            top_1.compute(), top_5.compute(), top_10.compute()))
    return preds_all
Пример #18
0
def run(args, use_gpu=True):

    # saving
    save_path = os.path.join(os.getcwd(), 'models')
    if not os.path.isdir(save_path):
        os.mkdir(save_path)

    model = lipnext(inputDim=256,
                    hiddenDim=512,
                    nClasses=args.nClasses,
                    frameLen=29,
                    alpha=args.alpha)
    model = reload_model(model, args.path).to(device)

    dset_loaders, dset_sizes = data_loader(args)

    train_loader = dset_loaders['train']
    val_loader = dset_loaders['test']

    train_size = dset_sizes['train']
    val_size = dset_sizes['val']

    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.)
    scheduler = AdjustLR(optimizer, [args.lr],
                         sleep_epochs=5,
                         half=5,
                         verbose=1)
    # TQDM
    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0))

    # Ignite trainer
    trainer = create_supervised_trainer(model, optimizer, F.cross_entropy, \
                                        device=device, prepare_batch=prepare_train_batch)
    evaluator = create_supervised_evaluator(model, metrics={'accuracy': Accuracy(),
                                                            'cross_entropy': Loss(F.cross_entropy),
                                                            'top-3': TopKCategoricalAccuracy(3)
                                                            }, device=device,\
                                                            prepare_batch=prepare_val_batch)

    # call backs
    @evaluator.on(Events.EPOCH_STARTED)
    def start_val(engine):
        tqdm.write("Evaluation in progress")

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1

        if iter % args.interval == 0:
            pbar.desc = desc.format(engine.state.output)
            pbar.update(args.interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_loss = metrics['cross_entropy']
        top_acc = metrics['top-3']
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f}, Top3: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, top_acc, avg_loss))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):

        # large dataset so saving often
        tqdm.write("saving model ..")
        torch.save(
            model.state_dict(),
            os.path.join(save_path,
                         'epoch' + str(engine.state.epoch + 1) + '.pt'))
        # saving to ONNX format
        dummy_input = torch.randn(args.batch_size, 1, 29, 88, 88)
        torch.onnx.export(model, dummy_input, "lipnext.onnx")

        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        top_acc = metrics['top-3']
        avg_loss = metrics['cross_entropy']
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f}, Top3: {:.2f} Avg loss: {:.2f} "
            .format(engine.state.epoch, avg_accuracy, top_acc, avg_loss))

        pbar.n = pbar.last_print_n = 0

    @trainer.on(Events.EPOCH_COMPLETED)
    def update_lr(engine):
        scheduler.step(engine.state.epoch)

    trainer.run(train_loader, max_epochs=args.epochs)
    pbar.close()
Пример #19
0
def training(config,
             local_rank=None,
             with_mlflow_logging=False,
             with_plx_logging=False):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    set_seed(config.seed + local_rank)
    torch.cuda.set_device(local_rank)
    device = 'cuda'

    torch.backends.cudnn.benchmark = True

    train_loader = config.train_loader
    train_sampler = getattr(train_loader, "sampler", None)
    assert train_sampler is not None, "Train loader of type '{}' " \
                                      "should have attribute 'sampler'".format(type(train_loader))
    assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \
        "Train sampler should have a callable method `set_epoch`"

    train_eval_loader = config.train_eval_loader
    val_loader = config.val_loader

    model = config.model.to(device)
    optimizer = config.optimizer
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=getattr(
                                          config, "fp16_opt_level", "O2"),
                                      num_losses=1)
    model = DDP(model, delay_allreduce=True)
    criterion = config.criterion.to(device)

    prepare_batch = getattr(config, "prepare_batch", _prepare_batch)
    non_blocking = getattr(config, "non_blocking", True)

    # Setup trainer
    accumulation_steps = getattr(config, "accumulation_steps", 1)
    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    def train_update_function(engine, batch):

        model.train()

        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        y_pred = model_output_transform(y_pred)
        loss = criterion(y_pred, y) / accumulation_steps

        with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss:
            scaled_loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return {
            'supervised batch loss': loss.item(),
        }

    trainer = Engine(train_update_function)
    common.setup_common_distrib_training_handlers(
        trainer,
        train_sampler,
        to_save={
            'model': model,
            'optimizer': optimizer
        },
        save_every_iters=1000,
        output_path=config.output_path.as_posix(),
        lr_scheduler=config.lr_scheduler,
        with_gpu_stats=True,
        output_names=[
            'supervised batch loss',
        ],
        with_pbars=True,
        with_pbar_on_iters=with_mlflow_logging,
        log_every_iters=1)

    if getattr(config, "benchmark_dataflow", False):
        benchmark_dataflow_num_iters = getattr(config,
                                               "benchmark_dataflow_num_iters",
                                               1000)
        DataflowBenchmark(benchmark_dataflow_num_iters,
                          prepare_batch=prepare_batch,
                          device=device).attach(trainer, train_loader)

    # Setup evaluators
    val_metrics = {
        "Accuracy": Accuracy(device=device),
        "Top-5 Accuracy": TopKCategoricalAccuracy(k=5, device=device),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    evaluator_args = dict(model=model,
                          metrics=val_metrics,
                          device=device,
                          non_blocking=non_blocking,
                          prepare_batch=prepare_batch,
                          output_transform=lambda x, y, y_pred: (
                              model_output_transform(y_pred),
                              y,
                          ))
    train_evaluator = create_supervised_evaluator(**evaluator_args)
    evaluator = create_supervised_evaluator(**evaluator_args)

    if dist.get_rank() == 0 and with_mlflow_logging:
        ProgressBar(persist=False,
                    desc="Train Evaluation").attach(train_evaluator)
        ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator)

    def run_validation(_):
        train_evaluator.run(train_eval_loader)
        evaluator.run(val_loader)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)),
        run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    score_metric_name = "Accuracy"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    if dist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(config.output_path.as_posix(),
                                            trainer,
                                            optimizer,
                                            evaluators={
                                                "training": train_evaluator,
                                                "validation": evaluator
                                            })
        if with_mlflow_logging:
            common.setup_mlflow_logging(trainer,
                                        optimizer,
                                        evaluators={
                                            "training": train_evaluator,
                                            "validation": evaluator
                                        })

        if with_plx_logging:
            common.setup_plx_logging(trainer,
                                     optimizer,
                                     evaluators={
                                         "training": train_evaluator,
                                         "validation": evaluator
                                     })

        common.save_best_model_by_val_score(config.output_path.as_posix(),
                                            evaluator,
                                            model,
                                            metric_name=score_metric_name,
                                            trainer=trainer)

        # Log train/val predictions:
        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2))

        tb_logger.attach(train_evaluator,
                         log_handler=predictions_gt_images_handler(
                             img_denormalize_fn=config.img_denormalize,
                             n_images=15,
                             another_engine=trainer,
                             prefix_tag="training"),
                         event_name=Events.ITERATION_COMPLETED(
                             once=len(train_eval_loader) // 2))

    trainer.run(train_loader, max_epochs=config.num_epochs)
Пример #20
0
def run_training(model, train, valid, optimizer, loss, lr_find=False):
    print_file(f'Experiment: {rcp.experiment}\nDescription:{rcp.description}',
               f'{rcp.base_path}description.txt')
    print_file(model, f'{rcp.models_path}model.txt')
    print_file(get_transforms(), f'{rcp.models_path}transform_{rcp.stage}.txt')
    # Data
    train.transform = get_transforms()
    valid.transform = get_transforms()
    train.save_csv(f'{rcp.base_path}train_df_{rcp.stage}.csv')
    valid.save_csv(f'{rcp.base_path}valid_df_{rcp.stage}.csv')
    train_loader = DataLoader(train,
                              batch_size=rcp.bs,
                              num_workers=8,
                              shuffle=rcp.shuffle_batch)
    valid_loader = DataLoader(valid,
                              batch_size=rcp.bs,
                              num_workers=8,
                              shuffle=rcp.shuffle_batch)

    if lr_find: lr_finder(model, optimizer, loss, train_loader, valid_loader)

    one_batch = next(iter(train_loader))
    dot = make_dot(model(one_batch[0].to(cfg.device)),
                   params=dict(model.named_parameters()))
    dot.render(f'{rcp.models_path}graph', './', format='png', cleanup=True)
    summary(model,
            one_batch[0].shape[-3:],
            batch_size=rcp.bs,
            device=cfg.device,
            to_file=f'{rcp.models_path}summary_{rcp.stage}.txt')

    # Engines
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        loss,
                                        device=cfg.device)
    t_evaluator = create_supervised_evaluator(model,
                                              metrics={
                                                  'accuracy':
                                                  Accuracy(),
                                                  'nll':
                                                  Loss(loss),
                                                  'precision':
                                                  Precision(average=True),
                                                  'recall':
                                                  Recall(average=True),
                                                  'topK':
                                                  TopKCategoricalAccuracy()
                                              },
                                              device=cfg.device)
    v_evaluator = create_supervised_evaluator(
        model,
        metrics={
            'accuracy':
            Accuracy(),
            'nll':
            Loss(loss),
            'precision_avg':
            Precision(average=True),
            'recall_avg':
            Recall(average=True),
            'topK':
            TopKCategoricalAccuracy(),
            'conf_mat':
            ConfusionMatrix(num_classes=len(valid.classes), average=None),
        },
        device=cfg.device)

    # Tensorboard
    tb_logger = TensorboardLogger(log_dir=f'{rcp.tb_log_path}{rcp.stage}')
    tb_writer = tb_logger.writer
    tb_logger.attach(trainer,
                     log_handler=OptimizerParamsHandler(optimizer, "lr"),
                     event_name=Events.EPOCH_STARTED)
    tb_logger.attach(trainer,
                     log_handler=WeightsHistHandler(model),
                     event_name=Events.EPOCH_COMPLETED)
    tb_logger.attach(trainer,
                     log_handler=WeightsScalarHandler(model),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(trainer,
                     log_handler=GradsScalarHandler(model),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(trainer,
                     log_handler=GradsHistHandler(model),
                     event_name=Events.EPOCH_COMPLETED)

    @trainer.on(Events.EPOCH_COMPLETED)
    def tb_and_log_training_stats(engine):
        t_evaluator.run(train_loader)
        v_evaluator.run(valid_loader)
        tb_and_log_train_valid_stats(engine, t_evaluator, v_evaluator,
                                     tb_writer)

    @trainer.on(
        Events.ITERATION_COMPLETED(every=int(1 + len(train_loader) / 100)))
    def print_dash(engine):
        print('-', sep='', end='', flush=True)

    if cfg.show_batch_images:

        @trainer.on(Events.STARTED)
        def show_batch_images(engine):
            imgs, lbls = next(iter(train_loader))
            denormalize = DeNormalize(**rcp.transforms.normalize)
            for i in range(len(imgs)):
                imgs[i] = denormalize(imgs[i])
            imgs = imgs.to(cfg.device)
            grid = thv.utils.make_grid(imgs)
            tb_writer.add_image('images', grid, 0)
            tb_writer.add_graph(model, imgs)
            tb_writer.flush()

    if cfg.show_top_losses:

        @trainer.on(Events.COMPLETED)
        def show_top_losses(engine, k=6):
            nll_loss = nn.NLLLoss(reduction='none')
            df = predict_dataset(model,
                                 valid,
                                 nll_loss,
                                 transform=None,
                                 bs=rcp.bs,
                                 device=cfg.device)
            df.sort_values('loss', ascending=False, inplace=True)
            df.reset_index(drop=True, inplace=True)
            for i, row in df.iterrows():
                img = cv2.imread(str(row['fname']))
                img = th.as_tensor(img.transpose(2, 0, 1))  # #CHW
                tag = f'TopLoss_{engine.state.epoch}/{row.loss:.4f}/{row.target}/{row.pred}/{row.pred2}'
                tb_writer.add_image(tag, img, 0)
                if i >= k - 1: break
            tb_writer.flush()

    if cfg.tb_projector:
        images, labels = train.select_n_random(250)
        # get the class labels for each image
        class_labels = [train.classes[lab] for lab in labels]
        # log embeddings
        features = images.view(-1, images.shape[-1] * images.shape[-2])
        tb_writer.add_embedding(features,
                                metadata=class_labels,
                                label_img=images)

    if cfg.log_pr_curve:

        @trainer.on(Events.COMPLETED)
        def log_pr_curve(engine):
            """
            1. gets the probability predictions in a test_size x num_classes Tensor
            2. gets the preds in a test_size Tensor
            takes ~10 seconds to run
            """
            class_probs = []
            class_preds = []
            with th.no_grad():
                for data in valid_loader:
                    imgs, lbls = data
                    imgs, lbls = imgs.to(cfg.device), lbls.to(cfg.device)
                    output = model(imgs)
                    class_probs_batch = [
                        th.softmax(el, dim=0) for el in output
                    ]
                    _, class_preds_batch = th.max(output, 1)
                    class_probs.append(class_probs_batch)
                    class_preds.append(class_preds_batch)
            test_probs = th.cat([th.stack(batch) for batch in class_probs])
            test_preds = th.cat(class_preds)

            for i in range(len(valid.classes)):
                """ Takes in a "class_index" from 0 to 9 and plots the corresponding precision-recall curve"""
                tensorboard_preds = test_preds == i
                tensorboard_probs = test_probs[:, i]

                tb_writer.add_pr_curve(f'{rcp.stage}/{valid.classes[i]}',
                                       tensorboard_preds,
                                       tensorboard_probs,
                                       global_step=engine.state.epoch,
                                       num_thresholds=127)
                tb_writer.flush()

    print()

    if cfg.lr_scheduler:
        # lr_scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, factor=.5, min_lr=1e-7, verbose=True)
        # v_evaluator.add_event_handler(Events.EPOCH_COMPLETED, lambda engine: lr_scheduler.step(v_evaluator.state.metrics['nll']))
        lr_scheduler = DelayedCosineAnnealingLR(optimizer, 10, 5)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED,
            lambda engine: lr_scheduler.step(trainer.state.epoch))

    if cfg.early_stopping:

        def score_function(engine):
            score = -1 * round(engine.state.metrics['nll'], 5)
            # score = engine.state.metrics['accuracy']
            return score

        es_handler = EarlyStopping(patience=10,
                                   score_function=score_function,
                                   trainer=trainer)
        v_evaluator.add_event_handler(Events.COMPLETED, es_handler)

    if cfg.save_last_checkpoint:

        @trainer.on(Events.EPOCH_COMPLETED(every=1))
        def save_last_checkpoint(engine):
            checkpoint = {}
            objects = {'model': model, 'optimizer': optimizer}
            if cfg.lr_scheduler: objects['lr_scheduler'] = lr_scheduler
            for k, obj in objects.items():
                checkpoint[k] = obj.state_dict()
            th.save(checkpoint,
                    f'{rcp.models_path}last_{rcp.stage}_checkpoint.pth')

    if cfg.save_best_checkpoint:

        def score_function(engine):
            score = -1 * round(engine.state.metrics['nll'], 5)
            # score = engine.state.metrics['accuracy']
            return score

        objects = {'model': model, 'optimizer': optimizer}
        if cfg.lr_scheduler: objects['lr_scheduler'] = lr_scheduler

        save_best = Checkpoint(
            objects,
            DiskSaver(f'{rcp.models_path}',
                      require_empty=False,
                      create_dir=True),
            n_saved=4,
            filename_prefix=f'best_{rcp.stage}',
            score_function=score_function,
            score_name='val_loss',
            global_step_transform=global_step_from_engine(trainer))
        v_evaluator.add_event_handler(Events.EPOCH_COMPLETED(every=1),
                                      save_best)
        load_checkpoint = False

        if load_checkpoint:
            resume_epoch = 6
            cp = f'{rcp.models_path}last_{rcp.stage}_checkpoint.pth'
            obj = th.load(f'{cp}')
            Checkpoint.load_objects(objects, obj)

            @trainer.on(Events.STARTED)
            def resume_training(engine):
                engine.state.iteration = (resume_epoch - 1) * len(
                    engine.state.dataloader)
                engine.state.epoch = resume_epoch - 1

    if cfg.save_confusion_matrix:

        @trainer.on(Events.STARTED)
        def init_best_loss(engine):
            engine.state.metrics['best_loss'] = 1e99

        @trainer.on(Events.EPOCH_COMPLETED)
        def confusion_matric(engine):
            if engine.state.metrics['best_loss'] > v_evaluator.state.metrics[
                    'nll']:
                engine.state.metrics['best_loss'] = v_evaluator.state.metrics[
                    'nll']
                cm = v_evaluator.state.metrics['conf_mat']
                cm_df = pd.DataFrame(cm.numpy(),
                                     index=valid.classes,
                                     columns=valid.classes)
                pretty_plot_confusion_matrix(
                    cm_df,
                    f'{rcp.results_path}cm_{rcp.stage}_{trainer.state.epoch}.png',
                    False)

    if cfg.log_stats:

        class Hook:
            def __init__(self, module):
                self.name = module[0]
                self.hook = module[1].register_forward_hook(self.hook_fn)
                self.stats_mean = 0
                self.stats_std = 0

            def hook_fn(self, module, input, output):
                self.stats_mean = output.mean()
                self.stats_std = output.std()

            def close(self):
                self.hook.remove()

        hookF = [Hook(layer) for layer in list(model.cnn.named_children())]

        @trainer.on(Events.ITERATION_COMPLETED)
        def log_stats(engine):
            std = {}
            mean = {}
            for hook in hookF:
                tb_writer.add_scalar(f'std/{hook.name}', hook.stats_std,
                                     engine.state.iteration)
                tb_writer.add_scalar(f'mean/{hook.name}', hook.stats_mean,
                                     engine.state.iteration)

    cfg.save_yaml()
    rcp.save_yaml()
    print(f'# batches: train: {len(train_loader)}, valid: {len(valid_loader)}')
    trainer.run(data=train_loader, max_epochs=rcp.max_epochs)
    tb_writer.close()
    tb_logger.close()
    return model
def multiclass_train_lstm(
    model: LstmClassifier,
    dataloader_train: DataLoader,
    dataloader_val: DataLoader,
    filename_prefix: str,
):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=1e-4,
                                 weight_decay=1e-3)
    criterion = CrossEntropyLossOneHot()

    def process_function(_engine, batch):
        model.train()
        optimizer.zero_grad()
        x, y = batch
        x = x.to(device)
        y = y.to(device)
        y_pred = model(x)
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()
        return y_pred, y, loss.item(),

    def eval_function(_engine, batch):
        model.eval()
        with torch.no_grad():
            x, y = batch
            y = y.to(device)
            x = x.to(device)
            y_pred = model(x)
            return y_pred, y

    def score_function(engine):
        return engine.state.metrics['top3-accuracy']

    model.to(device)

    trainer = Engine(process_function)
    train_evaluator = Engine(eval_function)
    validation_evaluator = Engine(eval_function)

    accuracy_top1 = Accuracy(output_transform=lambda x: (x[0], x[1]),
                             device=device,
                             is_multilabel=True)
    accuracy_top3 = TopKCategoricalAccuracy(output_transform=lambda x:
                                            (x[0], x[1]),
                                            k=3,
                                            device=device)

    RunningAverage(accuracy_top1).attach(trainer, 'accuracy')
    RunningAverage(accuracy_top3).attach(trainer, 'top3-accuracy')
    RunningAverage(output_transform=lambda x: x[2]).attach(trainer, 'loss')

    accuracy_top1.attach(train_evaluator, 'accuracy')
    accuracy_top3.attach(train_evaluator, 'top3-accuracy')
    Loss(criterion).attach(train_evaluator, 'loss')

    accuracy_top1.attach(validation_evaluator, 'accuracy')
    accuracy_top3.attach(validation_evaluator, 'top3-accuracy')
    Loss(criterion).attach(validation_evaluator, 'loss')

    pbar = ProgressBar(persist=True, bar_format="")
    pbar.attach(engine=trainer, metric_names='all')

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        train_evaluator.run(dataloader_train)
        message = f'Training results - Epoch: {engine.state.epoch}.'
        for metric_name, score in train_evaluator.state.metrics.items():
            message += f' {metric_name}: {score:.2f}.'
        pbar.log_message(message)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        validation_evaluator.run(dataloader_val)
        message = f'Validation results - Epoch: {engine.state.epoch}.'
        for metric_name, score in train_evaluator.state.metrics.items():
            message += f' {metric_name}: {score:.2f}.'
        pbar.log_message(message)
        pbar.n = pbar.last_print_n = 0

    validation_evaluator.add_event_handler(
        Events.COMPLETED,
        EarlyStopping(patience=5,
                      score_function=score_function,
                      trainer=trainer))

    checkpointer = ModelCheckpoint(dirname=DIR_MODELS,
                                   filename_prefix=filename_prefix,
                                   score_function=score_function,
                                   score_name='top3-accuracy',
                                   n_saved=2,
                                   create_dir=True,
                                   save_as_state_dict=True,
                                   require_empty=False)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer,
                              {'v2': model})

    trainer.run(dataloader_train, max_epochs=20)
Пример #22
0
    )
print('done')

## SETUP TRAINER AND EVALUATOR
# Setup model trainer and evaluator
print('[INFO] Creating Ignite training, evaluation objects and logging...', end='')
trainer = create_trainer(model=model, optimizer=optimizer, criterion=criterion, lr_scheduler=lr_scheduler)
# Metrics - running average
RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')
# Metrics - epochs
metrics = {
    'accuracy':Accuracy(),
    'recall':Recall(average=True),
    'precision':Precision(average=True),
    'f1':Fbeta(beta=1),
    'topKCatAcc':TopKCategoricalAccuracy(k=5),
    'loss':Loss(criterion)
}

# Create evaluators
evaluator = create_evaluator(model, metrics=metrics)
train_evaluator = create_evaluator(model, metrics=metrics, tag='train')

# Add validation logging
trainer.add_event_handler(Events.EPOCH_COMPLETED(every=1), evaluate_model)

# Add step length update at the end of each epoch
trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: lr_scheduler.step())

# Add TensorBoard logging
tb_logger = TensorboardLogger(log_dir=os.path.join(working_dir,'tb_logs'))
Пример #23
0
def training(local_rank, config, logger=None):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    torch.backends.cudnn.benchmark = True

    set_seed(config.seed + local_rank)

    train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader

    # Setup model, optimizer, criterion
    model, optimizer, criterion = initialize(config)

    if not hasattr(config, "prepare_batch"):
        config.prepare_batch = _prepare_batch

    # Setup trainer for this specific task
    trainer = create_trainer(model, optimizer, criterion, train_loader.sampler,
                             config, logger)

    if getattr(config, "benchmark_dataflow", False):
        benchmark_dataflow_num_iters = getattr(config,
                                               "benchmark_dataflow_num_iters",
                                               1000)
        DataflowBenchmark(benchmark_dataflow_num_iters,
                          prepare_batch=config.prepare_batch).attach(
                              trainer, train_loader)

    # Setup evaluators
    val_metrics = {
        "Accuracy": Accuracy(),
        "Top-5 Accuracy": TopKCategoricalAccuracy(k=5),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator, train_evaluator = create_evaluators(model, val_metrics, config)

    @trainer.on(
        Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1))
        | Events.COMPLETED)
    def run_validation():
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_eval_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(val_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)

    score_metric_name = "Accuracy"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    # Store 3 best models by validation accuracy:
    common.save_best_model_by_val_score(
        config.output_path.as_posix(),
        evaluator,
        model=model,
        metric_name=score_metric_name,
        n_saved=3,
        trainer=trainer,
        tag="val",
    )

    if idist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(
            config.output_path.as_posix(),
            trainer,
            optimizer,
            evaluators={
                "training": train_evaluator,
                "validation": evaluator
            },
        )

        exp_tracking_logger = exp_tracking.setup_logging(trainer,
                                                         optimizer,
                                                         evaluators={
                                                             "training":
                                                             train_evaluator,
                                                             "validation":
                                                             evaluator
                                                         })

        # Log train/val predictions:
        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2),
        )

        tb_logger.attach(
            train_evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="training"),
            event_name=Events.ITERATION_COMPLETED(
                once=len(train_eval_loader) // 2),
        )

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if idist.get_rank() == 0:
        tb_logger.close()
        exp_tracking_logger.close()
 def test_topk_accuracy(self, k: int, y_pred: Tensor, y_true: Tensor,
                        score: float):
     accuracy = TopKCategoricalAccuracy(k=k)
     accuracy.update((y_pred, y_true))
     self.assertEqual(score, accuracy.compute())
Пример #25
0
    def __init__(self,
                 prefix,
                 loss_type: str,
                 threshold=0.5,
                 top_k=[1, 5, 10],
                 n_classes: int = None,
                 multilabel: bool = None,
                 metrics=["precision", "recall", "top_k", "accuracy"]):
        self.loss_type = loss_type.upper()
        self.threshold = threshold
        self.n_classes = n_classes
        self.multilabel = multilabel
        self.top_ks = top_k
        self.prefix = prefix
        add_f1_metric = False

        if n_classes:
            top_k = [k for k in top_k if k < n_classes]

        self.metrics = {}
        for metric in metrics:
            if "precision" == metric:
                self.metrics[metric] = Precision(average=False,
                                                 is_multilabel=multilabel,
                                                 output_transform=None)
                if "micro_f1" in metrics:
                    self.metrics["precision_avg"] = Precision(
                        average=True,
                        is_multilabel=multilabel,
                        output_transform=None)
            elif "recall" == metric:
                self.metrics[metric] = Recall(average=False,
                                              is_multilabel=multilabel,
                                              output_transform=None)
                if "micro_f1" in metrics:
                    self.metrics["recall_avg"] = Recall(
                        average=True,
                        is_multilabel=multilabel,
                        output_transform=None)
            elif "top_k" in metric:
                if multilabel:
                    self.metrics[metric] = TopKMultilabelAccuracy(k_s=top_k)
                else:
                    self.metrics[metric] = TopKCategoricalAccuracy(
                        k=max(int(np.log(n_classes)), 1),
                        output_transform=None)
            elif "f1" in metric:
                add_f1_metric = True
                continue
            elif "accuracy" in metric:
                self.metrics[metric] = Accuracy(is_multilabel=multilabel,
                                                output_transform=None)
            elif "ogbn" in metric:
                self.metrics[metric] = NodeClfEvaluator(NodeEvaluator(metric))
            elif "ogbg" in metric:
                self.metrics[metric] = NodeClfEvaluator(GraphEvaluator(metric))
            elif "ogbl" in metric:
                self.metrics[metric] = LinkPredEvaluator(LinkEvaluator(metric))
            else:
                print(f"WARNING: metric {metric} doesn't exist")

        if add_f1_metric:
            assert "precision" in self.metrics and "recall" in self.metrics

            def macro_f1(precision, recall):
                return (precision * recall * 2 /
                        (precision + recall + 1e-12)).mean()

            self.metrics["macro_f1"] = MetricsLambda(macro_f1,
                                                     self.metrics["precision"],
                                                     self.metrics["recall"])

            if "micro_f1" in metrics:

                def micro_f1(precision, recall):
                    return (precision * recall * 2 /
                            (precision + recall + 1e-12))

                self.metrics["micro_f1"] = MetricsLambda(
                    micro_f1, self.metrics["precision_avg"],
                    self.metrics["recall_avg"])

        self.reset_metrics()