Пример #1
0
        "emb_vecs": [emb_vec1, emb_vec2],
        "cls_pred": cls_pred,
        "cls_true": cls_true,
        "targets": targets
    }
    return ret


# ----------------------------------------
if __name__ == "__main__":
    engine = Engine(_update)
    metrics = {
        "sim_acc":
        SiameseNetSimilarityAccuracy(margin=margin),
        "clsf_acc":
        Accuracy(output_transform=lambda x: (x['cls_pred'], x['cls_true']))
    }

    for name, metric in metrics.items():
        metric.attach(engine, name)

    from ignite.contrib.handlers import ProgressBar
    pbar = ProgressBar()
    pbar.attach(engine,
                output_transform=lambda x: {
                    'con_loss': x['con_loss'],
                    'clsf_loss': x['clsf_loss']
                })

    from ignite.engine import Events
    # @engine.on(Events.ITERATION_COMPLETED)
Пример #2
0
def train():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="data/korean/",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="gpt2",
                        help="Path, url or short name of the model")
    parser.add_argument("--model_version",
                        type=str,
                        default='v4',
                        help="version of model")
    parser.add_argument("--num_candidates",
                        type=int,
                        default=2,
                        help="Number of candidates for training")
    parser.add_argument("--max_history",
                        type=int,
                        default=30,
                        help="Number of previous exchanges to keep in history")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=1,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=1,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=8,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-5,
                        help="Learning rate")
    parser.add_argument("--lm_coef",
                        type=float,
                        default=1.0,
                        help="LM loss coefficient")
    parser.add_argument("--mc_coef",
                        type=float,
                        default=1.0,
                        help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=5,
                        help="Number of training epochs")
    parser.add_argument(
        "--eval_before_start",
        action='store_true',
        help="If true start with a first evaluation before training")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--fp16",
        type=str,
        default="",
        help=
        "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", args.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info(
        "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning"
    )
    torch.manual_seed(42)

    def get_kogpt2_tokenizer(model_path=None):
        if not model_path:
            model_path = 'taeminlee/kogpt2'
        tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        return tokenizer

    tokenizer = get_kogpt2_tokenizer()
    optimizer_class = AdamW
    model = get_kogpt2_model()
    model.to(args.device)
    optimizer = optimizer_class(model.parameters(), lr=args.lr)

    # tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint, unk_token='<|unkwn|>')
    SPECIAL_TOKENS_DICT = {'additional_special_tokens': SPECIAL_TOKENS}

    # tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT)
    print("SPECIAL TOKENS")
    print(SPECIAL_TOKENS)
    tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT)

    for value in SPECIAL_TOKENS:
        logger.info("Assigning %s to the %s key of the tokenizer", value,
                    value)
        setattr(tokenizer, value, value)
    model.resize_token_embeddings(len(tokenizer))

    s = ' '.join(act_name) + ' '.join(slot_name)
    print(tokenizer.decode(tokenizer.encode(s)))
    print(len(act_name) + len(slot_name), len(tokenizer.encode(s)))
    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        args, tokenizer)

    # Training function and trainer

    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        lm_loss, mc_loss, *_ = model(*batch)
        loss = (lm_loss * args.lm_coef +
                mc_loss * args.mc_coef) / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)
    trainer.logger.setLevel(logging.INFO)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)

    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            model_outputs = model(input_ids,
                                  mc_token_ids,
                                  token_type_ids=token_type_ids)
            lm_logits, mc_logits = model_outputs[0], model_outputs[
                1]  # So we can also use GPT2 outputs
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted,
                    mc_logits), (lm_labels_flat_shifted, mc_labels)

    evaluator = Engine(inference)
    evaluator.logger.setLevel(logging.INFO)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr),
                                 (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-1),
             output_transform=lambda x: (x[0][0], x[1][0])),
        "accuracy":
        Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args),
        "average_accuracy":
        MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir=None)
        tb_logger.writer.log_dir = tb_logger.writer.file_writer.get_logdir()
        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        """tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()),
                                                              global_step_transform=global_step_from_engine(trainer)),
                         event_name=Events.EPOCH_COMPLETED)"""
        tb_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag="validation",
            metric_names=list(metrics.keys()),
            global_step_transform=global_step_from_engine(trainer))

        checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=3)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" take care of distributed encapsulation

        torch.save(args, tb_logger.writer.log_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(
            os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
        tokenizer.save_vocabulary(tb_logger.writer.log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(
            checkpoint_handler._saved[-1][1][-1],
            os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Пример #3
0
def _test_distrib_on_metric(device):
    import torch.distributed as dist
    rank = dist.get_rank()
    n_iters = 10
    n_epochs = 3
    batch_size = 10
    n_classes = 10

    data = list(range(n_iters))
    np.random.seed(12)
    all_y_true_batch_values = np.random.randint(0,
                                                n_classes,
                                                size=(dist.get_world_size(),
                                                      n_epochs * n_iters,
                                                      batch_size))
    all_y_pred_batch_values = np.random.rand(dist.get_world_size(),
                                             n_epochs * n_iters, batch_size,
                                             n_classes)

    y_true_batch_values = iter(all_y_true_batch_values[rank, ...])
    y_pred_batch_values = iter(all_y_pred_batch_values[rank, ...])

    def update_fn(engine, batch):
        y_true_batch = next(y_true_batch_values)
        y_pred_batch = next(y_pred_batch_values)
        return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)

    trainer = Engine(update_fn)
    alpha = 0.98

    acc_metric = RunningAverage(Accuracy(
        output_transform=lambda x: [x[0], x[1]], device=device),
                                alpha=alpha,
                                epoch_bound=False)
    acc_metric.attach(trainer, 'running_avg_accuracy')

    running_avg_acc = [
        None,
    ]
    true_acc_metric = Accuracy(device=device)

    @trainer.on(Events.ITERATION_COMPLETED)
    def manual_running_avg_acc(engine):
        i = engine.state.iteration - 1

        true_acc_metric.reset()
        for j in range(dist.get_world_size()):
            output = (torch.from_numpy(all_y_pred_batch_values[j, i, :, :]),
                      torch.from_numpy(all_y_true_batch_values[j, i, :]))
            true_acc_metric.update(output)

        batch_acc = true_acc_metric._num_correct * 1.0 / true_acc_metric._num_examples

        if running_avg_acc[0] is None:
            running_avg_acc[0] = batch_acc
        else:
            running_avg_acc[0] = running_avg_acc[0] * alpha + (
                1.0 - alpha) * batch_acc
        engine.state.running_avg_acc = running_avg_acc[0]

    @trainer.on(Events.ITERATION_COMPLETED)
    def assert_equal_running_avg_acc_values(engine):
        assert engine.state.running_avg_acc == engine.state.metrics['running_avg_accuracy'], \
            "{} vs {}".format(engine.state.running_avg_acc, engine.state.metrics['running_avg_accuracy'])

    trainer.run(data, max_epochs=3)
Пример #4
0
def main():
    monai.config.print_config()
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    # IXI dataset as a demo, downloadable from https://brain-development.org/ixi-dataset/
    images = [
        os.sep.join([
            "workspace", "data", "medical", "ixi", "IXI-T1",
            "IXI607-Guys-1097-T1.nii.gz"
        ]),
        os.sep.join([
            "workspace", "data", "medical", "ixi", "IXI-T1",
            "IXI175-HH-1570-T1.nii.gz"
        ]),
        os.sep.join([
            "workspace", "data", "medical", "ixi", "IXI-T1",
            "IXI385-HH-2078-T1.nii.gz"
        ]),
        os.sep.join([
            "workspace", "data", "medical", "ixi", "IXI-T1",
            "IXI344-Guys-0905-T1.nii.gz"
        ]),
        os.sep.join([
            "workspace", "data", "medical", "ixi", "IXI-T1",
            "IXI409-Guys-0960-T1.nii.gz"
        ]),
        os.sep.join([
            "workspace", "data", "medical", "ixi", "IXI-T1",
            "IXI584-Guys-1129-T1.nii.gz"
        ]),
        os.sep.join([
            "workspace", "data", "medical", "ixi", "IXI-T1",
            "IXI253-HH-1694-T1.nii.gz"
        ]),
        os.sep.join([
            "workspace", "data", "medical", "ixi", "IXI-T1",
            "IXI092-HH-1436-T1.nii.gz"
        ]),
        os.sep.join([
            "workspace", "data", "medical", "ixi", "IXI-T1",
            "IXI574-IOP-1156-T1.nii.gz"
        ]),
        os.sep.join([
            "workspace", "data", "medical", "ixi", "IXI-T1",
            "IXI585-Guys-1130-T1.nii.gz"
        ]),
    ]

    # 2 binary labels for gender classification: man and woman
    labels = np.array([0, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=np.int64)

    # define transforms for image
    val_transforms = Compose(
        [ScaleIntensity(),
         AddChannel(),
         Resize((96, 96, 96)),
         ToTensor()])
    # define image dataset
    val_ds = ImageDataset(image_files=images,
                          labels=labels,
                          transform=val_transforms,
                          image_only=False)
    # create DenseNet121
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    net = monai.networks.nets.densenet.densenet121(spatial_dims=3,
                                                   in_channels=1,
                                                   out_channels=2).to(device)

    metric_name = "Accuracy"
    # add evaluation metric to the evaluator engine
    val_metrics = {metric_name: Accuracy()}

    def prepare_batch(batch, device=None, non_blocking=False):
        return _prepare_batch((batch[0], batch[1]), device, non_blocking)

    # Ignite evaluator expects batch=(img, label) and returns output=(y_pred, y) at every iteration,
    # user can add output_transform to return other values
    evaluator = create_supervised_evaluator(net,
                                            val_metrics,
                                            device,
                                            True,
                                            prepare_batch=prepare_batch)

    # add stats event handler to print validation stats via evaluator
    val_stats_handler = StatsHandler(
        name="evaluator",
        output_transform=lambda x:
        None,  # no need to print loss value, so disable per iteration output
    )
    val_stats_handler.attach(evaluator)

    # for the array data format, assume the 3rd item of batch data is the meta_data
    prediction_saver = ClassificationSaver(
        output_dir="tempdir",
        batch_transform=lambda batch: batch[2],
        output_transform=lambda output: output[0].argmax(1),
    )
    prediction_saver.attach(evaluator)

    # the model was trained by "densenet_training_array" example
    CheckpointLoader(load_path="./runs_array/net_checkpoint_20.pt",
                     load_dict={
                         "net": net
                     }).attach(evaluator)

    # create a validation data loader
    val_loader = DataLoader(val_ds,
                            batch_size=2,
                            num_workers=4,
                            pin_memory=torch.cuda.is_available())

    state = evaluator.run(val_loader)
    print(state)
Пример #5
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = f"stop-on-{config['stop_iteration']}"

        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_path']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

        if config["with_clearml"]:
            try:
                from clearml import Task
            except ImportError:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task

            task = Task.init("CIFAR10-Training", task_name=output_path.stem)
            task.connect_configuration(config)
            # Log hyper parameters
            hyper_params = [
                "model",
                "batch_size",
                "momentum",
                "weight_decay",
                "num_epochs",
                "learning_rate",
                "num_warmup_epochs",
            ]
            task.connect({k: config[k] for k in hyper_params})

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "accuracy": Accuracy(),
        "loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluator,
        models={"model": model},
        metric_name="accuracy",
        n_saved=3,
        trainer=trainer,
        tag="test",
    )

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info(
                f"Stop training on {trainer.state.iteration} iteration")
            trainer.terminate()

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        tb_logger.close()
Пример #6
0
def train():
    parser = ArgumentParser()
    parser.add_argument("--run_data", type=bool, default=False)
    parser.add_argument("--eval_freq", type=int, default=200000)
    parser.add_argument("--save_freq", type=int, default=2000)
    parser.add_argument("--data_nuggets", type=str, default=my_dataset)
    parser.add_argument("--dataset_path", type=str, default=my_dataset + 'json.txt', help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default=my_dataset + 'cache', help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint", type=str, default="gpt2", help="Path, url or short name of the model")
    parser.add_argument("--model", type=str, default="gpt2")
    parser.add_argument("--eval_before_start", type=bool, default=False, help="If true start with a first evaluation before training")
    parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training")
    parser.add_argument("--max_history", type=int, default=1, help="Number of previous exchanges to keep in history")
    parser.add_argument("--train_batch_size", type=int, default=2, help="Batch size for training")
    parser.add_argument("--valid_batch_size", type=int, default=2, help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps")
    parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate")
    parser.add_argument("--lm_coef", type=float, default=2.0, help="LM loss coefficient")
    parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
    parser.add_argument("--n_epochs", type=int, default=50, help="Number of training epochs")
    parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Running process %d", args.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')

    logger.info("Prepare tokenizer, pretrained model and optimizer.")
    tokenizer_class = GPT2Tokenizer if "gpt2" in args.model else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)


    model_class = GPT2DoubleHeadsModel if "gpt2" in args.model else OpenAIGPTDoubleHeadsModel
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    # Add special tokens if they are not already added
    add_special_tokens_(model, tokenizer)
    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16)
    #if args.distributed:
    #model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
    #model = DataParallel(model)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        #print(batch)
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
        (lm_loss), (mc_loss), *_ = model(
            input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
            mc_labels=mc_labels, lm_labels=lm_labels
        )
        loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps
        # DATAPARALLEL
        #loss = loss.sum()
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()
    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            # if we dont send labels to model, it doesnt return losses
            lm_logits, mc_logits, *_ = model(
                input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
            )
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])),
                 "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args),
                    "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)})
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args.model_checkpoint)
        tb_logger = TensorboardLogger(log_dir)

        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=100)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})  # "getattr" takes care of distributed encapsulation

        torch.save(args, log_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
        tokenizer.save_pretrained(log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Пример #7
0
def main(dataset_path, batch_size=256, max_epochs=10):
    assert torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled."
    torch.backends.cudnn.benchmark = True

    device = "cuda"

    train_loader, test_loader, eval_train_loader = get_train_eval_loaders(
        dataset_path, batch_size=batch_size)

    model = wide_resnet50_2(num_classes=100).to(device)
    optimizer = SGD(model.parameters(), lr=0.01)
    criterion = CrossEntropyLoss().to(device)

    scaler = GradScaler()

    def train_step(engine, batch):
        x = convert_tensor(batch[0], device, non_blocking=True)
        y = convert_tensor(batch[1], device, non_blocking=True)

        optimizer.zero_grad()

        # Runs the forward pass with autocasting.
        with autocast():
            y_pred = model(x)
            loss = criterion(y_pred, y)

        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
        # Backward passes under autocast are not recommended.
        # Backward ops run in the same precision that autocast used for corresponding forward ops.
        scaler.scale(loss).backward()

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(optimizer)

        # Updates the scale for next iteration.
        scaler.update()

        return loss.item()

    trainer = Engine(train_step)
    timer = Timer(average=True)
    timer.attach(trainer, step=Events.EPOCH_COMPLETED)
    ProgressBar(persist=True).attach(
        trainer, output_transform=lambda out: {"batch loss": out})

    metrics = {"Accuracy": Accuracy(), "Loss": Loss(criterion)}

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)

    def log_metrics(engine, title):
        for name in metrics:
            print(f"\t{title} {name}: {engine.state.metrics[name]:.2f}")

    @trainer.on(Events.COMPLETED)
    def run_validation(_):
        print(f"- Mean elapsed time for 1 epoch: {timer.value()}")
        print("- Metrics:")
        with evaluator.add_event_handler(Events.COMPLETED, log_metrics,
                                         "Train"):
            evaluator.run(eval_train_loader)

        with evaluator.add_event_handler(Events.COMPLETED, log_metrics,
                                         "Test"):
            evaluator.run(test_loader)

    trainer.run(train_loader, max_epochs=max_epochs)
Пример #8
0
def main():
    if not os.path.exists(args.outdir):
        os.mkdir(args.outdir)

    device = torch.device("cuda")
    torch.cuda.set_device(args.gpu)

    logfilename = os.path.join(args.outdir, args.logname)

    log(logfilename, "Hyperparameter List")
    log(logfilename, "Epochs: {:}".format(args.epochs))
    log(logfilename, "Learning Rate: {:}".format(args.lr))
    log(logfilename, "Alpha: {:}".format(args.alpha))
    log(logfilename, "Keep ratio: {:}".format(args.keep_ratio))

    test_acc_list = []
    for _ in range(args.round):
        train_dataset = get_dataset(args.dataset, 'train')
        test_dataset = get_dataset(args.dataset, 'test')
        pin_memory = (args.dataset == "imagenet")
        train_loader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=args.batch,
                                  num_workers=args.workers,
                                  pin_memory=pin_memory)
        test_loader = DataLoader(test_dataset,
                                 shuffle=False,
                                 batch_size=args.batch,
                                 num_workers=args.workers,
                                 pin_memory=pin_memory)

        # Loading the base_classifier
        base_classifier = get_architecture(args.arch, args.dataset, device)
        checkpoint = torch.load(args.savedir)
        base_classifier.load_state_dict(checkpoint['state_dict'])
        base_classifier.eval()
        print("Loaded the base_classifier")

        original_acc = model_inference(base_classifier,
                                       test_loader,
                                       device,
                                       display=True)

        log(logfilename,
            "Original Model Test Accuracy: {:.5}".format(original_acc))
        print("Original Model Test Accuracy, ", original_acc)

        # Creating a fresh copy of network not affecting the original network.
        net = copy.deepcopy(base_classifier)
        net = net.to(device)

        # Generating the mask 'm'
        for layer in net.modules():
            if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
                layer.weight_mask = nn.Parameter(torch.ones_like(layer.weight))

                layer.weight.requires_grad = True
                layer.weight_mask.requires_grad = True

            # This is the monkey-patch overriding layer.forward to custom function.
            # layer.forward will pass nn.Linear with weights: 'w' and 'm' elementwised
            if isinstance(layer, nn.Linear):
                layer.forward = types.MethodType(mask_forward_linear, layer)

            if isinstance(layer, nn.Conv2d):
                layer.forward = types.MethodType(mask_forward_conv2d, layer)

        criterion = nn.NLLLoss().to(
            device)  # I added Log Softmax layer to all architecture.
        optimizer = SGD(
            net.parameters(),
            lr=args.lr,
            momentum=args.momentum,
            weight_decay=0)  # weight_decay = 0 for training the mask.

        sparsity, total = 0, 0
        for layer in net.modules():
            if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
                boolean_list = layer.weight_mask.data > args.threshold
                sparsity += (boolean_list == 1).sum()
                total += layer.weight.numel()

        # Training the mask with the training set.
        # You can set the maximum number of loop in case the sparsity on auxiliary parameter
        # do not go below target sparsity.
        for epoch in range(300):
            if epoch % 5 == 0:
                print("Current epochs: ", epoch)
                print("Sparsity: {:}".format(sparsity))
            train_loss = mask_train(train_loader,
                                    net,
                                    criterion,
                                    optimizer,
                                    epoch,
                                    device,
                                    alpha=args.alpha,
                                    display=False)
            acc = model_inference(net, test_loader, device, display=False)
            log(logfilename,
                "Epoch {:}, Mask Update Test Acc: {:.5}".format(epoch, acc))

            sparsity, total = 0, 0
            for layer in net.modules():
                if isinstance(layer, nn.Linear) or isinstance(
                        layer, nn.Conv2d):
                    boolean_list = layer.weight_mask.data > args.threshold
                    sparsity += (boolean_list == 1).sum()
                    total += layer.weight.numel()

            if sparsity <= total * args.keep_ratio:
                print("Current epochs breaking loop at {:}".format(epoch))
                break

        mask_update_acc = model_inference(net,
                                          test_loader,
                                          device,
                                          display=True)
        log(logfilename,
            "Mask Update Test Accuracy: {:.5}".format(mask_update_acc))

        # This line allows to calculate the threshold to satisfy the keep_ratio.
        c_abs = []
        for layer in net.modules():
            if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
                c_abs.append(torch.abs(layer.weight_mask))

        all_scores = torch.cat([torch.flatten(x) for x in c_abs])
        num_params_to_keep = int(len(all_scores) * args.keep_ratio)
        threshold, _ = torch.topk(all_scores, num_params_to_keep, sorted=True)
        threshold = threshold[-1]

        keep_masks = []
        for c in c_abs:
            keep_masks.append((c >= threshold).float())
        print(
            "Number of ones.",
            torch.sum(torch.cat([torch.flatten(x == 1) for x in keep_masks])))

        # Updating the weight with elementwise product of update c.
        for layer in net.modules():
            if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
                # We update the weight by elementwise multiplication between
                # weight 'w' and mask 'm'.
                layer.weight.data = layer.weight.data * layer.weight_mask.data
                layer.zeros = nn.Parameter(torch.zeros_like(
                    layer.weight))  # Dummy parameter.
                layer.ones = nn.Parameter(torch.ones_like(
                    layer.weight))  # Dummy parameter.
                layer.weight_mask.data = torch.where(
                    torch.abs(layer.weight_mask) <= threshold, layer.zeros,
                    layer.ones
                )  # Updated weight_mask becomes the mask with element
                # 0 and 1 again.

                # Temporarily disabling the backprop for both 'w' and 'm'.
                layer.weight.requires_grad = False
                layer.weight_mask.requires_grad = False

            if isinstance(layer, nn.Linear):
                layer.forward = types.MethodType(mask_forward_linear, layer)

            if isinstance(layer, nn.Conv2d):
                layer.forward = types.MethodType(mask_forward_conv2d, layer)

        weight_update_acc = model_inference(net,
                                            test_loader,
                                            device,
                                            display=True)
        log(logfilename,
            "Weight Update Test Accuracy: {:.5}".format(weight_update_acc))

        # Calculating the sparsity of the network.
        remain = 0
        total = 0
        for layer in net.modules():
            if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
                total += torch.norm(torch.ones_like(layer.weight),
                                    p=1)  # Counting total num parameter
                remain += torch.norm(layer.weight_mask.data,
                                     p=1)  # Counting ones in the mask.

                # Disabling backprop except weight 'w' for the finetuning.
                layer.zeros.requires_grad = False
                layer.ones.requires_grad = False
                layer.weight_mask.requires_grad = False
                layer.weight.requires_grad = True

            if isinstance(layer, nn.Linear):
                layer.forward = types.MethodType(mask_forward_linear, layer)

            if isinstance(layer, nn.Conv2d):
                layer.forward = types.MethodType(mask_forward_conv2d, layer)

        log(logfilename, "Sparsity: {:.3}".format(remain / total))
        print("Sparsity: ", remain / total)

        #        --------------------------------
        # We need to transfer the weight we learned from "net" to "base_classifier".
        for (layer1, layer2) in zip(base_classifier.modules(), net.modules()):
            if isinstance(layer1, (nn.Linear, nn.Conv2d)) or isinstance(
                    layer2, (nn.Linear, nn.Conv2d)):
                layer1.weight.data = layer2.weight.data
                if layer1.bias != None:
                    layer1.bias.data = layer2.bias.data
                    layer1.bias.requires_grad = True

                layer1.weight.requires_grad = True

        # Applying the mask to the base_classifier.
        apply_prune_mask(base_classifier, keep_masks)
        #        --------------------------------

        optimizer = SGD(base_classifier.parameters(),
                        lr=1e-3,
                        momentum=args.momentum,
                        weight_decay=args.weight_decay)
        loss = nn.NLLLoss()
        scheduler = StepLR(optimizer,
                           step_size=args.lr_step_size,
                           gamma=args.gamma)

        test_acc = []
        # Finetuning via ignite
        trainer = create_supervised_trainer(base_classifier, optimizer,
                                            nn.NLLLoss(), device)
        evaluator = create_supervised_evaluator(base_classifier, {
            'accuracy': Accuracy(),
            'nll': Loss(loss)
        }, device)

        pbar = ProgressBar()
        pbar.attach(trainer)

        @trainer.on(Events.ITERATION_COMPLETED)
        def log_training_loss(engine):
            iter_in_epoch = (engine.state.iteration -
                             1) % len(train_loader) + 1
            if engine.state.iteration % args.print_freq == 0:
                pbar.log_message("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
                                 "".format(engine.state.epoch, iter_in_epoch,
                                           len(train_loader),
                                           engine.state.output))

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_epoch(engine):
            scheduler.step()
            evaluator.run(test_loader)

            metrics = evaluator.state.metrics
            avg_accuracy = metrics['accuracy']
            avg_nll = metrics['nll']

            pbar.log_message(
                "Validation Results - Epoch: {}  Avg accuracy: {:.5f} Avg loss: {:.3f}"
                .format(engine.state.epoch, avg_accuracy, avg_nll))

            log(
                logfilename,
                "Validation  - Epoch: {}  Avg accuracy: {:.5f} Avg loss: {:.3f}"
                .format(engine.state.epoch, avg_accuracy, avg_nll))

            test_acc.append(avg_accuracy)

            if avg_accuracy >= max(test_acc):
                print("Saving the model at Epoch {:}".format(
                    engine.state.epoch))
                torch.save(
                    {
                        'arch': args.arch,
                        'state_dict': base_classifier.state_dict(),
                        'optimizer': optimizer.state_dict(),
                    }, os.path.join(args.outdir, 'checkpoint.pth.tar'))

            if engine.state.epoch == args.epochs:
                test_acc_list.append(max(test_acc))
                log(logfilename,
                    "Finetuned Test Accuracy: {:.5f}".format(max(test_acc)))
                print("Finetuned Test Accuracy: ", max(test_acc))

        trainer.run(train_loader, args.epochs)

    log(logfilename, "This is the test accuracy list for args.round.")
    log(logfilename, str(test_acc_list))
Пример #9
0
def main():
    monai.config.print_config()
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    # IXI dataset as a demo, downloadable from https://brain-development.org/ixi-dataset/
    images = [
        "/workspace/data/medical/ixi/IXI-T1/IXI314-IOP-0889-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI249-Guys-1072-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI609-HH-2600-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI173-HH-1590-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI020-Guys-0700-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI342-Guys-0909-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI134-Guys-0780-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI577-HH-2661-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI066-Guys-0731-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI130-HH-1528-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI607-Guys-1097-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI175-HH-1570-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI385-HH-2078-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI344-Guys-0905-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI409-Guys-0960-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI584-Guys-1129-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI253-HH-1694-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI092-HH-1436-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI574-IOP-1156-T1.nii.gz",
        "/workspace/data/medical/ixi/IXI-T1/IXI585-Guys-1130-T1.nii.gz",
    ]
    # 2 binary labels for gender classification: man and woman
    labels = np.array(
        [0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0])

    # define transforms
    train_transforms = Compose([
        ScaleIntensity(),
        AddChannel(),
        Resize((96, 96, 96)),
        RandRotate90(),
        ToTensor()
    ])
    val_transforms = Compose(
        [ScaleIntensity(),
         AddChannel(),
         Resize((96, 96, 96)),
         ToTensor()])

    # define nifti dataset, data loader
    check_ds = NiftiDataset(image_files=images,
                            labels=labels,
                            transform=train_transforms)
    check_loader = DataLoader(check_ds,
                              batch_size=2,
                              num_workers=2,
                              pin_memory=torch.cuda.is_available())
    im, label = monai.utils.misc.first(check_loader)
    print(type(im), im.shape, label)

    # create DenseNet121, CrossEntropyLoss and Adam optimizer
    net = monai.networks.nets.densenet.densenet121(
        spatial_dims=3,
        in_channels=1,
        out_channels=2,
    )
    loss = torch.nn.CrossEntropyLoss()
    lr = 1e-5
    opt = torch.optim.Adam(net.parameters(), lr)
    device = torch.device("cuda:0")

    # Ignite trainer expects batch=(img, label) and returns output=loss at every iteration,
    # user can add output_transform to return other values, like: y_pred, y, etc.
    trainer = create_supervised_trainer(net, opt, loss, device, False)

    # adding checkpoint handler to save models (network params and optimizer stats) during training
    checkpoint_handler = ModelCheckpoint("./runs/",
                                         "net",
                                         n_saved=10,
                                         require_empty=False)
    trainer.add_event_handler(event_name=Events.EPOCH_COMPLETED,
                              handler=checkpoint_handler,
                              to_save={
                                  "net": net,
                                  "opt": opt
                              })

    # StatsHandler prints loss at every iteration and print metrics at every epoch,
    # we don't set metrics for trainer here, so just print loss, user can also customize print functions
    # and can use output_transform to convert engine.state.output if it's not loss value
    train_stats_handler = StatsHandler(name="trainer")
    train_stats_handler.attach(trainer)

    # TensorBoardStatsHandler plots loss at every iteration and plots metrics at every epoch, same as StatsHandler
    train_tensorboard_stats_handler = TensorBoardStatsHandler()
    train_tensorboard_stats_handler.attach(trainer)

    # set parameters for validation
    validation_every_n_epochs = 1

    metric_name = "Accuracy"
    # add evaluation metric to the evaluator engine
    val_metrics = {metric_name: Accuracy()}
    # Ignite evaluator expects batch=(img, label) and returns output=(y_pred, y) at every iteration,
    # user can add output_transform to return other values
    evaluator = create_supervised_evaluator(net, val_metrics, device, True)

    # add stats event handler to print validation stats via evaluator
    val_stats_handler = StatsHandler(
        name="evaluator",
        output_transform=lambda x:
        None,  # no need to print loss value, so disable per iteration output
        global_epoch_transform=lambda x: trainer.state.epoch,
    )  # fetch global epoch number from trainer
    val_stats_handler.attach(evaluator)

    # add handler to record metrics to TensorBoard at every epoch
    val_tensorboard_stats_handler = TensorBoardStatsHandler(
        output_transform=lambda x:
        None,  # no need to plot loss value, so disable per iteration output
        global_epoch_transform=lambda x: trainer.state.epoch,
    )  # fetch global epoch number from trainer
    val_tensorboard_stats_handler.attach(evaluator)

    # add early stopping handler to evaluator
    early_stopper = EarlyStopping(
        patience=4,
        score_function=stopping_fn_from_metric(metric_name),
        trainer=trainer)
    evaluator.add_event_handler(event_name=Events.EPOCH_COMPLETED,
                                handler=early_stopper)

    # create a validation data loader
    val_ds = NiftiDataset(image_files=images[-10:],
                          labels=labels[-10:],
                          transform=val_transforms)
    val_loader = DataLoader(val_ds,
                            batch_size=2,
                            num_workers=2,
                            pin_memory=torch.cuda.is_available())

    @trainer.on(Events.EPOCH_COMPLETED(every=validation_every_n_epochs))
    def run_validation(engine):
        evaluator.run(val_loader)

    # create a training data loader
    train_ds = NiftiDataset(image_files=images[:10],
                            labels=labels[:10],
                            transform=train_transforms)
    train_loader = DataLoader(train_ds,
                              batch_size=2,
                              shuffle=True,
                              num_workers=2,
                              pin_memory=torch.cuda.is_available())

    train_epochs = 30
    state = trainer.run(train_loader, train_epochs)
Пример #10
0
    def _test():
        acc = Accuracy()

        y_pred = torch.rand(10, 4)
        y = torch.randint(0, 4, size=(10, )).long()
        acc.update((y_pred, y))
        np_y_pred = y_pred.numpy().argmax(axis=1).ravel()
        np_y = y.numpy().ravel()
        assert acc._type == "multiclass"
        assert isinstance(acc.compute(), float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute())

        acc.reset()
        y_pred = torch.rand(10, 10, 1)
        y = torch.randint(0, 18, size=(10, 1)).long()
        acc.update((y_pred, y))
        np_y_pred = y_pred.numpy().argmax(axis=1).ravel()
        np_y = y.numpy().ravel()
        assert acc._type == "multiclass"
        assert isinstance(acc.compute(), float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute())

        acc.reset()
        y_pred = torch.rand(10, 18)
        y = torch.randint(0, 18, size=(10, )).long()
        acc.update((y_pred, y))
        np_y_pred = y_pred.numpy().argmax(axis=1).ravel()
        np_y = y.numpy().ravel()
        assert acc._type == "multiclass"
        assert isinstance(acc.compute(), float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute())

        acc.reset()
        y_pred = torch.rand(4, 10)
        y = torch.randint(0, 10, size=(4, )).long()
        acc.update((y_pred, y))
        np_y_pred = y_pred.numpy().argmax(axis=1).ravel()
        np_y = y.numpy().ravel()
        assert acc._type == "multiclass"
        assert isinstance(acc.compute(), float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute())

        # 2-classes
        acc.reset()
        y_pred = torch.rand(4, 2)
        y = torch.randint(0, 2, size=(4, )).long()
        acc.update((y_pred, y))
        np_y_pred = y_pred.numpy().argmax(axis=1).ravel()
        np_y = y.numpy().ravel()
        assert acc._type == "multiclass"
        assert isinstance(acc.compute(), float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute())

        # Batched Updates
        acc.reset()
        y_pred = torch.rand(100, 5)
        y = torch.randint(0, 5, size=(100, )).long()

        batch_size = 16
        n_iters = y.shape[0] // batch_size + 1

        for i in range(n_iters):
            idx = i * batch_size
            acc.update((y_pred[idx:idx + batch_size], y[idx:idx + batch_size]))

        np_y = y.numpy().ravel()
        np_y_pred = y_pred.numpy().argmax(axis=1).ravel()
        assert acc._type == "multiclass"
        assert isinstance(acc.compute(), float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute())
Пример #11
0
    def _test():
        acc = Accuracy(is_multilabel=True)

        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long()
        y = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long()
        acc.update((y_pred, y))

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(
            y_pred.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        np_y = to_numpy_multilabel(
            y.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)

        acc.reset()
        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long()
        y = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long()
        acc.update((y_pred, y))

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(
            y_pred.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        np_y = to_numpy_multilabel(
            y.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)

        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)
        # check that result is not changed
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)

        # Batched Updates
        acc.reset()
        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long()
        y = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long()

        batch_size = 16
        n_iters = y.shape[0] // batch_size + 1

        for i in range(n_iters):
            idx = i * batch_size
            acc.update((y_pred[idx:idx + batch_size], y[idx:idx + batch_size]))

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(
            y_pred.cpu())  # (N, C, L, ...) -> (N * L * ..., C)
        np_y = to_numpy_multilabel(y.cpu())  # (N, C, L, ...) -> (N * L ..., C)

        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)
Пример #12
0
def test_no_update():
    acc = Accuracy()
    with pytest.raises(NotComputableError):
        acc.compute()
Пример #13
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="IMDB-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_dir"]
    if rank == 0:

        now = datetime.now().strftime("%Y%m%d-%H%M%S")
        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_dir"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_dir']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

        if config["with_clearml"]:
            try:
                from clearml import Task
            except ImportError:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task

            task = Task.init("IMDB-Training", task_name=output_path.stem)
            task.connect_configuration(config)
            # Log hyper parameters
            hyper_params = [
                "model",
                "dropout",
                "n_fc",
                "batch_size",
                "max_length",
                "weight_decay",
                "num_epochs",
                "learning_rate",
                "num_warmup_epochs",
            ]
            task.connect({k: config[k] for k in hyper_params})

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "Accuracy":
        Accuracy(output_transform=utils.thresholded_output_transform),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_evaluator(model, metrics, config, tag="val")
    train_evaluator = create_evaluator(model, metrics, config, tag="train")

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED | Events.STARTED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(
            output_path,
            trainer,
            optimizer,
            evaluators=evaluators,
            log_every_iters=config["log_every_iters"])

    # Store 2 best models by validation accuracy starting from num_epochs / 2:
    best_model_handler = Checkpoint(
        {"model": model},
        utils.get_save_handler(config),
        filename_prefix="best",
        n_saved=2,
        global_step_transform=global_step_from_engine(trainer),
        score_name="test_accuracy",
        score_function=Checkpoint.get_default_score_fn("Accuracy"),
    )
    evaluator.add_event_handler(
        Events.COMPLETED(
            lambda *_: trainer.state.epoch > config["num_epochs"] // 2),
        best_model_handler)

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        logger.exception("")
        raise e

    if rank == 0:
        tb_logger.close()
Пример #14
0
def run(output_path, config):

    device = "cuda"
    batch_size = config['batch_size']

    train_loader, test_loader = get_train_test_loaders(
        dataset_name=config['dataset'],
        path=config['data_path'],
        batch_size=batch_size,
        num_workers=config['num_workers'])

    model = get_model(config['model'])
    model = model.to(device)

    optim_fn = optim.SGD
    if config['with_layca']:
        optim_fn = LaycaSGD

    optimizer = optim_fn(model.parameters(),
                         lr=0.0,
                         momentum=config['momentum'],
                         weight_decay=config['weight_decay'],
                         nesterov=True)
    criterion = nn.CrossEntropyLoss()

    le = len(train_loader)
    milestones_values = [(le * m, v)
                         for m, v in config['lr_milestones_values']]
    scheduler = PiecewiseLinear(optimizer,
                                "lr",
                                milestones_values=milestones_values)

    def _prepare_batch(batch, device, non_blocking):
        x, y = batch
        return (convert_tensor(x, device=device, non_blocking=non_blocking),
                convert_tensor(y, device=device, non_blocking=non_blocking))

    def process_function(engine, batch):

        x, y = _prepare_batch(batch, device=device, non_blocking=True)

        model.train()
        y_pred = model(x)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return loss.item()

    trainer = Engine(process_function)

    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    RunningAverage(output_transform=lambda x: x,
                   epoch_bound=False).attach(trainer, 'batchloss')

    ProgressBar(persist=True,
                bar_format="").attach(trainer,
                                      event_name=Events.EPOCH_STARTED,
                                      closing_event_name=Events.COMPLETED)

    tb_logger = TensorboardLogger(log_dir=output_path)
    tb_logger.attach(trainer,
                     log_handler=tbOutputHandler(tag="train",
                                                 metric_names='all'),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(trainer,
                     log_handler=tbOptimizerParamsHandler(optimizer,
                                                          param_name="lr"),
                     event_name=Events.ITERATION_STARTED)

    tb_logger.attach(trainer,
                     log_handler=LayerRotationStatsHandler(model),
                     event_name=Events.EPOCH_STARTED)

    metrics = {
        "accuracy": Accuracy(),
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine, val_interval):
        if (engine.state.epoch - 1) % val_interval == 0:
            train_evaluator.run(train_loader)
            evaluator.run(test_loader)

    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              run_validation,
                              val_interval=2)
    trainer.add_event_handler(Events.COMPLETED, run_validation, val_interval=1)

    tb_logger.attach(train_evaluator,
                     log_handler=tbOutputHandler(tag="train",
                                                 metric_names='all',
                                                 another_engine=trainer),
                     event_name=Events.COMPLETED)

    tb_logger.attach(evaluator,
                     log_handler=tbOutputHandler(tag="test",
                                                 metric_names='all',
                                                 another_engine=trainer),
                     event_name=Events.COMPLETED)

    def mlflow_batch_metrics_logging(engine, tag):
        step = trainer.state.iteration
        for name, value in engine.state.metrics.items():
            mlflow.log_metric("{} {}".format(tag, name), value, step=step)

    def mlflow_val_metrics_logging(engine, tag):
        step = trainer.state.epoch
        for name in metrics.keys():
            value = engine.state.metrics[name]
            mlflow.log_metric("{} {}".format(tag, name), value, step=step)

    trainer.add_event_handler(Events.ITERATION_COMPLETED,
                              mlflow_batch_metrics_logging, "train")
    train_evaluator.add_event_handler(Events.COMPLETED,
                                      mlflow_val_metrics_logging, "train")
    evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging,
                                "test")

    trainer.run(train_loader, max_epochs=config['num_epochs'])
    tb_logger.close()
Пример #15
0
def run(
    train_batch_size,
    val_batch_size,
    epochs,
    lr,
    momentum,
    log_interval,
    log_dir,
    checkpoint_every,
    resume_from,
    crash_iteration=1000,
):

    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    writer = SummaryWriter(logdir=log_dir)
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    criterion = nn.NLLLoss()
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.5)
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                "accuracy": Accuracy(),
                                                "nll": Loss(criterion)
                                            },
                                            device=device)

    @trainer.on(Events.EPOCH_COMPLETED)
    def lr_step(engine):
        lr_scheduler.step()

    desc = "ITERATION - loss: {:.4f} - lr: {:.4f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0, lr))

    if log_interval is None:
        e = Events.ITERATION_COMPLETED
        log_interval = 1
    else:
        e = Events.ITERATION_COMPLETED(every=log_interval)

    @trainer.on(e)
    def log_training_loss(engine):
        lr = optimizer.param_groups[0]["lr"]
        pbar.desc = desc.format(engine.state.output, lr)
        pbar.update(log_interval)
        writer.add_scalar("training/loss", engine.state.output,
                          engine.state.iteration)
        writer.add_scalar("lr", lr, engine.state.iteration)

    if resume_from is None:

        @trainer.on(Events.ITERATION_COMPLETED(once=crash_iteration))
        def _(engine):
            raise Exception("STOP at {}".format(engine.state.iteration))

    else:

        @trainer.on(Events.STARTED)
        def _(engine):
            pbar.n = engine.state.iteration

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("training/avg_accuracy", avg_accuracy,
                          engine.state.epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        pbar.n = pbar.last_print_n = 0
        writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("valdation/avg_accuracy", avg_accuracy,
                          engine.state.epoch)

    objects_to_checkpoint = {
        "trainer": trainer,
        "model": model,
        "optimizer": optimizer,
        "lr_scheduler": lr_scheduler
    }
    training_checkpoint = Checkpoint(to_save=objects_to_checkpoint,
                                     save_handler=DiskSaver(
                                         log_dir, require_empty=False))

    trainer.add_event_handler(
        Events.ITERATION_COMPLETED(every=checkpoint_every),
        training_checkpoint)

    if resume_from is not None:
        tqdm.write("Resume from a checkpoint: {}".format(resume_from))
        checkpoint = torch.load(resume_from)
        Checkpoint.load_objects(to_load=objects_to_checkpoint,
                                checkpoint=checkpoint)

    try:
        trainer.run(train_loader, max_epochs=epochs)
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    pbar.close()
    writer.close()
Пример #16
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval,
        log_dir):
    train_loader, val_loader = get_data_loaders(
        train_batch_size, val_batch_size)  # Dataloader实例化
    model = Net()  # 网络模型
    writer = create_summary_writer(model, train_loader, log_dir)
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)  # 优化器
    # 定义trainer,传入,model、optimizer、loss、device实例化
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        F.nll_loss,
                                        device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                "accuracy": Accuracy(),
                                                "nll": Loss(F.nll_loss)
                                            },
                                            device=device)

    # 定义触发事件
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_train_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}".format(
                engine.state.epoch, iter, len(train_loader),
                engine.state.output))  # 这里的engine.state.output 不是很明白
            writer.add_scalar("training/loss", engine.state.output,
                              engine.state.iteration)  # iteration是总共的迭代次数

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics  #
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        print(
            "Training Results -Epoch:{} Avg accuracy: {:.2f} Avg loss: {:.2f}".
            format(engine.state.epoch, avg_accuracy, avg_nll))

        writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("training/avg_accuracy", avg_accuracy,
                          engine.state.epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        print(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("valdation/avg_accuracy", avg_accuracy,
                          engine.state.epoch)

    trainer.run(train_loader, max_epochs=epochs)
    writer.close()
Пример #17
0
        "con_loss": contras_loss.item(),
        "clsf_loss": clsf_loss.item(),
        "emb_vecs": [emb_vec1, emb_vec2],
        "cls_pred": cls_pred,
        "cls_true": cls_true,
        "targets": targets
    }
    return ret


# ----------------------------------------
if __name__ == "__main__":
    engine = Engine(_update)
    metrics = {
        "sim_acc": SiameseNetSimilarityAccuracy(margin=margin, l2_normalize=True),
        "clsf_acc": Accuracy(
            output_transform=lambda x: (x['cls_pred'], x['cls_true']))
    }

    for name, metric in metrics.items():
        metric.attach(engine, name)

    from ignite.contrib.handlers import ProgressBar
    pbar = ProgressBar()
    pbar.attach(engine, output_transform=lambda x: {
        'con_loss': x['con_loss'],
        'clsf_loss': x['clsf_loss']
    })

    from ignite.engine import Events
    # @engine.on(Events.ITERATION_COMPLETED)
    # def log_training_loss(engine):
Пример #18
0
model = UNet(in_channels=params['in_channels'],
             n_classes=params['n_classes'],
             depth=params['depth'])
model.load_state_dict(torch.load(model_path))

model.to(device)

# Create Trainer or Evaluators
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')

# Determine metrics for evaluation.
metrics = {
    "accuracy": Accuracy(),
    "loss": Loss(criterion),
    "mean_iou": mIoU(ConfusionMatrix(num_classes=params['n_classes'])),
}


def backprop_step(engine, batch):
    model.eval()
    model.zero_grad()
    batch_x, batch_y = batch
    batch_x = batch_x.to(device)
    batch_y = batch_y.to(device)
    outputs = model(batch_x)
    loss = criterion(outputs[:, :, 127:128, 127:128], batch_y[:, 127:128,
                                                              127:128])
    loss.backward()
Пример #19
0
def main(batch_size, epochs):

    # 1. GPUの設定(PyTorchでは明示的に指定する必要がある)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)

    train_loader, test_loader = get_data_loaders(batch_size)

    # 2. モデル作成
    #    model = net.CNN(num_classes=num_classes).to(device)
    model = net.Net(1000, 10).to(device)
    print(model)  # ネットワークの詳細を確認用に表示

    # 3. 損失関数を定義
    criterion = nn.CrossEntropyLoss()

    # 4. 最適化手法を定義(ここでは例としてAdamを選択)
    #    optimizer = optim.Adam(model.parameters(), lr=0.001)
    optimizer = optim.Adam(model.parameters())
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics={
                                                      'accuracy': Accuracy(),
                                                      'loss': Loss(criterion)
                                                  },
                                                  device=device)
    test_evaluator = create_supervised_evaluator(model,
                                                 metrics={
                                                     'accuracy': Accuracy(),
                                                     'loss': Loss(criterion)
                                                 },
                                                 device=device)

    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0))

    log_interval = 10

    # 5. ログ出力
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        i = (engine.state.iteration - 1) % len(train_loader) + 1
        if i % log_interval == 0:
            pbar.desc = desc.format(engine.state.output)
            pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        train_evaluator.run(train_loader)
        metrics = train_evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_loss = metrics['loss']
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.3f} Avg loss: {:.4f}"
            .format(engine.state.epoch, avg_accuracy, avg_loss))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        test_evaluator.run(test_loader)
        metrics = test_evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_loss = metrics['loss']
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.3f} Avg loss: {:.4f}"
            .format(engine.state.epoch, avg_accuracy, avg_loss))
        pbar.n = pbar.last_print_n = 0

    def score_function(engine):
        val_loss = engine.state.metrics['loss']
        return -val_loss

    # 5. checkpoint setting
    best_handler = ModelCheckpoint(dirname='./checkpoints',
                                   filename_prefix='best',
                                   n_saved=3,
                                   score_name='loss',
                                   score_function=score_function,
                                   create_dir=True,
                                   require_empty=False)
    test_evaluator.add_event_handler(Events.EPOCH_COMPLETED, best_handler,
                                     {'mymodel': model})

    early_handler = EarlyStopping(patience=5,
                                  score_function=score_function,
                                  trainer=trainer)
    # Note: the handler is attached to an *Evaluator* (runs one epoch on validation dataset)
    test_evaluator.add_event_handler(Events.COMPLETED, early_handler)

    # 6. 実行
    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
Пример #20
0
# create DenseNet121
net = monai.networks.nets.densenet.densenet121(
    spatial_dims=3,
    in_channels=1,
    out_channels=2,
)
device = torch.device('cuda:0')


def prepare_batch(batch, device=None, non_blocking=False):
    return _prepare_batch((batch['img'], batch['label']), device, non_blocking)


metric_name = 'Accuracy'
# add evaluation metric to the evaluator engine
val_metrics = {metric_name: Accuracy()}
# ignite evaluator expects batch=(img, label) and returns output=(y_pred, y) at every iteration,
# user can add output_transform to return other values
evaluator = create_supervised_evaluator(net, val_metrics, device, True, prepare_batch=prepare_batch)

# add stats event handler to print validation stats via evaluator
val_stats_handler = StatsHandler(
    name='evaluator',
    output_transform=lambda x: None  # no need to print loss value, so disable per iteration output
)
val_stats_handler.attach(evaluator)

# for the array data format, assume the 3rd item of batch data is the meta_data
prediction_saver = ClassificationSaver(output_dir='tempdir', name='evaluator',
                                       batch_transform=lambda batch: {'filename_or_obj': batch['img.filename_or_obj']},
                                       output_transform=lambda output: output[0].argmax(1))
Пример #21
0
def main(tempdir):
    monai.config.print_config()
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    # create a temporary directory and 40 random image, mask pairs
    print(f"generating synthetic data to {tempdir} (this may take a while)")
    for i in range(5):
        im, seg = create_test_image_3d(128,
                                       128,
                                       128,
                                       num_seg_classes=1,
                                       channel_dim=-1)
        n = nib.Nifti1Image(im, np.eye(4))
        nib.save(n, os.path.join(tempdir, f"im{i:d}.nii.gz"))
        n = nib.Nifti1Image(seg, np.eye(4))
        nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))

    images = sorted(glob(os.path.join(tempdir, "im*.nii.gz")))
    segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz")))
    val_files = [{
        "image": img,
        "label": seg
    } for img, seg in zip(images, segs)]

    # model file path
    model_file = glob("./runs/net_key_metric*")[0]

    # define transforms for image and segmentation
    val_transforms = Compose([
        LoadImaged(keys=["image", "label"]),
        AsChannelFirstd(keys=["image", "label"], channel_dim=-1),
        ScaleIntensityd(keys="image"),
        EnsureTyped(keys=["image", "label"]),
    ])

    # create a validation data loader
    val_ds = monai.data.Dataset(data=val_files, transform=val_transforms)
    val_loader = monai.data.DataLoader(val_ds, batch_size=1, num_workers=4)

    # create UNet, DiceLoss and Adam optimizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    net = monai.networks.nets.UNet(
        spatial_dims=3,
        in_channels=1,
        out_channels=1,
        channels=(16, 32, 64, 128, 256),
        strides=(2, 2, 2, 2),
        num_res_units=2,
    ).to(device)

    val_post_transforms = Compose([
        EnsureTyped(keys="pred"),
        Activationsd(keys="pred", sigmoid=True),
        AsDiscreted(keys="pred", threshold=0.5),
        KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),
        SaveImaged(keys="pred",
                   meta_keys="image_meta_dict",
                   output_dir="./runs/")
    ])
    val_handlers = [
        StatsHandler(output_transform=lambda x: None),
        CheckpointLoader(load_path=model_file, load_dict={"net": net}),
    ]

    evaluator = SupervisedEvaluator(
        device=device,
        val_data_loader=val_loader,
        network=net,
        inferer=SlidingWindowInferer(roi_size=(96, 96, 96),
                                     sw_batch_size=4,
                                     overlap=0.5),
        postprocessing=val_post_transforms,
        key_val_metric={
            "val_mean_dice":
            MeanDice(include_background=True,
                     output_transform=from_engine(["pred", "label"]))
        },
        additional_metrics={
            "val_acc":
            Accuracy(output_transform=from_engine(["pred", "label"]))
        },
        val_handlers=val_handlers,
        # if no FP16 support in GPU or PyTorch version < 1.6, will not enable AMP evaluation
        amp=True if monai.utils.get_torch_version_tuple() >= (1, 6) else False,
    )
    evaluator.run()
Пример #22
0
def run(model, 
    optimizer,
    scheduler,
    loss_fn, 
    device, 
    train_loader, 
    val_loader,
    training_history,
    param_history,
    model_info,
    start_epoch,
    path
    ):

    expected_batch_size = model_info["data"]["batch_size"]

    def prep_batch(batch, device=device, non_blocking=False):
        return batch.to(device), batch.y.to(device)
    
    def update(trainer, batch):
        nonlocal expected_batch_size

        model.train()
        optimizer.zero_grad()
        x, y = prep_batch(batch, device=device, non_blocking=False)

        if expected_batch_size != x.num_graphs:
            print(expected_batch_size)
            print(type(x))
            print(x.num_graphs)
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        loss.backward()
        ### do clipping here
        for param in model.parameters():
            if param.grad is None:
                continue
            param.grad.data.clamp_(-1,1)

        optimizer.step()

        return loss.item()    

    trainer = Engine(update)

    evaluator = create_supervised_evaluator(model,
                                        metrics={'accuracy': Accuracy(),
                                        'nll': Loss(loss_fn)},
                                        device=device,
                                        prepare_batch=prep_batch)

    optimizer_history = []
    from event_handlers.log_lr import log_lr
    trainer.add_event_handler(
        Events.EPOCH_STARTED,
        log_lr,
        optimizer,
        optimizer_history)

    from event_handlers.scheduler import do_scheduler
    trainer.add_event_handler(
        Events.EPOCH_STARTED,
        do_scheduler,
        optimizer, scheduler)
    
    from event_handlers.log_gradient import log_gradient
    trainer.add_event_handler(
            Events.EPOCH_COMPLETED,
            log_gradient,
            model, param_history)

    from event_handlers.log_training import log_training_results
    trainer.add_event_handler(
            Events.EPOCH_COMPLETED,
            log_training_results,
            evaluator, val_loader, training_history)

    from event_handlers.save_model import handler_save_model
    trainer.add_event_handler(
            Events.EPOCH_COMPLETED,
            handler_save_model, 
            model_info["training"]["save_every"], model, optimizer, training_history, param_history, path, start_epoch)

    from event_handlers.save_img import log_img 
    trainer.add_event_handler(
            Events.EPOCH_COMPLETED,
            log_img, 
            model_info["training"]["save_every"], training_history, param_history, path, start_epoch,optimizer_history)

    pbar = tqdm(total=model_info["training"]["max_epochs"])
    @trainer.on(Events.EPOCH_COMPLETED)
    def show_bar(engine):
        pbar.update(1)
    trainer.run(train_loader, max_epochs=model_info["training"]["max_epochs"])
    pbar.close()
Пример #23
0
classes = 1108

model = getattr(models, model_name)(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = torch.nn.Linear(num_ftrs, classes)

# In[6]:

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# In[7]:

metrics = {
    'loss': Loss(criterion),
    'accuracy': Accuracy(),
}

trainer = create_supervised_trainer(model, optimizer, criterion, device=device)
val_evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device)

# In[8]:


@trainer.on(Events.EPOCH_COMPLETED)
def compute_and_display_val_metrics(engine):
    epoch = engine.state.epoch
    metrics = val_evaluator.run(val_loader).metrics
    print(
Пример #24
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.NLLLoss()
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("trainer")

    val_metrics = {"accuracy": Accuracy(), "nll": Loss(criterion)}
    evaluator = create_supervised_evaluator(model,
                                            metrics=val_metrics,
                                            device=device)
    evaluator.logger = setup_logger("evaluator")

    pbar = tqdm(
        initial=0,
        leave=False,
        total=len(train_loader),
        desc=f"ITERATION - loss: {0:.2f}",
    )

    @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
    def log_training_loss(engine):
        pbar.desc = f"ITERATION - loss: {engine.state.output:.2f}"
        pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            f"Training Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}"
        )

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            f"Validation Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}"
        )

        pbar.n = pbar.last_print_n = 0

    @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED)
    def log_time(engine):
        tqdm.write(
            f"{trainer.last_event_name.name} took { trainer.state.times[trainer.last_event_name.name]} seconds"
        )

    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
def run(tb, vb, lr, epochs, writer):
    device = os.environ['main-device']
    logging.info('Training program start!')
    logging.info('Configuration:')
    logging.info('\n' + json.dumps(INFO, indent=2))

    # ------------------------------------
    # 1. Define dataloader
    train_loader, train4val_loader, val_loader, num_of_images, mapping, _ = get_dataloaders(
        tb, vb)
    # train_loader, train4val_loader, val_loader, num_of_images = get_dataloaders(tb, vb)
    # Adjust weights of unknown
    num_of_images[6] += int(sum(num_of_images) / len(num_of_images))
    weights = (1 / num_of_images) / ((1 / num_of_images).sum().item())
    # weights = (1/num_of_images)/(1/num_of_images + 1/(num_of_images.sum().item()-num_of_images))
    weights = weights.to(device=device)

    # ------------------------------------
    # 2. Define model
    model = EfficientNet.from_pretrained(
        'efficientnet-b0', num_classes=INFO['dataset-info']['num-of-classes'])
    model = carrier(model)

    # ------------------------------------
    # 3. Define optimizer
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
    ignite_scheduler = LRScheduler(scheduler)

    # ------------------------------------
    # 4. Define metrics

    class SoftCrossEntropyLoss(nn.Module):
        def __init__(self, weight=None):
            super(SoftCrossEntropyLoss, self).__init__()
            self.class_weights = weight

        def forward(self, input, target):
            softmax = torch.exp(input) / torch.exp(input).sum(1)[:, None]
            onehot_labels = to_onehot(target, input.shape[1])
            soft_labels = torch.zeros_like(onehot_labels)
            soft_labels = torch.where(
                onehot_labels.cpu() == 1, torch.tensor([0.9]),
                torch.tensor([0.1 / (input.shape[1] - 1)])).to(device=device)
            if self.class_weights is not None:
                # print(soft_labels.shape, softmax.shape)
                loss = -torch.sum(
                    torch.log(softmax) * soft_labels * self.class_weights *
                    input.shape[1])
            else:
                loss = -torch.sum(torch.log(softmax) * soft_labels)
            return loss

    class EntropyPrediction(metric.Metric):
        def __init__(self, threshold=1.0):
            super(EntropyPrediction, self).__init__()
            self.threshold = threshold
            self.prediction = torch.tensor([], dtype=torch.int)
            self.y = torch.tensor([], dtype=torch.int)

        def reset(self):
            # self.threshold = 0.3
            self.prediction = torch.tensor([])
            self.y = torch.tensor([])
            super(EntropyPrediction, self).reset()

        def update(self, output):
            y_pred, y = output
            softmax = torch.exp(y_pred) / torch.exp(y_pred).sum(1)[:, None]
            entropy_base = math.log(y_pred.shape[1])
            entropy = (-softmax * torch.log(softmax)).sum(1) / entropy_base
            values, inds = softmax.max(1)
            prediction = torch.where(entropy > self.threshold, inds,
                                     torch.tensor([-1]).to(device=device))
            self.prediction = torch.cat(
                (self.prediction.type(torch.LongTensor).to(device=device),
                 torch.tensor([mapping[x.item()]
                               for x in prediction]).to(device=device)))
            self.y = torch.cat(
                (self.y.type(torch.LongTensor).to(device=device),
                 y.to(device=device)))
            # return self.prediction, self.y

        def compute(self):
            return self.prediction, self.y

    train_metrics = {
        'accuracy':
        Accuracy(),
        'loss':
        Loss(nn.CrossEntropyLoss(weight=weights)),
        'precision_recall':
        MetricsLambda(PrecisionRecallTable, Precision(), Recall(),
                      train_loader.dataset.classes),
        'cmatrix':
        MetricsLambda(CMatrixTable,
                      ConfusionMatrix(INFO['dataset-info']['num-of-classes']),
                      train_loader.dataset.classes)
    }

    val_metrics = {
        'accuracy':
        MetricsLambda(Labels2Acc, EntropyPrediction()),
        'precision_recall':
        MetricsLambda(Labels2PrecisionRecall, EntropyPrediction(),
                      val_loader.dataset.classes),
        'cmatrix':
        MetricsLambda(Labels2CMatrix, EntropyPrediction(),
                      val_loader.dataset.classes)
    }

    # ------------------------------------
    # 5. Create trainer
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        nn.CrossEntropyLoss(weight=weights),
                                        device=device)

    # ------------------------------------
    # 6. Create evaluator
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=train_metrics,
                                                  device=device)
    val_evaluator = create_supervised_evaluator(model,
                                                metrics=val_metrics,
                                                device=device)

    desc = 'ITERATION - loss: {:.4f}'
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0))

    # ------------------------------------
    # 7. Create event hooks

    # Update process bar on each iteration completed.
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        log_interval = 1
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            pbar.desc = desc.format(engine.state.output)
            pbar.update(log_interval)

    @trainer.on(Events.EPOCH_STARTED)
    def refresh_pbar(engine):
        pbar.refresh()
        pbar.n = pbar.last_print_n = 0

    # Compute metrics on train data on each epoch completed.
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        print('Checking on training set.')
        train_evaluator.run(train4val_loader)
        metrics = train_evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_loss = metrics['loss']
        precision_recall = metrics['precision_recall']
        cmatrix = metrics['cmatrix']
        prompt = """
      Training Results - Epoch: {}
      Avg accuracy: {:.4f}
      Avg loss: {:.4f}
      precision_recall: \n{}
      confusion matrix: \n{}
      """.format(engine.state.epoch, avg_accuracy, avg_loss,
                 precision_recall['pretty'], cmatrix['pretty'])
        tqdm.write(prompt)
        logging.info('\n' + prompt)
        writer.add_text(os.environ['run-id'], prompt, engine.state.epoch)
        writer.add_scalars('Aggregate/Acc', {'Train Acc': avg_accuracy},
                           engine.state.epoch)
        writer.add_scalars('Aggregate/Loss', {'Train Loss': avg_loss},
                           engine.state.epoch)

    # Compute metrics on val data on each epoch completed.
    cpe = CustomPeriodicEvent(n_epochs=50)
    cpe.attach(trainer)

    @trainer.on(cpe.Events.EPOCHS_50_COMPLETED)
    def log_validation_results(engine):
        pbar.clear()
        print('* - * - * - * - * - * - * - * - * - * - * - * - *')
        print('Checking on validation set.')
        val_evaluator.run(val_loader)
        metrics = val_evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        precision_recall = metrics['precision_recall']
        cmatrix = metrics['cmatrix']
        prompt = """
      Validating Results - Epoch: {}
      Avg accuracy: {:.4f}
      precision_recall: \n{}
      confusion matrix: \n{}
      """.format(engine.state.epoch, avg_accuracy, precision_recall['pretty'],
                 cmatrix['pretty'])
        tqdm.write(prompt)
        logging.info('\n' + prompt)
        writer.add_text(os.environ['run-id'], prompt, engine.state.epoch)
        writer.add_scalars('Aggregate/Acc', {'Val Acc': avg_accuracy},
                           engine.state.epoch)
        writer.add_scalars(
            'Aggregate/Score', {
                'Val avg precision': precision_recall['data'][0, -1],
                'Val avg recall': precision_recall['data'][1, -1]
            }, engine.state.epoch)

    # Save model ever N epoch.
    save_model_handler = ModelCheckpoint(os.environ['savedir'],
                                         '',
                                         save_interval=10,
                                         n_saved=2)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, save_model_handler,
                              {'model': model})

    # Update learning-rate due to scheduler.
    trainer.add_event_handler(Events.EPOCH_STARTED, ignite_scheduler)

    # ------------------------------------
    # Run
    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
def run_training(model, optimizer, scheduler, output_path,
                 train_loader, val_loader, epochs, patience,
                  epochs_pretrain, mixed_precision, classes_weights):

    # trainer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if classes_weights is not None:
        classes_weights = classes_weights.to(device)
    crit = nn.CrossEntropyLoss(weight=classes_weights)
    metrics = {"accuracy": Accuracy(), "loss": Loss(crit)}
    trainer = create_supervised_trainer_with_pretraining(
        model, optimizer, crit, device=device, epochs_pretrain=epochs_pretrain,
        mixed_precision=mixed_precision)
    train_evaluator = create_supervised_evaluator(
        model, metrics=metrics, device=device)
    val_evaluator = create_supervised_evaluator(
        model, metrics=metrics, device=device)

    # Out paths
    path_ckpt = os.path.join(output_path, "model_ckpt")
    log_dir = os.path.join(output_path, "log_dir")
    os.makedirs(log_dir, exist_ok=True)

    # tensorboard
    tb_logger = TensorboardLogger(log_dir=log_dir)
    tb_logger.attach(train_evaluator, log_handler=OutputHandler(tag="training", metric_names=[
        "accuracy", "loss"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED)
    tb_logger.attach(val_evaluator, log_handler=OutputHandler(tag="validation", metric_names=[
        "accuracy", "loss"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

    # training progress
    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names="all")

    # @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        train_evaluator.run(train_loader)
        val_evaluator.run(val_loader)
        train_loss = train_evaluator.state.metrics["loss"]
        val_loss = val_evaluator.state.metrics["loss"]
        train_acc = train_evaluator.state.metrics["accuracy"]
        val_acc = val_evaluator.state.metrics["accuracy"]
        pbar.log_message(
            "Training Results - Epoch: {}  Loss: {:.6f}  Accuracy: {:.6f}".format(engine.state.epoch, train_loss, train_acc))
        pbar.log_message(
            "Validation Results - Epoch: {}  Loss: {:.6f}  Accuracy: {:.6f}".format(engine.state.epoch, val_loss, val_acc))

        pbar.n = pbar.last_print_n = 0

    trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results)

    # def get_val_loss(engine):
    # 	return -engine.state.metrics['loss']
    def get_val_acc(engine):
        return engine.state.metrics['accuracy']

    # checkpoint and early stopping
    checkpointer = ModelCheckpoint(
        path_ckpt, "model", score_function=get_val_acc, score_name="accuracy", require_empty=False)
    early_stopper = EarlyStopping(patience, get_val_acc, trainer)

    to_save = {'optimizer': optimizer, 'model': model}
    if scheduler is not None:
        to_save["scheduler"] = scheduler
    val_evaluator.add_event_handler(Events.COMPLETED, checkpointer, to_save)
    val_evaluator.add_event_handler(Events.COMPLETED, early_stopper)
    if scheduler is not None:
        trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # free resources
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, lambda _: _empty_cache())
    train_evaluator.add_event_handler(
        Events.ITERATION_COMPLETED, lambda _: _empty_cache())
    val_evaluator.add_event_handler(
        Events.ITERATION_COMPLETED, lambda _: _empty_cache())

    trainer.run(train_loader, max_epochs=epochs)
    tb_logger.close()

    # Evaluation with best model
    model.load_state_dict(torch.load(
        glob.glob(os.path.join(path_ckpt, "*.pth"))[0])["model"])
    train_evaluator = create_supervised_evaluator(
        model, metrics=metrics, device=device)
    val_evaluator = create_supervised_evaluator(
        model, metrics=metrics, device=device)

    train_evaluator.run(train_loader)
    val_evaluator.run(val_loader)

    _pretty_print("Evaluating best model")
    pbar.log_message(
        "Best model on training set - Loss: {:.6f}  Accuracy: {:.6f}"
        .format(train_evaluator.state.metrics["loss"], train_evaluator.state.metrics["accuracy"]))
    pbar.log_message(
        "Best model on validation set - Loss: {:.6f}  Accuracy: {:.6f}"
        .format(val_evaluator.state.metrics["loss"], val_evaluator.state.metrics["accuracy"]))

    return model, train_evaluator.state.metrics, val_evaluator.state.metrics
Пример #27
0
    def __init__(self, output_transform=binary_transform):
        self._y = None
        self._pred = None
        super().__init__(output_transform=output_transform)

    def reset(self):
        self._y = list()
        self._pred = list()
        super().reset()

    def update(self, output):
        y_pred, y = output
        self._y.append(y)
        self._pred.append(y_pred)

    def compute(self):
        y_pred = torch.cat(self._pred, 0).cpu()
        y = torch.cat(self._y, 0).cpu()
        score = f1_score(y, y_pred, average='weighted')
        return score


metric = {
    'acc2': Accuracy(output_transform=binary_transform),
    'acc5': Accuracy(output_transform=five_transform),
    'acc7': Accuracy(output_transform=seven_transform),
    'f1': F1(),
    'corr': Pearson(),
    'mae': MeanAbsoluteError()
}
Пример #28
0
    def _train(
        self,
        train_data,
        val_data,
        test_data,
        writer,
        experiment,
        dry_run: bool = False,
    ) -> None:

        use_cuda = torch.cuda.is_available()

        # Preprocess all datasets.
        logger.info("Preprocessing datasets...")
        train_loader = self._preprocess_for_training("train", train_data, use_cuda)
        val_loader = self._preprocess_for_training("val", val_data, use_cuda)
        test_loader = self._preprocess_for_training("test", test_data, use_cuda)
        logger.info("")

        # Set up model and move it to device.
        logger.info("Creating model...")
        self._create_model(self.num_classes)
        device = torch.device("cuda" if use_cuda else "cpu")
        logger.info(f"    device: {device}")
        self.model = self.model.to(device)

        # Set up optimizer and loss.
        optimizer = self._create_optimizer()
        loss_func = nn.CrossEntropyLoss()
        logger.info(f"    loss function: cross-entropy")
        logger.info("")

        # Dedicate a few images that will be plotted as samples to tensorboard.
        num_samples_to_plot = self.config.get("num_samples_to_plot", 5)

        def get_samples(loader):
            if loader is None:
                return None, None
            else:
                return next(
                    iter(DataLoader(loader.dataset, batch_size=num_samples_to_plot))
                )

        train_sample_images, train_sample_labels = get_samples(train_loader)
        val_sample_images, val_sample_labels = get_samples(val_loader)
        test_sample_images, test_sample_labels = get_samples(test_loader)

        # Configure trainer and metrics.
        accumulate_train_metrics = self.config.get("accumulate_train_metrics", True)

        # We need to transform the output of the trainer and metrics here to accumulate
        # metrics during training (otherwise, we have to re-evaluate on the complete
        # train set which takes a long time). By default, the trainer outputs
        # `loss.item()` and the metrics expect `y_pred, y` (which is what the evaluator
        # outputs). We are now outputting `y_pred, y, loss` from the trainer and then
        # slicing off the `loss` before it goes into the metric.
        # See also the footnote here but note that it's a bit wrong:
        # https://pytorch.org/ignite/quickstart.html#
        def trainer_output_transform(x, y, y_pred, loss):
            return y_pred, y, loss.item()

        def metrics_output_transform(output):
            return output[:2]  # use only y_pred, y

        trainer = create_supervised_trainer(
            self.model,
            optimizer,
            loss_func,
            device=device,
            output_transform=trainer_output_transform,
        )
        if accumulate_train_metrics:
            # TODO: Maybe put train_metrics and val_metrics into one dict.
            train_metrics = {
                "accuracy": Accuracy(output_transform=metrics_output_transform),
                "loss": Loss(loss_func, output_transform=metrics_output_transform),
                # "confusion_matrix": ConfusionMatrix(num_classes),
            }
            for name, metric in train_metrics.items():
                # Attach metrics to trainer to accumulate them during training.
                metric.attach(trainer, name)
        val_metrics = {
            "accuracy": Accuracy(),
            "loss": Loss(loss_func),
            # "confusion_matrix": ConfusionMatrix(num_classes),
        }
        evaluator = create_supervised_evaluator(
            self.model, metrics=val_metrics, device=device
        )

        @trainer.on(
            Events.ITERATION_COMPLETED(every=self.config.get("print_every", 100))
        )
        def log_batch(trainer):
            batch = (trainer.state.iteration - 1) % trainer.state.epoch_length + 1
            logger.info(
                f"Epoch {trainer.state.epoch} / {num_epochs}, "
                f"batch {batch} / {trainer.state.epoch_length}: "
                f"Loss: {trainer.state.output[2]:.3f}"
                # f"Loss: {trainer.state.output:.3f}"
            )

        def log_results(name, metrics, epoch):
            """Log results of an epoch to stdout, tensorboard and comet."""
            logger.info(
                f"{name}: Average loss: {metrics['loss']:.3f}, "
                f"Average accuracy: {metrics['accuracy']:.3f}"
            )
            experiment.log_metric(f"{name}_loss", metrics["loss"])
            experiment.log_metric(f"{name}_accuracy", metrics["accuracy"])
            writer.add_scalar(f"{name}_loss", metrics["loss"], epoch)
            writer.add_scalar(f"{name}_accuracy", metrics["accuracy"], epoch)

        # TODO: This iterates over complete train set again, maybe accumulate as in the
        #   example in the footnote here: https://pytorch.org/ignite/quickstart.html#
        @trainer.on(Events.EPOCH_COMPLETED)
        def log_epoch(trainer):
            logger.info("")
            logger.info(f"Epoch {trainer.state.epoch} / {num_epochs} results: ")

            # Train data.
            if accumulate_train_metrics:
                log_results("train", trainer.state.metrics, trainer.state.epoch)
                logger.info("(train metrics are accumulated during training; "
                            "to re-evaluate on the complete train set after training, "
                            "use config parameter 'accumulate_train_metrics': False)")
            else:
                evaluator.run(train_loader)
                log_results("train", evaluator.state.metrics, trainer.state.epoch)

            # Val data.
            if val_loader:
                evaluator.run(val_loader)
                log_results("val", evaluator.state.metrics, trainer.state.epoch)

            # Test data.
            if test_loader:
                evaluator.run(test_loader)
                log_results("test", evaluator.state.metrics, trainer.state.epoch)

            logger.info("")

        @trainer.on(Events.EPOCH_COMPLETED)
        def checkpoint_model(trainer):
            # TODO: Do not checkpoint at every step.
            checkpoint_dir = (
                self.out_dir / "checkpoints" / f"epoch{trainer.state.epoch}"
            )
            checkpoint_dir.mkdir(parents=True, exist_ok=True)
            torch.save(self.model, checkpoint_dir / "model.pt")

        @trainer.on(Events.EPOCH_COMPLETED)
        def plot_samples(trainer):
            """Plot a few sample images and probabilites to tensorboard."""

            def write_samples_plot(name, sample_images, sample_labels):
                # TODO: This can be improved by just using the outputs already
                #   calculated in evaluator.state.output in the functions above.
                #   Problem: At least in the train evaluator, the batches are not equal,
                #   so the plotted images will differ from run to run.
                if sample_images is None:
                    return

                with torch.no_grad():
                    sample_output = self.model(sample_images.to(device))
                    sample_pred = torch.softmax(sample_output, dim=1)

                visualization.plot_samples(
                    writer,
                    f"{name}-samples",
                    trainer.state.epoch,
                    sample_images.to("cpu").numpy(),
                    sample_labels.to("cpu").numpy(),
                    sample_pred.to("cpu").numpy(),
                )

            write_samples_plot("train", train_sample_images, train_sample_labels)
            write_samples_plot("val", val_sample_images, val_sample_labels)
            write_samples_plot("test", test_sample_images, test_sample_labels)

        # Start training.
        num_epochs = 1 if dry_run else self.config.get("num_epochs", 5)
        if dry_run:
            num_batches = 1
            logger.info(f"Training model on device {device}... (DRY RUN, only 1 batch)")
        elif "num_samples" in self.config:
            # TODO: Make sure batch_size doesn't differ from the value extracted during
            #   preprocessing.
            batch_size = self.config.get("batch_size", 128)
            # TODO: This always uses a few more samples than num_samples. Maybe get it
            #   to the correct value.
            num_batches = int(self.config["num_samples"] / batch_size) + 1
            logger.info(
                f"Training model on device {device}... (using "
                f"{self.config['num_samples']} of {len(train_loader.dataset)} samples)"
            )
        else:
            num_batches = None  # all batches
            logger.info(f"Training model on device {device}...")
            logger.info(
                "(if this takes too long, train on less data with the config "
                "parameter 'num_samples')"
            )
        logger.info("(show more steps by setting the config parameter 'print_every')")
        logger.info("")
        trainer.run(train_loader, max_epochs=num_epochs, epoch_length=num_batches)
        logger.info("Training finished!")

        # Save the trained model.
        torch.save(self.model, self.out_dir / "model.pt")
Пример #29
0
def test_integration():

    n_iters = 100
    batch_size = 10
    n_classes = 10
    y_true_batch_values = iter(
        np.random.randint(0, n_classes, size=(n_iters, batch_size)))
    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
    loss_values = iter(range(n_iters))

    def update_fn(engine, batch):
        loss_value = next(loss_values)
        y_true_batch = next(y_true_batch_values)
        y_pred_batch = next(y_pred_batch_values)
        return loss_value, torch.from_numpy(y_pred_batch), torch.from_numpy(
            y_true_batch)

    trainer = Engine(update_fn)
    alpha = 0.98

    acc_metric = RunningAverage(
        Accuracy(output_transform=lambda x: [x[1], x[2]]), alpha=alpha)
    acc_metric.attach(trainer, 'running_avg_accuracy')

    avg_output = RunningAverage(output_transform=lambda x: x[0], alpha=alpha)
    avg_output.attach(trainer, 'running_avg_output')

    running_avg_acc = [
        None,
    ]

    @trainer.on(Events.ITERATION_COMPLETED)
    def manual_running_avg_acc(engine):
        _, y_pred, y = engine.state.output
        indices = torch.max(y_pred, 1)[1]
        correct = torch.eq(indices, y).view(-1)
        num_correct = torch.sum(correct).item()
        num_examples = correct.shape[0]
        batch_acc = num_correct * 1.0 / num_examples
        if running_avg_acc[0] is None:
            running_avg_acc[0] = batch_acc
        else:
            running_avg_acc[0] = running_avg_acc[0] * alpha + (
                1.0 - alpha) * batch_acc
        engine.state.running_avg_acc = running_avg_acc[0]

    @trainer.on(Events.EPOCH_STARTED)
    def running_avg_output_init(engine):
        engine.state.running_avg_output = None

    @trainer.on(Events.ITERATION_COMPLETED)
    def running_avg_output_update(engine):
        if engine.state.running_avg_output is None:
            engine.state.running_avg_output = engine.state.output[0]
        else:
            engine.state.running_avg_output = engine.state.running_avg_output * alpha + \
                (1.0 - alpha) * engine.state.output[0]

    @trainer.on(Events.ITERATION_COMPLETED)
    def assert_equal_running_avg_acc_values(engine):
        assert engine.state.running_avg_acc == engine.state.metrics['running_avg_accuracy'], \
            "{} vs {}".format(engine.state.running_avg_acc, engine.state.metrics['running_avg_accuracy'])

    @trainer.on(Events.ITERATION_COMPLETED)
    def assert_equal_running_avg_output_values(engine):
        assert engine.state.running_avg_output == engine.state.metrics['running_avg_output'], \
            "{} vs {}".format(engine.state.running_avg_output, engine.state.metrics['running_avg_output'])

    np.random.seed(10)
    running_avg_acc = [
        None,
    ]
    n_iters = 10
    batch_size = 10
    n_classes = 10
    data = list(range(n_iters))
    loss_values = iter(range(n_iters))
    y_true_batch_values = iter(
        np.random.randint(0, n_classes, size=(n_iters, batch_size)))
    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
    trainer.run(data, max_epochs=1)

    running_avg_acc = [
        None,
    ]
    n_iters = 10
    batch_size = 10
    n_classes = 10
    data = list(range(n_iters))
    loss_values = iter(range(n_iters))
    y_true_batch_values = iter(
        np.random.randint(0, n_classes, size=(n_iters, batch_size)))
    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
    trainer.run(data, max_epochs=1)
Пример #30
0
    def train_epoch(self):

        device = 'cpu'

        ########### model ###########
        #Model = model.GQNet()
        #Model = GQResnet.resnet18()
        #Model = GQResNeXt.resnext18()
        #Model = GQSEResNet.se_resnet_18()
        Model = GQSEResNeXt.se_resnext_18()
        #Model = GQSKResnet.SKNet18()
        #Model = GQSKResNeXt.SKNet18()

        total_model_parameters = sum(x.numel() for x in Model.parameters())
        print("Model have %.2fM paramerters in total" %
              (total_model_parameters / 1e6))

        print("SE_ResNeXt")

        if torch.cuda.is_available():
            print("pytorch gpu")
            device = 'cuda'
            Model = Model.cuda()

        ########### optimizer, loss ###########
        optimizer = torch.optim.Adam(Model.parameters(),
                                     lr=self.learning_rate,
                                     weight_decay=0.0001)
        loss = torch.nn.CrossEntropyLoss()

        ########### create trainer and evaluator ###########
        trainer = create_supervised_trainer(Model,
                                            optimizer,
                                            loss,
                                            device=device)

        evaluator = create_supervised_evaluator(Model,
                                                metrics={
                                                    "accuracy": Accuracy(),
                                                    "loss": Loss(loss)
                                                },
                                                device=device)

        ########### log ###########
        desc = "ITERATION - loss: {:.2f}"
        pbar = tqdm(initial=0,
                    leave=False,
                    total=len(self.data_loader_train),
                    desc=desc.format(0))

        ########### train loss ###########
        @trainer.on(Events.ITERATION_COMPLETED)
        def log_training_loss(engine):
            iter = (engine.state.iteration - 1) % len(
                self.data_loader_train) + 1

            if iter % self.log_interval == 0:
                pbar.desc = desc.format(engine.state.output)
                pbar.update(self.log_interval)

        ########### train results ###########
        @trainer.on(Events.EPOCH_COMPLETED)
        def log_training_results(engine):
            pbar.refresh()
            evaluator.run(self.data_loader_train)
            metrics = evaluator.state.metrics
            avg_accuracy = metrics['accuracy']
            avg_cross_loss = metrics['loss']

            tqdm.write(
                "Training Results - Epoch: {}  Avg accuracy: {:.2f}  Avg loss: {:.2f}"
                .format(engine.state.epoch, avg_accuracy, avg_cross_loss))

        ########### test results ###########
        @trainer.on(Events.EPOCH_COMPLETED)
        def log_validation_results(engine):
            evaluator.run(self.data_loader_valid)
            metrics = evaluator.state.metrics
            avg_accuracy = metrics['accuracy']
            avg_cross_loss = metrics['loss']

            tqdm.write(
                "Validation Results - Epoch: {}  Avg accuracy: {:.2f}  Avg loss: {:.2f}"
                .format(engine.state.epoch, avg_accuracy, avg_cross_loss))

            pbar.n = pbar.last_print_n = 0

        ########### trainer start ###########
        trainer.run(self.data_loader_train, max_epochs=self.num_epochs)

        ############## save model ################
        torch.save(Model, str(self.version_name))
        pbar.close()