Exemplo n.º 1
0
def initialize(config):
    model = utils.get_model(config["model"], config["model_dir"],
                            config["dropout"], config["n_fc"],
                            config["num_classes"])

    config["learning_rate"] *= idist.get_world_size()
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    optimizer = optim.AdamW(
        model.parameters(),
        lr=config["learning_rate"],
        weight_decay=config["weight_decay"],
    )
    optimizer = idist.auto_optim(optimizer)
    criterion = nn.BCEWithLogitsLoss()

    le = config["num_iters_per_epoch"]
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    return model, optimizer, criterion, lr_scheduler
Exemplo n.º 2
0
def initialize(config):
    model = utils.get_model(config["model"])
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model, find_unused_parameters=True)

    optimizer = optim.SGD(
        model.parameters(),
        lr=config["learning_rate"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
        nesterov=True,
    )
    optimizer = idist.auto_optim(optimizer)
    criterion = nn.CrossEntropyLoss().to(idist.device())

    le = config["num_iters_per_epoch"]
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    return model, optimizer, criterion, lr_scheduler
Exemplo n.º 3
0
def initialize(config):

    device = idist.device()

    model = config.model.to(device)
    optimizer = config.optimizer

    # Adapt model to dist config
    model = idist.auto_model(model)

    if idist.backend() == "horovod":
        accumulation_steps = config.get("accumulation_steps", 1)
        # Can not use auto_optim with Horovod: https://github.com/horovod/horovod/issues/2670
        import horovod.torch as hvd

        optimizer = hvd.DistributedOptimizer(
            optimizer,
            named_parameters=model.named_parameters(),
            backward_passes_per_step=accumulation_steps,
        )
        hvd.broadcast_optimizer_state(optimizer, root_rank=0)
        if accumulation_steps > 1:
            # disable manual grads accumulation as it is already done on optimizer's side
            config.accumulation_steps = 1
    else:
        optimizer = idist.auto_optim(optimizer)
    criterion = config.criterion.to(device)

    return model, optimizer, criterion
Exemplo n.º 4
0
def initialize(config):
    model = utils.get_model(config["model"])
    # Adapt model for distributed backend if provided
    model = idist.auto_model(model)

    optimizer = utils.get_optimizer(
        config["optimizer"],
        model,
        learning_rate=config["learning_rate"],
        weight_decay=config["weight_decay"],
    )
    # Adapt optimizer for distributed backend if provided
    optimizer = idist.auto_optim(optimizer)
    criterion = nn.CrossEntropyLoss().to(idist.device())

    le = config["num_iters_per_epoch"]
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    return model, optimizer, criterion, lr_scheduler
Exemplo n.º 5
0
def initialize(
    config: ConfigSchema, wlm: WeakLabelManager
) -> Tuple[nn.Module, Optimizer, nn.Module]:
    model = get_model(config.model)
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    to_decay, not_to_deacy = [], []
    for name, param in model.named_parameters():
        if not param.requires_grad:
            continue
        elif len(param.shape) == 1 or name.endswith("bias"):
            not_to_deacy.append(param)
        else:
            to_decay.append(param)
    optimizer = optim.SGD(
        [
            {"params": to_decay, "weight_decay": config.weight_decay},
            {"params": not_to_deacy, "weight_decay": 0.0},
        ],
        lr=config.learning_rate,
        momentum=config.momentum,
        nesterov=True,
    )
    optimizer = idist.auto_optim(optimizer)
    criterion = get_weak_label_loss(config, wlm).to(idist.device())

    return model, optimizer, criterion
    def _init_distribution(self):
        self.rank = idist.get_rank()
        manual_seed(42 + self.rank)
        self.device = idist.device()

        if self.train_ds:
            if self.train_ds.sampler is not None:
                sampler = self.train_ds.sampler(self.train_ds,
                                                self.train_ds.get_label)
                isShuffle = False
            else:
                sampler = None
                isShuffle = True
            self.train_loader = idist.auto_dataloader(
                self.train_ds,
                batch_size=self.hparams.train_bs,
                num_workers=self.hparams.train_num_workers,
                shuffle=isShuffle,
                drop_last=True,
                sampler=sampler,
                **self.train_ds.additional_loader_params)

        if self.valid_ds:
            self.valid_loader = idist.auto_dataloader(
                self.valid_ds,
                batch_size=self.hparams.valid_bs,
                num_workers=self.hparams.valid_num_workers,
                shuffle=False,
                drop_last=False,
                **self.valid_ds.additional_loader_params)

        if self.test_ds:
            self.test_loader = idist.auto_dataloader(
                self.test_ds,
                batch_size=self.hparams.valid_bs,
                num_workers=self.hparams.valid_num_workers,
                shuffle=False,
                drop_last=False,
                **self.test_ds.additional_loader_params)

        if USE_AMP:
            self._init_optimizer()
            self.model = idist.auto_model(self.model)
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level="O1")
        else:
            self.model = idist.auto_model(self.model)

        if not USE_AMP:
            self._init_optimizer()

        self.optimizer = idist.auto_optim(self.optimizer)

        self._init_scheduler()

        self.criterion = self.criterion.to(self.device)
def get_optimizer(model, config):

    assert config[
        "optimizer"] in optim.__dict__, f"Unknown optimizer: {config['optimizer']}"

    optimizer = optim.__dict__[config["optimizer"]](
        model.parameters(),
        lr=config["learning_rate"],
        weight_decay=config["weight_decay"])

    optimizer = idist.auto_optim(optimizer)

    return optimizer
Exemplo n.º 8
0
def initialize(config):
    model = get_model(config["model"])
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    optimizer = optim.SGD(
        model.parameters(),
        lr=config.get("learning_rate", 0.1),
        momentum=config.get("momentum", 0.9),
        weight_decay=config.get("weight_decay", 1e-5),
        nesterov=True,
    )
    optimizer = idist.auto_optim(optimizer)
    criterion = nn.CrossEntropyLoss().to(idist.device())

    le = config["num_iters_per_epoch"]
    lr_scheduler = StepLR(optimizer, step_size=le, gamma=0.9)

    return model, optimizer, criterion, lr_scheduler
Exemplo n.º 9
0
def initialize(config):
    model = get_model(config.model, config.model_dir, config.dropout, config.n_fc, config.num_classes)

    config.learning_rate *= idist.get_world_size()
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    optimizer = optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
    optimizer = idist.auto_optim(optimizer)
    loss_fn = nn.BCEWithLogitsLoss()

    le = config.num_iters_per_epoch
    milestones_values = [
        (0, 0.0),
        (le * config.num_warmup_epochs, config.learning_rate),
        (le * config.max_epochs, 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values)

    return model, optimizer, loss_fn, lr_scheduler
Exemplo n.º 10
0
def initialize(config):
    model = utils.get_model(config["model"], config["num_classes"])
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    optimizer = optim.SGD(
        model.parameters(),
        lr=config["learning_rate"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
        #        nesterov=True,
    )
    optimizer = idist.auto_optim(optimizer)

    # criterion = nn.CrossEntropyLoss().to(idist.device())
    criterion = nn.CrossEntropyLoss()

    le = config["num_iters_per_epoch"]
    cl = config["learning_rate"]
    # print("%d, %f" %(le,cl))
    milestones_values = [
        (30 * le, cl),
        (45 * le, 0.5 * cl),
        (46 * le, 0.1 * cl),
        (60 * le, 0.1 * cl),
        (61 * le, 0.01 * cl),
        (90 * le, 0.01 * cl),
        (120 * le, 0.001 * cl),
        # (le * config["num_warmup_epochs"], config["learning_rate"]),
        # (le * config["num_epochs"], 0.0),
    ]
    # print(milestones_values)
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config["lr_step_size"], gamma=config["lr_gamma"])

    return model, optimizer, criterion, lr_scheduler
Exemplo n.º 11
0
def initialize(config: Optional[Any]) -> Tuple[Module, Optimizer, Module, Union[_LRScheduler, ParamScheduler]]:
    """Initializing model, optimizer, loss function, and lr scheduler
    with correct settings.

    Parameters
    ----------
    config:
        config object

    Returns
    -------
    model, optimizer, loss_fn, lr_scheduler
    """
    model = ...
    optimizer = ...
    loss_fn = ...
    lr_scheduler = ...
    model = idist.auto_model(model)
    optimizer = idist.auto_optim(optimizer)
    loss_fn = loss_fn.to(idist.device())

    return model, optimizer, loss_fn, lr_scheduler
Exemplo n.º 12
0
def initialize(cfg):
    model = setup_model(cfg.model, num_classes=cfg.num_classes)
    ema_model = setup_model(cfg.model, num_classes=cfg.num_classes)

    model.to(idist.device())
    ema_model.to(idist.device())
    setup_ema(ema_model, model)

    model = idist.auto_model(model)

    if isinstance(model, nn.parallel.DataParallel):
        ema_model = nn.parallel.DataParallel(ema_model)

    optimizer = instantiate(cfg.solver.optimizer, model.parameters())
    optimizer = idist.auto_optim(optimizer)

    sup_criterion = instantiate(cfg.solver.supervised_criterion)

    total_num_iters = cfg.solver.num_epochs * cfg.solver.epoch_length
    lr_scheduler = instantiate(cfg.solver.lr_scheduler,
                               optimizer,
                               T_max=total_num_iters)

    return model, ema_model, optimizer, sup_criterion, lr_scheduler
Exemplo n.º 13
0
def training(rank, config):

    # Specific ignite.distributed
    print(
        idist.get_rank(),
        ": run with config:",
        config,
        "- backend=",
        idist.backend(),
        "- world size",
        idist.get_world_size(),
    )

    device = idist.device()

    # Data preparation:
    dataset = RndDataset(nb_samples=config["nb_samples"])

    # Specific ignite.distributed
    train_loader = idist.auto_dataloader(dataset,
                                         batch_size=config["batch_size"])

    # Model, criterion, optimizer setup
    model = idist.auto_model(wide_resnet50_2(num_classes=100))
    criterion = NLLLoss()
    optimizer = idist.auto_optim(SGD(model.parameters(), lr=0.01))

    # Training loop log param
    log_interval = config["log_interval"]

    def _train_step(engine, batch):

        data = batch[0].to(device)
        target = batch[1].to(device)

        optimizer.zero_grad()
        output = model(data)
        # Add a softmax layer
        probabilities = torch.nn.functional.softmax(output, dim=0)

        loss_val = criterion(probabilities, target)
        loss_val.backward()
        optimizer.step()

        return loss_val

    # Running the _train_step function on whole batch_data iterable only once
    trainer = Engine(_train_step)

    # Add a logger
    @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
    def log_training():
        print("Process {}/{} Train Epoch: {} [{}/{}]\tLoss: {}".format(
            idist.get_rank(),
            idist.get_world_size(),
            trainer.state.epoch,
            trainer.state.iteration * len(trainer.state.batch[0]),
            len(dataset) / idist.get_world_size(),
            trainer.state.output,
        ))

    trainer.run(train_loader, max_epochs=1)
Exemplo n.º 14
0
def run(
    local_rank: int,
    device: str,
    experiment_name: str,
    gpus: Optional[Union[int, List[int], str]] = None,
    dataset_root: str = "./dataset",
    log_dir: str = "./log",
    model: str = "fasterrcnn_resnet50_fpn",
    epochs: int = 13,
    batch_size: int = 4,
    lr: float = 0.01,
    download: bool = False,
    image_size: int = 256,
    resume_from: Optional[dict] = None,
) -> None:
    bbox_params = A.BboxParams(format="pascal_voc")
    train_transform = A.Compose(
        [A.HorizontalFlip(p=0.5), ToTensorV2()],
        bbox_params=bbox_params,
    )
    val_transform = A.Compose([ToTensorV2()], bbox_params=bbox_params)

    download = local_rank == 0 and download
    train_dataset = Dataset(root=dataset_root,
                            download=download,
                            image_set="train",
                            transforms=train_transform)
    val_dataset = Dataset(root=dataset_root,
                          download=download,
                          image_set="val",
                          transforms=val_transform)
    vis_dataset = Subset(val_dataset,
                         random.sample(range(len(val_dataset)), k=16))

    train_dataloader = idist.auto_dataloader(train_dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             collate_fn=collate_fn,
                                             num_workers=4)
    val_dataloader = DataLoader(val_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                collate_fn=collate_fn,
                                num_workers=4)
    vis_dataloader = DataLoader(vis_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                collate_fn=collate_fn,
                                num_workers=4)

    model = idist.auto_model(model)
    scaler = GradScaler()
    optimizer = SGD(lr=lr, params=model.parameters())
    optimizer = idist.auto_optim(optimizer)
    scheduler = OneCycleLR(optimizer,
                           max_lr=lr,
                           total_steps=len(train_dataloader) * epochs)

    def update_model(engine, batch):
        model.train()
        images, targets = batch
        images = list(image.to(device) for image in images)
        targets = [{
            k: v.to(device)
            for k, v in t.items() if isinstance(v, torch.Tensor)
        } for t in targets]

        with torch.autocast(device, enabled=True):
            loss_dict = model(images, targets)
            loss = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        loss_items = {k: v.item() for k, v in loss_dict.items()}
        loss_items["loss_average"] = loss.item() / 4

        return loss_items

    @torch.no_grad()
    def inference(engine, batch):
        model.eval()
        images, targets = batch
        images = list(image.to(device) for image in images)
        outputs = model(images)
        outputs = [{k: v.to("cpu") for k, v in t.items()} for t in outputs]
        return {
            "y_pred": outputs,
            "y": targets,
            "x": [i.cpu() for i in images]
        }

    trainer = Engine(update_model)
    evaluator = Engine(inference)
    visualizer = Engine(inference)

    aim_logger = AimLogger(
        repo=os.path.join(log_dir, "aim"),
        experiment=experiment_name,
    )

    CocoMetric(convert_to_coco_api(val_dataset)).attach(evaluator, "mAP")

    @trainer.on(Events.EPOCH_COMPLETED)
    @one_rank_only()
    def log_validation_results(engine):
        evaluator.run(val_dataloader)
        visualizer.run(vis_dataloader)

    @trainer.on(Events.ITERATION_COMPLETED)
    def step_scheduler(engine):
        scheduler.step()
        aim_logger.log_metrics({"lr": scheduler.get_last_lr()[0]},
                               step=engine.state.iteration)

    @visualizer.on(Events.EPOCH_STARTED)
    def reset_vis_images(engine):
        engine.state.model_outputs = []

    @visualizer.on(Events.ITERATION_COMPLETED)
    def add_vis_images(engine):
        engine.state.model_outputs.append(engine.state.output)

    @visualizer.on(Events.ITERATION_COMPLETED)
    def submit_vis_images(engine):
        aim_images = []
        for outputs in engine.state.model_outputs:
            for image, target, pred in zip(outputs["x"], outputs["y"],
                                           outputs["y_pred"]):
                image = (image * 255).byte()
                pred_labels = [
                    Dataset.class2name[label.item()]
                    for label in pred["labels"]
                ]
                pred_boxes = pred["boxes"].long()
                image = draw_bounding_boxes(image,
                                            pred_boxes,
                                            pred_labels,
                                            colors="red")

                target_labels = [
                    Dataset.class2name[label.item()]
                    for label in target["labels"]
                ]
                target_boxes = target["boxes"].long()
                image = draw_bounding_boxes(image,
                                            target_boxes,
                                            target_labels,
                                            colors="green")

                aim_images.append(aim.Image(image.numpy().transpose(
                    (1, 2, 0))))
        aim_logger.experiment.track(aim_images,
                                    name="vis",
                                    step=trainer.state.epoch)

    losses = [
        "loss_classifier", "loss_box_reg", "loss_objectness",
        "loss_rpn_box_reg", "loss_average"
    ]
    for loss_name in losses:
        RunningAverage(output_transform=lambda x: x[loss_name]).attach(
            trainer, loss_name)
    ProgressBar().attach(trainer, losses)
    ProgressBar().attach(evaluator)

    objects_to_checkpoint = {
        "trainer": trainer,
        "model": model,
        "optimizer": optimizer,
        "lr_scheduler": scheduler,
        "scaler": scaler,
    }
    checkpoint = Checkpoint(
        to_save=objects_to_checkpoint,
        save_handler=DiskSaver(log_dir, require_empty=False),
        n_saved=3,
        score_name="mAP",
        global_step_transform=lambda *_: trainer.state.epoch,
    )
    evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint)
    if resume_from:
        Checkpoint.load_objects(objects_to_checkpoint, torch.load(resume_from))

    aim_logger.log_params({
        "lr": lr,
        "image_size": image_size,
        "batch_size": batch_size,
        "epochs": epochs,
    })
    aim_logger.attach_output_handler(trainer,
                                     event_name=Events.ITERATION_COMPLETED,
                                     tag="train",
                                     output_transform=lambda loss: loss)
    aim_logger.attach_output_handler(
        evaluator,
        event_name=Events.EPOCH_COMPLETED,
        tag="val",
        metric_names=["mAP"],
        global_step_transform=global_step_from_engine(
            trainer, Events.ITERATION_COMPLETED),
    )

    trainer.run(train_dataloader, max_epochs=epochs)
Exemplo n.º 15
0
def training(rank, config):
    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    # Define output folder:
    config.output = "/tmp/output"

    model = idist.auto_model(config.model)
    optimizer = idist.auto_optim(config.optimizer)
    criterion = config.criterion

    train_set, val_set = config.train_set, config.val_set
    train_loader = idist.auto_dataloader(train_set,
                                         batch_size=config.train_batch_size)
    val_loader = idist.auto_dataloader(val_set,
                                       batch_size=config.val_batch_size)

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED(every=config.val_interval))
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    if rank == 0:
        tb_logger = TensorboardLogger(log_dir=config.output)

        tb_logger.attach_output_handler(
            trainer,
            event_name=Events.ITERATION_COMPLETED(every=100),
            tag="training",
            output_transform=lambda loss: {"batchloss": loss},
            metric_names="all",
        )

        for tag, evaluator in [("training", train_evaluator),
                               ("validation", validation_evaluator)]:
            tb_logger.attach_output_handler(
                evaluator,
                event_name=Events.EPOCH_COMPLETED,
                tag=tag,
                metric_names=["loss", "accuracy"],
                global_step_transform=global_step_from_engine(trainer),
            )

        tb_logger.attach_opt_params_handler(
            trainer,
            event_name=Events.ITERATION_COMPLETED(every=100),
            optimizer=optimizer)

    model_checkpoint = ModelCheckpoint(
        config.output,
        n_saved=2,
        filename_prefix="best",
        score_name="accuracy",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint,
                                           {"model": model})

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if rank == 0:
        tb_logger.close()