예제 #1
0
    def after_step(self):
        """Run after every iteration, see parent for details"""
        self.num_steps += 1
        if self.num_steps % self._period == 0:
            data = next(self._loader)

            if torch.cuda.is_available():
                torch.cuda.synchronize()

            with torch.no_grad():
                loss_dict = self.trainer.model(data)

                losses = sum(loss_dict.values())
                assert torch.isfinite(losses).all(), loss_dict

                loss_dict_reduced = {
                    "val_" + k: v.item()
                    for k, v in comm.reduce_dict(loss_dict).items()
                }
                losses_reduced = sum(loss
                                     for loss in loss_dict_reduced.values())
                if comm.is_main_process():
                    self.trainer.storage.put_scalars(
                        total_val_loss=losses_reduced, **loss_dict_reduced)
                comm.synchronize()
        else:
            pass
예제 #2
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
    )
    start_iter = (
        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
    )
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
    )

    writers = default_writers(cfg.OUTPUT_DIR, max_iter) if comm.is_main_process() else []

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement in a small training loop
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            storage.iter = iteration

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
            scheduler.step()

            if (
                cfg.TEST.EVAL_PERIOD > 0
                and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0
                and iteration != max_iter - 1
            ):
                do_test(cfg, model)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and (
                (iteration + 1) % 20 == 0 or iteration == max_iter - 1
            ):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
def get_loss(data, model):
    loss_dict = model(data)
    losses = sum(loss_dict.values())
    assert torch.isfinite(losses).all(), loss_dict
    loss_dict_reduced = {
        k: v.item()
        for k, v in comm.reduce_dict(loss_dict).items()
    }
    losses_reduced = sum(loss for loss in loss_dict_reduced.values())
    return loss_dict_reduced, losses, losses_reduced
예제 #4
0
    def after_step(self):
        data = next(self._loader)
        with torch.no_grad():
            loss_dict = self.trainer.model(data)
            
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {"val_" + k: v.item() for k, v in 
                                  comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                self.trainer.storage.put_scalars(total_val_loss=losses_reduced, 
                                                  **loss_dict_reduced)
예제 #5
0
def do_val(cfg, model, val_dataloader):
    with torch.no_grad():
        val_loss = 0
        tmp = None
        for idx, inputs in tqdm(enumerate(val_dataloader)):
            outputs = model(inputs)
            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(outputs).items()
            }
            tmp = loss_dict_reduced
            reduced_loss = sum(loss for loss in loss_dict_reduced.values())
            val_loss += reduced_loss

    return val_loss
def _get_val_loss(data, model):
    """
    返回lossdict
    """
    # with inference_context(model), torch.no_grad():
    model.train()
    loss_dict = model(data)
    # metrics_dict = {
    #     k: v.detach().cpu().item() if isinstance(v, torch.Tensor) else float(v)
    #     for k, v in metrics_dict.items()
    # }
    # total_losses_reduced = sum(loss for loss in metrics_dict.values())

    # print(loss_dict)
    # loss_dict = model(data)
    # losses = sum(loss_dict.values())
    # assert torch.isfinite(losses).all(), loss_dict

    loss_dict_reduced = {
        k: v.item()
        for k, v in comm.reduce_dict(loss_dict).items()
    }
    return loss_dict_reduced
예제 #7
0
 def after_step(self):
     if self.trainer.iter > 0 and self.trainer.iter % self.steps == 0:
         list_dict = []
         # self.trainer.model.eval() # これをすると model(data) の動作が変わるのでやらない。
         start = time.perf_counter()
         with torch.no_grad():
             for _ in range(self.ndata):
                 data = next(self._loader)
                 loss_dict = self.trainer.model(data)
                 list_dict.append({
                     k: v.detach().cpu().item() if isinstance(
                         v, torch.Tensor) else float(v)
                     for k, v in loss_dict.items()
                 })
         loss_dict = {}
         for key in list_dict[0].keys():
             loss_dict[key] = np.mean([dictwk[key] for dictwk in list_dict])
         loss_dict = {
             self.cfg.DATASETS.TRAIN[0] + "_" + k: torch.tensor(v.item())
             for k, v in comm.reduce_dict(loss_dict).items()
         }
         self.loss_dict = loss_dict
         self.data_time = time.perf_counter() - start
def do_test(cfg, model):
    results = OrderedDict()

    for dataset_name in cfg.DATASETS.TEST:

      losses_total = []

      data_loader = build_detection_test_loader(cfg, dataset_name, DatasetMapper(cfg,True))

      for iteration_test, data_test in enumerate(data_loader):
        loss_dict = model(data_test)

        losses = sum(loss for loss in loss_dict.values())

        assert torch.isfinite(losses).all(), loss_dict

        loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}

        #get the total loss
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        losses_total.append(losses_reduced)

      mean_loss = np.mean(losses_total)

      print ('mean loss validation %s' %mean_loss)

      evaluator = get_evaluator(cfg, dataset_name, os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name))

      results_i = inference_on_dataset(model, data_loader, evaluator)

      results[dataset_name] = results_i

    if len(results) == 1:
        results = list(results.values())[0]

    return results, mean_loss
예제 #9
0
def do_train(cfg, model):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = Checkpointer(model, './', optimizer=optimizer, scheduler=scheduler)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)

    writers = [CommonMetricPrinter(max_iter)] if d2_comm.is_main_process() else []

    train_mapper = get_dataset_mapper(cfg, is_train=True)
    dataloader, dataset_dicts = build_train_dataloader(cfg, mapper=train_mapper)
    LOG.info("Length of train dataset: {:d}".format(len(dataset_dicts)))
    LOG.info("Starting training")
    storage = get_event_storage()

    if cfg.EVAL_ON_START:
        do_test(cfg, model)
        comm.synchronize()

    # In mixed-precision training, gradients are scaled up to keep them from being vanished due to half-precision.
    # They're scaled down again before optimizers use them to compute updates.
    scaler = amp.GradScaler(enabled=cfg.SOLVER.MIXED_PRECISION_ENABLED)

    # Accumulate gradients for multiple batches (as returned by dataloader) before calling optimizer.step().
    accumulate_grad_batches = cfg.SOLVER.ACCUMULATE_GRAD_BATCHES

    num_images_seen = 0
    # For logging, this stores losses aggregated from all workers in distributed training.
    batch_loss_dict = defaultdict(float)
    optimizer.zero_grad()
    for data, iteration in zip(dataloader, range(max_iter * accumulate_grad_batches)):
        iteration += 1
        # this assumes drop_last=True, so all workers has the same size of batch.
        num_images_seen += len(data) * d2_comm.get_world_size()
        if iteration % accumulate_grad_batches == 0:
            storage.step()

        with amp.autocast(enabled=cfg.SOLVER.MIXED_PRECISION_ENABLED):
            loss_dict = model(data)
        # Account for accumulated gradients.
        loss_dict = {name: loss / accumulate_grad_batches for name, loss in loss_dict.items()}
        losses = sum(loss_dict.values())
        # FIXME: First few iterations might give Inf/NaN losses when using mixed precision. What should be done?
        if not torch.isfinite(losses):
            LOG.critical(f"The loss DIVERGED: {loss_dict}")

        # Track total loss for logging.
        loss_dict_reduced = {k: v.item() for k, v in d2_comm.reduce_dict(loss_dict).items()}
        assert torch.isfinite(torch.as_tensor(list(loss_dict_reduced.values()))).all(), loss_dict_reduced
        for k, v in loss_dict_reduced.items():
            batch_loss_dict[k] += v

        # No amp version: leaving this here for legacy:
        # losses.backward()
        scaler.scale(losses).backward()

        if iteration % accumulate_grad_batches > 0:
            # Just accumulate gradients and move on to next batch.
            continue

        # No amp version: leaving this here for legacy:
        # optimizer.step()
        # scheduler.step()
        # optimizer.zero_grad()

        scaler.step(optimizer)
        storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
        scheduler.step()
        scaler.update()

        losses_reduced = sum(loss for loss in batch_loss_dict.values())
        storage.put_scalars(total_loss=losses_reduced, **batch_loss_dict)

        # Reset states.
        batch_loss_dict = defaultdict(float)
        optimizer.zero_grad()

        batch_iter = iteration // accumulate_grad_batches

        # TODO: probably check if the gradients contain any inf or nan, and only proceed if not.
        if batch_iter > 5 and (batch_iter % 20 == 0 or batch_iter == max_iter):
            # if batch_iter > -1 and (batch_iter % 1 == 0 or batch_iter == max_iter):
            for writer in writers:
                writer.write()
            # log epoch / # images seen
            if d2_comm.is_main_process() and cfg.WANDB.ENABLED:
                wandb.log({"epoch": 1 + num_images_seen // len(dataset_dicts)}, step=batch_iter)
                wandb.log({"num_images_seen": num_images_seen}, step=batch_iter)

        if cfg.VIS.DATALOADER_ENABLED and batch_iter % cfg.VIS.DATALOADER_PERIOD == 0 and d2_comm.is_main_process():
            dataset_name = cfg.DATASETS.TRAIN.NAME
            visualizer_names = MetadataCatalog.get(dataset_name).loader_visualizers
            viz_images = defaultdict(dict)
            for viz_name in visualizer_names:
                viz = get_dataloader_visualizer(cfg, viz_name, dataset_name)
                for idx, x in enumerate(data):
                    viz_images[idx].update(viz.visualize(x))

            if cfg.WANDB.ENABLED:
                # per_image_vis = [coalece_viz_images(viz_images[idx])[0] for idx in range(len(data))]
                per_image_vis = [mosaic(list(viz_images[idx].values())) for idx in range(len(data))]
                wandb.log({
                    "dataloader": [wandb.Image(vis, caption=f"idx={idx}") for idx, vis in enumerate(per_image_vis)]
                },
                          step=batch_iter)
            save_vis(viz_images, os.path.join(os.getcwd(), "visualization"), "dataloader", step=batch_iter)

        if d2_comm.is_main_process():  # TODO (dennis.park): is this necessary?
            periodic_checkpointer.step(batch_iter - 1)  # (fvcore) model_0004999.pth checkpoints 5000-th iteration

        if batch_iter > 0 and batch_iter % cfg.SYNC_OUTPUT_DIR_S3.PERIOD == 0:
            sync_output_dir_s3(cfg)

        if (cfg.TEST.EVAL_PERIOD > 0 and batch_iter % cfg.TEST.EVAL_PERIOD == 0 and batch_iter != max_iter) or \
            batch_iter in cfg.TEST.ADDITIONAL_EVAL_STEPS:
            do_test(cfg, model)
            d2_comm.synchronize()
예제 #10
0
def do_train(cfg, model, resume=False, val_set='firevysor_val'):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, min_lr=1e-6)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1, last_epoch=-1)
    metric = 0
    print_every = 50

    tensorboard_dir = osp.join(cfg.OUTPUT_DIR, 'tensorboard')
    checkpoint_dir = osp.join(cfg.OUTPUT_DIR, 'checkpoints')
    create_dir(tensorboard_dir)
    create_dir(checkpoint_dir)

    checkpointer = AdetCheckpointer(model,
                                    checkpoint_dir,
                                    optimizer=optimizer,
                                    scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        # JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(tensorboard_dir),
    ] if comm.is_main_process() else [])
    data_loader = build_detection_train_loader(cfg)
    val_dataloader = build_detection_val_loader(cfg, val_set)

    logger.info("Starting training from iteration {}".format(start_iter))

    # [PHAT]: Create a log file
    log_file = open(cfg.MY_CUSTOM.LOG_FILE, 'w')

    best_loss = 1e6
    count_not_improve = 0
    train_size = 2177
    epoch_size = int(train_size / cfg.SOLVER.IMS_PER_BATCH)
    n_early_epoch = 10

    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss for loss in loss_dict.values())

            assert torch.isfinite(losses).all(), loss_dict

            # Update loss dict
            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            # Early stopping
            if (iteration > start_iter) and ((iteration - start_iter) %
                                             epoch_size == 0):
                val_loss = do_val(cfg, model, val_dataloader)

                if val_loss >= best_loss:
                    count_not_improve += 1
                    # stop if models doesn't improve after <n_early_epoch> epoch
                    if count_not_improve == epoch_size * n_early_epoch:
                        break
                else:
                    count_not_improve = 0
                    best_loss = val_loss
                    periodic_checkpointer.save("best_model_early")

                # print(f"epoch {iteration//epoch_size}, val_loss: {val_loss}")
                log_file.write(
                    f"Epoch {(iteration-start_iter)//epoch_size}, val_loss: {val_loss}\n"
                )
                comm.synchronize()

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            lr = optimizer.param_groups[0]["lr"]
            storage.put_scalar("lr", lr, smoothing_hint=False)
            scheduler.step()

            if iteration - start_iter > 5 and (
                (iteration - start_iter) % print_every == 0
                    or iteration == max_iter):
                for writer in writers:
                    writer.write()

                # Write my log
                log_file.write(
                    f"[iter {iteration}, best_loss: {best_loss}] total_loss: {losses}, lr: {lr}\n"
                )

            periodic_checkpointer.step(iteration)

    log_file.close()
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)
    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)

    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get('iteration', -1) + 1)

    max_iter = cfg.SOLVER.MAX_ITER
    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, 'metric.json')),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    data_loader = build_detection_train_loader(cfg)
    logger.info(" Starting training from iteration {}".format(start_iter))

    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss for loss in loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST_EVAL_PERIOC == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
    else []
)

# compared to "train_net.py", we do not support accurate timing and
# precise BN here, because they are not trivial to implement
data_loader = build_detection_train_loader(cfg)
logger.info("Starting training from iteration {}".format(start_iter))
with EventStorage(start_iter) as storage:
    for data, iteration in zip(data_loader, range(start_iter, max_iter)):
        iteration = iteration + 1
        storage.step()
        loss_dict = model(data)
        losses = sum(loss for loss in loss_dict.values())
        assert torch.isfinite(losses).all(), loss_dict

        loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        if comm.is_main_process():
            storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
        scheduler.step()

        if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
            for writer in writers:
                writer.write()
        periodic_checkpointer.step(iteration)
예제 #13
0
파일: my_comm.py 프로젝트: hz-ants/GDR-Net
def reduce_dict(input_dict, average=True):
    global _USE_HVD
    if _USE_HVD:
        return reduce_dict_hvd(input_dict, average=average)
    return comm.reduce_dict(input_dict, average=average)
예제 #14
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)

    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS,
        resume=resume,
    ).get("iteration", -1) + 1)
    if cfg.SOLVER.RESET_ITER:
        logger.info('Reset loaded iteration. Start training from iteration 0.')
        start_iter = 0
    max_iter = cfg.SOLVER.MAX_ITER if cfg.SOLVER.TRAIN_ITER < 0 else cfg.SOLVER.TRAIN_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])


    mapper = DatasetMapper(cfg, True) if cfg.INPUT.CUSTOM_AUG == '' else \
        DatasetMapper(cfg, True, augmentations=build_custom_augmentation(cfg, True))
    if cfg.DATALOADER.SAMPLER_TRAIN in [
            'TrainingSampler', 'RepeatFactorTrainingSampler'
    ]:
        data_loader = build_detection_train_loader(cfg, mapper=mapper)
    else:
        from centernet.data.custom_dataset_dataloader import build_custom_train_loader
        data_loader = build_custom_train_loader(cfg, mapper=mapper)

    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        step_timer = Timer()
        data_timer = Timer()
        start_time = time.perf_counter()
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            data_time = data_timer.seconds()
            storage.put_scalars(data_time=data_time)
            step_timer.reset()
            iteration = iteration + 1
            storage.step()
            loss_dict = model(data)

            losses = sum(loss for k, loss in loss_dict.items())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() \
                for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)

            step_time = step_timer.seconds()
            storage.put_scalars(time=step_time)
            data_timer.reset()
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                comm.synchronize()

            if iteration - start_iter > 5 and \
                (iteration % 20 == 0 or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)

        total_time = time.perf_counter() - start_time
        logger.info("Total training time: {}".format(
            str(datetime.timedelta(seconds=int(total_time)))))
def do_train(cfg, model, cat_heatmap_file, resume=False):
    model.train()

    # select optimizer and learning rate scheduler based on the config
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    # creat checkpointer
    checkpointer = DetectionCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
    )
    start_iter = (
        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
    )
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
    )

    # create output writers. Separate TensorBoard writers are created
    # for train and validation sets. This allows easy overlaying of graphs
    # in TensorBoard.
    train_tb_writer = os.path.join(cfg.OUTPUT_DIR, 'train')
    val_tb_writer = os.path.join(cfg.OUTPUT_DIR, 'val')
    train_writers = (
        [
            CommonMetricPrinter(max_iter),
            JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
            TensorboardXWriter(train_tb_writer),
        ]
        if comm.is_main_process()
        else []
    )
    val_writers = [TensorboardXWriter(val_tb_writer)]


    train_dataset_name = cfg.DATASETS.TRAIN[0]
    train_data_loader = build_detection_train_loader(cfg)
    train_eval_data_loader = build_detection_test_loader(cfg, train_dataset_name)
    val_dataset_name = cfg.DATASETS.TEST[0]
    val_eval_data_loader = build_detection_test_loader(cfg, val_dataset_name, DatasetMapper(cfg,True))
    logger.info("Starting training from iteration {}".format(start_iter))
    train_storage = EventStorage(start_iter)
    val_storage = EventStorage(start_iter)

    # Create the training and validation evaluator objects.
    train_evaluator = get_evaluator(
        cfg, train_dataset_name, os.path.join(cfg.OUTPUT_DIR, "train_inference", train_dataset_name),
        cat_heatmap_file
    )
    val_evaluator = get_evaluator(
        cfg, val_dataset_name, os.path.join(cfg.OUTPUT_DIR, "val_inference", val_dataset_name),
        cat_heatmap_file
    )

    # initialize the best AP50 value
    best_AP50 = 0
    start_time = time.time()
    for train_data, iteration in zip(train_data_loader, range(start_iter, max_iter)):
         # stop if the file stop_running exists in the running directory
         if os.path.isfile('stop_running'):
             os.remove('stop_running')
             break

         iteration = iteration + 1

         # run a step with the training data
         with train_storage as storage:
            model.train()
            storage.step()

            loss_dict = model(train_data)
            losses = sum(loss for loss in loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
            scheduler.step()


            # periodically evaluate the training set and write the results
            if (cfg.TEST.EVAL_PERIOD > 0
                and iteration % cfg.TEST.EVAL_PERIOD == 0
                and iteration != max_iter):

                train_eval_results = inference_on_dataset(model, train_eval_data_loader,
                                                          train_evaluator)
                flat_results = flatten_results(train_eval_results)
                storage.put_scalars(**flat_results)
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
                for writer in train_writers:
                    writer.write()
            periodic_checkpointer.step(iteration)

         # run a step with the validation set
         with val_storage as storage:
            storage.step()

            # every 20 iterations evaluate the dataset to collect the loss
            if iteration % 20 == 0 or iteration == max_iter:
                with torch.set_grad_enabled(False):
                     for input, i in zip(val_eval_data_loader , range(1)):
                        loss_dict = model(input)
                        losses = sum(loss for loss in loss_dict.values())
                        assert torch.isfinite(losses).all(), loss_dict

                        loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
                        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

                if comm.is_main_process():
                    storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            # periodically evaluate the validation set and write the results
            # check the results against the best results seen and save the parameters for
            # the best result
            if (cfg.TEST.EVAL_PERIOD > 0
                and iteration % cfg.TEST.EVAL_PERIOD == 0
                or iteration == max_iter):
                val_eval_results = inference_on_dataset(model, val_eval_data_loader,
                                                        val_evaluator)
                logger.info('val_eval_results {}', str(val_eval_results))
                results = val_eval_results.get('segm', None)
                if results is None:
                    results = val_eval_results.get('bbox', None)
                if results is not None and results.get('AP50',-1) > best_AP50:
                    best_AP50 = results['AP50']
                    logger.info('saving best results ({}), iter {}'.format(best_AP50, iteration))
                    checkpointer.save("best_AP50")

                flat_results = flatten_results(val_eval_results)
                storage.put_scalars(**flat_results)
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0):
                for writer in val_writers:
                    writer.write()
                elapsed = time.time() - start_time
                time_per_iter = elapsed / (iteration - start_iter)
                time_left = time_per_iter * (max_iter - iteration)
                logger.info("ETA: {}".format(str(datetime.timedelta(seconds=time_left))))
예제 #16
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    #dataset|mapper|augs|sampler are done during building data_loader
    atoms = generate_atom_list(cfg, True)
    black_magic_mapper = BlackMagicMapper(cfg,
                                          is_train=True,
                                          augmentations=atoms)
    data_loader = build_detection_train_loader(cfg, black_magic_mapper)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            if cfg.DATALOADER.SAVE_BLACK_MAGIC_PATH != "":
                save_data_to_disk(cfg, data)
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
def do_train(cfg, model, resume=False):
    # Set model to training mode
    model.train()
    # Create optimizer from config file (returns torch.nn.optimizer.Optimizer)
    optimizer = build_optimizer(cfg, model)
    # Create scheduler for learning rate (returns torch.optim.lr._LR_scheduler)
    scheduler = build_lr_scheduler(cfg, optimizer)
    print(f"Scheduler: {scheduler}")

    # Create checkpointer
    checkpointer = DetectionCheckpointer(model,
                                         save_dir=cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)

    # Create start iteration (refernces checkpointer) - https://detectron2.readthedocs.io/modules/checkpoint.html#detectron2.checkpoint.Checkpointer.resume_or_load
    start_iter = (
        # This can be 0
        checkpointer.resume_or_load(
            cfg.MODEL.
            WEIGHTS,  # Use predefined model weights (pretrained model)
            resume=resume).get("iteration", -1) + 1)
    # Set max number of iterations
    max_iter = cfg.SOLVER.MAX_ITER

    # Create periodiccheckpoint
    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer=checkpointer,
        # How often to make checkpoints?
        period=cfg.SOLVER.CHECKPOINT_PERIOD,
        max_iter=max_iter)

    # Create writers (for saving checkpoints?)
    writers = ([
        # Print out common metrics such as iteration time, ETA, memory, all losses, learning rate
        CommonMetricPrinter(max_iter=max_iter),
        # Write scalars to a JSON file such as loss values, time and more
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        # Write all scalars such as loss values to a TensorBoard file for easy visualization
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    ### Original note from script: ###
    # compared to "train_net.py", we do not support accurate timing and precise BN
    # here, because they are not trivial to implement

    # Build a training data loader based off the training dataset name in the config
    data_loader = build_detection_train_loader(cfg)

    # Start logging
    logger.info("Starting training from iteration {}".format(start_iter))

    # Store events
    with EventStorage(start_iter) as storage:
        # Loop through zipped data loader and iteration
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step(
            )  # update stroage with step - https://detectron2.readthedocs.io/modules/utils.html#detectron2.utils.events.EventStorage.step

            # Create loss dictionary by trying to model data
            loss_dict = model(data)
            losses = sum(loss_dict.values())
            # Are losses infinite? If so, something is wrong
            assert torch.isfinite(losses).all(), loss_dict

            # TODO - Not quite sure what's happening here
            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            # Sum up losses
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            # # TODO: wandb.log()? log the losses
            # wandb.log({
            #         "Total loss": losses_reduced
            # })

            # Update storage
            if comm.is_main_process():
                # Store informate in storage - https://detectron2.readthedocs.io/modules/utils.html#detectron2.utils.events.EventStorage.put_scalars
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            # Start doing PyTorch things
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            # Add learning rate to storage information
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            # This is required for your learning rate to change!!!! (not having this meant my learning rate was staying at 0)
            scheduler.step()

            # Perform evaluation?
            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                # TODO - compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            # Log different metrics with writers
            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()

            # Update the periodic_checkpointer
            periodic_checkpointer.step(iteration)
예제 #18
0
def do_relation_train(cfg, model, resume=False):
    model.train()
    for param in model.named_parameters():
        param[1].requires_grad = False
    for param in model.named_parameters():
        for trainable in cfg.MODEL.TRAINABLE:
            if param[0].startswith(trainable):
                param[1].requires_grad = True
                break

        if param[0] == "relation_heads.instance_head.semantic_embed.weight" or \
            param[0] == "relation_heads.pair_head.semantic_embed.weight" or \
            param[0] == "relation_heads.predicate_head.semantic_embed.weight" or \
            param[0] == "relation_heads.triplet_head.ins_embed.weight" or \
            param[0] == "relation_heads.triplet_head.pred_embed.weight" or \
            param[0] == "relation_heads.subpred_head.sub_embed.weight" or \
            param[0] == "relation_heads.subpred_head.pred_embed.weight" or \
            param[0] == "relation_heads.predobj_head.pred_embed.weight" or \
            param[0] == "relation_heads.predobj_head.obj_embed.weight" or \
            param[0].startswith("relation_heads.predicate_head.freq_bias.obj_baseline.weight"):
            param[1].requires_grad = False

    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)
    metrics_sum_dict = {
        'relation_cls_tp_sum': 0,
        'relation_cls_p_sum': 0.00001,
        'pred_class_tp_sum': 0,
        'pred_class_p_sum': 0.00001,
        'gt_class_tp_sum': 0,
        'gt_class_p_sum': 0.00001,
        'raw_pred_class_tp_sum': 0,
        'raw_pred_class_p_sum': 0.00001,
        'instance_tp_sum':0,
        'instance_p_sum': 0.00001,
        'instance_g_sum':0.00001,
        'subpred_tp_sum': 0,
        'subpred_p_sum': 0.00001,
        'subpred_g_sum': 0.00001,
        'predobj_tp_sum': 0,
        'predobj_p_sum': 0.00001,
        'predobj_g_sum': 0.00001,
        'pair_tp_sum':0,
        'pair_p_sum': 0.00001,
        'pair_g_sum':0.00001,
        'confidence_tp_sum': 0,
        'confidence_p_sum': 0.00001,
        'confidence_g_sum': 0.00001,
        'predicate_tp_sum': 0,
        'predicate_tp20_sum': 0,
        'predicate_tp50_sum': 0,
        'predicate_tp100_sum': 0,
        'predicate_p_sum': 0.00001,
        'predicate_p20_sum': 0.00001,
        'predicate_p50_sum': 0.00001,
        'predicate_p100_sum': 0.00001,
        'predicate_g_sum': 0.00001,
        'triplet_tp_sum': 0,
        'triplet_tp20_sum': 0,
        'triplet_tp50_sum': 0,
        'triplet_tp100_sum': 0,
        'triplet_p_sum': 0.00001,
        'triplet_p20_sum': 0.00001,
        'triplet_p50_sum': 0.00001,
        'triplet_p100_sum': 0.00001,
        'triplet_g_sum': 0.00001,
    }
    checkpointer = DetectionCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler, metrics_sum_dict=metrics_sum_dict
    )
    start_iter = (checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    # state_dict=torch.load(cfg.MODEL.WEIGHTS).pop("model")
    # model.load_state_dict(state_dict,strict=False)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)

    # relation_cls_state_dict=torch.load(cfg.MODEL.WEIGHTS).pop("model")
    # for param in model.named_parameters():
    #     if param[0] not in relation_cls_state_dict:
    #         print(param[0])
    # model.load_state_dict(relation_cls_state_dict,strict=False)

    writers = (
        [
            CommonMetricPrinter(max_iter),
            JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
            TensorboardXWriter(cfg.OUTPUT_DIR),
        ]
        if comm.is_main_process()
        else []
    )
    metrics_pr_dict={}
    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    acumulate_losses=0
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            print(iteration)
            iteration = iteration + 1
            storage.step()
            if True:
            # try:
                pred_instances, results_dict, losses_dict, metrics_dict = model(data,iteration,mode="relation",training=True)
                losses = sum(loss for loss in losses_dict.values())
                assert torch.isfinite(losses).all(), losses_dict
                #print(losses_dict)

                loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(losses_dict).items()}
                losses_reduced = sum(loss for loss in loss_dict_reduced.values())
                acumulate_losses += losses_reduced
                if comm.is_main_process():
                    storage.put_scalars(acumulate_losses=acumulate_losses/(iteration-start_iter),total_loss=losses_reduced, **loss_dict_reduced)

                if 'relation_cls_tp' in metrics_dict:
                    metrics_sum_dict['relation_cls_tp_sum']+=metrics_dict['relation_cls_tp']
                    metrics_sum_dict['relation_cls_p_sum'] += metrics_dict['relation_cls_p']
                    metrics_pr_dict['relation_cls_precision'] = metrics_sum_dict['relation_cls_tp_sum'] / metrics_sum_dict['relation_cls_p_sum']
                if 'pred_class_tp' in metrics_dict:
                    metrics_sum_dict['pred_class_tp_sum']+=metrics_dict['pred_class_tp']
                    metrics_sum_dict['pred_class_p_sum'] += metrics_dict['pred_class_p']
                    metrics_pr_dict['pred_class_precision'] = metrics_sum_dict['pred_class_tp_sum'] / metrics_sum_dict['pred_class_p_sum']
                if 'raw_pred_class_tp' in metrics_dict:
                    metrics_sum_dict['raw_pred_class_tp_sum']+=metrics_dict['raw_pred_class_tp']
                    metrics_sum_dict['raw_pred_class_p_sum'] += metrics_dict['raw_pred_class_p']
                    metrics_pr_dict['raw_pred_class_precision'] = metrics_sum_dict['raw_pred_class_tp_sum'] / metrics_sum_dict['raw_pred_class_p_sum']
                if 'gt_class_tp' in metrics_dict:
                    metrics_sum_dict['gt_class_tp_sum']+=metrics_dict['gt_class_tp']
                    metrics_sum_dict['gt_class_p_sum'] += metrics_dict['gt_class_p']
                    metrics_pr_dict['gt_class_precision'] = metrics_sum_dict['gt_class_tp_sum'] / metrics_sum_dict['gt_class_p_sum']
                if 'instance_tp' in metrics_dict:
                    metrics_sum_dict['instance_tp_sum']+=metrics_dict['instance_tp']
                    metrics_sum_dict['instance_p_sum'] += metrics_dict['instance_p']
                    metrics_sum_dict['instance_g_sum'] += metrics_dict['instance_g']
                    metrics_pr_dict['instance_precision'] = metrics_sum_dict['instance_tp_sum'] / metrics_sum_dict['instance_p_sum']
                    metrics_pr_dict['instance_recall'] = metrics_sum_dict['instance_tp_sum'] / metrics_sum_dict['instance_g_sum']
                if 'subpred_tp' in metrics_dict:
                    metrics_sum_dict['subpred_tp_sum']+=metrics_dict['subpred_tp']
                    metrics_sum_dict['subpred_p_sum'] += metrics_dict['subpred_p']
                    metrics_sum_dict['subpred_g_sum'] += metrics_dict['subpred_g']
                    metrics_pr_dict['subpred_precision'] = metrics_sum_dict['subpred_tp_sum'] / metrics_sum_dict['subpred_p_sum']
                    metrics_pr_dict['subpred_recall'] = metrics_sum_dict['subpred_tp_sum'] / metrics_sum_dict['subpred_g_sum']
                if 'predobj_tp' in metrics_dict:
                    metrics_sum_dict['predobj_tp_sum']+=metrics_dict['predobj_tp']
                    metrics_sum_dict['predobj_p_sum'] += metrics_dict['predobj_p']
                    metrics_sum_dict['predobj_g_sum'] += metrics_dict['predobj_g']
                    metrics_pr_dict['predobj_precision'] = metrics_sum_dict['predobj_tp_sum'] / metrics_sum_dict['predobj_p_sum']
                    metrics_pr_dict['predobj_recall'] = metrics_sum_dict['predobj_tp_sum'] / metrics_sum_dict['predobj_g_sum']

                if 'pair_tp' in metrics_dict:
                    metrics_sum_dict['pair_tp_sum'] += metrics_dict['pair_tp']
                    metrics_sum_dict['pair_p_sum'] += metrics_dict['pair_p']
                    metrics_sum_dict['pair_g_sum'] += metrics_dict['pair_g']
                    metrics_pr_dict['pair_precision'] = metrics_sum_dict['pair_tp_sum'] / metrics_sum_dict['pair_p_sum']
                    metrics_pr_dict['pair_recall'] = metrics_sum_dict['pair_tp_sum'] / metrics_sum_dict['pair_g_sum']
                if 'confidence_tp' in metrics_dict:
                    metrics_sum_dict['confidence_tp_sum']+=metrics_dict['confidence_tp']
                    metrics_sum_dict['confidence_p_sum'] += metrics_dict['confidence_p']
                    metrics_sum_dict['confidence_g_sum'] += metrics_dict['confidence_g']
                    metrics_pr_dict['confidence_precision'] = metrics_sum_dict['confidence_tp_sum'] / metrics_sum_dict['confidence_p_sum']
                    metrics_pr_dict['confidence_recall'] = metrics_sum_dict['confidence_tp_sum'] / metrics_sum_dict['confidence_g_sum']
                if 'predicate_tp' in metrics_dict:
                    metrics_sum_dict['predicate_tp_sum']+=metrics_dict['predicate_tp']
                    metrics_sum_dict['predicate_tp20_sum'] += metrics_dict['predicate_tp20']
                    metrics_sum_dict['predicate_tp50_sum'] += metrics_dict['predicate_tp50']
                    metrics_sum_dict['predicate_tp100_sum'] += metrics_dict['predicate_tp100']
                    metrics_sum_dict['predicate_p_sum'] += metrics_dict['predicate_p']
                    metrics_sum_dict['predicate_p20_sum'] += metrics_dict['predicate_p20']
                    metrics_sum_dict['predicate_p50_sum'] += metrics_dict['predicate_p50']
                    metrics_sum_dict['predicate_p100_sum'] += metrics_dict['predicate_p100']
                    metrics_sum_dict['predicate_g_sum'] += metrics_dict['predicate_g']
                    metrics_pr_dict['predicate_precision'] = metrics_sum_dict['predicate_tp_sum'] / metrics_sum_dict['predicate_p_sum']
                    metrics_pr_dict['predicate_precision20'] = metrics_sum_dict['predicate_tp20_sum'] / metrics_sum_dict['predicate_p20_sum']
                    metrics_pr_dict['predicate_precision50'] = metrics_sum_dict['predicate_tp50_sum'] / metrics_sum_dict['predicate_p50_sum']
                    metrics_pr_dict['predicate_precision100'] = metrics_sum_dict['predicate_tp100_sum'] / metrics_sum_dict['predicate_p100_sum']
                    metrics_pr_dict['predicate_recall'] = metrics_sum_dict['predicate_tp_sum'] / metrics_sum_dict['predicate_g_sum']
                    metrics_pr_dict['predicate_recall20'] = metrics_sum_dict['predicate_tp20_sum'] / metrics_sum_dict['predicate_g_sum']
                    metrics_pr_dict['predicate_recall50'] = metrics_sum_dict['predicate_tp50_sum'] / metrics_sum_dict['predicate_g_sum']
                    metrics_pr_dict['predicate_recall100'] = metrics_sum_dict['predicate_tp100_sum'] / metrics_sum_dict['predicate_g_sum']
                if 'triplet_tp' in metrics_dict:
                    metrics_sum_dict['triplet_tp_sum'] += metrics_dict['triplet_tp']
                    metrics_sum_dict['triplet_tp20_sum'] += metrics_dict['triplet_tp20']
                    metrics_sum_dict['triplet_tp50_sum'] += metrics_dict['triplet_tp50']
                    metrics_sum_dict['triplet_tp100_sum'] += metrics_dict['triplet_tp100']
                    metrics_sum_dict['triplet_p_sum'] += metrics_dict['triplet_p']
                    metrics_sum_dict['triplet_p20_sum'] += metrics_dict['triplet_p20']
                    metrics_sum_dict['triplet_p50_sum'] += metrics_dict['triplet_p50']
                    metrics_sum_dict['triplet_p100_sum'] += metrics_dict['triplet_p100']
                    metrics_sum_dict['triplet_g_sum'] += metrics_dict['triplet_g']
                    metrics_pr_dict['triplet_precision'] = metrics_sum_dict['triplet_tp_sum'] / metrics_sum_dict['triplet_p_sum']
                    metrics_pr_dict['triplet_precision20'] = metrics_sum_dict['triplet_tp20_sum'] / metrics_sum_dict['triplet_p20_sum']
                    metrics_pr_dict['triplet_precision50'] = metrics_sum_dict['triplet_tp50_sum'] / metrics_sum_dict['triplet_p50_sum']
                    metrics_pr_dict['triplet_precision100'] = metrics_sum_dict['triplet_tp100_sum'] / metrics_sum_dict['triplet_p100_sum']
                    metrics_pr_dict['triplet_recall'] = metrics_sum_dict['triplet_tp_sum'] / metrics_sum_dict['triplet_g_sum']
                    metrics_pr_dict['triplet_recall20'] = metrics_sum_dict['triplet_tp20_sum'] / metrics_sum_dict['triplet_g_sum']
                    metrics_pr_dict['triplet_recall50'] = metrics_sum_dict['triplet_tp50_sum'] / metrics_sum_dict['triplet_g_sum']
                    metrics_pr_dict['triplet_recall100'] = metrics_sum_dict['triplet_tp100_sum'] / metrics_sum_dict['triplet_g_sum']

                storage.put_scalars(**metrics_pr_dict, smoothing_hint=False)

                optimizer.zero_grad()
                losses.backward()
                optimizer.step()
                storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
                scheduler.step()

                if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
                    for writer in writers:
                        writer.write()
                periodic_checkpointer.step(iteration)
                torch.cuda.empty_cache()
예제 #19
0
    def train(self, stateDict, data, updateStateFun):
        '''
            Main training function.
        '''
        # initialize model
        model, stateDict, _, projectToStateMap = self.initializeModel(
            stateDict, data, False)

        # wrap dataset for usage with Detectron2
        ignoreUnsure = optionsHelper.get_hierarchical_value(
            self.options, ['options', 'train', 'ignore_unsure', 'value'],
            fallback=True)
        transforms = self.initializeTransforms(mode='train')
        indexMap = self._get_labelclass_index_map(stateDict['labelclassMap'],
                                                  False)
        try:
            imageFormat = self.detectron2cfg.INPUT.FORMAT
            assert imageFormat.upper() in ('RGB', 'BGR')
        except:
            imageFormat = 'BGR'
        datasetMapper = Detectron2DatasetMapper(self.project,
                                                self.fileServer,
                                                transforms,
                                                True,
                                                imageFormat,
                                                classIndexMap=indexMap)
        dataLoader = build_detection_train_loader(
            dataset=getDetectron2Data(
                data, stateDict['labelclassMap'], projectToStateMap,
                ignoreUnsure,
                self.detectron2cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS),
            mapper=datasetMapper,
            total_batch_size=self.detectron2cfg.SOLVER.IMS_PER_BATCH *
            comm.get_world_size(),  #TODO: verify
            aspect_ratio_grouping=True,
            num_workers=0)
        numImg = len(data['images'])

        # train
        model.train()
        optimizer = self._build_optimizer(self.detectron2cfg, model)
        scheduler = self._build_lr_scheduler(self.detectron2cfg, optimizer)
        imgCount = 0
        start_iter = 0  #TODO
        tbar = trange(numImg)
        dataLoaderIter = iter(dataLoader)
        with EventStorage(start_iter) as storage:
            for idx in range(numImg):
                batch = next(dataLoaderIter)
                storage.iter = idx  #TODO: start_iter
                loss_dict = model(batch)
                losses = sum(loss_dict.values())
                assert torch.isfinite(losses).all(), \
                    'Model produced Inf and/or NaN values; training was aborted. Try reducing the learning rate.'

                loss_dict_reduced = {
                    k: v.item()
                    for k, v in comm.reduce_dict(loss_dict).items()
                }
                losses_reduced = sum(loss
                                     for loss in loss_dict_reduced.values())
                if comm.is_main_process():
                    storage.put_scalars(total_loss=losses_reduced,
                                        **loss_dict_reduced)

                optimizer.zero_grad()
                losses.backward()
                optimizer.step()
                scheduler.step()

                # update worker state
                tbar.update(1)
                imgCount += len(batch)
                updateStateFun(state='PROGRESS',
                               message='training',
                               done=imgCount,
                               total=numImg)

            stats = storage.latest()
            for key in stats:
                if isinstance(stats[key], tuple):
                    stats[key] = stats[key][0]
            tbar.close()

        # all done; return state dict as bytes and stats
        return self.exportModelState(stateDict, model), stats
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    forward_pass_end_time = time.perf_counter()
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration_start_time = time.perf_counter()
            if comm.get_rank() == 0:
                print("Approx backwards pass duration: ",
                      iteration_start_time - forward_pass_end_time)
            iteration = iteration + 1
            storage.step()

            if iteration == 500:
                print("Iteration 500. Profiling!")
                with torch.autograd.profiler.profile(
                        use_cuda=True, record_shapes=True) as prof:
                    loss_dict = model(data)
                    losses = sum(loss_dict.values())
                    assert torch.isfinite(losses).all(), loss_dict

                    loss_dict_reduced = {
                        k: v.item()
                        for k, v in comm.reduce_dict(loss_dict).items()
                    }
                    losses_reduced = sum(
                        loss for loss in loss_dict_reduced.values())
                print(prof.key_averages().table(sort_by="self_cpu_time_total"))
                prof.export_chrome_trace("/root/trace.json")
            else:
                loss_dict = model(data)
                losses = sum(loss_dict.values())
                assert torch.isfinite(losses).all(), loss_dict

                loss_dict_reduced = {
                    k: v.item()
                    for k, v in comm.reduce_dict(loss_dict).items()
                }
                losses_reduced = sum(loss
                                     for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            forward_pass_end_time = time.perf_counter()
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
예제 #21
0
def do_train(cfg1, model1, model2, resume=False):

    model1.train()
    optimizer = build_optimizer(cfg1, model1)
    scheduler = build_lr_scheduler(cfg1, optimizer)

    checkpointer = DetectionCheckpointer(model1,
                                         cfg1.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg1.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg1.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg1.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg1.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg1.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    data_loader = custom_train_loader(cfg1)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            height = data[0]['image'].shape[1]
            width = data[0]['image'].shape[2]
            second_stream_outputs = inference_second_stream(
                model2, data, height, width)

            loss_dict = model1(data, second_stream_outputs)

            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg1.TEST.EVAL_PERIOD > 0
                    and iteration % cfg1.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg1, model1, model2)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
예제 #22
0
# compared to "train_net.py", we do not support accurate timing and
# precise BN here, because they are not trivial to implement
data_loader = build_detection_train_loader(cfg)
logger.info("Starting training from iteration {}".format(start_iter))
with EventStorage(start_iter) as storage:
    for data, iteration in zip(data_loader, range(start_iter, max_iter)):
        iteration = iteration + 1
        storage.step()
        loss_dict = model(data)
        losses = sum(loss for loss in loss_dict.values())
        assert torch.isfinite(losses).all(), loss_dict

        loss_dict_reduced = {
            k: v.item()
            for k, v in comm.reduce_dict(loss_dict).items()
        }
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        if comm.is_main_process():
            storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        storage.put_scalar("lr",
                           optimizer.param_groups[0]["lr"],
                           smoothing_hint=False)
        scheduler.step()

        if iteration - start_iter > 5 and (iteration % 20 == 0
                                           or iteration == max_iter):
예제 #23
0
def reduce_dict(input_dict, average=True):
    return comm.reduce_dict(input_dict, average=average)
예제 #24
0
def do_train(cfg_source, cfg_target, model, resume=False):

    model.train()
    print(model)

    optimizer = build_optimizer(cfg_source, model)
    scheduler = build_lr_scheduler(cfg_source, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg_source.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg_source.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg_source.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer, cfg_source.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg_source.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg_source.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    i = 1
    max_epoch = 41.27  # max iter / min(data_len(data_source, data_target))
    current_epoch = 0
    data_len = 1502

    alpha3 = 0
    alpha4 = 0
    alpha5 = 0

    data_loader_source = build_detection_train_loader(cfg_source)
    data_loader_target = build_detection_train_loader(cfg_target)
    logger.info("Starting training from iteration {}".format(start_iter))

    with EventStorage(start_iter) as storage:
        for data_source, data_target, iteration in zip(
                data_loader_source, data_loader_target,
                range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            if (iteration % data_len) == 0:
                current_epoch += 1
                i = 1

            p = float(i + current_epoch * data_len) / max_epoch / data_len
            alpha = 2. / (1. + np.exp(-10 * p)) - 1
            i += 1

            alpha3 = alpha
            alpha4 = alpha
            alpha5 = alpha

            if alpha3 > 0.5:
                alpha3 = 0.5

            if alpha4 > 0.5:
                alpha4 = 0.5

            if alpha5 > 0.1:
                alpha5 = 0.1

            loss_dict = model(data_source, False, alpha3, alpha4, alpha5)
            loss_dict_target = model(data_target, True, alpha3, alpha4, alpha5)
            loss_dict["loss_r3"] += loss_dict_target["loss_r3"]
            loss_dict["loss_r4"] += loss_dict_target["loss_r4"]
            loss_dict["loss_r5"] += loss_dict_target["loss_r5"]

            loss_dict["loss_r3"] *= 0.5
            loss_dict["loss_r4"] *= 0.5
            loss_dict["loss_r5"] *= 0.5

            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
예제 #25
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
    )
    
    start_iter = (
        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
    )
    max_iter = cfg.SOLVER.MAX_ITER

    writers = (
        [
            CommonMetricPrinter(max_iter),
            JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
            TensorboardXWriter(cfg.OUTPUT_DIR),
        ]
        if comm.is_main_process()
        else []
    )
    min_size = cfg.INPUT.MIN_SIZE_TRAIN 
    max_size = cfg.INPUT.MAX_SIZE_TRAIN, 
    sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
    data_loader = build_detection_train_loader(cfg, mapper=DatasetMapper(cfg,
                                                                        is_train=True,
                                                                        augmentations=[
                                                                        T.ResizeShortestEdge(min_size, max_size, sample_style),
                                                                        T.RandomApply(T.RandomFlip(prob = 1, vertical = False), prob = 0.5),
                                                                        T.RandomApply(T.RandomRotation(angle = [180], sample_style = 'choice'), prob = 0.1),
                                                                        T.RandomApply(T.RandomRotation(angle = [-10,10], sample_style = 'range'), prob = 0.9),
                                                                        T.RandomApply(T.RandomBrightness(0.5,1.5), prob = 0.5),
                                                                        T.RandomApply(T.RandomContrast(0.5,1.5), prob = 0.5)                                                             
                                                                        ]))
    best_model_weight = copy.deepcopy(model.state_dict())
    best_val_loss = None
    data_val_loader = build_detection_test_loader(cfg,
                                                  cfg.DATASETS.TEST[0],
                                                  mapper = DatasetMapper(cfg, True))
    logger.info("Starting training from iteration {}".format(start_iter))

    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration += 1
            start = time.time()
            storage.step()

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
            scheduler.step()

            if (
                cfg.TEST.EVAL_PERIOD > 0
                and iteration % cfg.TEST.EVAL_PERIOD == 0
                and iteration != max_iter
            ):
                logger.setLevel(logging.CRITICAL)
                print('validating')
                val_total_loss = do_val_monitor(cfg, model, data_val_loader)
                logger.setLevel(logging.DEBUG)
                logger.info(f"validation loss of iteration {iteration}th: {val_total_loss}")
                storage.put_scalar(name = 'val_total_loss', value = val_total_loss)
                
                if best_val_loss is None or val_total_loss < best_val_loss:
                  best_val_loss = val_total_loss
                  best_model_weight = copy.deepcopy(model.state_dict())

                comm.synchronize()
            
            # สร้าง checkpointer เพิ่มให้ save best model โดยดูจาก val loss
            if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
                for writer in writers:
                    writer.write()
            
    model.load_state_dict(best_model_weight)
    experiment_name = os.getenv('MLFLOW_EXPERIMENT_NAME')
    checkpointer.save(f'model_{experiment_name}')
    return model
def do_train(cfg, model, resume=False):
    # 模型设置训练模式
    model.train()
    # 构建优化器
    optimizer = build_optimizer(cfg, model)
    # 构建学习率调整策略
    scheduler = build_lr_scheduler(cfg, optimizer)

    # 断点管理对象
    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    # 可用于恢复训练的起始训练步
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    # 最大迭代次数
    max_iter = cfg.SOLVER.MAX_ITER

    # 这里的PeriodicCheckpointer是fvcore.common.checkpoint中的类,可以用于在指定checkpoint处保存和加载模型
    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),  # 负责终端loss登信息的打印
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # 构建batched训练data loader
    data_loader = build_detection_train_loader(cfg)
    # 构建用于获取测试loss的 test data loader
    test_data_loaders = []
    for dataset_name in cfg.DATASETS.TEST:
        test_data_loaders.append({
            "name":
            dataset_name,
            "data_loader":
            build_detection_test_loader(cfg, dataset_name,
                                        DatasetMapper(cfg, True))
        })
    logger.info("从第{}轮开始训练".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            # 每个迭代的开始调用,更新storage对象的游标
            storage.step()

            loss_dict = model(data)

            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                # 将该轮前向传播的loss放入storage对象的容器中(storage.histories(),后面读取该容器来打印终端)
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            # 反向传播
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            # 将该轮学习率放入storage对象的容器中
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            # if iteration % 21 == 0:
            #     do_loss_eval(cfg, storage, model, test_data_loaders)
            #     for writer in writers:
            #         writer.write()

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                do_loss_eval(cfg, storage, model, test_data_loaders)
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
예제 #27
0
def start_train(al_cfg, cfg, model, resume=False):
    early_stopping = EarlyStopping(patience=al_cfg.EARLY_STOP.PATIENCE,
                                   delta=al_cfg.EARLY_STOP.DELTA,
                                   verbose=True)
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss for loss in loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                results = do_test(cfg, model)
                bbox_results = results['bbox']
                AP = bbox_results['AP']
                comm.synchronize()
                print('AP:', AP, '\tValue:', 1 - (AP / 100))
                early_stopping(1 - (AP / 100))
                storage.put_scalars(**bbox_results)
                if early_stopping.counter < 1:
                    checkpointer.save('model_final')

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)

            if early_stopping.early_stop:
                print("EARLY STOPPING INITIATED AT ITERATION:", iteration)
                # checkpointer.save('model_final')
                break
예제 #28
0
def do_train(cfg, model, resume=False):
    """

    # TODO: Write docstring
    """
    # Set the model to train
    model.train()

    # Create torch optimiser & schedulars
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    # Create a torch checkpointer
    checkpointer = DetectionCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
    )

    # Create starting checkpoint i.e. pre-trained model using weights from config
    start_iter = (
        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
    )

    # Define the number of iterations
    max_iter = cfg.SOLVER.MAX_ITER

    # Create a periodic checkpointer at the configured period
    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
    )

    # Export checkpoint data to terminal, JSON & tensorboard files
    writers = (
        [
            CommonMetricPrinter(max_iter),
            JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
            TensorboardXWriter(cfg.OUTPUT_DIR),
        ]
        if comm.is_main_process()
        else []
    )

    # Create a data loader to supply the model with training data
    data_loader = build_detection_train_loader(cfg)

    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)
          
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
            scheduler.step()

            # If eval period has been set, run test at defined interval
            if (
                cfg.TEST.EVAL_PERIOD > 0
                and iteration % cfg.TEST.EVAL_PERIOD == 0
                and iteration != max_iter
            ):
                do_test(cfg, model)
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
                logger.debug('Logging iteration and loss to Weights & Biases')
                wandb.log({"iteration": iteration})
                wandb.log({"total_loss": losses_reduced})
                wandb.log(loss_dict_reduced)

                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
예제 #29
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    checkpointer_spot = DetectionCheckpointer(model,
                                              '/opt/ml/checkpoints',
                                              optimizer=optimizer,
                                              scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)
    periodic_checkpointer_spot = PeriodicCheckpointer(
        checkpointer_spot, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    #     data_loader = build_detection_train_loader(cfg)
    data_loader = build_detection_train_loader(
        cfg,
        #    mapper=DatasetMapper(cfg, is_train=True
        #                         , augmentations=[
        #        T.Resize((1024, 1024)),
        #        T.RandomBrightness(.75,1.25),
        #        T.RandomFlip(),
        #        T.RandomSaturation(.75,1.25)
        #    ]
    )
    #     )
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()
            if iteration % 500 == 0:
                try:
                    torch.save(model.state_dict(),
                               f'{cfg.OUTPUT_DIR}/model_{iteration}.pth')
                except:
                    print('save failed')

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
            periodic_checkpointer_spot.step(iteration)
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    # checkpointer = DetectionCheckpointer(
    #     model, cfg.OUTPUT_DIR,
    #     optimizer=optimizer,
    #     scheduler=scheduler
    # )
    #do not load checkpointer's optimizer and scheduler
    checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)

    #model.load_state_dict(optimizer)

    max_iter = cfg.SOLVER.MAX_ITER

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    train_data_loader = build_detection_train_loader(
        cfg, mapper=PathwayDatasetMapper(cfg, True))

    # epoch_data_loader = build_detection_test_loader(cfg=cfg, dataset_name= cfg.DATASETS.TRAIN[0],
    #                                           mapper=PathwayDatasetMapper(cfg, True))

    val_data_loader = build_detection_validation_loader(
        cfg=cfg,
        dataset_name=cfg.DATASETS.TEST[0],
        mapper=PathwayDatasetMapper(cfg, False))

    if cfg.DATALOADER.ASPECT_RATIO_GROUPING:
        epoch_num = (train_data_loader.dataset.sampler._size //
                     cfg.SOLVER.IMS_PER_BATCH) + 1
    else:
        epoch_num = train_data_loader.dataset.sampler._size // cfg.SOLVER.IMS_PER_BATCH

    # periodic_checkpointer = PeriodicCheckpointer(
    #     checkpointer,
    #     #cfg.SOLVER.CHECKPOINT_PERIOD,
    #     epoch_num,
    #     max_iter=max_iter
    # )

    logger.info("Starting training from iteration {}".format(start_iter))
    loss_weights = {'loss_cls': 1, 'loss_box_reg': 1}
    with EventStorage(start_iter) as storage:
        loss_per_epoch = 0.0
        best_loss = 99999.0
        best_val_loss = 99999.0
        better_train = False
        better_val = False
        for data, iteration in zip(train_data_loader,
                                   range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss for loss in loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item() * loss_weights[k]
                for k, v in comm.reduce_dict(loss_dict).items()
            }

            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            #prevent gredient explosion
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            #if comm.is_main_process():
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            # if (
            #     # cfg.TEST.EVAL_PERIOD > 0
            #     # and
            #         iteration % epoch_num == 0
            #         #iteration % cfg.TEST.EVAL_PERIOD == 0
            #     and iteration != max_iter
            # ):
            #     do_test(cfg, model)
            #     # Compared to "train_net.py", the test results are not dumped to EventStorage
            #     comm.synchronize()

            loss_per_epoch += losses_reduced
            if iteration % epoch_num == 0 or iteration == max_iter:
                #one complete epoch
                epoch_loss = loss_per_epoch / epoch_num
                #do validation
                #epoch_loss, epoch_cls_loss, epoch_box_reg_loss = do_validation(epoch_data_loader, model, loss_weights)
                #val_loss, val_cls_loss, val_box_reg_loss = do_validation(val_data_loader, model, loss_weights)
                checkpointer.save("model_{:07d}".format(iteration),
                                  **{"iteration": iteration})
                # calculate epoch_loss and push to history cache
                #if comm.is_main_process():
                storage.put_scalar("epoch_loss",
                                   epoch_loss,
                                   smoothing_hint=False)
                # storage.put_scalar("epoch_cls_loss", epoch_cls_loss, smoothing_hint=False)
                # storage.put_scalar("epoch_box_reg_loss", epoch_box_reg_loss, smoothing_hint=False)
                # storage.put_scalar("val_loss", val_loss, smoothing_hint=False)
                # storage.put_scalar("val_cls_loss", val_cls_loss, smoothing_hint=False)
                # storage.put_scalar("val_box_reg_loss", val_box_reg_loss, smoothing_hint=False)

                for writer in writers:
                    writer.write()

                # only save improved checkpoints on epoch_loss
                # if best_loss > epoch_loss:
                #     best_loss = epoch_loss
                #     better_train = True
                # if best_val_loss > val_loss:
                #     best_val_loss = val_loss
                #     better_val = True
                #if better_val:
                #checkpointer.save("model_{:07d}".format(iteration),  **{"iteration": iteration})
                #comm.synchronize()
                #reset loss_per_epoch
                loss_per_epoch = 0.0
                # better_train = False
                # better_val = False
            del loss_dict, losses, losses_reduced, loss_dict_reduced
            torch.cuda.empty_cache()