Пример #1
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    checkpointer_spot = DetectionCheckpointer(model,
                                              '/opt/ml/checkpoints',
                                              optimizer=optimizer,
                                              scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)
    periodic_checkpointer_spot = PeriodicCheckpointer(
        checkpointer_spot, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
            periodic_checkpointer_spot.step(iteration)
                                     optimizer=optimizer,
                                     scheduler=scheduler)
start_iter = (checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=False).get(
    "iteration", -1) + 1)
ckpt = Checkpointer(model)
ckpt.load("./frcn_attn_0/model_0044999.pth")
max_iter = cfg.SOLVER.MAX_ITER

periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                             cfg.SOLVER.CHECKPOINT_PERIOD,
                                             max_iter=max_iter)

writers = ([
    CommonMetricPrinter(max_iter),
    JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
    TensorboardXWriter(cfg.OUTPUT_DIR),
] if comm.is_main_process() else [])

# compared to "train_net.py", we do not support accurate timing and
# precise BN here, because they are not trivial to implement
data_loader = build_detection_train_loader(cfg)
logger.info("Starting training from iteration {}".format(start_iter))
with EventStorage(start_iter) as storage:
    for data, iteration in zip(data_loader, range(start_iter, max_iter)):
        iteration = iteration + 1
        storage.step()
        loss_dict = model(data)
        losses = sum(loss for loss in loss_dict.values())
        assert torch.isfinite(losses).all(), loss_dict

        loss_dict_reduced = {
def do_train(cfg, model, resume=False, patience=20):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    scheduler2 = ReduceLROnPlateau(optimizer, mode="max")

    # warmup_scheduler = warmup.LinearWarmup(optimizer, warmup_period=200)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement in a small training loop
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    best_ap50 = 0
    best_iteration = 0
    patience_counter = 0
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            storage.step()

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()
            # warmup_scheduler.dampen(iteration)

            if (cfg.TEST.EVAL_PERIOD > 0
                    and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter - 1):
                test_results = do_test(cfg, model)

                # scheduler2.step(test_results["bbox"]["AP50"])
                # early stopping.

                # save checkpoint to disk

                checkpointer.save(f"model_{iteration}")

                # TODO: restore from best model
                if test_results["bbox"]["AP50"] > best_ap50:
                    best_ap50 = test_results["bbox"]["AP50"]
                    best_iteration = iteration
                    # reset patience counter
                    patience_counter = 0
                    logger.info(f"Patience counter reset.")
                else:
                    patience_counter += 1
                    logger.info(
                        f"Patience counter increased to {patience_counter}, will be terminated at {patience}"
                    )
                    if patience_counter > patience:
                        for writer in writers:
                            writer.write()
                        # restore to best checkpoint

                        checkpointer.load(
                            f"{cfg.OUTPUT_DIR}/model_{best_iteration}.pth")

                        break
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and ((iteration + 1) % 20 == 0
                                               or iteration == max_iter - 1):
                for writer in writers:
                    writer.write()
            # periodic_checkpointer.step(iteration)
        checkpointer.save(f"model_final")
Пример #4
0
def do_train(cfg, model, resume=False, evaluate=False):
    """
    training loop.
    """

    # Build optimizer and scheduler from configuration and model
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    # Build checkpointers
    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER
    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)
    # Build writers
    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # Build dataloader
    data_loader = build_classification_train_loader(cfg)

    # training loop
    validation_losses = []
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        start = time.perf_counter()
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):

            data_time = time.perf_counter() - start
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)

            # compute losses
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalar("data_time", data_time)
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            # backward
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            #validation
            if ((cfg.TEST.EVAL_PERIOD > 0
                 and iteration % cfg.TEST.EVAL_PERIOD == 0)
                    or (iteration == max_iter)):
                # evaluate on the validation dataset
                res = do_test(cfg, model, evaluate=evaluate)
                validation = {}
                for k, v in res.items():
                    print(v, flush=True)
                    validation[k] = v['loss_cls']
                    storage.put_scalars(
                        **validation
                    )  # dump also validation loss into Tensorboard
                    validation['iteration'] = iteration
                validation_losses.append(validation)

            # logging/checkpoint
            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)

            #Try to get an accurate measuremetn of time
            start = time.perf_counter()

    # save validations metrics
    if evaluate:
        print(validation_losses, flush=True)
        file_path = os.path.join(cfg.OUTPUT_DIR, "validations_losses.pth")
        with PathManager.open(file_path, "wb") as f:
            torch.save(validation_losses, f)
def do_train(cfg, model, resume=False):

    #start the training
    model.train()

    #configuration of the model based on the cfg
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    #chechpoints configuration
    checkpointer = DetectionCheckpointer(model,cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler)

    #depending on whether we are using a checkpoint or not the initial iteration
    #would be different
    if resume == False:
        start_iter=1
    else:
        start_iter = (checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)

    #Number of iterations
    max_iter = cfg.SOLVER.MAX_ITER

    #checkpoints configurations
    periodic_checkpointer = PeriodicCheckpointer(checkpointer,cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)
    checkpointer_best= DetectionCheckpointer(model,cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler)
    periodic_checkpointer_best= PeriodicCheckpointer(checkpointer_best, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)

    #writer:
    writers = ([CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
            TensorboardXWriter(cfg.OUTPUT_DIR),] if comm.is_main_process() else [])

    #create the dataloader that get information from cfg.training set
    data_loader = build_detection_train_loader(cfg)

    #information about the current situation in the training process
    logger.info("Starting training from iteration {}".format(start_iter))

    #start iteration process (epochs)
    if resume == True:
      print ('Obtaining best val from previous session')
      best_loss=np.loadtxt(cfg.OUTPUT_DIR+"/"+"best_validation_loss.txt")
      print ('Previous best total val loss is %s' %best_loss)

    else:
        best_loss=99999999999999999999999999999999999

    #the patiente list stores the validation losses during the training process
    patience_list=[]
    patience_list.append(best_loss)

    dataset_size=cfg.NUMBER_IMAGES_TRAINING
    print("training set size is %s" %dataset_size)
    iteration_batch_ratio=int(round(float(dataset_size/cfg.SOLVER.IMS_PER_BATCH)))
    print ("%s Minibatches are cosidered as an entire epoch" %iteration_batch_ratio)

    with EventStorage(start_iter) as storage:
        if resume == True:
          iteration=start_iter
        else:
          start_iter=1
          iteration=1

        minibatch=0

        for data, miniepoch in zip(data_loader, range(start_iter*iteration_batch_ratio, max_iter*iteration_batch_ratio)):

            minibatch= minibatch +1
            if minibatch == iteration_batch_ratio:
              minibatch=0
              iteration = iteration + 1


            storage.step()

            loss_dict = model(data)
            #print (loss_dict)
            #print ('SPACE')

            losses = sum(loss for loss in loss_dict.values())
            #print (losses)
            #print ('SPACE')

            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
            #print ('SPACE')

            #get the total loss
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            if minibatch == 0:
                print ("Minibatch %s / %s" %(minibatch, iteration_batch_ratio))
                print ("iteration %s / %s" %(iteration, max_iter))
                print ('Total losses %s \n' %losses_reduced)
                print (loss_dict_reduced)

            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)

            scheduler.step()

            #Test the validation score of the model
            if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter and minibatch ==0 ):

                results, loss_val =do_test(cfg, model)
                patience_list.append(loss_val)
                #Compared to "train_net.py", the test results are not dumped to EventStorage

                if loss_val < best_loss:
                  print ('saving best model')
                  best_loss=loss_val
                  array_loss=np.array([best_loss])

                  #save best model
                  checkpointer_best.save('best_model')
                  np.savetxt(cfg.OUTPUT_DIR+"/"+"best_validation_loss.txt", array_loss, delimiter=',')


                if len(patience_list) > cfg.patience + cfg.warm_up_patience:
                  print('Chenking val losses .......')

                  #Item obtained (patience) iterations ago
                  item_patience=patience_list[-cfg.patience]
                  continue_training=False

                  #Check whether the val loss has improved
                  for i in range(cfg.patience):
                    item_to_check=patience_list[-i]
                    if item_to_check < item_patience:
                      continue_training=True

                  if continue_training == True:
                    print ('The val loss has improved')

                  else:
                    print ('The val loss has not improved. Stopping training')
                    #print the validation losses
                    print (patience_list)

                    #Plot validation loss error evolution
                    plt.plot(range(1,len(patience_list)+1,1),patience_list)
                    plt.xlabel('iterations')
                    plt.ylabel('validation loss')
                    plt.title('Evolution validation loss: \n min val loss: '
                    +str(min(patience_list)))

                    #save the plot
                    plt.savefig(os.path.join(cfg.OUTPUT_DIR,'evolution_val_loss.png'))
                    break


                comm.synchronize()

            # if iteration - start_iter > cfg.TEST.EVAL_PERIOD and (iteration % cfg.TEST.EVAL_PERIOD == 0 or iteration == max_iter):
            #   for writer in writers:
            #     writer.write()

            if minibatch == 1:
              periodic_checkpointer.step(iteration)
Пример #6
0
def main(args):
    print('_' * 60 + f'\nmain <- {args}')
    if 'setup(args)':
        cfg = get_cfg()
        cfg.merge_from_file(args.config_file)
        cfg.merge_from_list(args.opts)
        cfg.freeze()
        default_setup(
            cfg, args
        )  # if you don't like any of the default setup, write your own setup code
        global CONFIG
        CONFIG = cfg

    if True:  # N_GPU > 0:
        # __________________ For Debug _____________________________
        # mem_stats_df.record('Before-Build-Model')
        if 'build_model(cfg)':
            meta_arch = cfg.MODEL.META_ARCHITECTURE
            model = META_ARCH_REGISTRY.get(meta_arch)(cfg)
            # for param in model.backbone.parameters():
            #     param.requires_grad = False
            model.to(torch.device(cfg.MODEL.DEVICE))
        # __________________ For Debug _____________________________
        # mem_stats_df.record('After-Build-Model')

    if args.eval_only:
        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
            cfg.MODEL.WEIGHTS, resume=args.resume
        )
        return do_test(cfg, model)

    distributed = comm.get_world_size() > 1
    if distributed:
        model = DistributedDataParallel(
            model, device_ids=[comm.get_local_rank()], broadcast_buffers=False
        )

    if 'do-train':
        dataloader = build_train_dataloader(cfg)

        if N_GPUS > 0:
            cfg, model, resume = cfg, model, args.resume

            model.train()
            optimizer = build_optimizer(cfg, model)
            scheduler = build_lr_scheduler(cfg, optimizer)

            checkpointer = DetectionCheckpointer(
                model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler,
            )
            # "iteration" always be loaded whether resume or not.
            # "model" state_dict will always be loaded whether resume or not.
            start_iter = (
                    checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
            )
            max_iter = cfg.SOLVER.MAX_ITER
            # optimizer and scheduler will be resume to checkpointer.checkpointables[*] if resume is True
            if resume:
                optimizer  = checkpointer.checkpointables['optimizer']
                scheduler  = checkpointer.checkpointables['scheduler']
            else:
                start_iter = 0

            periodic_checkpointer = PeriodicCheckpointer(
                checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
            )

            writers = (
                [
                    CommonMetricPrinter(max_iter),
                    JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
                    TensorboardXWriter(cfg.OUTPUT_DIR),
                ]
                if comm.is_main_process()
                else []
            )
            logger.info("Starting training from iteration {}".format(start_iter))

            with EventStorage(start_iter) as storage:
                for data, itr in zip(dataloader, range(start_iter, max_iter)):
                    iteration = itr + 1
                    storage.step()

                    loss_dict = model(data)
                    losses = sum(loss_dict.values())
                    assert torch.isfinite(losses).all(), loss_dict

                    loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
                    losses_reduced = sum(loss for loss in loss_dict_reduced.values())
                    if comm.is_main_process():
                        storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

                    optimizer.zero_grad()
                    losses.backward()
                    optimizer.step()
                    storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
                    scheduler.step()

                    # __________________ Checkpoint / Test / Metrics ___________
                    periodic_checkpointer.step(iteration)

                    if (
                        cfg.TEST.EVAL_PERIOD > 0
                        and iteration % cfg.TEST.EVAL_PERIOD == 0
                        and iteration != max_iter
                    ):
                        do_test(cfg, model)
                        # Compared to "train_net.py", the test results are not dumped to EventStorage
                        comm.synchronize()

                    if iteration - start_iter > 5 and (iteration % 100 == 0 or iteration == max_iter):
                        for writer in writers:
                            writer.write()
                        # __________________ For Debug _____________________________
                        # mem_summary = torch.cuda.memory_summary()
                        # tcp_sock.send(mem_summary.encode('utf-8'))
                        global TIC
                        if TIC is None:
                            TIC = datetime.datetime.now()
                        else:
                            toc = datetime.datetime.now()
                            logger.info('_' * 35 + f'Time Elapsed: {(toc - TIC).total_seconds()} s')
                            TIC = toc