예제 #1
0
def do_face_train_triplet(
    cfg,
    model,
    data_loader,
    data_loader_val,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    test_period,
    arguments,
    divs_nums,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    dataset_names = cfg.DATASETS.TEST
    for iteration, (img_a, img_p, img_n, label_p,
                    label_n) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration
        img_a_list, _ = divs_tensors(device=device,
                                     tensors=img_a,
                                     targets=None,
                                     divs_nums=divs_nums)
        img_p_list, label_p_list = divs_tensors(device=device,
                                                tensors=img_p,
                                                targets=label_p,
                                                divs_nums=divs_nums)
        img_n_list, label_n_list = divs_tensors(device=device,
                                                tensors=img_n,
                                                targets=label_n,
                                                divs_nums=divs_nums)
        ####======== 拆分batch 可能对bn层有影响 ==========####
        optimizer.zero_grad()
        for img_a, img_p, img_n, label_p, label_n in zip(
                img_a_list, img_p_list, img_n_list, label_p_list,
                label_n_list):
            loss_dict = model(tensors=[img_a, img_p, img_n],
                              targets=[label_p, label_n],
                              batch=iteration,
                              total_batch=None)
            losses = sum(loss for loss in loss_dict.values())
            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = reduce_loss_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            meters.update(loss=losses_reduced, **loss_dict_reduced)
            losses /= divs_nums
            with amp.scale_loss(losses, optimizer) as scaled_losses:
                scaled_losses.backward()
        optimizer.step()
        scheduler.step()
        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
            if iteration > 40000:
                checkpointer.save_backbone("BACKBONE_{:07d}".format(iteration))
        #####========= data test ============#######
        if data_loader_val is not None and test_period > 0 and iteration % test_period == 0:
            meters_val = MetricLogger(delimiter="  ")
            synchronize()
            _ = inference(  # The result can be used for additional logging, e. g. for TensorBoard
                model,
                # The method changes the segmentation mask format in a data loader,
                # so every time a new data loader is created:
                make_data_loader(cfg,
                                 is_train=False,
                                 is_distributed=(get_world_size() > 1),
                                 is_for_period=True),
                dataset_name="[Validation]",
                iou_types=iou_types,
                box_only=False
                if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
                device=cfg.MODEL.DEVICE,
                expected_results=cfg.TEST.EXPECTED_RESULTS,
                expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
                output_folder=None,
            )
            synchronize()
            model.train()
            with torch.no_grad():
                # Should be one image for each GPU:
                for iteration_val, (images_val, targets_val,
                                    _) in enumerate(tqdm(data_loader_val)):
                    images_val = images_val.to(device)
                    targets_val = [target.to(device) for target in targets_val]
                    loss_dict = model(images_val, targets_val)
                    losses = sum(loss for loss in loss_dict.values())
                    loss_dict_reduced = reduce_loss_dict(loss_dict)
                    losses_reduced = sum(
                        loss for loss in loss_dict_reduced.values())
                    meters_val.update(loss=losses_reduced, **loss_dict_reduced)
            synchronize()
            logger.info(
                meters_val.delimiter.join([
                    "[Validation]: ",
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters_val),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))

        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)
            checkpointer.save_backbone("model_final")
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
예제 #2
0
def do_face_train_dk_dist_DIV_FC(
    cfg,
    model,
    data_loader,
    data_loader_val,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    test_period,
    arguments,
    divs_nums,
):
    # model,       head                 = model
    optimizer, head_optimizer = optimizer
    scheduler, head_scheduler = scheduler
    checkpointer, head_checkpointer = checkpointer
    teacher = model[2]
    head = model[1]
    model = model[0]
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    dataset_names = cfg.DATASETS.TEST
    teacher.eval()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration
        images_list, targets_list = divs_tensors(device=device,
                                                 tensors=images,
                                                 targets=targets,
                                                 divs_nums=divs_nums)
        ####======== 拆分batch 可能对bn层有影响 ==========####
        optimizer.zero_grad()
        if len(images_list) > 1:
            grad_sync = False
        else:
            grad_sync = True
        for i, (images, targets) in enumerate(zip(images_list, targets_list)):

            with torch.no_grad():
                soft_target = teacher(inputs=images,
                                      )  # grad_sync=False,grad_params=False)
                # soft_target = [soft_target_.detach() for soft_target_ in soft_target]
                soft_target = [
                    soft_target.to(GPU).detach() for GPU in head.module.GPUS
                ]
            features = model(inputs=images, grad_sync=grad_sync)
            loss_dict = head(features,
                             targets=targets,
                             batch=iteration,
                             soft_target=soft_target,
                             total_batch=None,
                             grad_sync=grad_sync)  #param_sync = param_sync
            losses = sum(loss for loss in loss_dict.values())
            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = reduce_loss_dict(loss_dict)
            losses_reduced = sum(
                torch.mean(loss) for loss in loss_dict_reduced.values())
            meters.update(loss=losses_reduced, **loss_dict_reduced)
            losses /= divs_nums
            with amp.scale_loss(losses, optimizer) as scaled_losses:
                scaled_losses.backward()
            if i == len(images_list) - 2:
                grad_sync = True
        optimizer.step()
        scheduler.step()
        # head_optimizer.step()
        # head_scheduler.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
            checkpointer.save_backbone("BACKBONE_{:07d}".format(iteration))
            head_checkpointer.save("HEAD_{:07d}".format(iteration),
                                   **arguments)