示例#1
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)
    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)
        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        loss_value = losses_reduced.item()
        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        if lr_scheduler is not None:
            lr_scheduler.step()
        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
示例#2
0
def train_one_epoch(model, optimizer, data_loader, device, epoch,
                    print_freq, accumulate, img_size, batch_size,
                    grid_min, grid_max, gs,
                    multi_scale=False, warmup=False):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0 and warmup is True:  # 当训练第一轮(epoch=0)时,启用warmup训练方式,可理解为热身训练
        warmup_factor = 5.0 / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)

    enable_amp = True if "cuda" in device.type else False
    mloss = torch.zeros(4).to(device)  # mean losses
    now_lr = 0.
    nb = len(data_loader)  # number of batches
    # imgs: [batch_size, 3, img_size, img_size]
    # targets: [num_obj, 6] , that number 6 means -> (img_index, obj_index, x, y, w, h)
    # paths: list of img path
    for i, (imgs, targets, paths, _, _) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
        # ni 统计从epoch0开始的所有batch数
        ni = i + nb * epoch  # number integrated batches (since train start)
        imgs = imgs.to(device).float() / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0
        targets = targets.to(device)

        # Multi-Scale
        if multi_scale:
            # 每训练64张图片,就随机修改一次输入图片大小,
            # 由于label已转为相对坐标,故缩放图片不影响label的值
            if ni % accumulate == 0:  #  adjust img_size (67% - 150%) every 1 batch
                # 在给定最大最小输入尺寸范围内随机选取一个size(size为32的整数倍)
                img_size = random.randrange(grid_min, grid_max + 1) * gs
            sf = img_size / max(imgs.shape[2:])  # scale factor

            # 如果图片最大边长不等于img_size, 则缩放图片,并将长和宽调整到32的整数倍
            if sf != 1:
                # gs: (pixels) grid size
                ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]  # new shape (stretched to 32-multiple)
                imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)

        # 混合精度训练上下文管理器,如果在CPU环境中不起任何作用
        with torch.cuda.amp.autocast(enabled=enable_amp):
            pred = model(imgs)

            # loss
            loss_dict = compute_loss(pred, targets, model)

            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purpose
            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            loss_items = torch.cat((loss_dict_reduced["box_loss"],
                                    loss_dict_reduced["obj_loss"],
                                    loss_dict_reduced["class_loss"],
                                    losses_reduced)).detach()
            mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses

            loss_value = losses_reduced.item()
            if not torch.isfinite(losses_reduced):
                print('WARNING: non-finite loss, ending training ', loss_value)
                print("training image path: {}".format(",".join(paths)))
                sys.exit(1)

        # 每训练64张图片更新一次权重
        # backward
        losses *= batch_size / 64  # scale loss
        losses.backward()
        # optimize
        if ni % accumulate == 0:
            optimizer.step()
            optimizer.zero_grad()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        now_lr = optimizer.param_groups[0]["lr"]
        metric_logger.update(lr=now_lr)

        if lr_scheduler is not None:  # 第一轮使用warmup训练方式
            lr_scheduler.step()

    return mloss, now_lr
            if args.weight == "instance_level":
                contributions_shap = compare_shap_and_KG(shap_values_train,
                                                         labels,
                                                         dataset=data)
                shap_coeff = reduce_shap(contributions_shap, is_exponential)
            elif args.weight == "bbox_level":
                shap_weights = get_bbox_weight(shap_values_train,
                                               is_exponential,
                                               dataset=data)
            print("Shap computed")
        print('Test loss: ', loss, '\tTest accuracy: ', accuracy)
    if j < num_epochs_detection:
        #Train detection
        metric_logger = uti.MetricLogger(delimiter="  ")
        metric_logger.add_meter(
            'lr', uti.SmoothedValue(window_size=1, fmt='{value:.6f}'))
        header = 'Epoch: [{}]'.format(j + 1)

        index = 0
        detector.train()
        for images, targets in metric_logger.log_every(train_loader, 250,
                                                       header):
            images = list(image.to('cuda') for image in images)
            targets = [{k: v.to('cuda')
                        for k, v in t.items()} for t in targets]
            #SHAP if necessary
            #-------
            if (j > 0 or args.resume) and args.weight == "bbox_level":
                loss_dict = detector(
                    images,
                    targets,