Пример #1
0
def main(parser_data):
    device = torch.device(
        parser_data.device if torch.cuda.is_available() else "cpu")
    print("Using {} device training.".format(device.type))

    data_transform = {"val": transforms.Compose([transforms.ToTensor()])}

    # read class_indict
    label_json_path = './pascal_voc_classes.json'
    assert os.path.exists(
        label_json_path), "json file {} dose not exist.".format(
            label_json_path)
    json_file = open(label_json_path, 'r')
    class_dict = json.load(json_file)
    category_index = {v: k for k, v in class_dict.items()}

    VOC_root = parser_data.data_path
    # check voc root
    if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False:
        raise FileNotFoundError(
            "VOCdevkit dose not in path:'{}'.".format(VOC_root))

    # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch
    batch_size = parser_data.batch_size
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    print('Using %g dataloader workers' % nw)

    # load validation data set
    val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], "val.txt")
    val_data_set_loader = torch.utils.data.DataLoader(
        val_data_set,
        batch_size=batch_size,
        shuffle=False,
        num_workers=nw,
        pin_memory=True,
        collate_fn=val_data_set.collate_fn)

    # create model num_classes equal background + 20 classes
    # 注意,这里的norm_layer要和训练脚本中保持一致
    backbone = resnet50_fpn_backbone(norm_layer=torch.nn.BatchNorm2d)
    model = FasterRCNN(backbone=backbone,
                       num_classes=parser_data.num_classes + 1)

    # 载入你自己训练好的模型权重
    weights_path = parser_data.weights
    assert os.path.exists(weights_path), "not found {} file.".format(
        weights_path)
    weights_dict = torch.load(weights_path, map_location=device)
    model.load_state_dict(weights_dict['model'])
    # print(model)

    model.to(device)

    # evaluate on the test dataset
    coco = get_coco_api_from_dataset(val_data_set)
    iou_types = ["bbox"]
    coco_evaluator = CocoEvaluator(coco, iou_types)
    cpu_device = torch.device("cpu")

    model.eval()
    with torch.no_grad():
        for image, targets in tqdm(val_data_set_loader, desc="validation..."):
            # 将图片传入指定设备device
            image = list(img.to(device) for img in image)

            # inference
            outputs = model(image)

            outputs = [{k: v.to(cpu_device)
                        for k, v in t.items()} for t in outputs]
            res = {
                target["image_id"].item(): output
                for target, output in zip(targets, outputs)
            }
            coco_evaluator.update(res)

    coco_evaluator.synchronize_between_processes()

    # accumulate predictions from all images
    coco_evaluator.accumulate()
    coco_evaluator.summarize()

    coco_eval = coco_evaluator.coco_eval["bbox"]
    # calculate COCO info for all classes
    coco_stats, print_coco = summarize(coco_eval)

    # calculate voc info for every classes(IoU=0.5)
    voc_map_info_list = []
    for i in range(len(category_index)):
        stats, _ = summarize(coco_eval, catId=i)
        voc_map_info_list.append(" {:15}: {}".format(category_index[i + 1],
                                                     stats[1]))

    print_voc = "\n".join(voc_map_info_list)
    print(print_voc)

    # 将验证结果保存至txt文件中
    with open("record_mAP.txt", "w") as f:
        record_lines = [
            "COCO results:", print_coco, "", "mAP(IoU=0.5) for each category:",
            print_voc
        ]
        f.write("\n".join(record_lines))
def main(parser_data):
    device = torch.device(parser_data.device if torch.cuda.is_available() else "cpu")
    print("Using {} device training.".format(device.type))

    data_transform = {
        "val": transforms.Compose([transforms.Resize(),
                                   transforms.ToTensor(),
                                   transforms.Normalization()])
    }

    # read class_indict
    label_json_path = './pascal_voc_classes.json'
    assert os.path.exists(label_json_path), "json file {} dose not exist.".format(label_json_path)
    json_file = open(label_json_path, 'r')
    class_dict = json.load(json_file)
    json_file.close()
    category_index = {v: k for k, v in class_dict.items()}

    VOC_root = parser_data.data_path
    # check voc root
    if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False:
        raise FileNotFoundError("VOCdevkit dose not in path:'{}'.".format(VOC_root))

    # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch
    batch_size = parser_data.batch_size
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
    print('Using %g dataloader workers' % nw)

    # load validation data set
    # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt
    val_dataset = VOCDataSet(VOC_root, "2012", transforms=data_transform["val"], train_set="val.txt")
    val_dataset_loader = torch.utils.data.DataLoader(val_dataset,
                                                     batch_size=batch_size,
                                                     shuffle=False,
                                                     num_workers=nw,
                                                     pin_memory=True,
                                                     collate_fn=val_dataset.collate_fn)

    # create model num_classes equal background + 20 classes
    backbone = Backbone()
    model = SSD300(backbone=backbone, num_classes=parser_data.num_classes + 1)

    # 载入你自己训练好的模型权重
    weights_path = parser_data.weights
    assert os.path.exists(weights_path), "not found {} file.".format(weights_path)
    model.load_state_dict(torch.load(weights_path, map_location=device)['model'])
    # print(model)

    model.to(device)

    # evaluate on the test dataset
    coco = get_coco_api_from_dataset(val_dataset)
    iou_types = ["bbox"]
    coco_evaluator = CocoEvaluator(coco, iou_types)
    cpu_device = torch.device("cpu")

    model.eval()
    with torch.no_grad():
        for images, targets in tqdm(val_dataset_loader, desc="validation..."):
            # 将图片传入指定设备device
            images = torch.stack(images, dim=0).to(device)

            # inference
            results = model(images)

            outputs = []
            for index, (bboxes_out, labels_out, scores_out) in enumerate(results):
                # 将box的相对坐标信息(0-1)转为绝对值坐标(xmin, ymin, xmax, ymax)
                height_width = targets[index]["height_width"]
                # 还原回原图尺度
                bboxes_out[:, [0, 2]] = bboxes_out[:, [0, 2]] * height_width[1]
                bboxes_out[:, [1, 3]] = bboxes_out[:, [1, 3]] * height_width[0]

                info = {"boxes": bboxes_out.to(cpu_device),
                        "labels": labels_out.to(cpu_device),
                        "scores": scores_out.to(cpu_device)}
                outputs.append(info)

            res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
            coco_evaluator.update(res)

    coco_evaluator.synchronize_between_processes()

    # accumulate predictions from all images
    coco_evaluator.accumulate()
    coco_evaluator.summarize()

    coco_eval = coco_evaluator.coco_eval["bbox"]
    # calculate COCO info for all classes
    coco_stats, print_coco = summarize(coco_eval)

    # calculate voc info for every classes(IoU=0.5)
    voc_map_info_list = []
    for i in range(len(category_index)):
        stats, _ = summarize(coco_eval, catId=i)
        voc_map_info_list.append(" {:15}: {}".format(category_index[i + 1], stats[1]))

    print_voc = "\n".join(voc_map_info_list)
    print(print_voc)

    # 将验证结果保存至txt文件中
    with open("record_mAP.txt", "w") as f:
        record_lines = ["COCO results:",
                        print_coco,
                        "",
                        "mAP(IoU=0.5) for each category:",
                        print_voc]
        f.write("\n".join(record_lines))
Пример #3
0
def train(hyp):
    device = torch.device(opt.device if torch.cuda.is_available() else "cpu")
    print("Using {} device training.".format(device.type))

    wdir = "weights" + os.sep  # weights dir
    best = wdir + "best.pt"
    results_file = "results{}.txt".format(
        datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

    cfg = opt.cfg
    data = opt.data
    epochs = opt.epochs
    batch_size = opt.batch_size
    accumulate = max(round(64 / batch_size),
                     1)  # accumulate n times before optimizer update (bs 64)
    weights = opt.weights  # initial training weights
    imgsz_train = opt.img_size
    imgsz_test = opt.img_size  # test image sizes
    multi_scale = opt.multi_scale

    # Image sizes
    # 图像要设置成32的倍数
    gs = 32  # (pixels) grid size
    assert math.fmod(
        imgsz_test,
        gs) == 0, "--img-size %g must be a %g-multiple" % (imgsz_test, gs)
    grid_min, grid_max = imgsz_test // gs, imgsz_test // gs
    if multi_scale:
        imgsz_min = opt.img_size // 1.5
        imgsz_max = opt.img_size // 0.667

        # 将给定的最大,最小输入尺寸向下调整到32的整数倍
        grid_min, grid_max = imgsz_min // gs, imgsz_max // gs
        imgsz_min, imgsz_max = int(grid_min * gs), int(grid_max * gs)
        imgsz_train = imgsz_max  # initialize with max size
        print("Using multi_scale training, image range[{}, {}]".format(
            imgsz_min, imgsz_max))

    # configure run
    init_seeds()  # 初始化随机种子,保证结果可复现
    data_dict = parse_data_cfg(data)
    train_path = data_dict["train"]
    test_path = data_dict["valid"]
    nc = 1 if opt.single_cls else int(
        data_dict["classes"])  # number of classes
    hyp["cls"] *= nc / 80  # update coco-tuned hyp['cls'] to current dataset
    hyp["obj"] *= imgsz_test / 320

    # Remove previous results
    for f in glob.glob(results_file):
        os.remove(f)

    # Initialize model
    model = Darknet(cfg).to(device)

    # 是否冻结权重,只训练predictor的权重
    if opt.freeze_layers:
        # 索引减一对应的是predictor的索引,YOLOLayer并不是predictor
        output_layer_indices = [
            idx - 1 for idx, module in enumerate(model.module_list)
            if isinstance(module, YOLOLayer)
        ]
        # 冻结除predictor和YOLOLayer外的所有层
        freeze_layer_indeces = [
            x for x in range(len(model.module_list))
            if (x not in output_layer_indices) and (
                x - 1 not in output_layer_indices)
        ]
        # Freeze non-output layers
        # 总共训练3x2=6个parameters
        for idx in freeze_layer_indeces:
            for parameter in model.module_list[idx].parameters():
                parameter.requires_grad_(False)
    else:
        # 如果freeze_layer为False,默认仅训练除darknet53之后的部分
        # 若要训练全部权重,删除以下代码
        darknet_end_layer = 74  # only yolov3spp cfg
        # Freeze darknet53 layers
        # 总共训练21x3+3x2=69个parameters
        for idx in range(darknet_end_layer + 1):  # [0, 74]
            for parameter in model.module_list[idx].parameters():
                parameter.requires_grad_(False)

    # optimizer
    pg = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.SGD(pg,
                          lr=hyp["lr0"],
                          momentum=hyp["momentum"],
                          weight_decay=hyp["weight_decay"],
                          nesterov=True)

    start_epoch = 0
    best_map = 0.0
    if weights.endswith(".pt") or weights.endswith(".pth"):
        ckpt = torch.load(weights, map_location=device)

        # load model
        try:
            ckpt["model"] = {
                k: v
                for k, v in ckpt["model"].items()
                if model.state_dict()[k].numel() == v.numel()
            }
            model.load_state_dict(ckpt["model"], strict=False)
        except KeyError as e:
            s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \
                "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights)
            raise KeyError(s) from e

        # load optimizer
        if ckpt["optimizer"] is not None:
            optimizer.load_state_dict(ckpt["optimizer"])
            if "best_map" in ckpt.keys():
                best_map = ckpt["best_map"]

        # load results
        if ckpt.get("training_results") is not None:
            with open(results_file, "w") as file:
                file.write(ckpt["training_results"])  # write results.txt

        # epochs
        start_epoch = ckpt["epoch"] + 1
        if epochs < start_epoch:
            print(
                '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.'
                % (opt.weights, ckpt['epoch'], epochs))
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[
        "lrf"]) + hyp["lrf"]  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    scheduler.last_epoch = start_epoch  # 指定从哪个epoch开始

    # Plot lr schedule
    # y = []
    # for _ in range(epochs):
    #     scheduler.step()
    #     y.append(optimizer.param_groups[0]['lr'])
    # plt.plot(y, '.-', label='LambdaLR')
    # plt.xlabel('epoch')
    # plt.ylabel('LR')
    # plt.tight_layout()
    # plt.savefig('LR.png', dpi=300)

    # model.yolo_layers = model.module.yolo_layers

    # dataset
    # 训练集的图像尺寸指定为multi_scale_range中最大的尺寸
    train_dataset = LoadImagesAndLabels(
        train_path,
        imgsz_train,
        batch_size,
        augment=True,
        hyp=hyp,  # augmentation hyperparameters
        rect=opt.rect,  # rectangular training
        cache_images=opt.cache_images,
        single_cls=opt.single_cls)

    # 验证集的图像尺寸指定为img_size(512)
    val_dataset = LoadImagesAndLabels(
        test_path,
        imgsz_test,
        batch_size,
        hyp=hyp,
        rect=True,  # 将每个batch的图像调整到合适大小,可减少运算量(并不是512x512标准尺寸)
        cache_images=opt.cache_images,
        single_cls=opt.single_cls)

    # dataloader
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        num_workers=nw,
        # Shuffle=True unless rectangular training is used
        shuffle=not opt.rect,
        pin_memory=True,
        collate_fn=train_dataset.collate_fn)

    val_datasetloader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=batch_size,
        num_workers=nw,
        pin_memory=True,
        collate_fn=val_dataset.collate_fn)

    # Model parameters
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
    # 计算每个类别的目标个数,并计算每个类别的比重
    # model.class_weights = labels_to_class_weights(train_dataset.labels, nc).to(device)  # attach class weights

    # start training
    # caching val_data when you have plenty of memory(RAM)
    # coco = None
    coco = get_coco_api_from_dataset(val_dataset)

    print("starting traning for %g epochs..." % epochs)
    print('Using %g dataloader workers' % nw)
    for epoch in range(start_epoch, epochs):
        mloss, lr = train_util.train_one_epoch(
            model,
            optimizer,
            train_dataloader,
            device,
            epoch,
            accumulate=accumulate,  # 迭代多少batch才训练完64张图片
            img_size=imgsz_train,  # 输入图像的大小
            multi_scale=multi_scale,
            grid_min=grid_min,  # grid的最小尺寸
            grid_max=grid_max,  # grid的最大尺寸
            gs=gs,  # grid step: 32
            print_freq=50,  # 每训练多少个step打印一次信息
            warmup=True)
        # update scheduler
        scheduler.step()

        if opt.notest is False or epoch == epochs - 1:
            # evaluate on the test dataset
            result_info = train_util.evaluate(model,
                                              val_datasetloader,
                                              coco=coco,
                                              device=device)

            coco_mAP = result_info[0]
            voc_mAP = result_info[1]
            coco_mAR = result_info[8]

            # write into tensorboard
            if tb_writer:
                tags = [
                    'train/giou_loss', 'train/obj_loss', 'train/cls_loss',
                    'train/loss', "learning_rate", "mAP@[IoU=0.50:0.95]",
                    "mAP@[IoU=0.5]", "mAR@[IoU=0.50:0.95]"
                ]

                for x, tag in zip(
                        mloss.tolist() + [lr, coco_mAP, voc_mAP, coco_mAR],
                        tags):
                    tb_writer.add_scalar(tag, x, epoch)

            # write into txt
            with open(results_file, "a") as f:
                result_info = [str(round(i, 4)) for i in result_info]
                txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
                f.write(txt + "\n")

            # update best mAP(IoU=0.50:0.95)
            if coco_mAP > best_map:
                best_map = coco_mAP

            if opt.savebest is False:
                # save weights every epoch
                with open(results_file, 'r') as f:
                    save_files = {
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'training_results': f.read(),
                        'epoch': epoch,
                        'best_map': best_map
                    }
                    torch.save(save_files,
                               "./weights/yolov3spp-{}.pt".format(epoch))
            else:
                # only save best weights
                if best_map == coco_mAP:
                    with open(results_file, 'r') as f:
                        save_files = {
                            'model': model.state_dict(),
                            'optimizer': optimizer.state_dict(),
                            'training_results': f.read(),
                            'epoch': epoch,
                            'best_map': best_map
                        }
                        torch.save(save_files, best.format(epoch))
def main(parser_data):
    device = torch.device(parser_data.device if torch.cuda.is_available() else "cpu")
    print("Using {} device training.".format(device.type))

    # read class_indict
    label_json_path = './data/pascal_voc_classes.json'
    assert os.path.exists(label_json_path), "json file {} dose not exist.".format(label_json_path)
    json_file = open(label_json_path, 'r')
    class_dict = json.load(json_file)
    category_index = {v: k for k, v in class_dict.items()}

    data_dict = parse_data_cfg(parser_data.data)
    test_path = data_dict["valid"]

    # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch
    batch_size = parser_data.batch_size
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
    print('Using %g dataloader workers' % nw)

    # load validation data set
    val_dataset = LoadImagesAndLabels(test_path, parser_data.img_size, batch_size,
                                      hyp=parser_data.hyp,
                                      rect=True)  # 将每个batch的图像调整到合适大小,可减少运算量(并不是512x512标准尺寸)

    val_dataset_loader = torch.utils.data.DataLoader(val_dataset,
                                                     batch_size=batch_size,
                                                     shuffle=False,
                                                     num_workers=nw,
                                                     pin_memory=True,
                                                     collate_fn=val_dataset.collate_fn)

    # create model
    model = Darknet(parser_data.cfg, parser_data.img_size)
    model.load_state_dict(torch.load(parser_data.weights, map_location=device)["model"])
    model.to(device)

    # evaluate on the test dataset
    coco = get_coco_api_from_dataset(val_dataset)
    iou_types = ["bbox"]
    coco_evaluator = CocoEvaluator(coco, iou_types)
    cpu_device = torch.device("cpu")

    model.eval()
    with torch.no_grad():
        for imgs, targets, paths, shapes, img_index in tqdm(val_dataset_loader, desc="validation..."):
            imgs = imgs.to(device).float() / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0

            pred = model(imgs)[0]  # only get inference result
            pred = non_max_suppression(pred, conf_thres=0.01, iou_thres=0.6, multi_label=False)

            outputs = []
            for index, p in enumerate(pred):
                if p is None:
                    p = torch.empty((0, 6), device=cpu_device)
                    boxes = torch.empty((0, 4), device=cpu_device)
                else:
                    # xmin, ymin, xmax, ymax
                    boxes = p[:, :4]
                    # shapes: (h0, w0), ((h / h0, w / w0), pad)
                    # 将boxes信息还原回原图尺度,这样计算的mAP才是准确的
                    boxes = scale_coords(imgs[index].shape[1:], boxes, shapes[index][0]).round()

                # 注意这里传入的boxes格式必须是xmin, ymin, xmax, ymax,且为绝对坐标
                info = {"boxes": boxes.to(cpu_device),
                        "labels": p[:, 5].to(device=cpu_device, dtype=torch.int64),
                        "scores": p[:, 4].to(cpu_device)}
                outputs.append(info)

            res = {img_id: output for img_id, output in zip(img_index, outputs)}

            coco_evaluator.update(res)

    coco_evaluator.synchronize_between_processes()

    # accumulate predictions from all images
    coco_evaluator.accumulate()
    coco_evaluator.summarize()

    coco_eval = coco_evaluator.coco_eval["bbox"]
    # calculate COCO info for all classes
    coco_stats, print_coco = summarize(coco_eval)

    # calculate voc info for every classes(IoU=0.5)
    voc_map_info_list = []
    for i in range(len(category_index)):
        stats, _ = summarize(coco_eval, catId=i)
        voc_map_info_list.append(" {:15}: {}".format(category_index[i + 1], stats[1]))

    print_voc = "\n".join(voc_map_info_list)
    print(print_voc)

    # 将验证结果保存至txt文件中
    with open("record_mAP.txt", "w") as f:
        record_lines = ["COCO results:",
                        print_coco,
                        "",
                        "mAP(IoU=0.5) for each category:",
                        print_voc]
        f.write("\n".join(record_lines))
Пример #5
0
def main(parser_data):
    device = torch.device(
        parser_data.device if torch.cuda.is_available() else "cpu")
    print("Using {} device training.".format(device.type))

    if not os.path.exists("save_weights"):
        os.mkdir("save_weights")

    results_file = "results{}.txt".format(
        datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

    data_transform = {
        "train":
        transforms.Compose([
            transforms.SSDCropping(),
            transforms.Resize(),
            transforms.ColorJitter(),
            transforms.ToTensor(),
            transforms.RandomHorizontalFlip(),
            transforms.Normalization(),
            transforms.AssignGTtoDefaultBox()
        ]),
        "val":
        transforms.Compose([
            transforms.Resize(),
            transforms.ToTensor(),
            transforms.Normalization()
        ])
    }

    VOC_root = parser_data.data_path
    # check voc root
    if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False:
        raise FileNotFoundError(
            "VOCdevkit dose not in path:'{}'.".format(VOC_root))

    # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt
    train_dataset = VOCDataSet(VOC_root,
                               "2012",
                               data_transform['train'],
                               train_set='train.txt')
    # 注意训练时,batch_size必须大于1
    batch_size = parser_data.batch_size
    assert batch_size > 1, "batch size must be greater than 1"
    # 防止最后一个batch_size=1,如果最后一个batch_size=1就舍去
    drop_last = True if len(train_dataset) % batch_size == 1 else False
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    print('Using %g dataloader workers' % nw)
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=nw,
        collate_fn=train_dataset.collate_fn,
        drop_last=drop_last)

    # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt
    val_dataset = VOCDataSet(VOC_root,
                             "2012",
                             data_transform['val'],
                             train_set='val.txt')
    val_data_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=nw,
        collate_fn=train_dataset.collate_fn)

    model = create_model(num_classes=args.num_classes + 1)
    model.to(device)

    # define optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=0.0005,
                                momentum=0.9,
                                weight_decay=0.0005)
    # learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=5,
                                                   gamma=0.3)

    # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练
    if parser_data.resume != "":
        checkpoint = torch.load(parser_data.resume, map_location='cpu')
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        parser_data.start_epoch = checkpoint['epoch'] + 1
        print("the training process from epoch{}...".format(
            parser_data.start_epoch))

    train_loss = []
    learning_rate = []
    val_map = []

    # 提前加载验证集数据,以免每次验证时都要重新加载一次数据,节省时间
    val_data = get_coco_api_from_dataset(val_data_loader.dataset)
    for epoch in range(parser_data.start_epoch, parser_data.epochs):
        mean_loss, lr = utils.train_one_epoch(model=model,
                                              optimizer=optimizer,
                                              data_loader=train_data_loader,
                                              device=device,
                                              epoch=epoch,
                                              print_freq=50)
        train_loss.append(mean_loss.item())
        learning_rate.append(lr)

        # update learning rate
        lr_scheduler.step()

        coco_info = utils.evaluate(model=model,
                                   data_loader=val_data_loader,
                                   device=device,
                                   data_set=val_data)

        # write into txt
        with open(results_file, "a") as f:
            # 写入的数据包括coco指标还有loss和learning rate
            result_info = [
                str(round(i, 4)) for i in coco_info + [mean_loss.item()]
            ] + [str(round(lr, 6))]
            txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
            f.write(txt + "\n")

        val_map.append(coco_info[1])  # pascal mAP

        # save weights
        save_files = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'lr_scheduler': lr_scheduler.state_dict(),
            'epoch': epoch
        }
        torch.save(save_files, "./save_weights/ssd300-{}.pth".format(epoch))

    # plot loss and lr curve
    if len(train_loss) != 0 and len(learning_rate) != 0:
        from plot_curve import plot_loss_and_lr
        plot_loss_and_lr(train_loss, learning_rate)

    # plot mAP curve
    if len(val_map) != 0:
        from plot_curve import plot_map
        plot_map(val_map)
Пример #6
0
def main(opt, hyp):
    # 初始化各进程
    init_distributed_mode(opt)

    if opt.rank in [-1, 0]:
        print(opt)
        print(
            'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/'
        )
        tb_writer = SummaryWriter(comment=opt.name)

    device = torch.device(opt.device)
    if "cuda" not in device.type:
        raise EnvironmentError("not find GPU device for training.")

    # 使用DDP后会对每个device上的gradients取均值,所以需要放大学习率
    hyp["lr0"] *= max(1., opt.world_size * opt.batch_size / 64)

    wdir = "weights" + os.sep  # weights dir
    best = wdir + "best.pt"
    results_file = "results{}.txt".format(
        datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

    cfg = opt.cfg
    data = opt.data
    epochs = opt.epochs
    batch_size = opt.batch_size
    # accumulate n times before optimizer update (bs 64)
    accumulate = max(round(64 / (opt.world_size * opt.batch_size)), 1)
    weights = opt.weights  # initial training weights
    imgsz_train = opt.img_size
    imgsz_test = opt.img_size  # test image sizes
    multi_scale = opt.multi_scale

    # Image sizes
    # 图像要设置成32的倍数
    gs = 32  # (pixels) grid size
    assert math.fmod(
        imgsz_test,
        gs) == 0, "--img-size %g must be a %g-multiple" % (imgsz_test, gs)
    grid_min, grid_max = imgsz_test // gs, imgsz_test // gs
    if multi_scale:
        imgsz_min = opt.img_size // 1.5
        imgsz_max = opt.img_size // 0.667

        # 将给定的最大,最小输入尺寸向下调整到32的整数倍
        grid_min, grid_max = imgsz_min // gs, imgsz_max // gs
        imgsz_min, imgsz_max = int(grid_min * gs), int(grid_max * gs)
        imgsz_train = imgsz_max  # initialize with max size
        if opt.rank in [-1, 0]:  # 只在第一个进程中显示打印信息
            print("Using multi_scale training, image range[{}, {}]".format(
                imgsz_min, imgsz_max))

    # configure run
    random.seed(0)  # 设置随机种子
    data_dict = parse_data_cfg(data)
    train_path = data_dict["train"]
    test_path = data_dict["valid"]
    nc = 1 if opt.single_cls else int(
        data_dict["classes"])  # number of classes
    hyp["cls"] *= nc / 80  # update coco-tuned hyp['cls'] to current dataset
    hyp["obj"] *= imgsz_test / 320

    if opt.rank in [-1, 0]:
        # Remove previous results
        for f in glob.glob(results_file) + glob.glob("tmp.pk"):
            os.remove(f)

    # Initialize model
    model = Darknet(cfg).to(device)

    start_epoch = 0
    best_map = 0.0
    # 如果指定了预训练权重,则载入预训练权重
    if weights.endswith(".pt"):
        ckpt = torch.load(weights, map_location=device)

        # load model
        try:
            ckpt["model"] = {
                k: v
                for k, v in ckpt["model"].items()
                if model.state_dict()[k].numel() == v.numel()
            }
            model.load_state_dict(ckpt["model"], strict=False)
        except KeyError as e:
            s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \
                "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights)
            raise KeyError(s) from e

        if opt.rank in [-1, 0]:
            # load results
            if ckpt.get("training_results") is not None:
                with open(results_file, "w") as file:
                    file.write(ckpt["training_results"])  # write results.txt

        # epochs
        start_epoch = ckpt["epoch"] + 1
        if epochs < start_epoch:
            print(
                '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.'
                % (opt.weights, ckpt['epoch'], epochs))
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt

    # 是否冻结权重,只训练predictor的权重
    if opt.freeze_layers:
        # 索引减一对应的是predictor的索引,YOLOLayer并不是predictor
        output_layer_indices = [
            idx - 1 for idx, module in enumerate(model.module_list)
            if isinstance(module, YOLOLayer)
        ]
        # 冻结除predictor和YOLOLayer外的所有层
        freeze_layer_indeces = [
            x for x in range(len(model.module_list))
            if (x not in output_layer_indices) and (
                x - 1 not in output_layer_indices)
        ]
        # Freeze non-output layers
        # 总共训练3x2=6个parameters
        for idx in freeze_layer_indeces:
            for parameter in model.module_list[idx].parameters():
                parameter.requires_grad_(False)
    else:
        # 如果freeze_layer为False,默认仅训练除darknet53之后的部分
        # 若要训练全部权重,删除以下代码
        darknet_end_layer = 74  # only yolov3spp cfg
        # Freeze darknet53 layers
        # 总共训练21x3+3x2=69个parameters
        for idx in range(darknet_end_layer + 1):  # [0, 74]
            for parameter in model.module_list[idx].parameters():
                parameter.requires_grad_(False)

    # SyncBatchNorm
    # 如果只训练最后的predictor(其中不含bn层),SyncBatchNorm没有作用
    if opt.freeze_layers is False:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)

    model = torch.nn.parallel.DistributedDataParallel(model,
                                                      device_ids=[opt.gpu])
    model.yolo_layers = model.module.yolo_layers  # move yolo layer indices to top level

    # optimizer
    pg = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.SGD(pg,
                          lr=hyp["lr0"],
                          momentum=hyp["momentum"],
                          weight_decay=hyp["weight_decay"],
                          nesterov=True)

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[
        "lrf"]) + hyp["lrf"]  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    scheduler.last_epoch = start_epoch  # 指定从哪个epoch开始

    # dataset
    # 训练集的图像尺寸指定为multi_scale_range中最大的尺寸
    # Make sure only the first process in DDP process the dataset first, and the following others can use the cache.
    with torch_distributed_zero_first(opt.rank):
        train_dataset = LoadImagesAndLabels(
            train_path,
            imgsz_train,
            batch_size,
            augment=True,
            hyp=hyp,  # augmentation hyperparameters
            rect=opt.rect,  # rectangular training
            cache_images=opt.cache_images,
            single_cls=opt.single_cls,
            rank=opt.rank)
        # 验证集的图像尺寸指定为img_size(512)
        val_dataset = LoadImagesAndLabels(test_path,
                                          imgsz_test,
                                          batch_size,
                                          hyp=hyp,
                                          cache_images=opt.cache_images,
                                          single_cls=opt.single_cls,
                                          rank=opt.rank)

    # 给每个rank对应的进程分配训练的样本索引
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset)
    val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
    # 将样本索引每batch_size个元素组成一个list
    train_batch_sampler = torch.utils.data.BatchSampler(train_sampler,
                                                        batch_size,
                                                        drop_last=True)

    # dataloader
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    if opt.rank in [-1, 0]:
        print('Using %g dataloader workers' % nw)
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_sampler=train_batch_sampler,
        num_workers=nw,
        pin_memory=True,
        collate_fn=train_dataset.collate_fn)

    val_data_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=batch_size,
        sampler=val_sampler,
        num_workers=nw,
        pin_memory=True,
        collate_fn=val_dataset.collate_fn)

    # Model parameters
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)

    # start training
    # caching val_data when you have plenty of memory(RAM)
    with torch_distributed_zero_first(opt.rank):
        if os.path.exists("tmp.pk") is False:
            coco = get_coco_api_from_dataset(val_dataset)
            with open("tmp.pk", "wb") as f:
                pickle.dump(coco, f)
        else:
            with open("tmp.pk", "rb") as f:
                coco = pickle.load(f)

    if opt.rank in [-1, 0]:
        print("starting traning for %g epochs..." % epochs)
        print('Using %g dataloader workers' % nw)

    start_time = time.time()
    for epoch in range(start_epoch, epochs):
        train_sampler.set_epoch(epoch)
        mloss, lr = train_util.train_one_epoch(
            model,
            optimizer,
            train_data_loader,
            device,
            epoch,
            accumulate=accumulate,  # 迭代多少batch才训练完64张图片
            img_size=imgsz_train,  # 输入图像的大小
            multi_scale=multi_scale,
            grid_min=grid_min,  # grid的最小尺寸
            grid_max=grid_max,  # grid的最大尺寸
            gs=gs,  # grid step: 32
            print_freq=50,  # 每训练多少个step打印一次信息
            warmup=True)
        # update scheduler
        scheduler.step()

        if opt.notest is False or epoch == epochs - 1:
            # evaluate on the test dataset
            result_info = train_util.evaluate(model,
                                              val_data_loader,
                                              coco=coco,
                                              device=device)

            # only first process in DDP process to record info and save weights
            if opt.rank in [-1, 0]:
                coco_mAP = result_info[0]
                voc_mAP = result_info[1]
                coco_mAR = result_info[8]

                # write into tensorboard
                if tb_writer:
                    tags = [
                        'train/giou_loss', 'train/obj_loss', 'train/cls_loss',
                        'train/loss', "learning_rate", "mAP@[IoU=0.50:0.95]",
                        "mAP@[IoU=0.5]", "mAR@[IoU=0.50:0.95]"
                    ]

                    for x, tag in zip(
                            mloss.tolist() + [lr, coco_mAP, voc_mAP, coco_mAR],
                            tags):
                        tb_writer.add_scalar(tag, x, epoch)

                # write into txt
                with open(results_file, "a") as f:
                    # 记录coco的12个指标加上训练总损失和lr
                    result_info = [
                        str(round(i, 4))
                        for i in result_info + [mloss.tolist()[-1]]
                    ] + [str(round(lr, 6))]
                    txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
                    f.write(txt + "\n")

                # update best mAP(IoU=0.50:0.95)
                if coco_mAP > best_map:
                    best_map = coco_mAP

                if opt.savebest is False:
                    # save weights every epoch
                    with open(results_file, 'r') as f:
                        save_files = {
                            'model': model.module.state_dict(),
                            'optimizer': optimizer.state_dict(),
                            'training_results': f.read(),
                            'epoch': epoch,
                            'best_map': best_map
                        }
                        torch.save(save_files,
                                   "./weights/yolov3spp-{}.pt".format(epoch))
                else:
                    # only save best weights
                    if best_map == coco_mAP:
                        with open(results_file, 'r') as f:
                            save_files = {
                                'model': model.module.state_dict(),
                                'optimizer': optimizer.state_dict(),
                                'training_results': f.read(),
                                'epoch': epoch,
                                'best_map': best_map
                            }
                            torch.save(save_files, best.format(epoch))

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    if opt.rank in [-1, 0]:
        print('Training time {}'.format(total_time_str))
def evaluate(model, data_loader, device, data_set=None):
    n_threads = torch.get_num_threads()
    # FIXME remove this and make paste_masks_in_image run on the GPU
    torch.set_num_threads(1)
    cpu_device = torch.device("cpu")
    model.eval()
    metric_logger = utils.MetricLogger(delimiter="  ")
    header = "Test: "

    if data_set is None:
        data_set = get_coco_api_from_dataset(data_loader.dataset)
    iou_types = _get_iou_types(model)
    coco_evaluator = CocoEvaluator(data_set, iou_types)

    for images, targets in metric_logger.log_every(data_loader, 100, header):
        images = torch.stack(images, dim=0).to(device)

        if device != torch.device("cpu"):
            torch.cuda.synchronize(device)

        model_time = time.time()
        #  list((bboxes_out, labels_out, scores_out), ...)
        results = model(images, targets=None)
        model_time = time.time() - model_time

        outputs = []
        for index, (bboxes_out, labels_out, scores_out) in enumerate(results):
            # 将box的相对坐标信息(0-1)转为绝对值坐标(xmin, ymin, xmax, ymax)
            height_width = targets[index]["height_width"]
            # 还原回原图尺度
            bboxes_out[:, [0, 2]] = bboxes_out[:, [0, 2]] * height_width[1]
            bboxes_out[:, [1, 3]] = bboxes_out[:, [1, 3]] * height_width[0]

            info = {
                "boxes": bboxes_out.to(cpu_device),
                "labels": labels_out.to(cpu_device),
                "scores": scores_out.to(cpu_device)
            }
            outputs.append(info)

        res = {
            target["image_id"].item(): output
            for target, output in zip(targets, outputs)
        }

        evaluator_time = time.time()
        coco_evaluator.update(res)
        evaluator_time = time.time() - evaluator_time
        metric_logger.update(model_time=model_time,
                             evaluator_time=evaluator_time)

    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    coco_evaluator.synchronize_between_processes()

    # accumulate predictions from all images
    coco_evaluator.accumulate()
    coco_evaluator.summarize()
    torch.set_num_threads(n_threads)

    coco_info = coco_evaluator.coco_eval[
        iou_types[0]].stats.tolist()  # numpy to list

    return coco_info