Пример #1
0
def main():
    global args, logger, writer
    args = get_parser().parse_args()
    logger = get_logger()
    writer = SummaryWriter(args.save_path)
    # os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.gpu)

    if args.dist:
        dist_init(args.port, backend=args.backend)

    if len(args.gpu) == 1:
        args.syncbn = False
    logger.info(args)
    assert args.classes > 1
    assert args.zoom_factor in [1, 2, 4, 8]
    assert (args.crop_h - 1) % 8 == 0 and (args.crop_w - 1) % 8 == 0

    world_size = 1
    rank = 0
    if args.dist:
        rank = dist.get_rank()
        world_size = dist.get_world_size()
    if rank == 0:
        logger.info('dist:{}'.format(args.dist))
        logger.info("=> creating model ...")
        logger.info("Classes: {}".format(args.classes))

    # rank = dist.get_rank()

    if args.bn_group > 1:
        args.syncbn = True
        bn_sync_stats = True
        bn_group_comm = simple_group_split(world_size, rank,
                                           world_size // args.bn_group)
    else:
        args.syncbn = False
        bn_sync_stats = False
        bn_group_comm = None

    model = PSPNet(layers=args.layers,
                   classes=args.classes,
                   zoom_factor=args.zoom_factor,
                   syncbn=args.syncbn,
                   group_size=args.bn_group,
                   group=bn_group_comm,
                   sync_stats=bn_sync_stats)
    if rank == 0:
        logger.info(model)
    # optimizer = torch.optim.SGD(model.parameters(), args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay)
    # newly introduced layer with lr x10
    optimizer = torch.optim.SGD([{
        'params': model.layer0.parameters()
    }, {
        'params': model.layer1.parameters()
    }, {
        'params': model.layer2.parameters()
    }, {
        'params': model.layer3.parameters()
    }, {
        'params': model.layer4.parameters()
    }, {
        'params': model.ppm.parameters(),
        'lr': args.base_lr * 10
    }, {
        'params': model.cls.parameters(),
        'lr': args.base_lr * 10
    }, {
        'params': model.aux.parameters(),
        'lr': args.base_lr * 10
    }],
                                lr=args.base_lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    # model = torch.nn.DataParallel(model).cuda()
    # if args.syncbn:
    #     from lib.syncbn import patch_replication_callback
    #     patch_replication_callback(model)
    model = model.cuda()

    cudnn.enabled = True
    cudnn.benchmark = True
    criterion = nn.NLLLoss(ignore_index=args.ignore_label)

    if args.weight:

        def map_func(storage, location):
            return storage.cuda()

        if os.path.isfile(args.weight):
            logger.info("=> loading weight '{}'".format(args.weight))
            checkpoint = torch.load(args.weight, map_location=map_func)
            model.load_state_dict(checkpoint['state_dict'])
            logger.info("=> loaded weight '{}'".format(args.weight))
        else:
            logger.info("=> no weight found at '{}'".format(args.weight))

    if args.resume:
        if os.path.isfile(args.resume):
            logger.info("=> loading checkpoint '{}'".format(args.resume))
            # checkpoint = torch.load(args.resume)
            # args.start_epoch = checkpoint['epoch']
            # model.load_state_dict(checkpoint['state_dict'])
            # optimizer.load_state_dict(checkpoint['optimizer'])
            model, optimizer, args.start_epoch = restore_from(
                model, optimizer, args.resume)

            logger.info("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, args.start_epoch))
        else:
            logger.info("=> no checkpoint found at '{}'".format(args.resume))

    if args.dist:
        broadcast_params(model)

    value_scale = 255
    mean = [0.485, 0.456, 0.406]
    mean = [item * value_scale for item in mean]
    std = [0.229, 0.224, 0.225]
    std = [item * value_scale for item in std]

    train_transform = transforms.Compose([
        transforms.RandScale([args.scale_min, args.scale_max]),
        transforms.RandRotate([args.rotate_min, args.rotate_max],
                              padding=mean,
                              ignore_label=args.ignore_label),
        transforms.RandomGaussianBlur(),
        transforms.RandomHorizontalFlip(),
        transforms.Crop([args.crop_h, args.crop_w],
                        crop_type='rand',
                        padding=mean,
                        ignore_label=args.ignore_label),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=std)
    ])
    train_data = datasets.SegData(split='train',
                                  data_root=args.data_root,
                                  data_list=args.train_list,
                                  transform=train_transform)
    train_sampler = None
    if args.dist:
        train_sampler = DistributedSampler(train_data)

    train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batch_size,
        shuffle=False if train_sampler else True,
        num_workers=args.workers,
        pin_memory=False,
        sampler=train_sampler)

    if args.evaluate:
        val_transform = transforms.Compose([
            transforms.Crop([args.crop_h, args.crop_w],
                            crop_type='center',
                            padding=mean,
                            ignore_label=args.ignore_label),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std)
        ])
        val_data = datasets.SegData(split='val',
                                    data_root=args.data_root,
                                    data_list=args.val_list,
                                    transform=val_transform)
        val_sampler = None
        if args.dist:
            val_sampler = DistributedSampler(val_data)
        val_loader = torch.utils.data.DataLoader(
            val_data,
            batch_size=args.batch_size_val,
            shuffle=False,
            num_workers=args.workers,
            pin_memory=False,
            sampler=val_sampler)

    for epoch in range(args.start_epoch, args.epochs + 1):
        loss_train, mIoU_train, mAcc_train, allAcc_train = train(
            train_loader, model, criterion, optimizer, epoch, args.zoom_factor,
            args.batch_size, args.aux_weight)
        writer.add_scalar('loss_train', loss_train.cpu().numpy(), epoch)
        writer.add_scalar('mIoU_train', mIoU_train, epoch)
        writer.add_scalar('mAcc_train', mAcc_train, epoch)
        writer.add_scalar('allAcc_train', allAcc_train, epoch)
        # write parameters histogram costs lots of time
        # for name, param in model.named_parameters():
        #     writer.add_histogram(name, param, epoch)

        if args.evaluate and rank == 0:
            loss_val, mIoU_val, mAcc_val, allAcc_val = validate(
                val_loader, model, criterion, args.classes, args.zoom_factor)
            writer.add_scalar('loss_val', loss_val.cpu().numpy(), epoch)
            writer.add_scalar('mIoU_val', mIoU_val, epoch)
            writer.add_scalar('mAcc_val', mAcc_val, epoch)
            writer.add_scalar('allAcc_val', allAcc_val, epoch)

        if epoch % args.save_step == 0 and (rank == 0):
            filename = args.save_path + '/train_epoch_' + str(epoch) + '.pth'
            logger.info('Saving checkpoint to: ' + filename)
            torch.save(
                {
                    'epoch': epoch,
                    'state_dict': model.cpu().state_dict(),
                    'optimizer': optimizer.state_dict()
                }, filename)
Пример #2
0
def load_model(filepath):
    net = PSPNet(pretrained=False)
    net.cpu()
    net.load_state_dict(torch.load(filepath, map_location='cpu'))
    return net