예제 #1
0
파일: val_best.py 프로젝트: bmhopkinson/CSN
    def __init__(self,args):
        warnings.filterwarnings('ignore')
        assert torch.cuda.is_available()
        torch.backends.cudnn.benchmark = True
        model_fname = 'data/deeplab_{0}_{1}_v3_{2}_epoch%d.pth'.format(args.backbone, args.dataset, args.exp)
        if args.dataset == 'pascal':
            raise NotImplementedError
        elif args.dataset == 'cityscapes':
            kwargs = {'num_workers': args.workers, 'pin_memory': True, 'drop_last': True}
            dataset_loader, num_classes = dataloaders.make_data_loader(args, **kwargs)
            args.num_classes = num_classes
        elif args.dataset == 'marsh' :
            kwargs = {'num_workers': args.workers, 'pin_memory': True, 'drop_last': True}
            dataset_loader,val_loader, test_loader, num_classes = dataloaders.make_data_loader(args, **kwargs)
            args.num_classes = num_classes
        else:
            raise ValueError('Unknown dataset: {}'.format(args.dataset))

        if args.backbone == 'autodeeplab':
            model = Retrain_Autodeeplab(args)
            model.load_state_dict(torch.load(r"./run/marsh/deeplab-autodeeplab/model_best.pth.tar")['state_dict'], strict=False)
        else:
            raise ValueError('Unknown backbone: {}'.format(args.backbone))
def evaluate():
    # setup

    warnings.filterwarnings('ignore')
    cfg = config_factory['resnet_cityscapes']
    args = obtain_retrain_autodeeplab_args()
    if not args.local_rank == -1:
        torch.cuda.set_device(args.local_rank)
        dist.init_process_group(backend='nccl',
                                init_method='tcp://127.0.0.1:{}'.format(
                                    cfg.port),
                                world_size=torch.cuda.device_count(),
                                rank=args.local_rank)
        setup_logger(cfg.respth)
    else:
        FORMAT = '%(levelname)s %(filename)s(%(lineno)d): %(message)s'
        log_level = logging.INFO
        if dist.is_initialized() and dist.get_rank() != 0:
            log_level = logging.ERROR
        logging.basicConfig(level=log_level, format=FORMAT, stream=sys.stdout)
    logger = logging.getLogger()

    # model
    logger.info('setup and restore model')
    net = Retrain_Autodeeplab(args)

    save_pth = osp.join(cfg.respth, 'model_final.pth')
    net.load_state_dict(torch.load(save_pth))
    net.cuda()
    net.eval()
    if not args.local_rank == -1:
        net = nn.parallel.DistributedDataParallel(
            net, device_ids=[
                args.local_rank,
            ], output_device=args.local_rank)

    # evaluator
    logger.info('compute the mIOU')
    evaluator = MscEval(cfg, args)
    mIOU = evaluator(net)
    logger.info('mIOU is: {:.6f}'.format(mIOU))
예제 #3
0
def main():
    warnings.filterwarnings('ignore')
    assert torch.cuda.is_available()
    torch.backends.cudnn.benchmark = True
    args = obtain_retrain_autodeeplab_args()
    args.data_dict = {}
    model_fname = 'data/deeplab_{0}_{1}_v3_{2}_epoch%d.pth'.format(
        args.backbone, args.dataset, args.exp)
    if args.dataset == 'pascal':
        raise NotImplementedError
    elif args.dataset == 'cityscapes':
        kwargs = {
            'num_workers': args.workers,
            'pin_memory': True,
            'drop_last': True
        }
        dataset_loader, num_classes = make_data_loader(args, **kwargs)
        args.num_classes = num_classes
    elif args.dataset == '2d':
        args.data_dict, args.num_classes = make_data_loader(args)
    elif args.dataset == '3d':
        args.data_dict, args.num_classes = make_data_loader_3d_patch(args)

    else:
        raise ValueError('Unknown dataset: {}'.format(args.dataset))

    if args.backbone == 'autodeeplab':
        model = Retrain_Autodeeplab(args)
    else:
        raise ValueError('Unknown backbone: {}'.format(args.backbone))

    if args.criterion == 'Ohem':
        args.thresh = 0.7
        args.crop_size = [args.crop_size, args.crop_size] if isinstance(
            args.crop_size, int) else args.crop_size
        args.n_min = int((args.batch_size / len(args.gpu) * args.crop_size[0] *
                          args.crop_size[1]) // 16)
    criterion = build_criterion(args)

    model = nn.DataParallel(model).cuda()
    model.train()
    if args.freeze_bn:
        for m in model.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.eval()
                m.weight.requires_grad = False
                m.bias.requires_grad = False
    optimizer = optim.SGD(model.module.parameters(),
                          lr=args.base_lr,
                          momentum=0.9,
                          weight_decay=0.0001)

    max_iteration = args.data_dict['num_train'] * args.epochs
    scheduler = Iter_LR_Scheduler(args, max_iteration,
                                  args.data_dict['num_train'])
    start_epoch = 0

    if args.resume:
        if os.path.isfile(args.resume):
            print('=> loading checkpoint {0}'.format(args.resume))
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print('=> loaded checkpoint {0} (epoch {1})'.format(
                args.resume, checkpoint['epoch']))
        else:
            raise ValueError('=> no checkpoint found at {0}'.format(
                args.resume))

    for epoch in range(start_epoch, args.epochs):
        losses = AverageMeter()
        for i in range(args.data_dict['num_train']):
            cur_iter = epoch * args.data_dict['num_train'] + i
            scheduler(optimizer, cur_iter)
            inputs = torch.FloatTensor(args.data_dict['train_data'][i]).cuda()
            target = torch.FloatTensor(args.data_dict['train_mask'][i]).cuda()
            outputs = model(inputs)
            loss = criterion(outputs, target)
            if np.isnan(loss.item()) or np.isinf(loss.item()):
                pdb.set_trace()
            losses.update(loss.item(), args.batch_size)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            print('epoch: {0}\t'
                  'iter: {1}/{2}\t'
                  'lr: {3:.6f}\t'
                  'loss: {loss.val:.4f} ({loss.ema:.4f})'.format(
                      epoch + 1,
                      i + 1,
                      args.data_dict['num_train'],
                      scheduler.get_lr(optimizer),
                      loss=losses))

        if epoch < args.epochs - 50:
            if epoch % 50 == 0:
                torch.save(
                    {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                    }, model_fname % (epoch + 1))
        else:
            torch.save(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                }, model_fname % (epoch + 1))

        if epoch % 2 == 0:
            ########################valid and test
            validation(epoch, model, args, criterion, args.num_classes)
            validation(epoch,
                       model,
                       args,
                       criterion,
                       args.num_classes,
                       test_tag=True)
        print('reset local total loss!')
예제 #4
0
def main(start_epoch, epochs):
    assert torch.cuda.is_available(), NotImplementedError('No cuda available ')
    if not osp.exists('data/'):
        os.mkdir('data/')
    if not osp.exists('log/'):
        os.mkdir('log/')

    args = obtain_evaluate_args()
    torch.backends.cudnn.benchmark = True
    model_fname = 'data/deeplab_{0}_{1}_v3_{2}_epoch%d.pth'.format(
        'autodeeplab', 'cityscapes', 'bnlr7e-3')

    if args.dataset == 'cityscapes':
        dataset = CityscapesSegmentation(args=args,
                                         root=Path.db_root_dir(args.dataset),
                                         split='reval')
        print(dataset)

    else:
        return NotImplementedError

    if args.backbone == 'autodeeplab':
        model = Retrain_Autodeeplab(args)
    else:
        raise ValueError('Unknown backbone: {}'.format(args.backbone))

    if not args.train:
        val_dataloader = DataLoader(dataset, batch_size=16, shuffle=False)
        model = torch.nn.DataParallel(model).cuda()
        print("======================start evaluate=======================")

        for epoch in range(0, args.epochs):
            print("evaluate epoch {:}".format(epoch + start_epoch))
            checkpoint_name = model_fname % (epoch + start_epoch)
            print(checkpoint_name)
            checkpoint = torch.load(checkpoint_name)

            ##TODO: capire perchè non fanno i due commenti sotto
            #state_dict = {k[1:]: v for k, v in checkpoint['state_dict'].items() if 'tracked' not in k}
            #model.module.load_state_dict(state_dict)

            inter_meter = AverageMeter()
            union_meter = AverageMeter()
            for i, sample in enumerate(val_dataloader):
                inputs, target = sample['image'], sample['label']

                #i have to add this line because of CUDA OUT OF MEMORY
                target = target[:, :200, :400]

                N, H, W = target.shape

                total_outputs = torch.zeros(
                    (N, dataset.NUM_CLASSES, H, W)).cuda()
                with torch.no_grad():
                    for j, scale in enumerate(args.eval_scales):
                        new_scale = [int(H * scale), int(W * scale)]
                        inputs = F.upsample(inputs,
                                            new_scale,
                                            mode='bilinear',
                                            align_corners=True)
                        inputs = inputs.cuda()
                        outputs = model(inputs)
                        outputs = F.upsample(outputs, (H, W),
                                             mode='bilinear',
                                             align_corners=True)
                        total_outputs += outputs
                    _, pred = torch.max(total_outputs, 1)
                    pred = pred.detach().cpu().numpy().squeeze().astype(
                        np.uint8)
                    mask = target.numpy().astype(np.uint8)
                    print('eval: {0}/{1}'.format(i + 1, len(val_dataloader)))

                    inter, union = inter_and_union(pred, mask,
                                                   len(dataset.CLASSES))
                    inter_meter.update(inter)
                    union_meter.update(union)
            iou = inter_meter.sum / (union_meter.sum + 1e-10)
            miou = 'epoch: {0} Mean IoU: {1:.2f}'.format(
                epoch,
                iou.mean() * 100)
            f = open('log/result.txt', 'a')
            for i, val in enumerate(iou):
                class_iou = 'IoU {0}: {1:.2f}\n'.format(
                    dataset.CLASSES[i], val * 100)
                f.write(class_iou)
            f.write('\n')
            f.write(miou)
            f.write('\n')
            f.close()
def main():
    args = obtain_retrain_autodeeplab_args()
    torch.cuda.set_device(args.local_rank)
    cfg = config_factory['resnet_cityscapes']
    if not os.path.exists(cfg.respth):
        os.makedirs(cfg.respth)
    dist.init_process_group(backend='nccl',
                            init_method='tcp://127.0.0.1:{}'.format(cfg.port),
                            world_size=torch.cuda.device_count(),
                            rank=args.local_rank)
    setup_logger(cfg.respth)
    logger = logging.getLogger()
    rand_seed = random.randint(0, args.manualSeed)
    prepare_seed(rand_seed)
    if args.local_rank == 0:
        log_string = 'seed-{}-time-{}'.format(rand_seed, time_for_file())
        train_logger = Logger(args, log_string)
        train_logger.log('Arguments : -------------------------------')
        for name, value in args._get_kwargs():
            train_logger.log('{:16} : {:}'.format(name, value))
        train_logger.log("Python  version : {}".format(
            sys.version.replace('\n', ' ')))
        train_logger.log("Pillow  version : {}".format(PIL.__version__))
        train_logger.log("PyTorch version : {}".format(torch.__version__))
        train_logger.log("cuDNN   version : {}".format(
            torch.backends.cudnn.version()))
        train_logger.log("random_seed : {}".format(rand_seed))
        if args.checkname is None:
            args.checkname = 'deeplab-' + str(args.backbone)
    # dataset
    kwargs = {
        'num_workers': args.workers,
        'pin_memory': True,
        'drop_last': True
    }
    train_loader, args.num_classes, sampler = make_data_loader(args=args,
                                                               **kwargs)
    # model
    model = Retrain_Autodeeplab(args)
    model.train()
    model.cuda()
    model = nn.parallel.DistributedDataParallel(
        model,
        device_ids=[
            args.local_rank,
        ],
        output_device=args.local_rank,
        find_unused_parameters=True).cuda()
    n_min = cfg.ims_per_gpu * cfg.crop_size[0] * cfg.crop_size[1] // 16
    criterion = OhemCELoss(thresh=cfg.ohem_thresh, n_min=n_min).cuda()
    max_iteration = int(cfg.max_epoch * len(train_loader))
    #     max_iteration = int(1500000 * 4 // cfg.gpus)
    it = 0
    # optimizer
    optimizer = Optimizer(model, cfg.lr_start, cfg.momentum, cfg.weight_decay,
                          cfg.warmup_steps, cfg.warmup_start_lr, max_iteration,
                          cfg.lr_power)
    if dist.get_rank() == 0:
        print(
            '======optimizer launch successfully , max_iteration {:}!======='.
            format(max_iteration))

    # train loop
    loss_avg = []
    start_time = glob_start_time = time.time()
    # for it in range(cfg.max_iter):
    if args.resume is not None:
        checkpoint = torch.load(args.resume, map_location='cpu')
        if checkpoint['iter'] is not None:
            args.train_mode = 'iter'
            start_iter = checkpoint['iter']
            n_epoch = checkpoint['epoch']
        elif checkpoint['epoch'] is not None:
            args.train_mode = 'epoch'
        model.module.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'], checkpoint['iter'])

    else:
        if args.train_mode == 'iter':
            start_iter = 0
            n_epoch = 0
        elif args.train_mode == 'epoch':
            start_epoch = 0

    if args.train_mode is 'iter':

        diter = iter(train_loader)
        for it in range(start_iter, cfg.max_iter):
            try:
                sample = next(diter)
            except StopIteration:
                n_epoch += 1
                sampler.set_epoch(n_epoch)
                diter = iter(train_loader)
                sample = next(diter)

            im, lb = sample['image'].cuda(), sample['label'].cuda()
            lb = torch.squeeze(lb, 1)

            optimizer.zero_grad()
            logits = model(im)
            loss = criterion(logits, lb)
            loss.backward()
            optimizer.step()

            loss_avg.append(loss.item())
            # print training log message

            if it % cfg.msg_iter == 0 and not it == 0 and dist.get_rank() == 0:
                loss_avg = sum(loss_avg) / len(loss_avg)
                lr = optimizer.lr
                ed = time.time()
                t_intv, glob_t_intv = ed - start_time, ed - glob_start_time
                eta = int((max_iteration - it) * (glob_t_intv / it))
                eta = str(datetime.timedelta(seconds=eta))
                msg = ', '.join([
                    'iter: {it}/{max_iteration}',
                    'lr: {lr:4f}',
                    'loss: {loss:.4f}',
                    'eta: {eta}',
                    'time: {time:.4f}',
                ]).format(it=it,
                          max_iteration=max_iteration,
                          lr=lr,
                          loss=loss_avg,
                          time=t_intv,
                          eta=eta)
                # TODO : now the logger.info will error if iter > 350000, so use print haha
                if max_iteration > 350000:
                    logger.info(msg)
                else:
                    print(msg)
                loss_avg = []
            it += 1

            if (cfg.msg_iter is not None) and (it % cfg.msg_iter
                                               == 0) and (it != 0):
                if args.verbose:
                    logger.info('evaluating the model of iter:{}'.format(it))
                    model.eval()
                    evaluator = MscEval(cfg, args)
                    mIOU, loss = evaluator(model,
                                           criteria=criterion,
                                           multi_scale=False)
                    logger.info('mIOU is: {}, loss_eval is {}'.format(
                        mIOU, loss))

                model.cpu()
                save_name = 'iter_{}_naive_model.pth'.format(it)
                save_pth = osp.join(cfg.respth, save_name)
                state = model.module.state_dict() if hasattr(
                    model, 'module') else model.state_dict()

                checkpoint = {
                    'state_dict': state,
                    'epoch': n_epoch,
                    'iter': it,
                    'optimizer': optimizer.optim.state_dict()
                }
                if dist.get_rank() == 0:
                    torch.save(state, save_pth)
                logger.info('model of iter {} saved to: {}'.format(
                    it, save_pth))
                model.cuda()
                model.train()

    elif args.train_mode is 'epoch':
        for epoch in range(start_epoch, cfg.max_epoch):
            for i, sample in enumerate(train_loader):
                im = sample['image'].cuda()
                lb = sample['label'].cuda()
                lb = torch.squeeze(lb, 1)

                optimizer.zero_grad()
                logits = model(im)
                loss = criterion(logits, lb)
                loss.backward()
                optimizer.step()

                loss_avg.append(loss.item())
                # print training log message

            if i % cfg.msg_iter == 0 and not (i == 0 and epoch
                                              == 0) and dist.get_rank() == 0:
                loss_avg = sum(loss_avg) / len(loss_avg)
                lr = optimizer.lr
                ed = time.time()
                t_intv, glob_t_intv = ed - start_time, ed - glob_start_time
                eta = int((max_iteration - it) * (glob_t_intv / it))
                eta = str(datetime.timedelta(seconds=eta))
                msg = ', '.join([
                    'iter: {it}/{max_iteration}',
                    'lr: {lr:4f}',
                    'loss: {loss:.4f}',
                    'eta: {eta}',
                    'time: {time:.4f}',
                ]).format(it=it,
                          max_iteration=max_iteration,
                          lr=lr,
                          loss=loss_avg,
                          time=t_intv,
                          eta=eta)
                logger.info(msg)
                loss_avg = []

            # save model and optimizer each epoch
            if args.verbose:
                logger.info('evaluating the model of iter:{}'.format(it))
                model.eval()
                evaluator = MscEval(cfg, args)
                mIOU, loss = evaluator(model,
                                       criteria=criterion,
                                       multi_scale=False)
                logger.info('mIOU is: {}, loss_eval is {}'.format(mIOU, loss))

            model.cpu()
            save_name = 'iter_{}_naive_model.pth'.format(it)
            save_pth = osp.join(cfg.respth, save_name)
            state = model.module.state_dict() if hasattr(
                model, 'module') else model.state_dict()

            checkpoint = {
                'state_dict': state,
                'epoch': n_epoch,
                'iter': it,
                'optimizer': optimizer.state_dict()
            }
            if dist.get_rank() == 0:
                torch.save(state, save_pth)
            logger.info('model of iter {} saved to: {}'.format(it, save_pth))
            model.cuda()
            model.train()

    else:
        raise NotImplementedError
예제 #6
0
    def __init__(self, args):
        warnings.filterwarnings('ignore')
        assert torch.cuda.is_available()
        torch.backends.cudnn.benchmark = True
        model_fname = 'data/deeplab_{0}_{1}_v3_{2}_epoch%d.pth'.format(
            args.backbone, args.dataset, args.exp)
        if args.dataset == 'pascal':
            raise NotImplementedError
        elif args.dataset == 'cityscapes':
            kwargs = {
                'num_workers': args.workers,
                'pin_memory': True,
                'drop_last': True
            }
            dataset_loader, num_classes = dataloaders.make_data_loader(
                args, **kwargs)
            args.num_classes = num_classes
        elif args.dataset == 'marsh':
            kwargs = {
                'num_workers': args.workers,
                'pin_memory': True,
                'drop_last': True
            }
            dataset_loader, val_loader, test_loader, num_classes = dataloaders.make_data_loader(
                args, **kwargs)
            args.num_classes = num_classes
        else:
            raise ValueError('Unknown dataset: {}'.format(args.dataset))

        if args.backbone == 'autodeeplab':
            model = Retrain_Autodeeplab(args)
        else:
            raise ValueError('Unknown backbone: {}'.format(args.backbone))

        if args.criterion == 'Ohem':
            args.thresh = 0.7
            args.crop_size = [args.crop_size, args.crop_size] if isinstance(
                args.crop_size, int) else args.crop_size
            args.n_min = int((args.batch_size / len(args.gpu) *
                              args.crop_size[0] * args.crop_size[1]) // 16)
        criterion = build_criterion(args)

        model = nn.DataParallel(model).cuda()
        model.train()
        if args.freeze_bn:
            for m in model.modules():
                if isinstance(m, nn.BatchNorm2d):
                    m.eval()
                    m.weight.requires_grad = False
                    m.bias.requires_grad = False
        optimizer = optim.SGD(model.module.parameters(),
                              lr=args.base_lr,
                              momentum=0.9,
                              weight_decay=0.0001)

        max_iteration = len(dataset_loader) * args.epochs
        scheduler = Iter_LR_Scheduler(args, max_iteration, len(dataset_loader))

        start_epoch = 0

        # Resuming checkpoint
        self.best_pred = 0.0
        if args.resume:
            if os.path.isfile(args.resume):
                print('=> loading checkpoint {0}'.format(args.resume))
                checkpoint = torch.load(args.resume)
                start_epoch = checkpoint['epoch']
                model.load_state_dict(checkpoint['state_dict'])
                optimizer.load_state_dict(checkpoint['optimizer'])
                print('=> loaded checkpoint {0} (epoch {1})'.format(
                    args.resume, checkpoint['epoch']))
                self.best_pred = checkpoint['best_pred']
            else:
                raise ValueError('=> no checkpoint found at {0}'.format(
                    args.resume))
        ##mergee
        self.args = args
        # Define Saver
        self.saver = Saver(args)
        self.saver.save_experiment_config()
        # Define Tensorboard Summary
        self.summary = TensorboardSummary(self.saver.experiment_dir)
        self.writer = self.summary.create_summary()

        # Define Dataloader
        #kwargs = {'num_workers': args.workers, 'pin_memory': True}
        self.train_loader, self.val_loader, self.test_loader, self.nclass = dataset_loader, val_loader, test_loader, num_classes

        self.criterion = criterion
        self.model, self.optimizer = model, optimizer

        # Define Evaluator
        self.evaluator = Evaluator(self.nclass)
        # Define lr scheduler
        #self.scheduler = scheduler
        self.scheduler = LR_Scheduler(
            "poly", args.lr, args.epochs,
            len(self.train_loader))  #removed None from second parameter.