Пример #1
0
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss_record = AvgMeter()
    for batch_idx, data in enumerate(train_loader):
        if epoch % args['lr_step'] == 0 and epoch != 0:
            optimizer.param_groups[0]['lr'] = 2 * args['lr'] / args['lr_decay']
            optimizer.param_groups[1]['lr'] = args['lr'] / args['lr_decay']
        inputs_volume, labels_volume = data
        sub_batch_len = math.ceil(inputs_volume.shape[1] /
                                  args['train_batch_size'])
        for sub_batch_idx in range(sub_batch_len):
            #split volume to batchs
            start = sub_batch_idx * args['train_batch_size']
            end = (sub_batch_idx + 1) * args['train_batch_size'] if (
                sub_batch_idx +
                1) * args['train_batch_size'] < inputs_volume.shape[
                    1] else inputs_volume.shape[1]
            inputs = inputs_volume[:, start:end, :, :].permute(1, 0, 2, 3)
            inputs = inputs.expand(
                torch.Size((inputs.shape[0], 3, inputs.shape[2],
                            inputs.shape[3])))  #adjust to net input channel
            # labels = labels_volume[:, start:end, :, :].permute(1, 0, 2, 3)
            labels = labels_volume[:, start:end, :, :].squeeze(0)
            # training trick
            tmp = np.array(labels)
            tmp[0, :128, :] = 255
            tmp[0, 384:, :] = 255
            tmp[0, :, :128] = 255
            tmp[0, :, 384:] = 255
            labels = torch.from_numpy(tmp)

            batch_size = inputs.size(0)
            inputs = Variable(inputs).cuda()
            labels = Variable(labels).cuda()
            optimizer.zero_grad()
            outputs = net(inputs)

            # pred=outputs.argmax(dim=1)
            # prediction = np.array(pred.detach().cpu())
            # l = np.array(labels.detach().cpu())
            # index = np.where(l==np.max(l))
            # a=np.count_nonzero(prediction)
            # b=np.count_nonzero(l)
            # print(str(a))
            # print(str(b))

            criterion = MultiClassCriterion('OhemCrossEntropy')
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()
            train_loss_record.update(loss.item(), batch_size)
            log = 'iter: %d | [Total loss: %.5f], [lr: %.8f]' % \
                  (epoch, train_loss_record.avg, optimizer.param_groups[1]['lr'])
            progress_bar(batch_idx, len(train_loader), log)
Пример #2
0
del data_config['dataset']

modelname = config_path.stem
output_dir = Path('../model') / modelname
output_dir.mkdir(exist_ok=True)
log_dir = Path('../logs') / modelname
log_dir.mkdir(exist_ok=True)

logger = debug_logger(log_dir)
logger.debug(config)
logger.info(f'Device: {device}')
logger.info(f'Max Epoch: {max_epoch}')

# Loss
print('Initializing loss function, optimizer and scheduler...')
loss_fn = MultiClassCriterion(**loss_config).to(device)
params = model.parameters()
optimizer, scheduler = create_optimizer(params, **opt_config)

# history
if resume:
    with open(log_dir.joinpath('history.pkl'), 'rb') as f:
        history_dict = pickle.load(f)
        best_metrics = history_dict['best_metrics']
        loss_history = history_dict['loss']
        iou_history = history_dict['iou']
        start_epoch = len(iou_history)
        for _ in range(start_epoch):
            scheduler.step()
else:
    start_epoch = 0
Пример #3
0
def main(argv, configPath=None):
    # arguments
    args = getArgs_(argv, configPath)
    saveDir = savePath(args)
    logger = infoLogger(logdir=saveDir, name=args.model)

    logger.info(argv)
    logger.debug(cfgInfo(args))
    logger.info("CheckPoints path: {}".format(saveDir))
    logger.debug("Model Name: {}".format(args.model))

    train_dataset = BDD100K_Area_Seg(base_dir=args.dataPath,
                                     split='train',
                                     target_size=args.size)
    valid_dataset = BDD100K_Area_Seg(base_dir=args.dataPath,
                                     split='val',
                                     target_size=args.size)

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_worker,
                              pin_memory=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_worker,
                              pin_memory=True)

    args.num_gpus, args.device = deviceSetting(logger=logger,
                                               device=args.device)
    # model
    model = Deeplabv3plus_Mobilenet(args.output_channels,
                                    output_stride=args.output_stride)

    optimizer, scheduler = create_optimizer_(model, args)
    loss_fn = MultiClassCriterion(loss_type=args.loss_type,
                                  ignore_index=args.ignore_index)
    model, trainData = modelDeploy(args, model, optimizer, scheduler, logger)

    tensorLogger = SummaryWriter(log_dir=os.path.join(saveDir, 'runs'),
                                 filename_suffix=args.model)
    logger.info("Tensorboard event log saved in {}".format(
        tensorLogger.log_dir))

    logger.info('Start training...')
    # global_step = 0
    start_epoch = trainData['epoch']

    num_classes = args.output_channels
    extra_info_ckpt = '{}_{}_{}'.format(args.model, args.size[0], args.size[1])
    for i_epoch in range(start_epoch, args.max_epoch):
        if i_epoch >= 29:
            optimizer.param_groups[0]["lr"] = np.float64(0.00001)
        trainData['epoch'] = i_epoch
        lossList, miouList = train_seg(model,
                                       train_loader,
                                       i_epoch,
                                       optimizer,
                                       loss_fn,
                                       num_classes,
                                       logger,
                                       tensorLogger,
                                       args=args)
        scheduler.step()
        trainData['loss'].extend(lossList)
        trainData['miou'].extend(miouList)

        valLoss, valMiou = val_seg(model,
                                   valid_loader,
                                   i_epoch,
                                   loss_fn,
                                   num_classes,
                                   logger,
                                   tensorLogger,
                                   args=args)
        trainData['val'].append([valLoss, valMiou])

        best = valMiou > trainData['bestMiou']
        if valMiou > trainData['bestMiou']:
            trainData['bestMiou'] = valMiou

        weights_dict = model.module.state_dict(
        ) if args.device == 'cuda' else model.state_dict()

        save_checkpoint(
            {
                'trainData': trainData,
                'model': weights_dict,
                'optimizer': optimizer.state_dict(),
            },
            is_best=best,
            dir=saveDir,
            extra_info=extra_info_ckpt,
            miou_val=valMiou,
            logger=logger)

    tensorLogger.close()
Пример #4
0
def process(config_path):
    gc.collect()
    torch.cuda.empty_cache()
    config = yaml.load(open(config_path))
    net_config = config['Net']
    data_config = config['Data']
    train_config = config['Train']
    loss_config = config['Loss']
    opt_config = config['Optimizer']
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    t_max = opt_config['t_max']

    # Collect training parameters
    max_epoch = train_config['max_epoch']
    batch_size = train_config['batch_size']
    fp16 = train_config['fp16']
    resume = train_config['resume']
    pretrained_path = train_config['pretrained_path']
    freeze_enabled = train_config['freeze']
    seed_enabled = train_config['seed']

    #########################################
    # Deterministic training
    if seed_enabled:
        seed = 100
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed=seed)
        import random
        random.seed(a=100)
    #########################################

    # Network
    if 'unet' in net_config['dec_type']:
        net_type = 'unet'
        model = EncoderDecoderNet(**net_config)
    else:
        net_type = 'deeplab'
        net_config['output_channels'] = 19
        model = SPPNet(**net_config)

    dataset = data_config['dataset']
    if dataset == 'deepglobe-dynamic':
        from dataset.deepglobe_dynamic import DeepGlobeDatasetDynamic as Dataset
        net_config['output_channels'] = 7
        classes = np.arange(0, 7)
    else:
        raise NotImplementedError
    del data_config['dataset']

    modelname = config_path.stem
    timestamp = datetime.timestamp(datetime.now())
    print("timestamp =", datetime.fromtimestamp(timestamp))
    output_dir = Path(os.path.join(ROOT_DIR, f'model/{modelname}_{datetime.fromtimestamp(timestamp)}') )
    output_dir.mkdir(exist_ok=True)
    log_dir = Path(os.path.join(ROOT_DIR, f'logs/{modelname}_{datetime.fromtimestamp(timestamp)}') )
    log_dir.mkdir(exist_ok=True)
    dataset_dir= '/home/sfoucher/DEV/pytorch-segmentation/data/deepglobe_as_pascalvoc/VOCdevkit/VOC2012'
    logger = debug_logger(log_dir)
    logger.debug(config)
    logger.info(f'Device: {device}')
    logger.info(f'Max Epoch: {max_epoch}')

    # Loss
    loss_fn = MultiClassCriterion(**loss_config).to(device)
    params = model.parameters()
    optimizer, scheduler = create_optimizer(params, **opt_config)

    # history
    if resume:
        with open(log_dir.joinpath('history.pkl'), 'rb') as f:
            history_dict = pickle.load(f)
            best_metrics = history_dict['best_metrics']
            loss_history = history_dict['loss']
            iou_history = history_dict['iou']
            start_epoch = len(iou_history)
            for _ in range(start_epoch):
                scheduler.step()
    else:
        start_epoch = 0
        best_metrics = 0
        loss_history = []
        iou_history = []


    affine_augmenter = albu.Compose([albu.HorizontalFlip(p=.5),albu.VerticalFlip(p=.5)
                                    # Rotate(5, p=.5)
                                    ])
    # image_augmenter = albu.Compose([albu.GaussNoise(p=.5),
    #                                 albu.RandomBrightnessContrast(p=.5)])
    image_augmenter = None

    # This has been put in the loop for the dynamic training

    """
    # Dataset
    train_dataset = Dataset(affine_augmenter=affine_augmenter, image_augmenter=image_augmenter,
                            net_type=net_type, **data_config)
    valid_dataset = Dataset(split='valid', net_type=net_type, **data_config)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4,
                            pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True)
    """

    

    # Pretrained model
    if pretrained_path:
        logger.info(f'Resume from {pretrained_path}')
        param = torch.load(pretrained_path)
        model.load_state_dict(param)
        model.logits = torch.nn.Conv2d(256, net_config['output_channels'], 1)
        del param

    # To device
    model = model.to(device)

    #########################################
    if freeze_enabled:
        # Code de Rémi
        # Freeze layers
        for param_index in range(int((len(optimizer.param_groups[0]['params']))*0.5)):
            optimizer.param_groups[0]['params'][param_index].requires_grad = False
    #########################################
        params_to_update = model.parameters()
        print("Params to learn:")
        if freeze_enabled:
            params_to_update = []
            for name,param in model.named_parameters():
                if param.requires_grad == True:
                    params_to_update.append(param)
                    print("\t",name)
        optimizer, scheduler = create_optimizer(params_to_update, **opt_config)

    # fp16
    if fp16:
        # I only took the necessary files because I don't need the C backend of apex,
        # which is broken and can't be installed
        # from apex import fp16_utils
        from utils.apex.apex.fp16_utils.fp16util import BN_convert_float
        from utils.apex.apex.fp16_utils.fp16_optimizer import FP16_Optimizer
        # model = fp16_utils.BN_convert_float(model.half())
        model = BN_convert_float(model.half())
        # optimizer = fp16_utils.FP16_Optimizer(optimizer, verbose=False, dynamic_loss_scale=True)
        optimizer = FP16_Optimizer(optimizer, verbose=False, dynamic_loss_scale=True)
        logger.info('Apply fp16')

    # Restore model
    if resume:
        model_path = output_dir.joinpath(f'model_tmp.pth')
        logger.info(f'Resume from {model_path}')
        param = torch.load(model_path)
        model.load_state_dict(param)
        del param
        opt_path = output_dir.joinpath(f'opt_tmp.pth')
        param = torch.load(opt_path)
        optimizer.load_state_dict(param)
        del param
    i_iter = 0
    ma_loss= 0
    ma_iou= 0
    # Train
    for i_epoch in range(start_epoch, max_epoch):
        logger.info(f'Epoch: {i_epoch}')
        logger.info(f'Learning rate: {optimizer.param_groups[0]["lr"]}')

        train_losses = []
        train_ious = []
        model.train()

        # Initialize randomized but balanced datasets
        train_dataset = Dataset(base_dir = dataset_dir,
                                affine_augmenter=affine_augmenter, image_augmenter=image_augmenter,
                                net_type=net_type, **data_config)
        valid_dataset = Dataset(base_dir = dataset_dir,
                                split='valid', net_type=net_type, **data_config)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4,
                                pin_memory=True, drop_last=True)
        valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True)

        with tqdm(train_loader) as _tqdm:
            for i, batched in enumerate(_tqdm):
                images, labels = batched
                if fp16:
                    images = images.half()
                images, labels = images.to(device), labels.to(device)
                optimizer.zero_grad()
                preds = model(images)
                if net_type == 'deeplab':
                    preds = F.interpolate(preds, size=labels.shape[1:], mode='bilinear', align_corners=True)
                if fp16:
                    loss = loss_fn(preds.float(), labels)
                else:
                    loss = loss_fn(preds, labels)

                preds_np = preds.detach().cpu().numpy()
                labels_np = labels.detach().cpu().numpy()
                iou = compute_iou_batch(np.argmax(preds_np, axis=1), labels_np, classes)

                _tqdm.set_postfix(OrderedDict(seg_loss=f'{loss.item():.5f}', iou=f'{iou:.3f}'))
                train_losses.append(loss.item())
                train_ious.append(iou)
                ma_loss= 0.01*loss.item() +  0.99 * ma_loss
                ma_iou= 0.01*iou +  0.99 * ma_iou
                plotter.plot('loss', 'train', 'iteration Loss', i_iter, loss.item())
                plotter.plot('iou', 'train', 'iteration iou', i_iter, iou)
                plotter.plot('loss', 'ma_loss', 'iteration Loss', i_iter, ma_loss)
                plotter.plot('iou', 'ma_iou', 'iteration iou', i_iter, ma_iou)
                if fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                optimizer.step()
                i_iter += 1
        scheduler.step()

        train_loss = np.mean(train_losses)
        train_iou = np.nanmean(train_ious)
        logger.info(f'train loss: {train_loss}')
        logger.info(f'train iou: {train_iou}')
        plotter.plot('loss-epoch', 'train', 'iteration Loss', i_epoch, train_loss)
        plotter.plot('iou-epoch', 'train', 'iteration iou', i_epoch, train_iou)
        torch.save(model.state_dict(), output_dir.joinpath('model_tmp.pth'))
        torch.save(optimizer.state_dict(), output_dir.joinpath('opt_tmp.pth'))

        valid_losses = []
        valid_ious = []
        model.eval()
        with torch.no_grad():
            with tqdm(valid_loader) as _tqdm:
                for batched in _tqdm:
                    images, labels = batched
                    if fp16:
                        images = images.half()
                    images, labels = images.to(device), labels.to(device)
                    preds = model.tta(images, net_type=net_type)
                    if fp16:
                        loss = loss_fn(preds.float(), labels)
                    else:
                        loss = loss_fn(preds, labels)

                    preds_np = preds.detach().cpu().numpy()
                    labels_np = labels.detach().cpu().numpy()

                    # I changed a parameter in the compute_iou method to prevent it from yielding nans
                    iou = compute_iou_batch(np.argmax(preds_np, axis=1), labels_np, classes)

                    _tqdm.set_postfix(OrderedDict(seg_loss=f'{loss.item():.5f}', iou=f'{iou:.3f}'))
                    valid_losses.append(loss.item())
                    valid_ious.append(iou)

        valid_loss = np.mean(valid_losses)
        valid_iou = np.mean(valid_ious)
        logger.info(f'valid seg loss: {valid_loss}')
        logger.info(f'valid iou: {valid_iou}')
        plotter.plot('loss-epoch', 'valid', 'iteration Loss', i_epoch, valid_loss)
        plotter.plot('iou-epoch', 'valid', 'iteration iou', i_epoch, valid_iou)
        if best_metrics < valid_iou:
            best_metrics = valid_iou
            logger.info('Best Model!')
            torch.save(model.state_dict(), output_dir.joinpath('model.pth'))
            torch.save(optimizer.state_dict(), output_dir.joinpath('opt.pth'))

        loss_history.append([train_loss, valid_loss])
        iou_history.append([train_iou, valid_iou])
        history_ploter(loss_history, log_dir.joinpath('loss.png'))
        history_ploter(iou_history, log_dir.joinpath('iou.png'))

        history_dict = {'loss': loss_history,
                        'iou': iou_history,
                        'best_metrics': best_metrics}
        with open(log_dir.joinpath('history.pkl'), 'wb') as f:
            pickle.dump(history_dict, f)