示例#1
0
def main():
    # Get args from command line
    args = get_args_from_command_line()

    if args.gpu_id is not None:
        cfg.CONST.DEVICE = args.gpu_id
    if args.weights is not None:
        cfg.CONST.WEIGHTS = args.weights

    # Print config
    print('Use config:')
    pprint(cfg)

    # Set GPU to use
    os.environ["CUDA_VISIBLE_DEVICES"] = cfg.CONST.DEVICE

    # Start train/test process
    if not args.test and not args.inference:
        train_net(cfg)
    else:
        if 'WEIGHTS' not in cfg.CONST or not os.path.exists(cfg.CONST.WEIGHTS):
            logging.error('Please specify the file path of checkpoint.')
            sys.exit(2)

        if args.test:
            test_net(cfg)
        else:
            inference_net(cfg)
示例#2
0
def main():
    # Get args from command line
    args = get_args_from_command_line()
    
    if args.gpu_id is not None:
        cfg.CONST.DEVICE = args.gpu_id
    if not args.randomize:
        np.random.seed(cfg.CONST.RNG_SEED)
    if args.batch_size is not None:
        cfg.CONST.BATCH_SIZE = args.batch_size
    if args.iter is not None:
        cfg.TRAIN.NUM_ITERATION = args.iter
    if args.out_path is not None:
        cfg.DIR.OUT_PATH = args.out_path
    if args.weights is not None:
        cfg.CONST.WEIGHTS = args.weights
        cfg.TRAIN.RESUME_TRAIN = True
        cfg.TRAIN.INITIAL_ITERATION = int(args.init_iter)

    # Print config
    print('Use config:')
    pprint(cfg)

    # Set GPU to use
    theano.gpuarray.use(cfg.CONST.DEVICE)

    # Start train/test process
    if not args.test:
        train_net(cfg)
    else:
        test_net(cfg)
示例#3
0
def main():
    # Get args from command line
    args = get_args_from_command_line()

    if args.gpu_id is not None:
        cfg.CONST.DEVICE = args.gpu_id
    if args.weights is not None:
        cfg.CONST.WEIGHTS = args.weights

    # Print config
    print('Use config:')
    pprint(cfg)
    # f_runner.write(str(cfg))

    # Set GPU to use
    os.environ["CUDA_VISIBLE_DEVICES"] = cfg.CONST.DEVICE

    # Start train/test process
    if not args.test and not args.inference:
        train_net(cfg)
    else:
        '''
        if 'WEIGHTS' not in cfg.CONST or not os.path.exists(cfg.CONST.WEIGHTS):
            logging.error('Please specify the file path of checkpoint.')
            sys.exit(2)
        '''

        if 'WEIGHTS' not in cfg.CONST:
            logging.error('Please specify the file path of checkpoint.1')
            sys.exit(2)
        if not os.path.exists(cfg.CONST.WEIGHTS):
            logging.error('Please specify the file path of checkpoint.2')
            sys.exit(2)

        if args.test:
            # test_net(cfg)
            path = '/raid/wuruihai/GRNet_FILES/tb_log'
            test_writer = SummaryWriter(path)
            test_net(cfg, test_writer=test_writer)

        if args.test_KITTI:
            path = '/raid/wuruihai/GRNet_FILES/tb_log'
            test_writer = SummaryWriter(path)
            test_net_KITTI(cfg, test_writer=test_writer)
        else:
            inference_net(cfg)
示例#4
0
def main():
    # Get args from command line
    args = get_args_from_command_line()

    # Read the experimental config
    exec(compile(open(args.cfg_file, "rb").read(), args.cfg_file, 'exec'))
    cfg = locals()['__C']
    pprint(cfg)

    # Parse runtime arguments
    if args.gpu_id is not None:
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
    if not args.randomize:
        random.seed(cfg.CONST.RNG_SEED)
        np.random.seed(cfg.CONST.RNG_SEED)
        torch.manual_seed(cfg.CONST.RNG_SEED)
        torch.cuda.manual_seed(cfg.CONST.RNG_SEED)
        torch.cuda.manual_seed_all(cfg.CONST.RNG_SEED)
        # References: https://pytorch.org/docs/stable/notes/randomness.html
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    if args.exp_name is not None:
        cfg.CONST.EXP_NAME = args.exp_name
    if args.weights is not None:
        cfg.CONST.WEIGHTS = args.weights

    # Start train/test process
    if not args.test and not args.inference:
        # Make sure cfg.TRAIN.NETWORK in ['RMNet', 'TinyFlowNet']
        if cfg.TRAIN.NETWORK not in ['RMNet', 'TinyFlowNet']:
            logging.error(
                'Please make sure cfg.TRAIN.NETWORK in ["RMNet", "TinyFlowNet"].'
            )
            sys.exit(1)

        train_net(cfg)
    else:
        if 'WEIGHTS' not in cfg.CONST or not os.path.exists(cfg.CONST.WEIGHTS):
            logging.error('Please specify the file path of checkpoint.')
            sys.exit(2)

        if args.test:
            test_net(cfg)
        else:
            inference_net(cfg)
示例#5
0
def main():
    # Get args from command line
    args = get_args_from_command_line()

    if args.gpu_id is not None:
        cfg.CONST.DEVICE = args.gpu_id
    if not args.randomize:
        np.random.seed(cfg.CONST.RNG_SEED)
    if args.batch_size is not None:
        cfg.CONST.BATCH_SIZE = args.batch_size
    if args.epoch is not None:
        cfg.TRAIN.NUM_EPOCHES = args.epoch
    if args.dataset is not None:
        cfg.DATASET.DATASET_NAME = args.dataset
        if cfg.DATASET.DATASET_NAME not in cfg.DATASETS:
            cfg.DATASET.CENTER_BIAS = cfg.DATASETS.TEST.CENTER_BIAS
        else:
            cfg.DATASET.CENTER_BIAS = cfg.DATASETS.SALICON.CENTER_BIAS
    if args.out_path is not None:
        cfg.DIR.OUT_PATH = args.out_path
    if args.weights is not None:
        cfg.CONST.WEIGHTS = args.weights
        if not args.test:
            cfg.TRAIN.RESUME_TRAIN = True

    # print config
    print('Use config:')
    pprint(cfg)

    # Set GPU to use
    if type(cfg.CONST.DEVICE) == str:
        os.environ["CUDA_VISIBLE_DEVICES"] = cfg.CONST.DEVICE

    # Start train/test process
    if not args.test:
        train_net(cfg)
    else:
        if 'WEIGHTS' in cfg.CONST and os.path.exists(cfg.CONST.WEIGHTS):
            test_net(cfg)
        else:
            print('[FATAL] %s Please specify the file path of checkpoint.' %
                  (dt.now()))
            sys.exit(2)
示例#6
0
def train_net(cfg):
    # Enable the inbuilt cudnn auto-tuner to find the best algorithm to use
    torch.backends.cudnn.benchmark = True

    # Set up data augmentation
    IMG_SIZE = cfg.CONST.IMG_H, cfg.CONST.IMG_W
    CROP_SIZE = cfg.CONST.CROP_IMG_H, cfg.CONST.CROP_IMG_W
    train_transforms = utils.data_transforms.Compose([
        utils.data_transforms.RandomCrop(IMG_SIZE, CROP_SIZE),
        utils.data_transforms.RandomBackground(
            cfg.TRAIN.RANDOM_BG_COLOR_RANGE),
        utils.data_transforms.ColorJitter(cfg.TRAIN.BRIGHTNESS,
                                          cfg.TRAIN.CONTRAST,
                                          cfg.TRAIN.SATURATION),
        utils.data_transforms.RandomNoise(cfg.TRAIN.NOISE_STD),
        utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN,
                                        std=cfg.DATASET.STD),
        utils.data_transforms.RandomFlip(),
        utils.data_transforms.RandomPermuteRGB(),
        utils.data_transforms.ToTensor(),
    ])
    val_transforms = utils.data_transforms.Compose([
        utils.data_transforms.CenterCrop(IMG_SIZE, CROP_SIZE),
        utils.data_transforms.RandomBackground(cfg.TEST.RANDOM_BG_COLOR_RANGE),
        utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN,
                                        std=cfg.DATASET.STD),
        utils.data_transforms.ToTensor(),
    ])

    # Set up data loader
    train_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[
        cfg.DATASET.TRAIN_DATASET](cfg)
    val_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[
        cfg.DATASET.TEST_DATASET](cfg)
    train_data_loader = torch.utils.data.DataLoader(
        dataset=train_dataset_loader.get_dataset(
            utils.data_loaders.DatasetType.TRAIN, cfg.CONST.N_VIEWS_RENDERING,
            train_transforms),
        batch_size=cfg.CONST.BATCH_SIZE,
        num_workers=cfg.TRAIN.NUM_WORKER,
        pin_memory=True,
        shuffle=True,
        drop_last=True)
    val_data_loader = torch.utils.data.DataLoader(
        dataset=val_dataset_loader.get_dataset(
            utils.data_loaders.DatasetType.VAL, cfg.CONST.N_VIEWS_RENDERING,
            val_transforms),
        batch_size=1,
        num_workers=1,
        pin_memory=True,
        shuffle=False)

    # Set up networks
    encoder = Encoder(cfg)
    decoder = Decoder(cfg)
    refiner = Refiner(cfg)
    merger = Merger(cfg)
    print('[DEBUG] %s Parameters in Encoder: %d.' %
          (dt.now(), utils.network_utils.count_parameters(encoder)))
    print('[DEBUG] %s Parameters in Decoder: %d.' %
          (dt.now(), utils.network_utils.count_parameters(decoder)))
    print('[DEBUG] %s Parameters in Refiner: %d.' %
          (dt.now(), utils.network_utils.count_parameters(refiner)))
    print('[DEBUG] %s Parameters in Merger: %d.' %
          (dt.now(), utils.network_utils.count_parameters(merger)))

    # Initialize weights of networks
    encoder.apply(utils.network_utils.init_weights)
    decoder.apply(utils.network_utils.init_weights)
    refiner.apply(utils.network_utils.init_weights)
    merger.apply(utils.network_utils.init_weights)

    # Set up solver
    if cfg.TRAIN.POLICY == 'adam':
        encoder_solver = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                                 encoder.parameters()),
                                          lr=cfg.TRAIN.ENCODER_LEARNING_RATE,
                                          betas=cfg.TRAIN.BETAS)
        decoder_solver = torch.optim.Adam(decoder.parameters(),
                                          lr=cfg.TRAIN.DECODER_LEARNING_RATE,
                                          betas=cfg.TRAIN.BETAS)
        refiner_solver = torch.optim.Adam(refiner.parameters(),
                                          lr=cfg.TRAIN.REFINER_LEARNING_RATE,
                                          betas=cfg.TRAIN.BETAS)
        merger_solver = torch.optim.Adam(merger.parameters(),
                                         lr=cfg.TRAIN.MERGER_LEARNING_RATE,
                                         betas=cfg.TRAIN.BETAS)
    elif cfg.TRAIN.POLICY == 'sgd':
        encoder_solver = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                                encoder.parameters()),
                                         lr=cfg.TRAIN.ENCODER_LEARNING_RATE,
                                         momentum=cfg.TRAIN.MOMENTUM)
        decoder_solver = torch.optim.SGD(decoder.parameters(),
                                         lr=cfg.TRAIN.DECODER_LEARNING_RATE,
                                         momentum=cfg.TRAIN.MOMENTUM)
        refiner_solver = torch.optim.SGD(refiner.parameters(),
                                         lr=cfg.TRAIN.REFINER_LEARNING_RATE,
                                         momentum=cfg.TRAIN.MOMENTUM)
        merger_solver = torch.optim.SGD(merger.parameters(),
                                        lr=cfg.TRAIN.MERGER_LEARNING_RATE,
                                        momentum=cfg.TRAIN.MOMENTUM)
    else:
        raise Exception('[FATAL] %s Unknown optimizer %s.' %
                        (dt.now(), cfg.TRAIN.POLICY))

    # Set up learning rate scheduler to decay learning rates dynamically
    encoder_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        encoder_solver,
        milestones=cfg.TRAIN.ENCODER_LR_MILESTONES,
        gamma=cfg.TRAIN.GAMMA)
    decoder_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        decoder_solver,
        milestones=cfg.TRAIN.DECODER_LR_MILESTONES,
        gamma=cfg.TRAIN.GAMMA)
    refiner_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        refiner_solver,
        milestones=cfg.TRAIN.REFINER_LR_MILESTONES,
        gamma=cfg.TRAIN.GAMMA)
    merger_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        merger_solver,
        milestones=cfg.TRAIN.MERGER_LR_MILESTONES,
        gamma=cfg.TRAIN.GAMMA)

    if torch.cuda.is_available():
        encoder = torch.nn.DataParallel(encoder).cuda()
        decoder = torch.nn.DataParallel(decoder).cuda()
        refiner = torch.nn.DataParallel(refiner).cuda()
        merger = torch.nn.DataParallel(merger).cuda()

    # Set up loss functions
    bce_loss = torch.nn.BCELoss()

    # Load pretrained model if exists
    init_epoch = 0
    best_iou = -1
    best_epoch = -1
    if 'WEIGHTS' in cfg.CONST and cfg.TRAIN.RESUME_TRAIN:
        print('[INFO] %s Recovering from %s ...' %
              (dt.now(), cfg.CONST.WEIGHTS))
        checkpoint = torch.load(cfg.CONST.WEIGHTS)
        init_epoch = checkpoint['epoch_idx']
        best_iou = checkpoint['best_iou']
        best_epoch = checkpoint['best_epoch']

        encoder.load_state_dict(checkpoint['encoder_state_dict'])
        decoder.load_state_dict(checkpoint['decoder_state_dict'])
        if cfg.NETWORK.USE_REFINER:
            refiner.load_state_dict(checkpoint['refiner_state_dict'])
        if cfg.NETWORK.USE_MERGER:
            merger.load_state_dict(checkpoint['merger_state_dict'])

        print('[INFO] %s Recover complete. Current epoch #%d, Best IoU = %.4f at epoch #%d.' \
                 % (dt.now(), init_epoch, best_iou, best_epoch))

    # Summary writer for TensorBoard
    output_dir = os.path.join(cfg.DIR.OUT_PATH, '%s', dt.now().isoformat())
    log_dir = output_dir % 'logs'
    ckpt_dir = output_dir % 'checkpoints'
    train_writer = SummaryWriter(os.path.join(log_dir, 'train'))
    val_writer = SummaryWriter(os.path.join(log_dir, 'test'))

    # Training loop
    for epoch_idx in range(init_epoch, cfg.TRAIN.NUM_EPOCHES):
        # Tick / tock
        epoch_start_time = time()

        # Batch average meterics
        batch_time = utils.network_utils.AverageMeter()
        data_time = utils.network_utils.AverageMeter()
        encoder_losses = utils.network_utils.AverageMeter()
        refiner_losses = utils.network_utils.AverageMeter()

        # Adjust learning rate
        encoder_lr_scheduler.step()
        decoder_lr_scheduler.step()
        refiner_lr_scheduler.step()
        merger_lr_scheduler.step()

        # switch models to training mode
        encoder.train()
        decoder.train()
        merger.train()
        refiner.train()

        batch_end_time = time()
        n_batches = len(train_data_loader)
        for batch_idx, (taxonomy_names, sample_names, rendering_images,
                        ground_truth_volumes) in enumerate(train_data_loader):
            # Measure data time
            data_time.update(time() - batch_end_time)

            # Get data from data loader
            rendering_images = utils.network_utils.var_or_cuda(
                rendering_images)
            ground_truth_volumes = utils.network_utils.var_or_cuda(
                ground_truth_volumes)

            # Train the encoder, decoder, refiner, and merger
            image_features = encoder(rendering_images)
            raw_features, generated_volumes = decoder(image_features)

            if cfg.NETWORK.USE_MERGER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_MERGER:
                generated_volumes = merger(raw_features, generated_volumes)
            else:
                generated_volumes = torch.mean(generated_volumes, dim=1)
            encoder_loss = bce_loss(generated_volumes,
                                    ground_truth_volumes) * 10

            if cfg.NETWORK.USE_REFINER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_REFINER:
                generated_volumes = refiner(generated_volumes)
                refiner_loss = bce_loss(generated_volumes,
                                        ground_truth_volumes) * 10
            else:
                refiner_loss = encoder_loss

            # Gradient decent
            encoder.zero_grad()
            decoder.zero_grad()
            refiner.zero_grad()
            merger.zero_grad()

            if cfg.NETWORK.USE_REFINER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_REFINER:
                encoder_loss.backward(retain_graph=True)
                refiner_loss.backward()
            else:
                encoder_loss.backward()

            encoder_solver.step()
            decoder_solver.step()
            refiner_solver.step()
            merger_solver.step()

            # Append loss to average metrics
            encoder_losses.update(encoder_loss.item())
            refiner_losses.update(refiner_loss.item())
            # Append loss to TensorBoard
            n_itr = epoch_idx * n_batches + batch_idx
            train_writer.add_scalar('EncoderDecoder/BatchLoss',
                                    encoder_loss.item(), n_itr)
            train_writer.add_scalar('Refiner/BatchLoss', refiner_loss.item(),
                                    n_itr)

            # Tick / tock
            batch_time.update(time() - batch_end_time)
            batch_end_time = time()
            print('[INFO] %s [Epoch %d/%d][Batch %d/%d] BatchTime = %.3f (s) DataTime = %.3f (s) EDLoss = %.4f RLoss = %.4f' % \
                (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, batch_idx + 1, n_batches, \
                    batch_time.val, data_time.val, encoder_loss.item(), refiner_loss.item()))

        # Append epoch loss to TensorBoard
        train_writer.add_scalar('EncoderDecoder/EpochLoss', encoder_losses.avg,
                                epoch_idx + 1)
        train_writer.add_scalar('Refiner/EpochLoss', refiner_losses.avg,
                                epoch_idx + 1)

        # Tick / tock
        epoch_end_time = time()
        print('[INFO] %s Epoch [%d/%d] EpochTime = %.3f (s) EDLoss = %.4f RLoss = %.4f' %
            (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, epoch_end_time - epoch_start_time, \
                encoder_losses.avg, refiner_losses.avg))

        # Update Rendering Views
        if cfg.TRAIN.UPDATE_N_VIEWS_RENDERING:
            n_views_rendering = random.randint(1, cfg.CONST.N_VIEWS_RENDERING)
            train_data_loader.dataset.set_n_views_rendering(n_views_rendering)
            print('[INFO] %s Epoch [%d/%d] Update #RenderingViews to %d' % \
                (dt.now(), epoch_idx + 2, cfg.TRAIN.NUM_EPOCHES, n_views_rendering))

        # Validate the training models
        iou = test_net(cfg, epoch_idx + 1, output_dir, val_data_loader,
                       val_writer, encoder, decoder, refiner, merger)

        # Save weights to file
        if (epoch_idx + 1) % cfg.TRAIN.SAVE_FREQ == 0:
            if not os.path.exists(ckpt_dir):
                os.makedirs(ckpt_dir)

            utils.network_utils.save_checkpoints(cfg, \
                    os.path.join(ckpt_dir, 'ckpt-epoch-%04d.pth' % (epoch_idx + 1)), \
                    epoch_idx + 1, encoder, encoder_solver, decoder, decoder_solver, \
                    refiner, refiner_solver, merger, merger_solver, best_iou, best_epoch)
        if iou > best_iou:
            if not os.path.exists(ckpt_dir):
                os.makedirs(ckpt_dir)

            best_iou = iou
            best_epoch = epoch_idx + 1
            utils.network_utils.save_checkpoints(cfg, \
                    os.path.join(ckpt_dir, 'best-ckpt.pth'), \
                    epoch_idx + 1, encoder, encoder_solver, decoder, decoder_solver, \
                    refiner, refiner_solver, merger, merger_solver, best_iou, best_epoch)

    # Close SummaryWriter for TensorBoard
    train_writer.close()
    val_writer.close()
示例#7
0
    return args


if __name__ == '__main__':
    args = parse_args()

    print('Called with args:')
    print(args)

    if args.cfg_file is not None:
        cfg_from_file(args.cfg_file)
    if args.set_cfgs is not None:
        cfg_from_list(args.set_cfgs)
    if args.exp_dir is not None:
        cfg.EXP_DIR = args.exp_dir

    cfg.GPU_ID = args.gpu_id

    print('Using config:')
    pprint.pprint(cfg)

    caffe.set_mode_gpu()
    caffe.set_device(args.gpu_id)

    output_dir_name = 'test'
    if args.datasets:
        output_dir_name += '_' + '_'.join(args.datasets)
    output_dir_name += '_' + datetime.datetime.now().strftime("%d_%m_%Y_%H_%M")
    output_dir = get_output_dir(output_dir_name, None)
    test_net(args.caffemodel, output_dir, args.datasets)
示例#8
0
def train_net(cfg):
    # Set up data augmentation
    IMG_SIZE = cfg.CONST.IMG_H, cfg.CONST.IMG_W
    CROP_SIZE = cfg.CONST.CROP_IMG_H, cfg.CONST.CROP_IMG_W
    train_transforms = utils.data_transforms.Compose([
        utils.data_transforms.RandomCrop(IMG_SIZE, CROP_SIZE),
        utils.data_transforms.RandomBackground(
            cfg.TRAIN.RANDOM_BG_COLOR_RANGE),
        utils.data_transforms.ColorJitter(cfg.TRAIN.BRIGHTNESS,
                                          cfg.TRAIN.CONTRAST,
                                          cfg.TRAIN.SATURATION),
        utils.data_transforms.RandomNoise(cfg.TRAIN.NOISE_STD),
        utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN,
                                        std=cfg.DATASET.STD),
        utils.data_transforms.RandomFlip(),
        utils.data_transforms.RandomPermuteRGB(),
        utils.data_transforms.ToTensor(),
    ])
    val_transforms = utils.data_transforms.Compose([
        utils.data_transforms.CenterCrop(IMG_SIZE, CROP_SIZE),
        utils.data_transforms.RandomBackground(cfg.TEST.RANDOM_BG_COLOR_RANGE),
        utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN,
                                        std=cfg.DATASET.STD),
        utils.data_transforms.ToTensor(),
    ])

    # Set up data loader
    train_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[
        cfg.DATASET.TRAIN_DATASET](cfg)
    val_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[
        cfg.DATASET.TEST_DATASET](cfg)
    train_data_loader = paddle.io.DataLoader(
        dataset=train_dataset_loader.get_dataset(
            utils.data_loaders.DatasetType.TRAIN, cfg.CONST.N_VIEWS_RENDERING,
            train_transforms),
        batch_size=cfg.CONST.BATCH_SIZE,
        #num_workers=0  , # cfg.TRAIN.NUM_WORKER>0时报错,因为dev/shm/太小  https://blog.csdn.net/ctypyb2002/article/details/107914643
        #pin_memory=True,
        use_shared_memory=False,
        shuffle=True,
        drop_last=True)
    val_data_loader = paddle.io.DataLoader(
        dataset=val_dataset_loader.get_dataset(
            utils.data_loaders.DatasetType.VAL, cfg.CONST.N_VIEWS_RENDERING,
            val_transforms),
        batch_size=1,
        #num_workers=1,
        #pin_memory=True,
        shuffle=False)

    # Set up networks # paddle.Model prepare fit save
    encoder = Encoder(cfg)
    decoder = Decoder(cfg)
    merger = Merger(cfg)
    refiner = Refiner(cfg)
    print('[DEBUG] %s Parameters in Encoder: %d.' %
          (dt.now(), utils.network_utils.count_parameters(encoder)))
    print('[DEBUG] %s Parameters in Decoder: %d.' %
          (dt.now(), utils.network_utils.count_parameters(decoder)))
    print('[DEBUG] %s Parameters in Merger: %d.' %
          (dt.now(), utils.network_utils.count_parameters(merger)))
    print('[DEBUG] %s Parameters in Refiner: %d.' %
          (dt.now(), utils.network_utils.count_parameters(refiner)))

    # # Initialize weights of networks # paddle的参数化不同,参见API
    # encoder.apply(utils.network_utils.init_weights)
    # decoder.apply(utils.network_utils.init_weights)
    # merger.apply(utils.network_utils.init_weights)

    # Set up learning rate scheduler to decay learning rates dynamically
    encoder_lr_scheduler = paddle.optimizer.lr.MultiStepDecay(
        learning_rate=cfg.TRAIN.ENCODER_LEARNING_RATE,
        milestones=cfg.TRAIN.ENCODER_LR_MILESTONES,
        gamma=cfg.TRAIN.GAMMA,
        verbose=True)
    decoder_lr_scheduler = paddle.optimizer.lr.MultiStepDecay(
        learning_rate=cfg.TRAIN.DECODER_LEARNING_RATE,
        milestones=cfg.TRAIN.DECODER_LR_MILESTONES,
        gamma=cfg.TRAIN.GAMMA,
        verbose=True)
    merger_lr_scheduler = paddle.optimizer.lr.MultiStepDecay(
        learning_rate=cfg.TRAIN.MERGER_LEARNING_RATE,
        milestones=cfg.TRAIN.MERGER_LR_MILESTONES,
        gamma=cfg.TRAIN.GAMMA,
        verbose=True)
    refiner_lr_scheduler = paddle.optimizer.lr.MultiStepDecay(
        learning_rate=cfg.TRAIN.REFINER_LEARNING_RATE,
        milestones=cfg.TRAIN.REFINER_LR_MILESTONES,
        gamma=cfg.TRAIN.GAMMA,
        verbose=True)
    # Set up solver
    # if cfg.TRAIN.POLICY == 'adam':
    encoder_solver = paddle.optimizer.Adam(learning_rate=encoder_lr_scheduler,
                                           parameters=encoder.parameters())
    decoder_solver = paddle.optimizer.Adam(learning_rate=decoder_lr_scheduler,
                                           parameters=decoder.parameters())
    merger_solver = paddle.optimizer.Adam(learning_rate=merger_lr_scheduler,
                                          parameters=merger.parameters())
    refiner_solver = paddle.optimizer.Adam(learning_rate=refiner_lr_scheduler,
                                           parameters=refiner.parameters())

    # if torch.cuda.is_available():
    #     encoder = torch.nn.DataParallel(encoder).cuda()
    #     decoder = torch.nn.DataParallel(decoder).cuda()
    #     merger = torch.nn.DataParallel(merger).cuda()

    # Set up loss functions
    bce_loss = paddle.nn.BCELoss()

    # Load pretrained model if exists
    init_epoch = 0
    best_iou = -1
    best_epoch = -1
    if 'WEIGHTS' in cfg.CONST and cfg.TRAIN.RESUME_TRAIN:
        print('[INFO] %s Recovering from %s ...' %
              (dt.now(), cfg.CONST.WEIGHTS))
        # load
        encoder_state_dict = paddle.load(
            os.path.join(cfg.CONST.WEIGHTS, "encoder.pdparams"))
        encoder_solver_state_dict = paddle.load(
            os.path.join(cfg.CONST.WEIGHTS, "encoder_solver.pdopt"))
        encoder.set_state_dict(encoder_state_dict)
        encoder_solver.set_state_dict(encoder_solver_state_dict)
        decoder_state_dict = paddle.load(
            os.path.join(cfg.CONST.WEIGHTS, "decoder.pdparams"))
        decoder_solver_state_dict = paddle.load(
            os.path.join(cfg.CONST.WEIGHTS, "decoder_solver.pdopt"))
        decoder.set_state_dict(decoder_state_dict)
        decoder_solver.set_state_dict(decoder_solver_state_dict)

        if cfg.NETWORK.USE_MERGER:
            merger_state_dict = paddle.load(
                os.path.join(cfg.CONST.WEIGHTS, "merger.pdparams"))
            merger_solver_state_dict = paddle.load(
                os.path.join(cfg.CONST.WEIGHTS, "merger_solver.pdopt"))
            merger.set_state_dict(merger_state_dict)
            merger_solver.set_state_dict(merger_solver_state_dict)

        if cfg.NETWORK.USE_REFINER:
            refiner_state_dict = paddle.load(
                os.path.join(cfg.CONST.WEIGHTS, "refiner.pdparams"))
            refiner_solver_state_dict = paddle.load(
                os.path.join(cfg.CONST.WEIGHTS, "refiner_solver.pdopt"))
            refiner.set_state_dict(refiner_state_dict)
            refiner_solver.set_state_dict(refiner_solver_state_dict)

        print(
            '[INFO] %s Recover complete. Current epoch #%d, Best IoU = %.4f at epoch #%d.'
            % (dt.now(), init_epoch, best_iou, best_epoch))

    # Summary writer for TensorBoard
    output_dir = os.path.join(cfg.DIR.OUT_PATH, '%s', dt.now().isoformat())
    log_dir = output_dir % 'logs'
    ckpt_dir = output_dir % 'checkpoints'
    # train_writer = SummaryWriter()
    # val_writer = SummaryWriter(os.path.join(log_dir, 'test'))
    train_writer = LogWriter(os.path.join(log_dir, 'train'))
    val_writer = LogWriter(os.path.join(log_dir, 'val'))

    # Training loop
    for epoch_idx in range(init_epoch, cfg.TRAIN.NUM_EPOCHES):
        # Tick / tock
        epoch_start_time = time()

        # Batch average meterics
        batch_time = utils.network_utils.AverageMeter()
        data_time = utils.network_utils.AverageMeter()
        encoder_losses = utils.network_utils.AverageMeter()
        refiner_losses = utils.network_utils.AverageMeter()

        # # switch models to training mode
        encoder.train()
        decoder.train()
        merger.train()
        refiner.train()

        batch_end_time = time()
        n_batches = len(train_data_loader)

        # print("****debug: length of train data loder",n_batches)
        for batch_idx, (rendering_images, ground_truth_volumes) in enumerate(
                train_data_loader()):
            # # debug
            # if batch_idx>1:
            #     break

            # Measure data time
            data_time.update(time() - batch_end_time)
            # print("****debug: batch_idx",batch_idx)
            # print(rendering_images.shape)
            # print(ground_truth_volumes.shape)
            # Get data from data loader
            rendering_images = utils.network_utils.var_or_cuda(
                rendering_images)
            ground_truth_volumes = utils.network_utils.var_or_cuda(
                ground_truth_volumes)

            # Train the encoder, decoder, and merger
            image_features = encoder(rendering_images)
            raw_features, generated_volumes = decoder(image_features)

            if cfg.NETWORK.USE_MERGER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_MERGER:
                generated_volumes = merger(raw_features, generated_volumes)
            # else:
            #     mergered_volumes = paddle.mean(generated_volumes, aixs=1)

            encoder_loss = bce_loss(generated_volumes,
                                    ground_truth_volumes) * 10

            if cfg.NETWORK.USE_REFINER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_REFINER:
                generated_volumes = refiner(generated_volumes)
                refiner_loss = bce_loss(generated_volumes,
                                        ground_truth_volumes) * 10
            # else:
            #     refiner_loss = encoder_loss

            # Gradient decent
            encoder_solver.clear_grad()
            decoder_solver.clear_grad()
            merger_solver.clear_grad()
            refiner_solver.clear_grad()

            if cfg.NETWORK.USE_REFINER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_REFINER:
                encoder_loss.backward(retain_graph=True)
                refiner_loss.backward()
            # else:
            #     encoder_loss.backward()

            encoder_solver.step()
            decoder_solver.step()
            merger_solver.step()
            refiner_solver.step()

            # Append loss to average metrics
            encoder_losses.update(encoder_loss.numpy())
            refiner_losses.update(refiner_loss.numpy())

            # Append loss to TensorBoard
            n_itr = epoch_idx * n_batches + batch_idx
            train_writer.add_scalar(tag='EncoderDecoder/BatchLoss',
                                    step=n_itr,
                                    value=encoder_loss.numpy())
            train_writer.add_scalar('Refiner/BatchLoss',
                                    value=refiner_loss.numpy(),
                                    step=n_itr)

            # Tick / tock
            batch_time.update(time() - batch_end_time)
            batch_end_time = time()
            if (batch_idx % int(cfg.CONST.INFO_BATCH)) == 0:
                print(
                    '[INFO] %s [Epoch %d/%d][Batch %d/%d] BatchTime = %.3f (s) DataTime = %.3f (s) EDLoss = %.4f RLoss = %.4f'
                    % (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES,
                       batch_idx + 1, n_batches, batch_time.val, data_time.val,
                       encoder_loss.numpy(), refiner_loss.numpy()))

        # Append epoch loss to TensorBoard
        train_writer.add_scalar(tag='EncoderDecoder/EpochLoss',
                                step=epoch_idx + 1,
                                value=encoder_losses.avg)
        train_writer.add_scalar('Refiner/EpochLoss',
                                value=refiner_losses.avg,
                                step=epoch_idx + 1)

        # update scheduler each step
        encoder_lr_scheduler.step()
        decoder_lr_scheduler.step()
        merger_lr_scheduler.step()
        refiner_lr_scheduler.step()

        # Tick / tock
        epoch_end_time = time()
        print(
            '[INFO] %s Epoch [%d/%d] EpochTime = %.3f (s) EDLoss = %.4f RLoss = %.4f'
            % (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, epoch_end_time -
               epoch_start_time, encoder_losses.avg, refiner_losses.avg))

        # Update Rendering Views
        if cfg.TRAIN.UPDATE_N_VIEWS_RENDERING:
            n_views_rendering = random.randint(1, cfg.CONST.N_VIEWS_RENDERING)
            train_data_loader.dataset.set_n_views_rendering(n_views_rendering)
            print('[INFO] %s Epoch [%d/%d] Update #RenderingViews to %d' %
                  (dt.now(), epoch_idx + 2, cfg.TRAIN.NUM_EPOCHES,
                   n_views_rendering))

        # Validate the training models
        iou = test_net(cfg, epoch_idx + 1, output_dir, val_data_loader,
                       val_writer, encoder, decoder, merger, refiner)

        # Save weights to file
        if (epoch_idx + 1) % cfg.TRAIN.SAVE_FREQ == 0:
            if not os.path.exists(ckpt_dir):
                os.makedirs(ckpt_dir)

            utils.network_utils.save_checkpoints(
                cfg, os.path.join(ckpt_dir,
                                  'ckpt-epoch-%04d' % (epoch_idx + 1)),
                epoch_idx + 1, encoder, encoder_solver, decoder,
                decoder_solver, merger, merger_solver, refiner, refiner_solver,
                best_iou, best_epoch)
        if iou > best_iou:
            if not os.path.exists(ckpt_dir):
                os.makedirs(ckpt_dir)

            best_iou = iou
            best_epoch = epoch_idx + 1
            utils.network_utils.save_checkpoints(
                cfg, os.path.join(ckpt_dir, 'best-ckpt'), epoch_idx + 1,
                encoder, encoder_solver, decoder, decoder_solver, merger,
                merger_solver, refiner, refiner_solver, best_iou, best_epoch)
示例#9
0
    args = parser.parse_args()
    return args

if __name__ == '__main__':
    args = parse_args()

    print('Called with args:')
    print(args)

    if args.cfg_file is not None:
        cfg_from_file(args.cfg_file)
    if args.set_cfgs is not None:
        cfg_from_list(args.set_cfgs)
    if args.exp_dir is not None:
        cfg.EXP_DIR = args.exp_dir

    cfg.GPU_ID = args.gpu_id

    print('Using config:')
    pprint.pprint(cfg)

    caffe.set_mode_gpu()
    caffe.set_device(args.gpu_id)

    output_dir_name = 'test'
    if args.datasets:
        output_dir_name += '_' + '_'.join(args.datasets)
    output_dir_name += '_' + datetime.datetime.now().strftime("%d_%m_%Y_%H_%M")
    output_dir = get_output_dir(output_dir_name, None)
    test_net(args.caffemodel, output_dir, args.datasets)
示例#10
0
文件: train.py 项目: sxy7147/GRNet
def train_net(cfg):
    # Enable the inbuilt cudnn auto-tuner to find the best algorithm to use
    torch.backends.cudnn.benchmark = True

    # Set up data loader
    # choose ShapeNet
    train_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[
        cfg.DATASET.TRAIN_DATASET](cfg)
    test_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[
        cfg.DATASET.TEST_DATASET](cfg)
    # get_dataset's para: subdataset(train0, test1, val2)
    train_data_loader = torch.utils.data.DataLoader(
        dataset=train_dataset_loader.get_dataset(
            utils.data_loaders.DatasetSubset.TRAIN),  # train/test/val
        batch_size=cfg.TRAIN.BATCH_SIZE,
        num_workers=cfg.CONST.NUM_WORKERS,
        collate_fn=utils.data_loaders.collate_fn,
        pin_memory=True,
        shuffle=True,
        drop_last=True)
    val_data_loader = torch.utils.data.DataLoader(
        dataset=test_dataset_loader.get_dataset(
            utils.data_loaders.DatasetSubset.VAL),
        batch_size=1,
        num_workers=cfg.CONST.NUM_WORKERS,
        collate_fn=utils.data_loaders.collate_fn,
        pin_memory=True,
        shuffle=False)

    # Set up folders for logs and checkpoints
    output_dir = os.path.join(cfg.DIR.OUT_PATH, '%s',
                              datetime.now().isoformat())  # output_dir
    cfg.DIR.CHECKPOINTS = output_dir % 'checkpoints'
    cfg.DIR.LOGS = output_dir % 'logs'
    txt_dir = output_dir % 'txt'
    if not os.path.exists(txt_dir):
        os.makedirs(txt_dir)
    f_record = open(txt_dir + '/record.txt', 'w')
    if not os.path.exists(cfg.DIR.CHECKPOINTS):
        os.makedirs(cfg.DIR.CHECKPOINTS)

    # Create tensorboard writers
    train_writer = SummaryWriter(os.path.join(cfg.DIR.LOGS, 'train'))
    val_writer = SummaryWriter(os.path.join(cfg.DIR.LOGS, 'test'))

    # Create the networks
    grnet = GRNet(cfg)
    grnet.apply(utils.helpers.init_weights)
    logging.debug('Parameters in GRNet: %d.' %
                  utils.helpers.count_parameters(grnet))

    # Move the network to GPU if possible
    if torch.cuda.is_available():
        grnet = torch.nn.DataParallel(grnet).cuda()

    # Create the optimizers
    grnet_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                              grnet.parameters()),
                                       lr=cfg.TRAIN.LEARNING_RATE,
                                       weight_decay=cfg.TRAIN.WEIGHT_DECAY,
                                       betas=cfg.TRAIN.BETAS)
    grnet_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        grnet_optimizer,
        milestones=cfg.TRAIN.LR_MILESTONES,
        gamma=cfg.TRAIN.GAMMA)

    # Set up loss functions
    chamfer_dist = ChamferDistance()
    gridding_loss = GriddingLoss(  # lgtm [py/unused-local-variable]
        scales=cfg.NETWORK.GRIDDING_LOSS_SCALES,
        alphas=cfg.NETWORK.GRIDDING_LOSS_ALPHAS)

    # Load pretrained model if exists
    init_epoch = 0  # 断点续跑
    best_metrics = None
    if 'WEIGHTS' in cfg.CONST:
        logging.info('Recovering from %s ...' % (cfg.CONST.WEIGHTS))
        checkpoint = torch.load(cfg.CONST.WEIGHTS)
        best_metrics = Metrics(cfg.TEST.METRIC_NAME,
                               checkpoint['best_metrics'])
        grnet.load_state_dict(checkpoint['grnet'])
        logging.info(
            'Recover complete. Current epoch = #%d; best metrics = %s.' %
            (init_epoch, best_metrics))

    # Training/Testing the network
    first_epoch = True
    for epoch_idx in range(init_epoch + 1, cfg.TRAIN.N_EPOCHS + 1):
        epoch_start_time = time()

        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses = AverageMeter(['SparseLoss', 'DenseLoss'])
        # losses = AverageMeter(['GridLoss', 'DenseLoss'])

        grnet.train()

        batch_end_time = time()
        n_batches = len(train_data_loader)
        for batch_idx, (taxonomy_ids, model_ids,
                        data) in enumerate(train_data_loader):
            # print('batch_size: ', data['partial_cloud'].shape)
            data_time.update(time() - batch_end_time)
            for k, v in data.items():
                data[k] = utils.helpers.var_or_cuda(v)
            sparse_ptcloud, dense_ptcloud = grnet(data)
            sparse_loss = chamfer_dist(sparse_ptcloud, data['gtcloud'])
            # grid_loss = gridding_loss(dense_ptcloud, data['gtcloud'])
            dense_loss = chamfer_dist(dense_ptcloud, data['gtcloud'])
            _loss = sparse_loss + dense_loss
            losses.update(
                [sparse_loss.item() * 1000,
                 dense_loss.item() * 1000])
            # _loss = grid_loss + dense_loss
            # losses.update([grid_loss.item() * 1000, dense_loss.item() * 1000])

            grnet.zero_grad()
            _loss.backward()
            grnet_optimizer.step()

            n_itr = (epoch_idx - 1) * n_batches + batch_idx
            train_writer.add_scalar('Loss/Batch/Sparse',
                                    sparse_loss.item() * 1000, n_itr)
            # train_writer.add_scalar('Loss/Batch/Grid', grid_loss.item() * 1000, n_itr)
            train_writer.add_scalar('Loss/Batch/Dense',
                                    dense_loss.item() * 1000, n_itr)

            batch_time.update(time() - batch_end_time)
            batch_end_time = time()
            ###

            f_record.write(
                '\n[Epoch %d/%d][Batch %d/%d] BatchTime = %.3f (s) DataTime = %.3f (s) Losses = %s'
                % (epoch_idx, cfg.TRAIN.N_EPOCHS, batch_idx + 1, n_batches,
                   batch_time.val(), data_time.val(),
                   ['%.4f' % l for l in losses.val()]))
            logging.info(
                '[Epoch %d/%d][Batch %d/%d] BatchTime = %.3f (s) DataTime = %.3f (s) Losses = %s'
                % (epoch_idx, cfg.TRAIN.N_EPOCHS, batch_idx + 1, n_batches,
                   batch_time.val(), data_time.val(),
                   ['%.4f' % l for l in losses.val()]))

        grnet_lr_scheduler.step()
        epoch_end_time = time()
        train_writer.add_scalar('Loss/Epoch/Sparse', losses.avg(0), epoch_idx)
        # train_writer.add_scalar('Loss/Epoch/Grid', losses.avg(0), epoch_idx)
        train_writer.add_scalar('Loss/Epoch/Dense', losses.avg(1), epoch_idx)
        f_record.write('\n[Epoch %d/%d] EpochTime = %.3f (s) Losses = %s' %
                       (epoch_idx, cfg.TRAIN.N_EPOCHS, epoch_end_time -
                        epoch_start_time, ['%.4f' % l for l in losses.avg()]))
        logging.info('[Epoch %d/%d] EpochTime = %.3f (s) Losses = %s' %
                     (epoch_idx, cfg.TRAIN.N_EPOCHS, epoch_end_time -
                      epoch_start_time, ['%.4f' % l for l in losses.avg()]))

        # Validate the current model
        # if epoch_idx % cfg.TRAIN.SAVE_FREQ == 0:
        # metrics = test_net(cfg, epoch_idx, val_data_loader, val_writer, grnet)

        # Save ckeckpoints
        # if epoch_idx % cfg.TRAIN.SAVE_FREQ == 0 or metrics.better_than(best_metrics):

        if first_epoch:
            metrics = test_net(cfg, epoch_idx, val_data_loader, val_writer,
                               grnet)
            best_metrics = metrics
            first_epoch = False

        if epoch_idx % cfg.TRAIN.SAVE_FREQ == 0:
            metrics = test_net(cfg, epoch_idx, val_data_loader, val_writer,
                               grnet)
            file_name = 'best-ckpt.pth' if metrics.better_than(
                best_metrics) else 'epoch-%03d.pth' % (epoch_idx + 1)
            output_path = os.path.join(cfg.DIR.CHECKPOINTS, file_name)
            torch.save({
                'epoch_index': epoch_idx,
                'best_metrics': metrics.state_dict(),
                'grnet': grnet.state_dict()
            }, output_path)  # yapf: disable

            logging.info('Saved checkpoint to %s ...' % output_path)
            if metrics.better_than(best_metrics):
                best_metrics = metrics

    train_writer.close()
    val_writer.close()
示例#11
0
# Written by Ting Pan
# --------------------------------------------------------

import dragon.vm.caffe as caffe
from datasets.factory import get_imdb
from core.test import test_net
from config import cfg
import time, os

cfg.DATA_DIR = '/home/workspace/datasets/UA-DETRAC'
imdb_name = 'detrac_2017_test'
gpu_id = 1
prototxt = 'models/detrac/AirNet/deploy.prototxt'
caffemodel = 'checkpoints/airnet_final.caffemodel'
vis = False

if __name__ == '__main__':

    while not os.path.exists(caffemodel):
        print('Waiting for {} to exist...'.format(caffemodel))
        time.sleep(10)

    caffe.set_mode_gpu()
    caffe.set_device(gpu_id)
    net = caffe.Net(prototxt, caffemodel, caffe.TEST)

    net.name = os.path.splitext(os.path.basename(caffemodel))[0]
    imdb = get_imdb(imdb_name)

    test_net(net, imdb, thresh=cfg.TEST.THRESH, vis=vis)
示例#12
0
    print(
        "[INFO] %s Epoch [%d/%d] EpochTime = %.3f (s) EDLoss = %.4f RLoss = %.4f"
        % (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, time.time() -
           epoch_start_time, encoder_losses.avg, refiner_losses.avg))

    # Update Rendering Views
    if cfg.TRAIN.UPDATE_N_VIEWS_RENDERING:
        #using random number of views to train the net
        n_views_rendering = random.randint(1, cfg.CONST.N_VIEWS_RENDERING)
        train_data_loader.dataset.set_n_views_rendering(n_views_rendering)
        print('[INFO] %s Epoch [%d/%d] Update #RenderingViews to %d' %
              (dt.now(), epoch_idx + 2, cfg.TRAIN.NUM_EPOCHES,
               n_views_rendering))

    # Validate the training models
    iou = test_net(cfg, epoch_idx + 1, None, val_data_loader, None, encoder,
                   decoder, refiner, merger)
    if epoch_idx == 0:
        best_iou = 0.0
        best_epoch = 1
    if iou > best_iou:
        best_iou = iou
        best_epoch = epoch_idx + 1
    if iou > best_iou * 0.85:
        volume, _ = forward(
            encoder, decoder, merger, refiner,
            val_dataset[0][1].expand_dims(axis=0).as_in_context(ctx))
        #if current iou is bigger than 85% of best iou, generate the 3D model and save it in fold generated_models_with_refiner.
        utils.binvox_visualization.get_volume_views(
            volume,
            "/home/hzx/my pix2vox model3/generated_models_with_refiner",
            epoch_idx)
示例#13
0
    print('Called with args:')
    print(args)

    if args.cfg_file is not None:
        cfg_from_file(args.cfg_file)
    if args.set_cfgs is not None:
        cfg_from_list(args.set_cfgs)

    cfg.GPU_ID = args.gpu_id

    print('Using config:')
    pprint.pprint(cfg)

    while not os.path.exists(args.caffemodel) and args.wait:
        print('Waiting for {} to exist...'.format(args.caffemodel))
        time.sleep(10)

    caffe.set_mode_gpu()
    caffe.set_device(args.gpu_id)
    net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST)
    net.name = os.path.splitext(os.path.basename(args.caffemodel))[0]

    print(args.imdb_name)
    imdb = get_repo_imdb(args.imdb_name)
    imdb.competition_mode(args.comp_mode)
    if not cfg.TEST.OBJ_DET.HAS_RPN:
        imdb.set_proposal_method(cfg.TEST.PROPOSAL_METHOD)

    test_net(net, imdb, max_per_image=args.max_per_image, vis=args.vis)
示例#14
0
    print(args.imdb_name)
    imdb = get_repo_imdb(args.imdb_name)
    imdb.competition_mode(args.comp_mode)
    if not cfg.TEST.OBJ_DET.HAS_RPN and cfg.TASK == 'object_detection':
        imdb.set_proposal_method(cfg.TEST.PROPOSAL_METHOD)

    al_net = None
    if args.al_net is not None and args.al_def is not None:
        al_net = caffe.Net(args.al_def, caffe.TEST, weights=args.al_net)
        al_net.name = "al_" + os.path.splitext(os.path.basename(
            args.al_def))[0]

    test_net(net,
             imdb,
             max_per_image=args.max_per_image,
             vis=args.vis,
             al_net=al_net)
'''
argparse.ArgumentParser
Input: (description='Test an Object Detection network'), Output: parser

caffe.Net
input: (args.prototxt, args.caffemodel, caffe.TEST), Output: net

os.path.splitext
input: (os.path.basename(args.caffemodel)), output: net.name

get_repo_imdb
input: (args.imdb_name), output: imdb
'''
示例#15
0
文件: train.py 项目: wx-b/RMNet
def train_net(cfg):
    # Set up data loader
    train_data_loader = torch.utils.data.DataLoader(
        dataset=utils.data_loaders.DatasetCollector.get_dataset(
            cfg, cfg.DATASET.TRAIN_DATASET,
            utils.data_loaders.DatasetSubset.TRAIN),
        batch_size=cfg.TRAIN.BATCH_SIZE,
        num_workers=cfg.CONST.N_WORKERS,
        pin_memory=True,
        shuffle=True,
        drop_last=True)
    val_data_loader = torch.utils.data.DataLoader(
        dataset=utils.data_loaders.DatasetCollector.get_dataset(
            cfg, cfg.DATASET.TEST_DATASET,
            utils.data_loaders.DatasetSubset.VAL),
        batch_size=1,
        num_workers=cfg.CONST.N_WORKERS,
        pin_memory=True,
        shuffle=False)

    # Set up networks
    tflownet = TinyFlowNet(cfg)
    rmnet = RMNet(cfg)
    tflownet.apply(utils.helpers.init_weights)
    rmnet.kv_memory.apply(utils.helpers.init_weights)
    rmnet.kv_query.apply(utils.helpers.init_weights)
    rmnet.decoder.apply(utils.helpers.init_weights)
    logging.info('Parameters in TinyFlowNet: %d.' %
                 (utils.helpers.count_parameters(tflownet)))
    logging.info('Parameters in RMNet: %d.' %
                 (utils.helpers.count_parameters(rmnet)))

    # Move the network to GPU if possible
    if torch.cuda.is_available():
        if torch.__version__ >= '1.2.0' and cfg.TRAIN.USE_BATCH_NORM:
            torch.distributed.init_process_group(
                'nccl',
                init_method='file:///tmp/rmnet-%s' % uuid.uuid4().hex,
                world_size=1,
                rank=0)
            tflownet = torch.nn.SyncBatchNorm.convert_sync_batchnorm(tflownet)
            rmnet = torch.nn.SyncBatchNorm.convert_sync_batchnorm(rmnet)

        tflownet = torch.nn.DataParallel(tflownet).cuda()
        rmnet = torch.nn.DataParallel(rmnet).cuda()

    # Create the optimizers
    network = rmnet if cfg.TRAIN.NETWORK == 'RMNet' else tflownet
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        network.parameters()),
                                 lr=cfg.TRAIN.LEARNING_RATE,
                                 weight_decay=cfg.TRAIN.WEIGHT_DECAY,
                                 betas=cfg.TRAIN.BETAS)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, cfg.TRAIN.N_EPOCHS)

    # Set up loss functions
    l1_loss = torch.nn.L1Loss()
    nll_loss = torch.nn.NLLLoss(ignore_index=cfg.CONST.IGNORE_IDX)
    lovasz_loss = LovaszLoss(ignore_index=cfg.CONST.IGNORE_IDX)

    # Load the pretrained model if exists
    init_epoch = 0
    best_metrics = None
    METRICS_THRESHOLD = Metrics(
        cfg.TEST.MAIN_METRIC_NAME,
        [cfg.TRAIN.CKPT_SAVE_THRESHOLD for i in range(len(Metrics.names()))])

    if 'WEIGHTS' in cfg.CONST:
        logging.info('Recovering from %s ...' % (cfg.CONST.WEIGHTS))
        checkpoint = torch.load(cfg.CONST.WEIGHTS)
        best_metrics = Metrics(cfg.TEST.MAIN_METRIC_NAME,
                               checkpoint['best_metrics'])
        tflownet.load_state_dict(checkpoint['tflownet'])
        rmnet.load_state_dict(checkpoint['rmnet'])
        logging.info(
            'Recover completed. Current epoch = #%d; best metrics = %s.' %
            (init_epoch, best_metrics))

    # Set up folders for logs, snapshot and checkpoints
    output_dir = os.path.join(cfg.DIR.OUTPUT_DIR, '%s', cfg.CONST.EXP_NAME)
    cfg.DIR.CHECKPOINTS = output_dir % 'checkpoints'
    cfg.DIR.LOGS = output_dir % 'logs'
    if not os.path.exists(cfg.DIR.CHECKPOINTS):
        os.makedirs(cfg.DIR.CHECKPOINTS)

    # Create tensorboard writers
    train_writer = SummaryWriter(cfg, 'train')
    val_writer = SummaryWriter(cfg, 'test')

    # Backup current code snapshot
    cfg.DIR.SNAPSHOTS = os.path.join(cfg.DIR.OUTPUT_DIR, 'snapshots')
    if not os.path.exists(cfg.DIR.SNAPSHOTS):
        os.makedirs(cfg.DIR.SNAPSHOTS)

    with zipfile.ZipFile(
            os.path.join(cfg.DIR.SNAPSHOTS, '%s.zip' % cfg.CONST.EXP_NAME),
            'w') as zf:
        root_dir = os.getcwd()
        for dirname, subdirs, files in os.walk(root_dir):
            if os.path.normpath(dirname).find(
                    os.path.normpath(cfg.DIR.OUTPUT_DIR)) != -1:
                continue

            _dirname = os.path.relpath(dirname, root_dir)
            zf.write(_dirname)
            for filename in files:
                zf.write(os.path.join(_dirname, filename))

    # Training/Testing the network
    n_batches = len(train_data_loader)
    last_epoch_idx_keep_frame_steps = -cfg.TRAIN.N_EPOCHS
    for epoch_idx in range(init_epoch + 1, cfg.TRAIN.N_EPOCHS + 1):
        epoch_start_time = time()

        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses = AverageMeter()

        if cfg.TRAIN.USE_BATCH_NORM:
            tflownet.train()
            rmnet.train()
        else:
            tflownet.eval()
            rmnet.eval()

        # Update frame step
        if cfg.TRAIN.USE_RANDOM_FRAME_STEPS:
            if epoch_idx >= cfg.TRAIN.EPOCH_INDEX_FIXING_FRAME_STEPS and \
               epoch_idx <= last_epoch_idx_keep_frame_steps + cfg.TRAIN.N_EPOCHS_KEEP_FRAME_STEPS:
                # Keep the frame step == 1 when JF Mean exceed a threshold for several epochs
                max_frame_steps = 1
            else:
                max_frame_steps = random.randint(
                    1, min(cfg.TRAIN.MAX_FRAME_STEPS, epoch_idx // 5 + 2))

            train_data_loader.dataset.set_frame_step(
                random.randint(1, max_frame_steps))
            logging.info('[Epoch %d/%d] Set frame step to %d' %
                         (epoch_idx, cfg.TRAIN.N_EPOCHS,
                          train_data_loader.dataset.frame_step))

        batch_end_time = time()
        for batch_idx, (video_name, n_objects, frames, masks,
                        optical_flows) in enumerate(train_data_loader):
            n_itr = (epoch_idx - 1) * n_batches + batch_idx
            data_time.update(time() - batch_end_time)

            try:
                frames = utils.helpers.var_or_cuda(frames)
                masks = utils.helpers.var_or_cuda(masks)
                optical_flows = utils.helpers.var_or_cuda(optical_flows)

                est_flows = tflownet(frames)
                est_flows = utils.helpers.var_or_cuda(est_flows)
                est_probs = rmnet(frames, masks, optical_flows, n_objects,
                                  cfg.TRAIN.MEMORIZE_EVERY)
                est_probs = utils.helpers.var_or_cuda(
                    est_probs[:, 1:]).permute(0, 2, 1, 3, 4)
                masks = torch.argmax(masks[:, 1:], dim=2)

                if cfg.TRAIN.NETWORK == 'TinyFlowNet':
                    loss = l1_loss(est_flows, optical_flows)
                else:  # RMNet
                    loss = lovasz_loss(est_probs, masks) + nll_loss(
                        torch.log(est_probs), masks)

                losses.update(loss.item())
                tflownet.zero_grad()
                rmnet.zero_grad()
                loss.backward()
                optimizer.step()
            except Exception as ex:
                logging.exception(ex)
                continue

            train_writer.add_scalar('Loss/Batch', loss.item(), n_itr)
            batch_time.update(time() - batch_end_time)
            batch_end_time = time()
            logging.info(
                '[Epoch %d/%d][Batch %d/%d] BatchTime = %.3f (s) DataTime = %.3f (s) Loss = %.4f'
                % (epoch_idx, cfg.TRAIN.N_EPOCHS, batch_idx + 1, n_batches,
                   batch_time.val(), data_time.val(), losses.val()))

        lr_scheduler.step()
        epoch_end_time = time()
        train_writer.add_scalar('Loss/Epoch', losses.avg(), epoch_idx)
        logging.info('[Epoch %d/%d] EpochTime = %.3f (s) Loss = %.4f' %
                     (epoch_idx, cfg.TRAIN.N_EPOCHS,
                      epoch_end_time - epoch_start_time, losses.avg()))

        # Evaluate the current model
        metrics = test_net(cfg, epoch_idx, val_data_loader, val_writer,
                           tflownet, rmnet)
        if metrics.state_dict(
        )[cfg.TEST.MAIN_METRIC_NAME] > cfg.TRAIN.KEEP_FRAME_STEPS_THRESHOLD:
            last_epoch_idx_keep_frame_steps = epoch_idx

        # Save ckeckpoints
        if epoch_idx % cfg.TRAIN.CKPT_SAVE_FREQ == 0 and metrics.better_than(
                METRICS_THRESHOLD):
            output_path = os.path.join(cfg.DIR.CHECKPOINTS,
                                       'ckpt-epoch-%03d.pth' % epoch_idx)
            torch.save({
                'epoch_index': epoch_idx,
                'best_metrics': metrics.state_dict(),
                'tflownet': tflownet.state_dict(),
                'rmnet': rmnet.state_dict()
            }, output_path)  # yapf: disable
            logging.info('Saved checkpoint to %s ...' % output_path)

        if metrics.better_than(best_metrics):
            output_path = os.path.join(cfg.DIR.CHECKPOINTS, 'ckpt-best.pth')
            best_metrics = metrics
            torch.save({
                'epoch_index': epoch_idx,
                'best_metrics': metrics.state_dict(),
                'tflownet': tflownet.state_dict(),
                'rmnet': rmnet.state_dict()
            }, output_path)  # yapf: disable
            logging.info('Saved checkpoint to %s ...' % output_path)

    train_writer.close()
    val_writer.close()
示例#16
0
def train_net(cfg):
    # Set up data augmentation
    IMG_SIZE = cfg.CONST.IMG_H, cfg.CONST.IMG_W
    CROP_SIZE = cfg.CONST.CROP_IMG_H, cfg.CONST.CROP_IMG_W
    train_transforms = utils.data_transforms.Compose([
        utils.data_transforms.RandomCrop(IMG_SIZE, CROP_SIZE),
        utils.data_transforms.RandomBackground(cfg.TRAIN.RANDOM_BG_COLOR_RANGE),
        utils.data_transforms.ColorJitter(cfg.TRAIN.BRIGHTNESS, cfg.TRAIN.CONTRAST, cfg.TRAIN.SATURATION),
        utils.data_transforms.RandomNoise(cfg.TRAIN.NOISE_STD),
        utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD),
        utils.data_transforms.RandomFlip(),
        utils.data_transforms.RandomPermuteRGB(),
        utils.data_transforms.ToTensor(),
    ])
    val_transforms = utils.data_transforms.Compose([
        utils.data_transforms.CenterCrop(IMG_SIZE, CROP_SIZE),
        utils.data_transforms.RandomBackground(cfg.TEST.RANDOM_BG_COLOR_RANGE),
        utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD),
        utils.data_transforms.ToTensor(),
    ])

    # Set up data loader
    train_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[cfg.DATASET.TRAIN_DATASET](cfg)
    val_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[cfg.DATASET.TEST_DATASET](cfg)
    train_data_loader = paddle.io.DataLoader(dataset=train_dataset_loader.get_dataset(
        utils.data_loaders.DatasetType.TRAIN, cfg.CONST.N_VIEWS_RENDERING, train_transforms),
                                                    batch_size=cfg.CONST.BATCH_SIZE,
                                                    #num_workers=0  , # cfg.TRAIN.NUM_WORKER>0时报错,因为dev/shm/太小  https://blog.csdn.net/ctypyb2002/article/details/107914643
                                                    #pin_memory=True,
                                                    use_shared_memory=False,
                                                    shuffle=True,
                                                    drop_last=True)
    val_data_loader = paddle.io.DataLoader(dataset=val_dataset_loader.get_dataset(
        utils.data_loaders.DatasetType.VAL, cfg.CONST.N_VIEWS_RENDERING, val_transforms),
                                                  batch_size=1,
                                                  #num_workers=1,
                                                  #pin_memory=True,
                                                  shuffle=False)

    # Set up networks # paddle.Model prepare fit save
    res_gru_net = Res_Gru_Net(cfg)

    print('[DEBUG] %s Parameters in Merger: %d.' % (dt.now(), utils.network_utils.count_parameters(res_gru_net)))

    # Set up learning rate scheduler to decay learning rates dynamically
    res_gru_net_lr_scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=cfg.TRAIN.RES_GRU_NET_LEARNING_RATE,
                                                               milestones=cfg.TRAIN.RES_GRU_NET_LR_MILESTONES,
                                                               gamma=cfg.TRAIN.GAMMA, verbose=True)
    # Set up solver
    # if cfg.TRAIN.POLICY == 'adam':
    res_gru_net_solver = paddle.optimizer.Adam(learning_rate=res_gru_net_lr_scheduler, parameters=res_gru_net.parameters())

    # Set up loss functions
    bce_loss = paddle.nn.BCELoss()

    # Load pretrained model if exists
    init_epoch = 0
    best_iou = -1
    best_epoch = -1
    if 'WEIGHTS' in cfg.CONST and cfg.TRAIN.RESUME_TRAIN:
        print('[INFO] %s Recovering from %s ...' % (dt.now(), cfg.CONST.WEIGHTS))
        # load
        res_gru_net_state_dict = paddle.load(os.path.join(cfg.CONST.WEIGHTS, "res_gru_net.pdparams"))
        res_gru_net_solver_state_dict = paddle.load(os.path.join(cfg.CONST.WEIGHTS, "res_gru_net_solver.pdopt"))
        res_gru_net.set_state_dict(res_gru_net_state_dict)
        res_gru_net_solver.set_state_dict(res_gru_net_solver_state_dict)

        print('[INFO] %s Recover complete. Current epoch #%d, Best IoU = %.4f at epoch #%d.' %
              (dt.now(), init_epoch, best_iou, best_epoch))

    # Summary writer for TensorBoard
    output_dir = os.path.join(cfg.DIR.OUT_PATH, '%s', dt.now().isoformat())
    log_dir = output_dir % 'logs'
    ckpt_dir = output_dir % 'checkpoints'
    # train_writer = SummaryWriter()
    # val_writer = SummaryWriter(os.path.join(log_dir, 'test'))
    train_writer=LogWriter(os.path.join(log_dir, 'train'))
    val_writer=LogWriter(os.path.join(log_dir, 'val'))
  
    # Training loop
    for epoch_idx in range(init_epoch, cfg.TRAIN.NUM_EPOCHES):
        # Tick / tock
        epoch_start_time = time()

        # Batch average meterics
        batch_time = utils.network_utils.AverageMeter()
        data_time = utils.network_utils.AverageMeter()
        res_gru_net_losses = utils.network_utils.AverageMeter()

        # # switch models to training mode
        res_gru_net.train()
        batch_end_time = time()
        n_batches = len(train_data_loader)
        
        for batch_idx, (rendering_images, ground_truth_volumes) in enumerate(train_data_loader()):
            # if batch_idx>1:
            #     exit()
            # Measure data time
            data_time.update(time() - batch_end_time)
            rendering_images = rendering_images.cuda()
            ground_truth_volumes = ground_truth_volumes.cuda()
            # print(rendering_images.shape)
            # print(ground_truth_volumes.shape)
            # [64, 5, 3, 224, 224]
            # [64, 32, 32, 32]
            # print("ground_truth_volumes", ground_truth_volumes)

            # Train the res_gru_net
            generated_volumes = res_gru_net(rendering_images)
            # print("generated_volumes", generated_volumes)
            res_gru_net_loss = bce_loss(generated_volumes, ground_truth_volumes) * 10
            res_gru_net_loss.backward()
            res_gru_net_solver.step()
            # Gradient decent
            res_gru_net_solver.clear_grad ()

            # Append loss to average metrics
            res_gru_net_losses.update(res_gru_net_loss)
            # Append loss to TensorBoard
            n_itr = epoch_idx * n_batches + batch_idx
            train_writer.add_scalar(tag='Res_Gru_Net/BatchLoss', step=n_itr, value=res_gru_net_loss)

            # Tick / tock
            batch_time.update(time() - batch_end_time)
            batch_end_time = time()
            n_batches = len(train_data_loader)
            if (batch_idx % int(cfg.CONST.INFO_BATCH )) == 0:
                print('[INFO] %s [Epoch %d/%d][Batch %d/%d] BatchTime = %.3f (s) DataTime = %.3f (s) EDLoss = %.4f' %
                    (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, batch_idx + 1, n_batches, batch_time.val,
                    data_time.val, res_gru_net_loss))

        # Append epoch loss to TensorBoard
        train_writer.add_scalar(tag='Res_Gru_Net/EpochLoss', step=epoch_idx + 1, value=res_gru_net_losses.avg)


        # Tick / tock
        epoch_end_time = time()
        print('[INFO] %s Epoch [%d/%d] EpochTime = %.3f (s) EDLoss = %.4f' %
              (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, epoch_end_time - epoch_start_time, res_gru_net_losses.avg))

        # Update Rendering Views
        if cfg.TRAIN.UPDATE_N_VIEWS_RENDERING:
            n_views_rendering = random.randint(1, cfg.CONST.N_VIEWS_RENDERING)
            train_data_loader.dataset.set_n_views_rendering(n_views_rendering)
            print('[INFO] %s Epoch [%d/%d] Update #RenderingViews to %d' %
                  (dt.now(), epoch_idx + 2, cfg.TRAIN.NUM_EPOCHES, n_views_rendering))

        # Validate the training models
        iou = test_net(cfg, epoch_idx + 1, output_dir, val_data_loader, val_writer, res_gru_net)

        # Save weights to file
        if (epoch_idx + 1) % cfg.TRAIN.SAVE_FREQ == 0:
            if not os.path.exists(ckpt_dir):
                os.makedirs(ckpt_dir)

            utils.network_utils.save_checkpoints(cfg, os.path.join(ckpt_dir, 'ckpt-epoch-%04d' % (epoch_idx + 1)),
                                                 epoch_idx + 1, res_gru_net, res_gru_net_solver, best_iou, best_epoch)
        if iou > best_iou:
            if not os.path.exists(ckpt_dir):
                os.makedirs(ckpt_dir)

            best_iou = iou
            best_epoch = epoch_idx + 1
            utils.network_utils.save_checkpoints(cfg, os.path.join(ckpt_dir, 'best-ckpt'), epoch_idx + 1, 
            res_gru_net, res_gru_net_solver, best_iou, best_epoch)