示例#1
0
def main(cfg, config_name):
    """
    Main training function: after preparing the data loaders, model, optimizer, and trainer,
    start with the training process.

    Args:
        cfg (dict): current configuration parameters
        config_name (str): path to the config file
    """

    # Create the output dir if it does not exist
    if not os.path.exists(cfg['misc']['log_dir']):
        os.makedirs(cfg['misc']['log_dir'])

    # Initialize the model
    model = config.get_model(cfg)
    model = model.cuda()

    # Get data loader
    train_loader = make_data_loader(cfg, phase='train')
    val_loader = make_data_loader(cfg, phase='val')

    # Log directory
    dataset_name = cfg["data"]["dataset"]

    now = datetime.now().strftime("%y_%m_%d-%H_%M_%S_%f")
    now += "__Method_" + str(cfg['method']['backbone'])
    now += "__Pretrained_" if cfg['network']['use_pretrained'] and cfg[
        'network']['pretrained_path'] else ''
    if cfg['method']['flow']: now += "__Flow_"
    if cfg['method']['ego_motion']: now += "__Ego_"
    if cfg['method']['semantic']: now += "__Sem_"
    now += "__Rem_Ground_" if cfg['data']['remove_ground'] else ''
    now += "__VoxSize_" + str(cfg['misc']["voxel_size"])
    now += "__Pts_" + str(cfg['misc']["num_points"])
    path2log = os.path.join(cfg['misc']['log_dir'], "logs_" + dataset_name,
                            now)

    logger, checkpoint_dir = prepare_logger(cfg, path2log)
    tboard_logger = SummaryWriter(path2log)

    # Output number of model parameters
    logger.info("Parameter Count: {:d}".format(n_model_parameters(model)))

    # Output torch and cuda version
    logger.info('Torch version: {}'.format(torch.__version__))
    logger.info('CUDA version: {}'.format(torch.version.cuda))

    # Save config file that was used for this experiment
    with open(os.path.join(path2log,
                           config_name.split(os.sep)[-1]), 'w') as outfile:
        yaml.dump(cfg, outfile, default_flow_style=False, allow_unicode=True)

    # Get optimizer and trainer
    optimizer = config.get_optimizer(cfg, model)
    scheduler = config.get_scheduler(cfg, optimizer)

    # Parameters determining the saving and validation interval (if positive denotes iteration if negative epoch)
    stat_interval = cfg['train']['stat_interval']
    stat_interval = stat_interval if stat_interval > 0 else abs(
        stat_interval * len(train_loader))

    chkpt_interval = cfg['train']['chkpt_interval']
    chkpt_interval = chkpt_interval if chkpt_interval > 0 else abs(
        chkpt_interval * len(train_loader))

    val_interval = cfg['train']['val_interval']
    val_interval = val_interval if val_interval > 0 else abs(val_interval *
                                                             len(train_loader))

    # if not a pretrained model epoch and iterations should be -1
    metric_val_best = np.inf
    running_metrics = {}
    running_losses = {}
    epoch_it = -1
    total_it = -1

    # Load the pretrained weights
    if cfg['network']['use_pretrained'] and cfg['network']['pretrained_path']:
        model, optimizer, scheduler, epoch_it, total_it, metric_val_best = load_checkpoint(
            model,
            optimizer,
            scheduler,
            filename=cfg['network']['pretrained_path'])

        # Find previous tensorboard files and copy them
        tb_files = glob.glob(
            os.sep.join(cfg['network']['pretrained_path'].split(os.sep)[:-1]) +
            '/events.*')
        for tb_file in tb_files:
            shutil.copy(tb_file,
                        os.path.join(path2log,
                                     tb_file.split(os.sep)[-1]))

    # Initialize the trainer
    device = torch.device('cuda' if (
        torch.cuda.is_available() and cfg['misc']['use_gpu']) else 'cpu')
    trainer = config.get_trainer(cfg, model, device)
    acc_iter_size = cfg['train']['acc_iter_size']

    # Training loop
    while epoch_it < cfg['train']['max_epoch']:
        epoch_it += 1
        lr = scheduler.get_last_lr()
        logger.info('Training epoch: {}, LR: {} '.format(epoch_it, lr))
        gc.collect()

        train_loader_iter = train_loader.__iter__()
        start = time.time()
        tbar = tqdm(total=len(train_loader) // acc_iter_size, ncols=100)

        for it in range(len(train_loader) // acc_iter_size):
            optimizer.zero_grad()
            total_it += 1
            batch_metrics = {}
            batch_losses = {}

            for iter_idx in range(acc_iter_size):

                batch = train_loader_iter.next()

                dict_all_to_device(batch, device)
                losses, metrics, total_loss = trainer.train_step(batch)

                total_loss.backward()

                # Save the running metrics and losses
                if not batch_metrics:
                    batch_metrics = copy.deepcopy(metrics)
                else:
                    for key, value in metrics.items():
                        batch_metrics[key] += value

                if not batch_losses:
                    batch_losses = copy.deepcopy(losses)
                else:
                    for key, value in losses.items():
                        batch_losses[key] += value

            # Compute the mean value of the metrics and losses of the batch
            for key, value in batch_metrics.items():
                batch_metrics[key] = value / acc_iter_size

            for key, value in batch_losses.items():
                batch_losses[key] = value / acc_iter_size

            optimizer.step()
            torch.cuda.empty_cache()

            tbar.set_description('Loss: {:.3g}'.format(
                batch_losses['total_loss']))
            tbar.update(1)

            # Save the running metrics and losses
            if not running_metrics:
                running_metrics = copy.deepcopy(batch_metrics)
            else:
                for key, value in batch_metrics.items():
                    running_metrics[key] += value

            if not running_losses:
                running_losses = copy.deepcopy(batch_losses)
            else:
                for key, value in batch_losses.items():
                    running_losses[key] += value

            # Logs
            if total_it % stat_interval == stat_interval - 1:
                # Print / save logs
                logger.info("Epoch {0:d} - It. {1:d}: loss = {2:.3f}".format(
                    epoch_it, total_it,
                    running_losses['total_loss'] / stat_interval))

                for key, value in running_losses.items():
                    tboard_logger.add_scalar("Train/{}".format(key),
                                             value / stat_interval, total_it)
                    # Reinitialize the values
                    running_losses[key] = 0

                for key, value in running_metrics.items():
                    tboard_logger.add_scalar("Train/{}".format(key),
                                             value / stat_interval, total_it)
                    # Reinitialize the values
                    running_metrics[key] = 0

                start = time.time()

            # Run validation
            if total_it % val_interval == val_interval - 1:
                logger.info("Starting the validation")
                val_losses, val_metrics = trainer.validate(val_loader)

                for key, value in val_losses.items():
                    tboard_logger.add_scalar("Val/{}".format(key), value,
                                             total_it)

                for key, value in val_metrics.items():
                    tboard_logger.add_scalar("Val/{}".format(key), value,
                                             total_it)

                logger.info(
                    "VALIDATION -It. {0:d}: total loss: {1:.3f}.".format(
                        total_it, val_losses['total_loss']))

                if val_losses['total_loss'] < metric_val_best:
                    metric_val_best = val_losses['total_loss']
                    logger.info('New best model (loss: {:.4f})'.format(
                        metric_val_best))

                    save_checkpoint(os.path.join(path2log, 'model_best.pt'),
                                    epoch=epoch_it,
                                    it=total_it,
                                    model=model,
                                    optimizer=optimizer,
                                    scheduler=scheduler,
                                    config=cfg,
                                    best_val=metric_val_best)
                else:
                    save_checkpoint(os.path.join(
                        path2log, 'model_{}.pt'.format(total_it)),
                                    epoch=epoch_it,
                                    it=total_it,
                                    model=model,
                                    optimizer=optimizer,
                                    scheduler=scheduler,
                                    config=cfg,
                                    best_val=val_losses['total_loss'])

        # After the epoch if finished update the scheduler
        scheduler.step()

    # Quit after the maximum number of epochs is reached
    logger.info(
        'Training completed after {} Epochs ({} it) with best val metric ({})={}'
        .format(epoch_it, it, model_selection_metric, metric_val_best))
示例#2
0
def main(cfg, logger):
    """
    Main function of this software. After preparing the data loaders, model, optimizer, and trainer,
    start with the training and evaluation process.

    Args:
        cfg (dict): current configuration paramaters
    """

    # Initialize parameters
    model_selection_metric = cfg['train']['model_selection_metric']

    if cfg['train']['model_selection_mode'] == 'maximize':
        model_selection_sign = 1
    elif cfg['train']['model_selection_mode'] == 'minimize':
        model_selection_sign = -1
    else:
        raise ValueError(
            'model_selection_mode must be either maximize or minimize.')

    # Get data loader
    train_loader = make_data_loader(cfg, phase='train')
    val_loader = make_data_loader(cfg, phase='val')

    # Set up tensorboard logger
    tboard_logger = SummaryWriter(os.path.join(cfg['misc']['log_dir'], 'logs'))

    # Get model
    model = config.get_model(cfg)

    # Get optimizer and trainer
    optimizer = getattr(optim, cfg['optimizer']['alg'])(
        model.parameters(),
        lr=cfg['optimizer']['learning_rate'],
        weight_decay=cfg['optimizer']['weight_decay'])

    trainer = config.get_trainer(cfg, model, optimizer, tboard_logger)

    # Load pre-trained model if existing
    kwargs = {
        'model': model,
        'optimizer': optimizer,
    }

    checkpoint_io = CheckpointIO(
        cfg['misc']['log_dir'],
        initialize_from=cfg['model']['init_from'],
        initialization_file_name=cfg['model']['init_file_name'],
        **kwargs)

    try:
        load_dict = checkpoint_io.load('model.pt')
    except FileExistsError:
        load_dict = dict()

    epoch_it = load_dict.get('epoch_it', -1)
    it = load_dict.get('it', -1)

    metric_val_best = load_dict.get('loss_val_best',
                                    -model_selection_sign * np.inf)

    if metric_val_best == np.inf or metric_val_best == -np.inf:
        metric_val_best = -model_selection_sign * np.inf

    logger.info('Current best validation metric ({}): {:.5f}'.format(
        model_selection_metric, metric_val_best))

    # Training parameters
    stat_interval = cfg['train']['stat_interval']
    stat_interval = stat_interval if stat_interval > 0 else abs(
        stat_interval * len(train_loader))

    chkpt_interval = cfg['train']['chkpt_interval']
    chkpt_interval = chkpt_interval if chkpt_interval > 0 else abs(
        chkpt_interval * len(train_loader))

    val_interval = cfg['train']['val_interval']
    val_interval = val_interval if val_interval > 0 else abs(val_interval *
                                                             len(train_loader))

    # Print model parameters and model graph
    nparameters = sum(p.numel() for p in model.parameters())
    #print(model)
    logger.info('Total number of parameters: {}'.format(nparameters))

    # Training loop
    while epoch_it < cfg['train']['max_epoch']:
        epoch_it += 1

        for batch in train_loader:
            it += 1
            loss = trainer.train_step(batch, it)
            tboard_logger.add_scalar('train/loss', loss, it)

            # Print output
            if stat_interval != 0 and (it % stat_interval) == 0 and it != 0:
                logger.info('[Epoch {}] it={}, loss={:.4f}'.format(
                    epoch_it, it, loss))

            # Save checkpoint
            if (chkpt_interval != 0 and
                (it % chkpt_interval) == 0) and it != 0:
                logger.info('Saving checkpoint')
                checkpoint_io.save('model.pt',
                                   epoch_it=epoch_it,
                                   it=it,
                                   loss_val_best=metric_val_best)

            # Run validation
            if val_interval != 0 and (it % val_interval) == 0 and it != 0:
                eval_dict = trainer.evaluate(val_loader, it)

                metric_val = eval_dict[model_selection_metric]
                logger.info('Validation metric ({}): {:.4f}'.format(
                    model_selection_metric, metric_val))

                for k, v in eval_dict.items():
                    tboard_logger.add_scalar('val/{}'.format(k), v, it)

                if model_selection_sign * (metric_val - metric_val_best) > 0:
                    metric_val_best = metric_val
                    logger.info(
                        'New best model (loss {:.4f})'.format(metric_val_best))
                    checkpoint_io.save('model_best.pt',
                                       epoch_it=epoch_it,
                                       it=it,
                                       loss_val_best=metric_val_best)

    # Quit after the maximum number of epochs is reached
    logger.info(
        'Training completed after {} Epochs ({} it) with best val metric ({})={}'
        .format(epoch_it, it, model_selection_metric, metric_val_best))
示例#3
0
def main(cfg, logger):
    """
    Main function of this evaluation software. After preparing the data loaders, and the model start with the evaluation process.
    Args:
        cfg (dict): current configuration paramaters
    """

    # Create the output dir if it does not exist 
    if not os.path.exists(cfg['test']['results_dir']):
        os.makedirs(cfg['test']['results_dir'])

    # Get model
    model = config.get_model(cfg)
    device = torch.device('cuda' if (torch.cuda.is_available() and cfg['misc']['use_gpu']) else 'cpu') 

    # Get data loader
    eval_loader = make_data_loader(cfg, phase='test')

    # Log directory
    dataset_name = cfg["data"]["dataset"]

    path2log = os.path.join(cfg['test']['results_dir'], dataset_name, '{}_{}'.format(cfg['method']['backbone'], cfg['misc']['num_points']))

    logger, checkpoint_dir = prepare_logger(cfg, path2log)

    # Output torch and cuda version 
    
    logger.info('Torch version: {}'.format(torch.__version__))
    logger.info('CUDA version: {}'.format(torch.version.cuda))
    logger.info('Starting evaluation of the method {} on {} dataset'.format(cfg['method']['backbone'], dataset_name))

    # Save config file that was used for this experiment
    with open(os.path.join(path2log, "config.yaml"),'w') as outfile:
        yaml.dump(cfg, outfile, default_flow_style=False, allow_unicode=True)


    logger.info("Parameter Count: {:d}".format(n_model_parameters(model)))
    
    # Load the pretrained weights
    if cfg['network']['use_pretrained'] and cfg['network']['pretrained_path']:
        model, optimizer, scheduler, epoch_it, total_it, metric_val_best = load_checkpoint(model, None, None, filename=cfg['network']['pretrained_path'])

    else:
        logger.warning('MODEL RUNS IN EVAL MODE, BUT NO PRETRAINED WEIGHTS WERE LOADED!!!!')


    # Initialize the trainer
    trainer = config.get_trainer(cfg, model,device)

    # if not a pretrained model epoch and iterations should be -1 
    eval_metrics = defaultdict(list)    
    start = time.time()
    
    for it, batch in enumerate(tqdm(eval_loader)):
        # Put all the tensors to the designated device
        dict_all_to_device(batch, device)
        

        metrics = trainer.eval_step(batch)
        
        for key in metrics:
            eval_metrics[key].append(metrics[key])


    stop = time.time()

    # Compute mean values of the evaluation statistics
    result_string = ''

    for key, value in eval_metrics.items():
        if key not in ['true_p', 'true_n', 'false_p', 'false_n']:
            result_string += '{}: {:.3f}; '.format(key, np.mean(value))
    
    if 'true_p' in eval_metrics:
        result_string += '{}: {:.3f}; '.format('dataset_precision_f', (np.sum(eval_metrics['true_p']) / (np.sum(eval_metrics['true_p'])  + np.sum(eval_metrics['false_p'])) ))
        result_string += '{}: {:.3f}; '.format('dataset_recall_f', (np.sum(eval_metrics['true_p']) / (np.sum(eval_metrics['true_p'])  + np.sum(eval_metrics['false_n']))))

        result_string += '{}: {:.3f}; '.format('dataset_precision_b', (np.sum(eval_metrics['true_n']) / (np.sum(eval_metrics['true_n'])  + np.sum(eval_metrics['false_n']))))
        result_string += '{}: {:.3f}; '.format('dataset_recall_b', (np.sum(eval_metrics['true_n']) / (np.sum(eval_metrics['true_n'])  + np.sum(eval_metrics['false_p']))))


    logger.info('Outputing the evaluation metric for: {} {} {} '.format('Flow, ' if cfg['metrics']['flow'] else '', 'Ego-Motion, ' if cfg['metrics']['ego_motion'] else '', 'Bckg. Segmentaion' if cfg['metrics']['semantic'] else ''))
    logger.info(result_string)
    logger.info('Evaluation completed in {}s [{}s per scene]'.format((stop - start), (stop - start)/len(eval_loader)))     
    dataset_folder = cfg['data']['path']
    ################

    dataset = data.HumansDataset(dataset_folder,
                                 fields,
                                 'test',
                                 specific_model=specific_model)

    test_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=1,
        num_workers=1,
        worker_init_fn=data.worker_init_fn,
        shuffle=False)

    model = config.get_model(cfg, device=device)
    model_dir = os.path.join(out_dir, cfg['test']['model_file'])
    print('Loading checkpoint from %s' % model_dir)
    load_dict = torch.load(model_dir)
    model.load_state_dict(load_dict['model'])

    cfg['generation']['n_time_steps'] = args.seq_length
    generator = config.get_generator(model, cfg, device=device)

    times = np.array(
        [i / (args.seq_length - 1) for i in range(args.seq_length)],
        dtype=np.float32)
    if args.experiment == 'temporal':
        t_idx = np.random.choice(range(args.seq_length),
                                 size=args.seq_length // 2,
                                 replace=False)
        transform, seq_len=cfg['data']['length_sequence'],
        scale_type=cfg['data']['scale_type'])
}
dataset = data.HumansDataset(dataset_folder=cfg['data']['path'],
                             fields=fields, mode='test', split='test')


# Choose the motion sequence and identity sequence
identity_seq = {'category': 'D-FAUST', 'model': '50002_light_hopping_loose', 'start_idx': 30}
motion_seq = {'category': 'D-FAUST', 'model': '50004_punching', 'start_idx': 60}

inp_id = dataset.get_data_dict(identity_seq)
inp_motion = dataset.get_data_dict(motion_seq)

# Model
model = config.get_model(cfg, device=device, dataset=dataset)

checkpoint_io = CheckpointIO(out_dir, model=model)
checkpoint_io.load(cfg['test']['model_file'])

# Generator
generator = config.get_generator(model, cfg, device=device)

model.eval()
meshes, _ = generator.generate_motion_transfer(inp_id, inp_motion)

# Save generated sequence
if not os.path.isdir(generation_dir):
    os.makedirs(generation_dir)
modelname = '%s_%d_to_%s_%d' % (motion_seq['model'],
                                motion_seq['start_idx'],
示例#6
0
def main(cfg, args, logger):
    """
    Main function of this software. After preparing the model, carry out the pairwise point cloud registration.

    Args:
        cfg (dict): current configuration paramaters
    """

    # Get model
    model = config.get_model(cfg)
    model.eval()

    # Load pre-trained model if existing
    kwargs = {'model': model}

    checkpoint_io = CheckpointIO('',
                                 initialize_from='./pretrained/',
                                 initialization_file_name=args.model,
                                 **kwargs)

    try:
        load_dict = checkpoint_io.load()
    except FileExistsError:
        load_dict = dict()

    # Print model parameters and model graph
    nparameters = sum(p.numel() for p in model.parameters())
    logger.info('Total number of model parameters: {}'.format(nparameters))

    # Prepare the output file name
    target_base = './data/demo/pairwise/results'
    id_0 = args.source_pc.split(os.sep)[-1].split('_')[-1].split('.')[0]
    id_1 = args.target_pc.split(os.sep)[-1].split('_')[-1].split('.')[0]
    metadata = np.array([[id_0, id_1, 'True']])

    if not os.path.exists(target_base):
        os.makedirs(target_base)

    target_path = os.path.join(target_base, 'est_T.log')

    with torch.no_grad():
        # Load the point clouds and prepare the input
        start_time_pipeline = time.time()

        point_cloud_files = [args.source_pc, args.target_pc]
        data = prepare_data(point_cloud_files, cfg['misc']['voxel_size'])

        # Extract the descriptors perform the NN search and prepare the data for the reg. blocks
        start_time_features = time.time()
        filtering_data, _, _ = model.compute_descriptors(data)
        end_time_features = time.time()

        if args.verbose:
            logger.info('Feature computation and sampling took {:.3f}s'.format(
                end_time_features - start_time_features))

        # Filter the putative correspondences and estimate the relative transformation parameters
        start_time_filtering = time.time()
        est_data = model.filter_correspondences(filtering_data)
        end_time_filtering = time.time()

        if args.verbose:
            logger.info(
                'Filtering the correspondences and estimation of paramaters took {:.3f}s'
                .format(end_time_filtering - start_time_filtering))

        est_T = np.eye(4)
        est_T[0:3, 0:3] = est_data['rot_est'][-1].cpu().numpy()
        est_T[0:3, 3:4] = est_data['trans_est'][-1].cpu().numpy()

        end_time_pipeline = time.time()

        # Save the results
        write_trajectory(np.expand_dims(est_T, 0), metadata, target_path)

        if args.verbose:
            logger.info(
                'Estimation of the pairwise transformation parameters completed in {:.3f}s'
                .format(end_time_pipeline - start_time_pipeline))
            logger.info(
                'Estimated parameters were saved in {}.'.format(target_path))

        if args.visualize:
            pcd_1 = o3d.io.read_point_cloud(args.source_pc)
            pcd_2 = o3d.io.read_point_cloud(args.target_pc)

            # First plot both point clouds in their original reference frame
            draw_registration_result(pcd_1, pcd_2, np.identity(4))

            # Plot the point clouds after applying the estimated transformation parameters
            draw_registration_result(pcd_1, pcd_2, est_T)