def main(cfg, config_name): """ Main training function: after preparing the data loaders, model, optimizer, and trainer, start with the training process. Args: cfg (dict): current configuration parameters config_name (str): path to the config file """ # Create the output dir if it does not exist if not os.path.exists(cfg['misc']['log_dir']): os.makedirs(cfg['misc']['log_dir']) # Initialize the model model = config.get_model(cfg) model = model.cuda() # Get data loader train_loader = make_data_loader(cfg, phase='train') val_loader = make_data_loader(cfg, phase='val') # Log directory dataset_name = cfg["data"]["dataset"] now = datetime.now().strftime("%y_%m_%d-%H_%M_%S_%f") now += "__Method_" + str(cfg['method']['backbone']) now += "__Pretrained_" if cfg['network']['use_pretrained'] and cfg[ 'network']['pretrained_path'] else '' if cfg['method']['flow']: now += "__Flow_" if cfg['method']['ego_motion']: now += "__Ego_" if cfg['method']['semantic']: now += "__Sem_" now += "__Rem_Ground_" if cfg['data']['remove_ground'] else '' now += "__VoxSize_" + str(cfg['misc']["voxel_size"]) now += "__Pts_" + str(cfg['misc']["num_points"]) path2log = os.path.join(cfg['misc']['log_dir'], "logs_" + dataset_name, now) logger, checkpoint_dir = prepare_logger(cfg, path2log) tboard_logger = SummaryWriter(path2log) # Output number of model parameters logger.info("Parameter Count: {:d}".format(n_model_parameters(model))) # Output torch and cuda version logger.info('Torch version: {}'.format(torch.__version__)) logger.info('CUDA version: {}'.format(torch.version.cuda)) # Save config file that was used for this experiment with open(os.path.join(path2log, config_name.split(os.sep)[-1]), 'w') as outfile: yaml.dump(cfg, outfile, default_flow_style=False, allow_unicode=True) # Get optimizer and trainer optimizer = config.get_optimizer(cfg, model) scheduler = config.get_scheduler(cfg, optimizer) # Parameters determining the saving and validation interval (if positive denotes iteration if negative epoch) stat_interval = cfg['train']['stat_interval'] stat_interval = stat_interval if stat_interval > 0 else abs( stat_interval * len(train_loader)) chkpt_interval = cfg['train']['chkpt_interval'] chkpt_interval = chkpt_interval if chkpt_interval > 0 else abs( chkpt_interval * len(train_loader)) val_interval = cfg['train']['val_interval'] val_interval = val_interval if val_interval > 0 else abs(val_interval * len(train_loader)) # if not a pretrained model epoch and iterations should be -1 metric_val_best = np.inf running_metrics = {} running_losses = {} epoch_it = -1 total_it = -1 # Load the pretrained weights if cfg['network']['use_pretrained'] and cfg['network']['pretrained_path']: model, optimizer, scheduler, epoch_it, total_it, metric_val_best = load_checkpoint( model, optimizer, scheduler, filename=cfg['network']['pretrained_path']) # Find previous tensorboard files and copy them tb_files = glob.glob( os.sep.join(cfg['network']['pretrained_path'].split(os.sep)[:-1]) + '/events.*') for tb_file in tb_files: shutil.copy(tb_file, os.path.join(path2log, tb_file.split(os.sep)[-1])) # Initialize the trainer device = torch.device('cuda' if ( torch.cuda.is_available() and cfg['misc']['use_gpu']) else 'cpu') trainer = config.get_trainer(cfg, model, device) acc_iter_size = cfg['train']['acc_iter_size'] # Training loop while epoch_it < cfg['train']['max_epoch']: epoch_it += 1 lr = scheduler.get_last_lr() logger.info('Training epoch: {}, LR: {} '.format(epoch_it, lr)) gc.collect() train_loader_iter = train_loader.__iter__() start = time.time() tbar = tqdm(total=len(train_loader) // acc_iter_size, ncols=100) for it in range(len(train_loader) // acc_iter_size): optimizer.zero_grad() total_it += 1 batch_metrics = {} batch_losses = {} for iter_idx in range(acc_iter_size): batch = train_loader_iter.next() dict_all_to_device(batch, device) losses, metrics, total_loss = trainer.train_step(batch) total_loss.backward() # Save the running metrics and losses if not batch_metrics: batch_metrics = copy.deepcopy(metrics) else: for key, value in metrics.items(): batch_metrics[key] += value if not batch_losses: batch_losses = copy.deepcopy(losses) else: for key, value in losses.items(): batch_losses[key] += value # Compute the mean value of the metrics and losses of the batch for key, value in batch_metrics.items(): batch_metrics[key] = value / acc_iter_size for key, value in batch_losses.items(): batch_losses[key] = value / acc_iter_size optimizer.step() torch.cuda.empty_cache() tbar.set_description('Loss: {:.3g}'.format( batch_losses['total_loss'])) tbar.update(1) # Save the running metrics and losses if not running_metrics: running_metrics = copy.deepcopy(batch_metrics) else: for key, value in batch_metrics.items(): running_metrics[key] += value if not running_losses: running_losses = copy.deepcopy(batch_losses) else: for key, value in batch_losses.items(): running_losses[key] += value # Logs if total_it % stat_interval == stat_interval - 1: # Print / save logs logger.info("Epoch {0:d} - It. {1:d}: loss = {2:.3f}".format( epoch_it, total_it, running_losses['total_loss'] / stat_interval)) for key, value in running_losses.items(): tboard_logger.add_scalar("Train/{}".format(key), value / stat_interval, total_it) # Reinitialize the values running_losses[key] = 0 for key, value in running_metrics.items(): tboard_logger.add_scalar("Train/{}".format(key), value / stat_interval, total_it) # Reinitialize the values running_metrics[key] = 0 start = time.time() # Run validation if total_it % val_interval == val_interval - 1: logger.info("Starting the validation") val_losses, val_metrics = trainer.validate(val_loader) for key, value in val_losses.items(): tboard_logger.add_scalar("Val/{}".format(key), value, total_it) for key, value in val_metrics.items(): tboard_logger.add_scalar("Val/{}".format(key), value, total_it) logger.info( "VALIDATION -It. {0:d}: total loss: {1:.3f}.".format( total_it, val_losses['total_loss'])) if val_losses['total_loss'] < metric_val_best: metric_val_best = val_losses['total_loss'] logger.info('New best model (loss: {:.4f})'.format( metric_val_best)) save_checkpoint(os.path.join(path2log, 'model_best.pt'), epoch=epoch_it, it=total_it, model=model, optimizer=optimizer, scheduler=scheduler, config=cfg, best_val=metric_val_best) else: save_checkpoint(os.path.join( path2log, 'model_{}.pt'.format(total_it)), epoch=epoch_it, it=total_it, model=model, optimizer=optimizer, scheduler=scheduler, config=cfg, best_val=val_losses['total_loss']) # After the epoch if finished update the scheduler scheduler.step() # Quit after the maximum number of epochs is reached logger.info( 'Training completed after {} Epochs ({} it) with best val metric ({})={}' .format(epoch_it, it, model_selection_metric, metric_val_best))
def main(cfg, logger): """ Main function of this software. After preparing the data loaders, model, optimizer, and trainer, start with the training and evaluation process. Args: cfg (dict): current configuration paramaters """ # Initialize parameters model_selection_metric = cfg['train']['model_selection_metric'] if cfg['train']['model_selection_mode'] == 'maximize': model_selection_sign = 1 elif cfg['train']['model_selection_mode'] == 'minimize': model_selection_sign = -1 else: raise ValueError( 'model_selection_mode must be either maximize or minimize.') # Get data loader train_loader = make_data_loader(cfg, phase='train') val_loader = make_data_loader(cfg, phase='val') # Set up tensorboard logger tboard_logger = SummaryWriter(os.path.join(cfg['misc']['log_dir'], 'logs')) # Get model model = config.get_model(cfg) # Get optimizer and trainer optimizer = getattr(optim, cfg['optimizer']['alg'])( model.parameters(), lr=cfg['optimizer']['learning_rate'], weight_decay=cfg['optimizer']['weight_decay']) trainer = config.get_trainer(cfg, model, optimizer, tboard_logger) # Load pre-trained model if existing kwargs = { 'model': model, 'optimizer': optimizer, } checkpoint_io = CheckpointIO( cfg['misc']['log_dir'], initialize_from=cfg['model']['init_from'], initialization_file_name=cfg['model']['init_file_name'], **kwargs) try: load_dict = checkpoint_io.load('model.pt') except FileExistsError: load_dict = dict() epoch_it = load_dict.get('epoch_it', -1) it = load_dict.get('it', -1) metric_val_best = load_dict.get('loss_val_best', -model_selection_sign * np.inf) if metric_val_best == np.inf or metric_val_best == -np.inf: metric_val_best = -model_selection_sign * np.inf logger.info('Current best validation metric ({}): {:.5f}'.format( model_selection_metric, metric_val_best)) # Training parameters stat_interval = cfg['train']['stat_interval'] stat_interval = stat_interval if stat_interval > 0 else abs( stat_interval * len(train_loader)) chkpt_interval = cfg['train']['chkpt_interval'] chkpt_interval = chkpt_interval if chkpt_interval > 0 else abs( chkpt_interval * len(train_loader)) val_interval = cfg['train']['val_interval'] val_interval = val_interval if val_interval > 0 else abs(val_interval * len(train_loader)) # Print model parameters and model graph nparameters = sum(p.numel() for p in model.parameters()) #print(model) logger.info('Total number of parameters: {}'.format(nparameters)) # Training loop while epoch_it < cfg['train']['max_epoch']: epoch_it += 1 for batch in train_loader: it += 1 loss = trainer.train_step(batch, it) tboard_logger.add_scalar('train/loss', loss, it) # Print output if stat_interval != 0 and (it % stat_interval) == 0 and it != 0: logger.info('[Epoch {}] it={}, loss={:.4f}'.format( epoch_it, it, loss)) # Save checkpoint if (chkpt_interval != 0 and (it % chkpt_interval) == 0) and it != 0: logger.info('Saving checkpoint') checkpoint_io.save('model.pt', epoch_it=epoch_it, it=it, loss_val_best=metric_val_best) # Run validation if val_interval != 0 and (it % val_interval) == 0 and it != 0: eval_dict = trainer.evaluate(val_loader, it) metric_val = eval_dict[model_selection_metric] logger.info('Validation metric ({}): {:.4f}'.format( model_selection_metric, metric_val)) for k, v in eval_dict.items(): tboard_logger.add_scalar('val/{}'.format(k), v, it) if model_selection_sign * (metric_val - metric_val_best) > 0: metric_val_best = metric_val logger.info( 'New best model (loss {:.4f})'.format(metric_val_best)) checkpoint_io.save('model_best.pt', epoch_it=epoch_it, it=it, loss_val_best=metric_val_best) # Quit after the maximum number of epochs is reached logger.info( 'Training completed after {} Epochs ({} it) with best val metric ({})={}' .format(epoch_it, it, model_selection_metric, metric_val_best))
def main(cfg, logger): """ Main function of this evaluation software. After preparing the data loaders, and the model start with the evaluation process. Args: cfg (dict): current configuration paramaters """ # Create the output dir if it does not exist if not os.path.exists(cfg['test']['results_dir']): os.makedirs(cfg['test']['results_dir']) # Get model model = config.get_model(cfg) device = torch.device('cuda' if (torch.cuda.is_available() and cfg['misc']['use_gpu']) else 'cpu') # Get data loader eval_loader = make_data_loader(cfg, phase='test') # Log directory dataset_name = cfg["data"]["dataset"] path2log = os.path.join(cfg['test']['results_dir'], dataset_name, '{}_{}'.format(cfg['method']['backbone'], cfg['misc']['num_points'])) logger, checkpoint_dir = prepare_logger(cfg, path2log) # Output torch and cuda version logger.info('Torch version: {}'.format(torch.__version__)) logger.info('CUDA version: {}'.format(torch.version.cuda)) logger.info('Starting evaluation of the method {} on {} dataset'.format(cfg['method']['backbone'], dataset_name)) # Save config file that was used for this experiment with open(os.path.join(path2log, "config.yaml"),'w') as outfile: yaml.dump(cfg, outfile, default_flow_style=False, allow_unicode=True) logger.info("Parameter Count: {:d}".format(n_model_parameters(model))) # Load the pretrained weights if cfg['network']['use_pretrained'] and cfg['network']['pretrained_path']: model, optimizer, scheduler, epoch_it, total_it, metric_val_best = load_checkpoint(model, None, None, filename=cfg['network']['pretrained_path']) else: logger.warning('MODEL RUNS IN EVAL MODE, BUT NO PRETRAINED WEIGHTS WERE LOADED!!!!') # Initialize the trainer trainer = config.get_trainer(cfg, model,device) # if not a pretrained model epoch and iterations should be -1 eval_metrics = defaultdict(list) start = time.time() for it, batch in enumerate(tqdm(eval_loader)): # Put all the tensors to the designated device dict_all_to_device(batch, device) metrics = trainer.eval_step(batch) for key in metrics: eval_metrics[key].append(metrics[key]) stop = time.time() # Compute mean values of the evaluation statistics result_string = '' for key, value in eval_metrics.items(): if key not in ['true_p', 'true_n', 'false_p', 'false_n']: result_string += '{}: {:.3f}; '.format(key, np.mean(value)) if 'true_p' in eval_metrics: result_string += '{}: {:.3f}; '.format('dataset_precision_f', (np.sum(eval_metrics['true_p']) / (np.sum(eval_metrics['true_p']) + np.sum(eval_metrics['false_p'])) )) result_string += '{}: {:.3f}; '.format('dataset_recall_f', (np.sum(eval_metrics['true_p']) / (np.sum(eval_metrics['true_p']) + np.sum(eval_metrics['false_n'])))) result_string += '{}: {:.3f}; '.format('dataset_precision_b', (np.sum(eval_metrics['true_n']) / (np.sum(eval_metrics['true_n']) + np.sum(eval_metrics['false_n'])))) result_string += '{}: {:.3f}; '.format('dataset_recall_b', (np.sum(eval_metrics['true_n']) / (np.sum(eval_metrics['true_n']) + np.sum(eval_metrics['false_p'])))) logger.info('Outputing the evaluation metric for: {} {} {} '.format('Flow, ' if cfg['metrics']['flow'] else '', 'Ego-Motion, ' if cfg['metrics']['ego_motion'] else '', 'Bckg. Segmentaion' if cfg['metrics']['semantic'] else '')) logger.info(result_string) logger.info('Evaluation completed in {}s [{}s per scene]'.format((stop - start), (stop - start)/len(eval_loader)))
dataset_folder = cfg['data']['path'] ################ dataset = data.HumansDataset(dataset_folder, fields, 'test', specific_model=specific_model) test_loader = torch.utils.data.DataLoader( dataset, batch_size=1, num_workers=1, worker_init_fn=data.worker_init_fn, shuffle=False) model = config.get_model(cfg, device=device) model_dir = os.path.join(out_dir, cfg['test']['model_file']) print('Loading checkpoint from %s' % model_dir) load_dict = torch.load(model_dir) model.load_state_dict(load_dict['model']) cfg['generation']['n_time_steps'] = args.seq_length generator = config.get_generator(model, cfg, device=device) times = np.array( [i / (args.seq_length - 1) for i in range(args.seq_length)], dtype=np.float32) if args.experiment == 'temporal': t_idx = np.random.choice(range(args.seq_length), size=args.seq_length // 2, replace=False)
transform, seq_len=cfg['data']['length_sequence'], scale_type=cfg['data']['scale_type']) } dataset = data.HumansDataset(dataset_folder=cfg['data']['path'], fields=fields, mode='test', split='test') # Choose the motion sequence and identity sequence identity_seq = {'category': 'D-FAUST', 'model': '50002_light_hopping_loose', 'start_idx': 30} motion_seq = {'category': 'D-FAUST', 'model': '50004_punching', 'start_idx': 60} inp_id = dataset.get_data_dict(identity_seq) inp_motion = dataset.get_data_dict(motion_seq) # Model model = config.get_model(cfg, device=device, dataset=dataset) checkpoint_io = CheckpointIO(out_dir, model=model) checkpoint_io.load(cfg['test']['model_file']) # Generator generator = config.get_generator(model, cfg, device=device) model.eval() meshes, _ = generator.generate_motion_transfer(inp_id, inp_motion) # Save generated sequence if not os.path.isdir(generation_dir): os.makedirs(generation_dir) modelname = '%s_%d_to_%s_%d' % (motion_seq['model'], motion_seq['start_idx'],
def main(cfg, args, logger): """ Main function of this software. After preparing the model, carry out the pairwise point cloud registration. Args: cfg (dict): current configuration paramaters """ # Get model model = config.get_model(cfg) model.eval() # Load pre-trained model if existing kwargs = {'model': model} checkpoint_io = CheckpointIO('', initialize_from='./pretrained/', initialization_file_name=args.model, **kwargs) try: load_dict = checkpoint_io.load() except FileExistsError: load_dict = dict() # Print model parameters and model graph nparameters = sum(p.numel() for p in model.parameters()) logger.info('Total number of model parameters: {}'.format(nparameters)) # Prepare the output file name target_base = './data/demo/pairwise/results' id_0 = args.source_pc.split(os.sep)[-1].split('_')[-1].split('.')[0] id_1 = args.target_pc.split(os.sep)[-1].split('_')[-1].split('.')[0] metadata = np.array([[id_0, id_1, 'True']]) if not os.path.exists(target_base): os.makedirs(target_base) target_path = os.path.join(target_base, 'est_T.log') with torch.no_grad(): # Load the point clouds and prepare the input start_time_pipeline = time.time() point_cloud_files = [args.source_pc, args.target_pc] data = prepare_data(point_cloud_files, cfg['misc']['voxel_size']) # Extract the descriptors perform the NN search and prepare the data for the reg. blocks start_time_features = time.time() filtering_data, _, _ = model.compute_descriptors(data) end_time_features = time.time() if args.verbose: logger.info('Feature computation and sampling took {:.3f}s'.format( end_time_features - start_time_features)) # Filter the putative correspondences and estimate the relative transformation parameters start_time_filtering = time.time() est_data = model.filter_correspondences(filtering_data) end_time_filtering = time.time() if args.verbose: logger.info( 'Filtering the correspondences and estimation of paramaters took {:.3f}s' .format(end_time_filtering - start_time_filtering)) est_T = np.eye(4) est_T[0:3, 0:3] = est_data['rot_est'][-1].cpu().numpy() est_T[0:3, 3:4] = est_data['trans_est'][-1].cpu().numpy() end_time_pipeline = time.time() # Save the results write_trajectory(np.expand_dims(est_T, 0), metadata, target_path) if args.verbose: logger.info( 'Estimation of the pairwise transformation parameters completed in {:.3f}s' .format(end_time_pipeline - start_time_pipeline)) logger.info( 'Estimated parameters were saved in {}.'.format(target_path)) if args.visualize: pcd_1 = o3d.io.read_point_cloud(args.source_pc) pcd_2 = o3d.io.read_point_cloud(args.target_pc) # First plot both point clouds in their original reference frame draw_registration_result(pcd_1, pcd_2, np.identity(4)) # Plot the point clouds after applying the estimated transformation parameters draw_registration_result(pcd_1, pcd_2, est_T)