def main(): # Create main logger logger = get_logger('CASENetPredictor') parser = argparse.ArgumentParser(description='CASENet2D testing') parser.add_argument( '--config', type=str, help='Path to the YAML config file', default= '/home/SENSETIME/shenrui/Dropbox/SenseTime/edgeDL/resources/test_config_backup.yaml' ) args = parser.parse_args() # Load and log experiment configuration config = load_config(args.config) logger.info(config) # Create the model model = get_model(config) # Load model state model_path = config['model_path'] logger.info(f'Loading model from {model_path}...') load_checkpoint(model_path, model) logger.info(f"Sending the model to '{config['device']}'") model = model.to(config['device']) folderpath = config['save_path'] logger.info(f'Destination of predictions is {folderpath}...') logger.info('Loading datasets...') eval_score_avg = 0 count = 0 for test_loader in get_test_loaders(config): logger.info(f"Processing '{test_loader.dataset.file_path}'...") output_file = _get_output_file(test_loader.dataset, folderpath=folderpath) # run the model prediction on the entire dataset and save to nifti image eval_score, num = predict(model, test_loader, output_file, config, logger) eval_score_avg = (eval_score_avg * count + eval_score * num) / (count + num) count += num logger.info( f'Testing finished. Average evaluation score: {eval_score}. Saving predictions to: {output_file}...' ) logger.info(f'Total average evaluation score: {eval_score_avg}')
def main(): # Create main logger parser = argparse.ArgumentParser(description='GCN training') parser.add_argument('--config', type=str, help='Path to the YAML config file', default=CONFIG_PATH) parser.add_argument('--model', type=str, help='Path to the model parameters', default=MODEL_PATH) args = parser.parse_args() # Load experiment configuration config = load_config(args.config) print(config) manual_seed = config.get('manual_seed', None) if manual_seed is not None: torch.manual_seed(manual_seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Create the model module_path = "models.gcn.model" if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(_get_model(module_path, config)) else: model = _get_model(module_path, config) load_checkpoint(args.model, model) # put the model on GPUs model = model.to(config['device']) # Create loss criterion loss_criterion = get_loss_criterion(config) # Create evaluation metric eval_criterion = get_evaluation_metric(config) # Create data loaders loaders = get_data_loaders(config) # Start testing val_losses, val_scores = validate(model, loaders['test'], loss_criterion, eval_criterion, config['device']) print('testing loss is:', val_losses) print('evaluation score is:', val_scores)
def from_pretrained(cls, pre_trained, model, optimizer, lr_scheduler, loss_criterion, eval_criterion, device, loaders, max_num_epochs=100, max_num_iterations=1e5, validate_after_iters=100, log_after_iters=100, validate_iters=None, num_iterations=1, num_epoch=0, align_start_iters=None, align_after_iters=None, eval_score_higher_is_better=True, best_eval_score=None, level_set_config=None, logger=None): logger.info(f"Logging pre-trained model from '{pre_trained}'...") load_checkpoint(pre_trained, model, None) checkpoint_dir = os.path.split(pre_trained)[0] return cls(model, optimizer, lr_scheduler, loss_criterion, eval_criterion, device, loaders, checkpoint_dir, eval_score_higher_is_better=eval_score_higher_is_better, best_eval_score=best_eval_score, num_iterations=num_iterations, num_epoch=num_epoch, max_num_epochs=max_num_epochs, max_num_iterations=max_num_iterations, validate_after_iters=validate_after_iters, log_after_iters=log_after_iters, validate_iters=validate_iters, align_start_iters=align_start_iters, align_after_iters=align_after_iters, level_set_config=level_set_config, logger=logger)
def run(args): env = args['env'] env = NormalizedActions(env) device = args['device'] input_shape = env.observation_space.shape[0] action_dim = env.action_space.shape[0] actor = ddpg_actor(input_shape, action_dim).to(device) target_actor = ddpg_actor(input_shape, action_dim).to(device) actor_optimizer = Adam(actor.parameters(), lr=args['actor_lr']) critic = ddpg_critic(input_shape, 1, action_dim).to(device) target_critic = ddpg_critic(input_shape, 1, action_dim).to(device) critic_optimizer = Adam(critic.parameters(), lr=args['critic_lr']) target_actor.hard_update(actor) target_critic.hard_update(critic) total_steps = 0 total_episodes = 0 if args['resume']: params = { 'params': [ 'total_steps', 'total_episodes', 'actor', 'target_actor', 'critic', 'target_critic', 'actor_optimizer', 'critic_optimizer', 'state_info', 'reward_info' ] } (total_steps, total_episodes, actor_weights, target_actor_weights, critic_weights, target_critic_weights, actor_optimizer_params, critic_optimizer_params, state_params, reward_params) = load_checkpoint(args['resume'], **params) actor.set_weights(actor_weights) target_actor.set_weights(target_actor_weights) critic.set_weights(critic_weights) target_critic.set_weights(target_critic_weights) actor_optimizer.load_state_dict(actor_optimizer_params) critic_optimizer.load_state_dict(critic_optimizer_params) try: args['state_info'] = state_params args['reward_info'] = reward_params except KeyError: print(args.keys()) pass args['total_steps'] = total_steps args['total_episodes'] = total_episodes if not args['test_only']: trained_vars = train(actor, target_actor, critic, target_critic, actor_optimizer, critic_optimizer, args) actor = trained_vars['actor'] test(actor, args['n_test_episodes'], args)
def from_checkpoint(cls, checkpoint_path, model, optimizer, lr_scheduler, loss_criterion, eval_criterion, device, loaders, align_start_iters=None, align_after_iters=None, level_set_config=None, logger=None): logger.info(f"Loading checkpoint '{checkpoint_path}'...") state = load_checkpoint(checkpoint_path, model, optimizer) logger.info( f"Checkpoint loaded. Epoch: {state['epoch']}. Best val score: {state['best_eval_score']}. Num_iterations: {state['num_iterations']}" ) checkpoint_dir = os.path.split(checkpoint_path)[0] return cls( model, optimizer, lr_scheduler, loss_criterion, eval_criterion, device, loaders, checkpoint_dir, eval_score_higher_is_better=state['eval_score_higher_is_better'], best_eval_score=state['best_eval_score'], num_iterations=state['num_iterations'], num_epoch=state['epoch'], max_num_epochs=state['max_num_epochs'], max_num_iterations=state['max_num_iterations'], validate_after_iters=state['validate_after_iters'], log_after_iters=state['log_after_iters'], validate_iters=state['validate_iters'], align_start_iters=align_start_iters, align_after_iters=align_after_iters, level_set_config=level_set_config, logger=logger)
def main(): args = parser.parse_args() setup_default_logging(log_path=os.path.join(args.output, 'log.txt')) if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank if args.distributed: args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() _logger.info( 'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (args.rank, args.world_size)) else: _logger.info('Training with a single process on 1 GPUs.') assert args.rank >= 0 # resolve AMP arguments based on PyTorch / Apex availability use_amp = None if args.amp and has_native_amp: use_amp = 'native' _logger.info(f'Using Pytorch {torch.__version__} amp...') torch.manual_seed(args.seed + args.rank) model = getModel(model_name=args.model, args=args) if args.local_rank == 0: _logger.info('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) # move model to gpu model.cuda() if args.distributed and args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if args.local_rank == 0: _logger.info( 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using ' 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.' ) # optimizer optimizer = create_optimizer(args, model) amp_autocast = suppress # do nothing loss_scaler = None if use_amp == 'native': amp_autocast = torch.cuda.amp.autocast loss_scaler = NativeScaler() if args.local_rank == 0: _logger.info( 'Using native Torch AMP. Training in mixed precision.') else: if args.local_rank == 0: _logger.info('AMP not enabled. Training in float32.') resume_epoch = None if args.resume: resume_epoch = resume_checkpoint( model, args.resume, optimizer=None if args.no_resume_opt else optimizer, loss_scaler=None if args.no_resume_opt else loss_scaler, log_info=args.local_rank == 0) # setup exponential moving average of model weights, SWA could be used here too model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEmaV2( model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else None) if args.resume: load_checkpoint(model_ema.module, args.resume, use_ema=True) #set up distributed training if args.distributed: if args.local_rank == 0: _logger.info("Using native Torch DistributedDataParallel.") model = NativeDDP(model, device_ids=[args.local_rank ]) # can use device str in Torch >= 1.1 # lr schedule lr_scheduler, num_epochs = create_scheduler(args, optimizer) if args.start_epoch is not None: # a specified start_epoch will always override the resume epoch start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch if lr_scheduler is not None and start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: _logger.info('Scheduled epochs: {}'.format(num_epochs)) # create the train and eval dataset dataset_train = getDataset(dataset_name=args.dataset, mode=args.train_split, args=args) dataset_eval = getDataset(dataset_name=args.dataset, mode=args.val_split, args=args) # create loader loader_train = getDataLoader(dataset_train, is_training=True, args=args) loader_eval = getDataLoader(dataset_eval, is_training=False, args=args) # set_up loss function train_loss_fn = nn.CrossEntropyLoss().cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() # set_up checkpoint saver and eval metric tracking eval_metric = args.eval_metric best_metric = None best_epoch = None saver = None output_dir = '' if args.local_rank == 0: output_base = args.output if args.output else './output' exp_name = '-'.join([ datetime.now().strftime("%Y%m%d-%H%M%S"), args.dataset, args.model, ]) output_dir = get_outdir(output_base, 'train', exp_name) decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver(model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler, checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing, max_history=args.checkpoint_hist) # with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: # f.write(args_text) try: for epoch in range(start_epoch, num_epochs): if args.distributed and hasattr(loader_train.sampler, 'set_epoch'): loader_train.sampler.set_epoch(epoch) train_metrics = train_one_epoch(epoch, model, loader_train, optimizer, train_loss_fn, args, lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema) eval_metrics = validate(model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast) if model_ema is not None and not args.model_ema_force_cpu: if args.distributed and args.dist_bn in ('broadcast', 'reduce'): distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce') ema_eval_metrics = validate(model_ema.module, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast, log_suffix=' (EMA)') eval_metrics = ema_eval_metrics if lr_scheduler is not None: lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) if saver is not None: # save proper checkpoint with eval metric save_metric = eval_metrics[eval_metric] best_metric, best_epoch = saver.save_checkpoint( epoch, metric=save_metric) except KeyboardInterrupt: pass if best_metric is not None: _logger.info('*** Best metric: {0} (epoch {1})'.format( best_metric, best_epoch))