def main(args): root_dir = dirname(abspath(__file__)) # Load the parameters from json file imagenet_dir = args.data_dir exp_dir = join(root_dir, 'training', 'experiments', args.exp_name) json_path = join(exp_dir, 'parameters.json') assert isfile(json_path), ( "No json configuration file found at {}".format(json_path)) params = train_utils.Params(json_path) # Add the timer option to the parameters params.update_with_dict({'timer': args.timer}) params.update_with_dict({'num_workers': args.num_workers}) train_utils.set_logger(join(exp_dir, '{}.log'.format(args.mode))) logging.info("----Starting train script in mode: {}----".format(args.mode)) setup_timer = Timer(convert=True) setup_timer.reset() logging.info("Loading datasets...") # Get the correct model if params.model == 'BaselineEmbeddingNet': model = mdl.SiameseNet(mdl.BaselineEmbeddingNet(), upscale=params.upscale, corr_map_size=33, stride=4) elif params.model == 'VGG11EmbeddingNet_5c': model = mdl.SiameseNet(mdl.VGG11EmbeddingNet_5c(), upscale=params.upscale, corr_map_size=33, stride=4) elif params.model == 'VGG16EmbeddingNet_8c': model = mdl.SiameseNet(mdl.VGG16EmbeddingNet_8c(), upscale=params.upscale, corr_map_size=33, stride=4) # Freeze all the indicated parameters for i, (name, parameter) in enumerate(model.named_parameters()): if i in params.parameter_freeze: logging.info("Freezing parameter {}".format(name)) parameter.requires_grad = False model = model.to(device) # Set the tensorboard summary maker summ_maker = SummaryMaker(join(exp_dir, 'tensorboard'), params, model.upscale_factor) label_function = create_BCELogit_loss_label img_read_fcn = imutils.get_decode_jpeg_fcn(flag=args.imutils_flag) img_resize_fcn = imutils.get_resize_fcn(flag=args.imutils_flag) logging.info("Validation dataset...") metadata_val_file = join(exp_dir, "metadata.val") val_set = ImageNetVID_val(imagenet_dir, label_fcn=label_function, pos_thr=params.pos_thr, neg_thr=params.neg_thr, upscale_factor=model.upscale_factor, cxt_margin=params.context_margin, reference_size=params.reference_sz, search_size=params.search_sz, img_read_fcn=img_read_fcn, resize_fcn=img_resize_fcn, metadata_file=metadata_val_file, save_metadata=metadata_val_file, max_frame_sep=params.max_frame_sep) val_loader = DataLoader(val_set, batch_size=params.batch_size, shuffle=False, num_workers=params.num_workers, pin_memory=True) if params.eval_epoch_size > len(val_loader): logging.info('The user set eval_epoch_size ({}) is bigger than the ' 'size of the eval set ({}). \n Setting ' 'eval_epoch_size to the eval set size.'.format( params.eval_epoch_size, len(val_loader))) params.eval_epoch_size = len(val_loader) # Define the model and optimizer # fetch loss function and metrics loss_fn = losses.BCELogit_Loss metrics = met.METRICS # Set the optional keyword arguments for the functions that need it metrics['center_error']['kwargs']['upscale_factor'] = model.upscale_factor try: if args.mode == 'train': logging.info("Training dataset...") metadata_train_file = join(exp_dir, "metadata.train") train_set = ImageNetVID(imagenet_dir, label_fcn=label_function, pos_thr=params.pos_thr, neg_thr=params.neg_thr, upscale_factor=model.upscale_factor, cxt_margin=params.context_margin, reference_size=params.reference_sz, search_size=params.search_sz, img_read_fcn=img_read_fcn, resize_fcn=img_resize_fcn, metadata_file=metadata_train_file, save_metadata=metadata_train_file, max_frame_sep=params.max_frame_sep) train_loader = DataLoader(train_set, batch_size=params.batch_size, shuffle=True, num_workers=params.num_workers, pin_memory=True) # Though I'm not a big fan of changing the value of a parameter # variable after it has been read, at least I let the user know I'm # changing it. if params.train_epoch_size > len(train_loader): logging.info( 'The user set train_epoch_size ({}) is bigger than the ' 'size of the train set ({}). \n Setting ' 'train_epoch_size to the train set size.'.format( params.train_epoch_size, len(train_loader))) params.train_epoch_size = len(train_loader) logging.info("Done") logging.info("Setup time: {}".format(setup_timer.elapsed)) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optimz.OPTIMIZERS[params.optim](parameters, **params.optim_kwargs) # Set the scheduler, that updates the learning rate using a exponential # decay. If you don't want lr decay set it to 1. logging.info("Using Exponential Learning Rate Decay of {}".format( params.lr_decay)) scheduler = torch.optim.lr_scheduler.ExponentialLR( optimizer, params.lr_decay) logging.info("Epoch sizes: {} in train and {} in eval".format( params.train_epoch_size, params.eval_epoch_size)) logging.info("Starting training for {} epoch(s)".format( params.num_epochs)) with Timer(convert=True) as t: train_and_evaluate(model, train_loader, val_loader, optimizer, scheduler, loss_fn, metrics, params, exp_dir, args, summ_maker=summ_maker) if params.timer: logging.info( "[profiling] Total time to train {} epochs, with {}" " elements on training dataset and {} " "on val dataset: {}".format(params.num_epochs, len(train_loader), len(val_loader), t.elapsed)) elif args.mode == 'eval': logging.info("Done") with Timer(convert=True) as total: logging.info("Starting evaluation") # TODO write a decent Exception if args.restore_file is None: raise IncompleteArgument("In eval mode you have to specify" " a model checkpoint to be loaded" " and evaluated." " E.g: --restore_file best") checkpoint_path = join(exp_dir, args.restore_file + '.pth.tar') train_utils.load_checkpoint(checkpoint_path, model) # Evaluate summ_maker.epoch = 0 test_metrics = evaluate(model, loss_fn, val_loader, metrics, params, args, summ_maker=summ_maker) save_path = join( exp_dir, "metrics_test_{}.json".format(args.restore_file)) train_utils.save_dict_to_json(test_metrics, save_path) if params.timer: logging.info("[profiling] Total evaluation time: {}".format( total.elapsed)) except KeyboardInterrupt: logging.info("=== User interrupted execution ===") raise except Exception as e: logging.exception("Fatal error in main loop") logging.info("=== Execution Terminated with error ===") else: logging.info("=== Execution exited normally ===")
def main(args): root_dir = dirname(abspath(__file__)) # Load the parameters from json file imagenet_dir = args.data_dir exp_dir = join(root_dir, 'training', 'experiments', args.exp_name) json_path = join(exp_dir, 'parameters.json') assert isfile(json_path), ("No json configuration file found at {}" .format(json_path)) params = train_utils.Params(json_path) # Add the timer option to the parameters params.update_with_dict({'timer': args.timer}) params.update_with_dict({'num_workers': args.num_workers}) train_utils.set_logger(join(exp_dir, '{}.log'.format(args.mode))) logging.info("----Starting train script in mode: {}----".format(args.mode)) setup_timer = Timer(convert=True) setup_timer.reset() logging.info("Loading datasets...") # Get the correct model if params.model == 'BaselineEmbeddingNet': model = mdl.SiameseNet(mdl.BaselineEmbeddingNet(), upscale=params.upscale, corr_map_size=33, stride=4) elif params.model == 'VGG11EmbeddingNet_5c': model = mdl.SiameseNet(mdl.VGG11EmbeddingNet_5c(), upscale=params.upscale, corr_map_size=33, stride=4) elif params.model == 'VGG16EmbeddingNet_8c': model = mdl.SiameseNet(mdl.VGG16EmbeddingNet_8c(), upscale=params.upscale, corr_map_size=33, stride=4) # Freeze all the indicated parameters for i, (name, parameter) in enumerate(model.named_parameters()): if i in params.parameter_freeze: logging.info("Freezing parameter {}".format(name)) parameter.requires_grad = False model = model.to(device) # Set the tensorboard summary maker summ_maker = SummaryMaker(join(exp_dir, 'tensorboard'), params, model.upscale_factor) label_function = create_BCELogit_loss_label img_read_fcn = imutils.get_decode_jpeg_fcn(flag=args.imutils_flag) img_resize_fcn = imutils.get_resize_fcn(flag=args.imutils_flag) logging.info("Validation dataset...") metadata_val_file = join(exp_dir, "metadata.val") val_set = ImageNetVID_val(imagenet_dir, label_fcn=label_function, pos_thr=params.pos_thr, neg_thr=params.neg_thr, upscale_factor=model.upscale_factor, cxt_margin=params.context_margin, reference_size=params.reference_sz, search_size=params.search_sz, img_read_fcn=img_read_fcn, resize_fcn=img_resize_fcn, metadata_file=metadata_val_file, save_metadata=metadata_val_file, max_frame_sep=params.max_frame_sep) val_loader = DataLoader(val_set, batch_size=params.batch_size, shuffle=False, num_workers=params.num_workers, pin_memory=True) if params.eval_epoch_size > len(val_loader): logging.info('The user set eval_epoch_size ({}) is bigger than the ' 'size of the eval set ({}). \n Setting ' 'eval_epoch_size to the eval set size.' .format(params.eval_epoch_size, len(val_loader))) params.eval_epoch_size = len(val_loader) # Define the model and optimizer # fetch loss function and metrics loss_fn = losses.BCELogit_Loss metrics = met.METRICS # Set the optional keyword arguments for the functions that need it metrics['center_error']['kwargs']['upscale_factor'] = model.upscale_factor try: if args.mode == 'train': logging.info("Training dataset...") metadata_train_file = join(exp_dir, "metadata.train") train_set = ImageNetVID(imagenet_dir, label_fcn=label_function, pos_thr=params.pos_thr, neg_thr=params.neg_thr, upscale_factor=model.upscale_factor, cxt_margin=params.context_margin, reference_size=params.reference_sz, search_size=params.search_sz, img_read_fcn=img_read_fcn, resize_fcn=img_resize_fcn, metadata_file=metadata_train_file, save_metadata=metadata_train_file, max_frame_sep=params.max_frame_sep) train_loader = DataLoader(train_set, batch_size=params.batch_size, shuffle=True, num_workers=params.num_workers, pin_memory=True) # Though I'm not a big fan of changing the value of a parameter # variable after it has been read, at least I let the user know I'm # changing it. if params.train_epoch_size > len(train_loader): logging.info('The user set train_epoch_size ({}) is bigger than the ' 'size of the train set ({}). \n Setting ' 'train_epoch_size to the train set size.' .format(params.train_epoch_size, len(train_loader))) params.train_epoch_size = len(train_loader) logging.info("Done") logging.info("Setup time: {}".format(setup_timer.elapsed)) parameters = filter(lambda p: p.requires_grad,model.parameters()) optimizer = optimz.OPTIMIZERS[params.optim](parameters, **params.optim_kwargs) # Set the scheduler, that updates the learning rate using a exponential # decay. If you don't want lr decay set it to 1. logging.info("Using Exponential Learning Rate Decay of {}".format(params.lr_decay)) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, params.lr_decay) logging.info("Epoch sizes: {} in train and {} in eval" .format(params.train_epoch_size, params.eval_epoch_size)) logging.info("Starting training for {} epoch(s)".format(params.num_epochs)) with Timer(convert=True) as t: train_and_evaluate(model, train_loader, val_loader, optimizer, scheduler, loss_fn, metrics, params, exp_dir, args, summ_maker=summ_maker) if params.timer: logging.info("[profiling] Total time to train {} epochs, with {}" " elements on training dataset and {} " "on val dataset: {}" .format(params.num_epochs, len(train_loader), len(val_loader), t.elapsed)) elif args.mode == 'eval': logging.info("Done") with Timer(convert=True) as total: logging.info("Starting evaluation") # TODO write a decent Exception if args.restore_file is None: raise IncompleteArgument("In eval mode you have to specify" " a model checkpoint to be loaded" " and evaluated." " E.g: --restore_file best") checkpoint_path = join(exp_dir, args.restore_file + 'pth.tar') train_utils.load_checkpoint(checkpoint_path, model) # Evaluate summ_maker.epoch = 0 test_metrics = evaluate(model, loss_fn, val_loader, metrics, params, args, summ_maker=summ_maker) save_path = join(exp_dir, "metrics_test_{}.json".format(args.restore_file)) train_utils.save_dict_to_json(test_metrics, save_path) if params.timer: logging.info("[profiling] Total evaluation time: {}" .format(total.elapsed)) except KeyboardInterrupt: logging.info("=== User interrupted execution ===") raise except Exception as e: logging.exception("Fatal error in main loop") logging.info("=== Execution Terminated with error ===") else: logging.info("=== Execution exited normally ===")
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, scheduler, loss_fn, metrics, params, exp_dir, args, summ_maker=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data optimizer: (torch.optim) optimizer for parameters of model scheduler: (torch.optim.lr_scheduler.ExponentialLR) The exponential learning rate scheduler. loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch params: (Params) hyperparameters exp_dir: (string) directory containing the parameters, weights and logs for the current experiment. The full path. args: The parser object containing the user informed arguments summ_maker: The SummaryMaker object that writes the training information to a tensorboard-readable file. """ # reload weights from restore_file if specified # TODO load and set best validation error if args.restore_file is not None: restore_path = join(exp_dir, (args.restore_file + '.pth.tar')) logging.info("Restoring parameters from {}".format(restore_path)) train_utils.load_checkpoint(restore_path, model) # best_val_c_error = float("inf") best_val_auc = 0 # Before starting the first epoch do the eval logging.info('Pretraining evaluation...') # Epoch 0 is the validation epoch before the learning starts. summ_maker.epoch = 0 val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params, args, summ_maker=summ_maker) for epoch in range(params.num_epochs): # The first epoch after training is 1 not 0 summ_maker.epoch = epoch + 1 # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train(model, optimizer, loss_fn, train_dataloader, metrics, params, summ_maker=summ_maker) # Update the Learning rate scheduler.step() # Evaluate for one epoch on validation set val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params, args, summ_maker=summ_maker) val_auc = val_metrics['AUC'] is_best = val_auc >= best_val_auc # Save weights train_utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=exp_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best auc") best_val_auc = val_auc # Save best val metrics in a json file in the model directory best_json_path = join(exp_dir, "metrics_val_best_weights.json") train_utils.save_dict_to_json(val_metrics, best_json_path) pass # Save latest val metrics in a json file in the model directory last_json_path = join(exp_dir, "metrics_val_last_weights.json") train_utils.save_dict_to_json(val_metrics, last_json_path)
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, scheduler, loss_fn, metrics, params, exp_dir, args, summ_maker=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data optimizer: (torch.optim) optimizer for parameters of model scheduler: (torch.optim.lr_scheduler.ExponentialLR) The exponential learning rate scheduler. loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch params: (Params) hyperparameters exp_dir: (string) directory containing the parameters, weights and logs for the current experiment. The full path. args: The parser object containing the user informed arguments summ_maker: The SummaryMaker object that writes the training information to a tensorboard-readable file. """ # reload weights from restore_file if specified # TODO load and set best validation error if args.restore_file is not None: restore_path = join(exp_dir, (args.restore_file + '.pth.tar')) logging.info("Restoring parameters from {}".format(restore_path)) train_utils.load_checkpoint(restore_path, model) # best_val_c_error = float("inf") best_val_auc = 0 # Before starting the first epoch do the eval logging.info('Pretraining evaluation...') # Epoch 0 is the validation epoch before the learning starts. summ_maker.epoch = 0 val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params, args, summ_maker=summ_maker) for epoch in range(params.num_epochs): # The first epoch after training is 1 not 0 summ_maker.epoch = epoch + 1 # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train(model, optimizer, loss_fn, train_dataloader, metrics, params, summ_maker=summ_maker) # Update the Learning rate scheduler.step() # Evaluate for one epoch on validation set val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params, args, summ_maker=summ_maker) val_auc = val_metrics['AUC'] is_best = val_auc >= best_val_auc # Save weights train_utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, is_best=is_best, checkpoint=exp_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best auc") best_val_auc = val_auc # Save best val metrics in a json file in the model directory best_json_path = join(exp_dir, "metrics_val_best_weights.json") train_utils.save_dict_to_json(val_metrics, best_json_path) pass # Save latest val metrics in a json file in the model directory last_json_path = join(exp_dir, "metrics_val_last_weights.json") train_utils.save_dict_to_json(val_metrics, last_json_path)