def main(): # Init random seeds. random.seed(config.RANDOM_SEED) np.random.seed(config.RANDOM_SEED) torch.manual_seed(config.RANDOM_SEED) torch.cuda.manual_seed(config.RANDOM_SEED) # Setup Tensorboard. print('\nsaving run in .. {}'.format(config.TRAINED_MODELS_DIR)) if not os.path.exists(config.TRAINED_MODELS_DIR): os.makedirs(config.TRAINED_MODELS_DIR) writer = SummaryWriter(f'{config.TRAINED_MODELS_DIR}') # Load the Model. print() model = affnet.ResNetAffNet(pretrained=config.IS_PRETRAINED, num_classes=config.NUM_CLASSES) model.to(config.DEVICE) # Load the dataset. train_loader, val_loader, test_loader = umd_dataset_loaders.load_umd_train_datasets( ) # construct optimizer. params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=config.LEARNING_RATE, weight_decay=config.WEIGHT_DECAY, momentum=config.MOMENTUM) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=config.MILESTONES, gamma=config.GAMMA) # Main training loop. num_epochs = config.NUM_EPOCHS best_Fwb = -np.inf for epoch in range(0, num_epochs): print() if epoch < config.EPOCH_TO_TRAIN_FULL_DATASET: is_subsample = True else: is_subsample = False # train & val for one epoch model, optimizer = train_utils.train_one_epoch( model, optimizer, train_loader, config.DEVICE, epoch, writer, is_subsample=is_subsample) model, optimizer = train_utils.val_one_epoch(model, optimizer, val_loader, config.DEVICE, epoch, writer, is_subsample=is_subsample) # update learning rate. lr_scheduler.step() # # eval Fwb # model, Fwb = eval_utils.affnet_eval_umd(model, test_loader) # writer.add_scalar('eval/Fwb', Fwb, int(epoch)) # # save best model. # if Fwb > best_Fwb: # best_Fwb = Fwb # writer.add_scalar('eval/Best_Fwb', best_Fwb, int(epoch)) # checkpoint_path = config.BEST_MODEL_SAVE_PATH # train_utils.save_checkpoint(model, optimizer, epoch, checkpoint_path) # print("Saving best model .. best Fwb={:.5f} ..".format(best_Fwb)) # # checkpoint_path checkpoint_path = config.MODEL_SAVE_PATH + 'affnet_epoch_' + np.str( epoch) + '.pth' train_utils.save_checkpoint(model, optimizer, epoch, checkpoint_path)
def main_worker(gpu_idx, configs): configs.gpu_idx = gpu_idx if configs.gpu_idx is not None: print("Use GPU: {} for training".format(configs.gpu_idx)) configs.device = torch.device('cuda:{}'.format(configs.gpu_idx)) if configs.distributed: if configs.dist_url == "env://" and configs.rank == -1: configs.rank = int(os.environ["RANK"]) if configs.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes configs.rank = configs.rank * configs.ngpus_per_node + gpu_idx dist.init_process_group(backend=configs.dist_backend, init_method=configs.dist_url, world_size=configs.world_size, rank=configs.rank) configs.is_master_node = (not configs.distributed) or ( configs.distributed and (configs.rank % configs.ngpus_per_node == 0)) if configs.is_master_node: logger = Logger(configs.logs_dir, configs.saved_fn) logger.info('>>> Created a new logger') logger.info('>>> configs: {}'.format(configs)) tb_writer = SummaryWriter( log_dir=os.path.join(configs.logs_dir, 'tensorboard')) else: logger = None tb_writer = None # model model = get_model(configs) # Data Parallel model = make_data_parallel(model, configs) # Freeze model model = freeze_model(model, configs.freeze_modules_list) if configs.is_master_node: num_parameters = get_num_parameters(model) logger.info('number of trained parameters of the model: {}'.format( num_parameters)) optimizer = get_optimizer(configs, model, is_warm_up=False) lr_scheduler = get_lr_scheduler(optimizer, configs) best_val_loss = np.inf earlystop_count = 0 # optionally load weight from a checkpoint if configs.pretrained_path is not None: model = load_pretrained_model(model, configs.pretrained_path, gpu_idx, configs.overwrite_global_2_local) if logger is not None: logger.info('loaded pretrained model at {}'.format( configs.pretrained_path)) # optionally resume from a checkpoint if configs.resume_path is not None: checkpoint = resume_model(configs.resume_path, configs.arch, configs.gpu_idx) if hasattr(model, 'module'): model.module.load_state_dict(checkpoint['state_dict']) else: model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) best_val_loss = checkpoint['best_val_loss'] earlystop_count = checkpoint['earlystop_count'] configs.start_epoch = checkpoint['epoch'] + 1 if logger is not None: logger.info(">>> Loading dataset & getting dataloader...") # Create dataloader train_loader, val_loader, train_sampler = create_train_val_dataloader( configs) if logger is not None: logger.info('number of batches in train set: {}'.format( len(train_loader))) if val_loader is not None: logger.info('number of batches in val set: {}'.format( len(val_loader))) if configs.evaluate: assert val_loader is not None, "The validation should not be None" val_loss = validate_one_epoch(val_loader, model, configs.start_epoch - 1, configs, logger) print('Evaluate, val_loss: {}'.format(val_loss)) return for epoch in range(configs.start_epoch, configs.num_epochs + 1): # Get the current learning rate for param_group in optimizer.param_groups: lr = param_group['lr'] if logger is not None: logger.info('{}'.format('*-' * 40)) logger.info('{} {}/{} {}'.format('=' * 35, epoch, configs.num_epochs, '=' * 35)) logger.info('{}'.format('*-' * 40)) logger.info('>>> Epoch: [{}/{}] learning rate: {:.2e}'.format( epoch, configs.num_epochs, lr)) if configs.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_loss = train_one_epoch(train_loader, model, optimizer, epoch, configs, logger) # evaluate on validation set if not configs.no_val: val_loss = validate_one_epoch(val_loader, model, epoch, configs, logger) # Adjust learning rate if configs.lr_type == 'step_lr': lr_scheduler.step() elif configs.lr_type == 'plateau': assert configs.no_val == True, "Only use plateau when having validation set" lr_scheduler.step(val_loss) if not configs.no_val: is_best = val_loss <= best_val_loss best_val_loss = min(val_loss, best_val_loss) print_string = '\t--- train_loss: {:.4f}, val_loss: {:.4f}, best_val_loss: {:.4f}\t'.format( train_loss, val_loss, best_val_loss) if tb_writer is not None: tb_writer.add_scalars('Loss', { 'train': train_loss, 'val': val_loss }, epoch) if configs.is_master_node and (is_best or ( (epoch % configs.checkpoint_freq) == 0)): saved_state = get_saved_state(model, optimizer, lr_scheduler, epoch, configs, best_val_loss, earlystop_count) save_checkpoint(configs.checkpoints_dir, configs.saved_fn, saved_state, is_best, epoch) if configs.earlystop_patience: earlystop_count = 0 if is_best else (earlystop_count + 1) print_string += ' |||\t earlystop_count: {}'.format( earlystop_count) if configs.earlystop_patience <= earlystop_count: print_string += '\n\t--- Early stopping!!!' break else: print_string += '\n\t--- Continue training..., earlystop_count: {}'.format( earlystop_count) if logger is not None: logger.info(print_string) else: if tb_writer is not None: tb_writer.add_scalars('Loss', {'train': train_loss}, epoch) if configs.is_master_node and ((epoch % configs.checkpoint_freq) == 0): saved_state = get_saved_state(model, optimizer, lr_scheduler, epoch, configs, best_val_loss, earlystop_count) save_checkpoint(configs.checkpoints_dir, configs.saved_fn, saved_state, False, epoch) if tb_writer is not None: tb_writer.close() cleanup()
def main(): # Init random seeds. random.seed(config.RANDOM_SEED) np.random.seed(config.RANDOM_SEED) torch.manual_seed(config.RANDOM_SEED) torch.cuda.manual_seed(config.RANDOM_SEED) # Setup Tensorboard. print('\nsaving run in .. {}'.format(config.TRAINED_MODELS_DIR)) if not os.path.exists(config.TRAINED_MODELS_DIR): os.makedirs(config.TRAINED_MODELS_DIR) writer = SummaryWriter(f'{config.TRAINED_MODELS_DIR}') # Load the Model. print() # Compare Pytorch-Simple-MaskRCNN. with Torchvision MaskRCNN. model = model_utils.get_model_instance_segmentation( pretrained=config.IS_PRETRAINED, num_classes=config.NUM_CLASSES) model.to(config.DEVICE) # Load the dataset. train_loader, val_loader, test_loader = arl_affpose_dataset_loaders.load_arl_affpose_train_datasets( ) # Construct an optimizer. params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=config.LEARNING_RATE, weight_decay=config.WEIGHT_DECAY, momentum=config.MOMENTUM) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=config.MILESTONES, gamma=config.GAMMA) # Main training loop. num_epochs = config.NUM_EPOCHS best_Fwb, best_mAP = -np.inf, -np.inf for epoch in range(0, num_epochs): print() # train & val for one epoch model, optimizer = train_utils.train_one_epoch(model, optimizer, train_loader, config.DEVICE, epoch, writer, is_subsample=True) model, optimizer = train_utils.val_one_epoch(model, optimizer, val_loader, config.DEVICE, epoch, writer, is_subsample=True) # update learning rate. lr_scheduler.step() # checkpoint_path checkpoint_path = config.MODEL_SAVE_PATH + 'maskrcnn_epoch_' + np.str( epoch) + '.pth' train_utils.save_checkpoint(model, optimizer, epoch, checkpoint_path)
def main(): # Init random seeds. random.seed(config.RANDOM_SEED) np.random.seed(config.RANDOM_SEED) torch.manual_seed(config.RANDOM_SEED) torch.cuda.manual_seed(config.RANDOM_SEED) # Setup Tensorboard. print('\nsaving run in .. {}'.format(config.TRAINED_MODELS_DIR)) if not os.path.exists(config.TRAINED_MODELS_DIR): os.makedirs(config.TRAINED_MODELS_DIR) writer = SummaryWriter(f'{config.TRAINED_MODELS_DIR}') # Load the Model. print() model = maskrcnn.ResNetMaskRCNN(pretrained=config.IS_PRETRAINED, num_classes=config.COCO_NUM_CLASSES) model.to(config.DEVICE) # Load the dataset. train_loader, val_loader = coco_dataset_loaders.load_coco_train_datasets() # Construct an optimizer. params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=config.LEARNING_RATE, weight_decay=config.WEIGHT_DECAY, momentum=config.MOMENTUM) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=config.MILESTONES, gamma=config.GAMMA) # Main training loop. num_epochs = config.NUM_EPOCHS for epoch in range(0, num_epochs): print() if epoch < config.EPOCH_TO_TRAIN_FULL_DATASET: is_subsample = True else: is_subsample = False # train & val for one epoch model, optimizer = train_utils.train_one_epoch( model, optimizer, train_loader, config.DEVICE, epoch, writer, is_subsample=is_subsample) model, optimizer = train_utils.val_one_epoch(model, optimizer, val_loader, config.DEVICE, epoch, writer, is_subsample=is_subsample) # update learning rate. lr_scheduler.step() # checkpoint_path CHECKPOINT_PATH = config.MODEL_SAVE_PATH + 'maskrcnn_epoch_' + np.str( epoch) + '.pth' train_utils.save_checkpoint(model, optimizer, epoch, CHECKPOINT_PATH) print(f'saved model to {CHECKPOINT_PATH} ..')
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, scheduler, loss_fn, metrics, params, exp_dir, args, summ_maker=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data optimizer: (torch.optim) optimizer for parameters of model scheduler: (torch.optim.lr_scheduler.ExponentialLR) The exponential learning rate scheduler. loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch params: (Params) hyperparameters exp_dir: (string) directory containing the parameters, weights and logs for the current experiment. The full path. args: The parser object containing the user informed arguments summ_maker: The SummaryMaker object that writes the training information to a tensorboard-readable file. """ # reload weights from restore_file if specified # TODO load and set best validation error if args.restore_file is not None: restore_path = join(exp_dir, (args.restore_file + '.pth.tar')) logging.info("Restoring parameters from {}".format(restore_path)) train_utils.load_checkpoint(restore_path, model) # best_val_c_error = float("inf") best_val_auc = 0 # Before starting the first epoch do the eval logging.info('Pretraining evaluation...') # Epoch 0 is the validation epoch before the learning starts. summ_maker.epoch = 0 val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params, args, summ_maker=summ_maker) for epoch in range(params.num_epochs): # The first epoch after training is 1 not 0 summ_maker.epoch = epoch + 1 # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train(model, optimizer, loss_fn, train_dataloader, metrics, params, summ_maker=summ_maker) # Update the Learning rate scheduler.step() # Evaluate for one epoch on validation set val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params, args, summ_maker=summ_maker) val_auc = val_metrics['AUC'] is_best = val_auc >= best_val_auc # Save weights train_utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=exp_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best auc") best_val_auc = val_auc # Save best val metrics in a json file in the model directory best_json_path = join(exp_dir, "metrics_val_best_weights.json") train_utils.save_dict_to_json(val_metrics, best_json_path) pass # Save latest val metrics in a json file in the model directory last_json_path = join(exp_dir, "metrics_val_last_weights.json") train_utils.save_dict_to_json(val_metrics, last_json_path)
def main(): # Init random seeds. random.seed(config.RANDOM_SEED) np.random.seed(config.RANDOM_SEED) torch.manual_seed(config.RANDOM_SEED) torch.cuda.manual_seed(config.RANDOM_SEED) # Setup Tensorboard. print('\nsaving run in .. {}'.format(config.TRAINED_MODELS_DIR)) if not os.path.exists(config.TRAINED_MODELS_DIR): os.makedirs(config.TRAINED_MODELS_DIR) writer = SummaryWriter(f'{config.TRAINED_MODELS_DIR}') # Load the Model. print() model = affnet.ResNetAffNet(pretrained=config.IS_PRETRAINED, num_classes=config.NUM_CLASSES) model.to(config.DEVICE) torch.cuda.empty_cache() # # TODO: Freeze the backbone. # model = model_utils.freeze_backbone(model, verbose=True) # TODO: Load saved weights. print( f"\nrestoring pre-trained AffNet weights: {config.RESTORE_SYN_ARL_AFFNET_WEIGHTS} .. " ) checkpoint = torch.load(config.RESTORE_SYN_ARL_AFFNET_WEIGHTS, map_location=config.DEVICE) model.load_state_dict(checkpoint["model"]) model.to(config.DEVICE) # Load the dataset. train_loader, val_loader, test_loader = arl_affpose_dataset_loaders.load_arl_affpose_train_datasets( ) # Construct an optimizer. params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=config.LEARNING_RATE, weight_decay=config.WEIGHT_DECAY, momentum=config.MOMENTUM) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=config.MILESTONES, gamma=config.GAMMA) # # TODO: Load saved weights. # optimizer.load_state_dict(checkpoint["optimizer"]) # Main training loop. num_epochs = config.NUM_EPOCHS best_Fwb, best_mAP = -np.inf, -np.inf for epoch in range(0, num_epochs): print() if epoch < config.EPOCH_TO_TRAIN_FULL_DATASET: is_subsample = True else: is_subsample = False # train & val for one epoch model, optimizer = train_utils.train_one_epoch( model, optimizer, train_loader, config.DEVICE, epoch, writer, is_subsample=is_subsample) model, optimizer = train_utils.val_one_epoch(model, optimizer, val_loader, config.DEVICE, epoch, writer, is_subsample=is_subsample) # update learning rate. lr_scheduler.step() model, mAP, Fwb = eval_utils.affnet_eval_arl_affpose( model, test_loader) # eval FwB writer.add_scalar('eval/Fwb', Fwb, int(epoch)) if Fwb > best_Fwb: best_Fwb = Fwb writer.add_scalar('eval/Best_Fwb', best_Fwb, int(epoch)) checkpoint_path = config.BEST_MODEL_SAVE_PATH train_utils.save_checkpoint(model, optimizer, epoch, checkpoint_path) print("Saving best model .. best Fwb={:.5f} ..".format(best_Fwb)) # eval mAP writer.add_scalar('eval/mAP', mAP, int(epoch)) if mAP > best_mAP: best_mAP = mAP writer.add_scalar('eval/Best_mAP', best_mAP, int(epoch)) # checkpoint_path checkpoint_path = config.MODEL_SAVE_PATH + 'affnet_epoch_' + np.str( epoch) + '.pth' train_utils.save_checkpoint(model, optimizer, epoch, checkpoint_path)
def main_worker(gpu, ngpus_per_node, args): """ :param gpu: current gpu id :param ngpus_per_node: number of gpus in one node :param args: config parameter :return: init training setup and iteratively training """ params = vars(args) args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) print("=> creating model '{}'".format(args.arch)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) #init model model = CLSA(models.__dict__[args.arch], args, args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp) print(model) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) # comment out the following line for debugging raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. raise NotImplementedError("Only DistributedDataParallel is supported.") # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) exit() cudnn.benchmark = True # config data loader normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) fix_transform = Multi_Fixtransform(args.size_crops, args.nmb_crops, args.min_scale_crops, args.max_scale_crops, normalize, args.aug_times) traindir = os.path.join(args.data, 'train') train_dataset = datasets.ImageFolder(traindir, fix_transform) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True) save_path = init_log_path(args) #config model save path and log path log_path = os.path.join(save_path, "train.log") best_Acc = 0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) acc1 = train(train_loader, model, criterion, optimizer, epoch, args, log_path) is_best = best_Acc > acc1 best_Acc = max(best_Acc, acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_dict = { 'epoch': epoch + 1, 'arch': args.arch, 'best_acc': best_Acc, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } if epoch % 10 == 9: tmp_save_path = os.path.join( save_path, 'checkpoint_{:04d}.pth.tar'.format(epoch)) save_checkpoint(save_dict, is_best=False, filename=tmp_save_path) tmp_save_path = os.path.join(save_path, 'checkpoint_best.pth.tar') save_checkpoint(save_dict, is_best=is_best, filename=tmp_save_path)
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, scheduler, loss_fn, metrics, params, exp_dir, args, summ_maker=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data optimizer: (torch.optim) optimizer for parameters of model scheduler: (torch.optim.lr_scheduler.ExponentialLR) The exponential learning rate scheduler. loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch params: (Params) hyperparameters exp_dir: (string) directory containing the parameters, weights and logs for the current experiment. The full path. args: The parser object containing the user informed arguments summ_maker: The SummaryMaker object that writes the training information to a tensorboard-readable file. """ # reload weights from restore_file if specified # TODO load and set best validation error if args.restore_file is not None: restore_path = join(exp_dir, (args.restore_file + '.pth.tar')) logging.info("Restoring parameters from {}".format(restore_path)) train_utils.load_checkpoint(restore_path, model) # best_val_c_error = float("inf") best_val_auc = 0 # Before starting the first epoch do the eval logging.info('Pretraining evaluation...') # Epoch 0 is the validation epoch before the learning starts. summ_maker.epoch = 0 val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params, args, summ_maker=summ_maker) for epoch in range(params.num_epochs): # The first epoch after training is 1 not 0 summ_maker.epoch = epoch + 1 # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train(model, optimizer, loss_fn, train_dataloader, metrics, params, summ_maker=summ_maker) # Update the Learning rate scheduler.step() # Evaluate for one epoch on validation set val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params, args, summ_maker=summ_maker) val_auc = val_metrics['AUC'] is_best = val_auc >= best_val_auc # Save weights train_utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, is_best=is_best, checkpoint=exp_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best auc") best_val_auc = val_auc # Save best val metrics in a json file in the model directory best_json_path = join(exp_dir, "metrics_val_best_weights.json") train_utils.save_dict_to_json(val_metrics, best_json_path) pass # Save latest val metrics in a json file in the model directory last_json_path = join(exp_dir, "metrics_val_last_weights.json") train_utils.save_dict_to_json(val_metrics, last_json_path)