def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: model = custom_models.__dict__[args.arch]([2, 2, 2, 2], pooling_type='max', in_chns=1, num_classes=12, inplanes=64) # print("=> creating model '{}'".format(args.arch)) # model = models.__dict__[args.arch]() if not torch.cuda.is_available(): print('using CPU, this will be slow') elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code args.mean = [0.5, 0.5, 0.5] args.std = [0.5, 0.5, 0.5] trans_funcs = [] val_loader = torch.utils.data.DataLoader(get_val_dataset( args.data_dir + '/img/', args.data_dir + '/gt/', args.data_dir + '/all_imgs.txt', args.test_inds, trans_funcs, args.mean, args.std, args.target_size), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return train_dataset = get_train_dataset(args.data_dir + '/img/', args.data_dir + '/gt/', args.data_dir + '/all_imgs.txt', args.test_inds, trans_funcs, args.mean, args.std, args.target_size) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best)
def main(opts): """Main function for the training pipeline :opts: commandlien arguments :returns: None """ ########################################################################## # Basic settings # ########################################################################## exp_dir = 'experiments' log_dir = os.path.join(exp_dir, 'logs') model_dir = os.path.join(exp_dir, 'models') os.makedirs(os.path.join(model_dir, opts.run_name), exist_ok=True) os.makedirs(os.path.join(log_dir, opts.run_name)) pprint(vars(opts)) with open(os.path.join(log_dir, opts.run_name, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) torch.manual_seed(opts.seed) np.random.seed(opts.seed) random.seed(opts.seed) ########################################################################## # Define all the necessary variables for model training and evaluation # ########################################################################## writer = SummaryWriter(os.path.join(log_dir, opts.run_name), flush_secs=5) if opts.train_mode == 'combined': train_dataset = get_train_dataset(opts.data_root, opts, opts.folder1, opts.folder2, opts.folder3) elif opts.train_mode == 'oversampling': train_dataset = get_train_dataset_by_oversampling( opts.data_root, opts, opts.folder1, opts.folder2, opts.folder3) elif opts.train_mode == 'pretrain_and_finetune': train_dataset, finetune_dataset = get_pretrain_and_finetune_datast( opts.data_root, opts, opts.folder1, opts.folder2, opts.folder3) finetune_loader = torch.utils.data.DataLoader( finetune_dataset, batch_size=opts.batch_size, num_workers=opts.num_workers, drop_last=False, shuffle=True) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opts.batch_size, num_workers=opts.num_workers, drop_last=False, shuffle=True) val_dataset = get_val_dataset(os.path.join('data', 'val'), opts) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=opts.eval_batch_size, shuffle=False, num_workers=opts.num_workers, drop_last=False) test_dataset = get_test_dataset(os.path.join('data', 'test'), opts) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=opts.eval_batch_size, shuffle=False, num_workers=opts.num_workers, drop_last=False) assert train_dataset.class_to_idx == val_dataset.class_to_idx == test_dataset.class_to_idx, "Mapping not correct" model = get_model(opts) opts.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if torch.cuda.device_count() > 1 and not opts.no_data_parallel: model = nn.DataParallel(model) model = model.to(opts.device) optimizer = optim.RMSprop(model.parameters(), lr=opts.lr, alpha=0.9, weight_decay=1e-5, momentum=0.9) scheduler = get_lr_scheduler(optimizer, opts) best_val_loss = float('inf') best_val_accu = float(0) best_val_rec = float(0) best_val_prec = float(0) best_val_f1 = float(0) best_val_auc = float(0) iteration_change_loss = 0 t_start_training = time.time() ########################################################################## # Main training loop # ########################################################################## for epoch in range(opts.epochs): current_lr = get_lr(optimizer) t_start = time.time() ############################################################ # The actual training and validation step for each epoch # ############################################################ train_loss, train_metric = train_model(model, train_loader, optimizer, opts) if epoch == opts.finetune_epoch and opts.train_mode == 'pretrain_and_finetune': train_loader = finetune_loader optimizer = optim.RMSprop(model.parameters(), lr=opts.lr, alpha=0.9, weight_decay=1e-5, momentum=0.9) scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=opts.step_size_finetuning, gamma=opts.gamma) # Run the validation set with torch.no_grad(): val_loss, val_metric = evaluate_model(model, val_loader, opts) ############################## # Write to summary writer # ############################## train_acc, val_acc = train_metric['accuracy'], val_metric['accuracy'] train_rec, val_rec = train_metric['recalls'], val_metric['recalls'] train_prec, val_prec = train_metric['precisions'], val_metric[ 'precisions'] train_f1, val_f1 = train_metric['f1'], val_metric['f1'] train_auc, val_auc = train_metric['auc'], val_metric['auc'] writer.add_scalar('Loss/Train', train_loss, epoch) writer.add_scalar('Accuracy/Train', train_acc, epoch) writer.add_scalar('Precision/Train', train_prec, epoch) writer.add_scalar('Recall/Train', train_rec, epoch) writer.add_scalar('F1/Train', train_f1, epoch) writer.add_scalar('AUC/Train', train_auc, epoch) writer.add_scalar('Loss/Val', val_loss, epoch) writer.add_scalar('Accuracy/Val', val_acc, epoch) writer.add_scalar('Precision/Val', val_prec, epoch) writer.add_scalar('Recall/Val', val_rec, epoch) writer.add_scalar('F1/Val', val_f1, epoch) writer.add_scalar('AUC/Val', val_auc, epoch) ############################## # Adjust the learning rate # ############################## if opts.lr_scheduler == 'plateau': scheduler.step(val_loss) elif opts.lr_scheduler in ['step', 'cosine']: scheduler.step() t_end = time.time() delta = t_end - t_start print_epoch_progress(epoch, opts.epochs, train_loss, val_loss, delta, train_metric, val_metric) iteration_change_loss += 1 print('-' * 30) if val_acc > best_val_accu: best_val_accu = val_acc if bool(opts.save_model): torch.save( model.state_dict(), os.path.join(model_dir, opts.run_name, 'best_state_dict.pth')) if val_loss < best_val_loss: best_val_loss = val_loss iteration_change_loss = 0 if val_rec > best_val_rec: best_val_rec = val_rec if val_prec > best_val_prec: best_val_prec = val_prec if val_f1 > best_val_f1: best_val_f1 = val_f1 print(f'The best validation F1-score is now {best_val_f1}') print( f'The validation accuracy and AUC are now {val_acc} and {val_auc}' ) if val_auc > best_val_auc: best_val_auc = val_auc if iteration_change_loss == opts.patience and opts.early_stopping: print( ('Early stopping after {0} iterations without the decrease ' + 'of the val loss').format(iteration_change_loss)) break t_end_training = time.time() print(f'training took {t_end_training - t_start_training}s') print(f'Best validation accuracy: {best_val_accu}') print(f'Best validation loss: {best_val_loss}') print(f'Best validation precision: {best_val_prec}') print(f'Best validation recall: {best_val_rec}') print(f'Best validation f1: {best_val_f1}') print(f'Best validation AUC: {best_val_auc}') with torch.no_grad(): if opts.train_mode in ['combined', 'oversampling']: model.load_state_dict( torch.load( os.path.join(model_dir, opts.run_name, 'best_state_dict.pth'))) test_loss, test_metric = evaluate_model(model, test_loader, opts) print(f'The best test F1: {test_metric["f1"]}') print(f'The best test auc: {test_metric["auc"]}') print(f'The best test accuracy: {test_metric["accuracy"]}')
def main(args): parser = argparse.ArgumentParser(description='Variational AutoEncoders') parser.add_argument('data_dir', help='path to training data') parser.add_argument('--test_inds', type=int, nargs='+', help='inds test participants') parser.add_argument('--test_file', type=str, help='path to a file containing test inds') parser.add_argument('--target-size', default=260, type=int) parser.add_argument('-j', '--workers', default=4, type=int, help='number of data loading workers (default: 4)') parser.add_argument('--pred', type=str, default=None, help='Only prediction') model_parser = parser.add_argument_group('Model Parameters') model_parser.add_argument('--model', default='vqvae', choices=['vae', 'vqvae', 'resnet'], help='autoencoder variant to use: vae | vqvae') model_parser.add_argument( '--batch-size', type=int, default=4, metavar='N', help='input batch size for training (default: 128)') model_parser.add_argument('--hidden', type=int, metavar='N', help='number of hidden channels') model_parser.add_argument('-k', '--dict-size', type=int, dest='k', metavar='K', help='number of atoms in dictionary') model_parser.add_argument('-kl', '--kl', type=int, dest='kl', default=None, help='length of vectors in embedded space') model_parser.add_argument('--lr', type=float, default=None, help='learning rate') model_parser.add_argument('--vq_coef', type=float, default=None, help='vq coefficient in loss') model_parser.add_argument('--commit_coef', type=float, default=None, help='commitment coefficient in loss') model_parser.add_argument('--kl_coef', type=float, default=None, help='kl-divergence coefficient in loss') model_parser.add_argument('--gabor_layer', action='store_true', default=False, help='using gabor like layer') parser.add_argument('--resume', type=str, default=None, help='The path to resume.') training_parser = parser.add_argument_group('Training Parameters') training_parser.add_argument( '--dataset', default='custom', choices=['mnist', 'cifar10', 'imagenet', 'coco', 'custom'], help='dataset to use: mnist | cifar10 | imagenet | coco | custom') training_parser.add_argument( '--dataset_dir_name', default='', help='name of the dir containing the dataset if dataset == custom') training_parser.add_argument('--data-dir', default='/media/ssd/Datasets', help='directory containing the dataset') training_parser.add_argument( '--epochs', type=int, default=20, metavar='N', help='number of epochs to train (default: 10)') training_parser.add_argument('--max-epoch-samples', type=int, default=50000, help='max num of samples per epoch') training_parser.add_argument('--no-cuda', action='store_true', default=False, help='enables CUDA training') training_parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') training_parser.add_argument('--gpus', default='0', help='gpus used for training - e.g 0,1,3') logging_parser = parser.add_argument_group('Logging Parameters') logging_parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') logging_parser.add_argument('--results-dir', metavar='RESULTS_DIR', default='./results', help='results dir') logging_parser.add_argument('--save-name', default='', help='saved folder') logging_parser.add_argument('--data-format', default='json', help='in which format to save the data') model_parser.add_argument('--backbone', type=str, default=None, nargs='+', help='details of backbone') args = parser.parse_args(args) args.cuda = not args.no_cuda and torch.cuda.is_available() lr = args.lr or default_hyperparams[args.dataset]['lr'] k = args.k or default_hyperparams[args.dataset]['k'] hidden = args.hidden or default_hyperparams[args.dataset]['hidden'] num_channels = dataset_n_channels[args.dataset] save_path = ex_util.setup_logging_from_args(args) writer = SummaryWriter(save_path) # if test file is specified use it for selecting test train sets if args.test_file is not None: args.test_inds = args.test_file args.inv_func = None torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed_all(args.seed) args.gpus = [int(i) for i in args.gpus.split(',')] torch.cuda.set_device(args.gpus[0]) cudnn.benchmark = True torch.cuda.manual_seed(args.seed) if args.model == 'resnet': backbone = { 'arch_name': args.backbone[0], 'layer_name': args.backbone[1] } if len(args.backbone) > 2: backbone['weights_path'] = args.backbone[2] model = models[args.dataset][args.model](hidden, k=k, kl=args.kl, num_channels=num_channels, gabor_layer=args.gabor_layer, backbone=backbone) else: model = models[args.dataset][args.model](hidden, k=k, kl=args.kl, num_channels=num_channels, gabor_layer=args.gabor_layer) if args.resume is not None: weights = torch.load(args.resume, map_location='cpu') model.load_state_dict(weights) if args.cuda: model.cuda() optimizer = optim.Adam(model.parameters(), lr=lr) scheduler = optim.lr_scheduler.StepLR(optimizer, int(args.epochs / 3), 0.5) # NOTE: right now there's no additional transformaiton function trans_funcs = [] # normlisation args.mean = [0.5, 0.5, 0.5] args.std = [0.5, 0.5, 0.5] in_chns = 1 if args.model == 'resnet': in_chns = 3 val_dataset = get_val_dataset(args.data_dir + '/img/', args.data_dir + '/gt/', args.data_dir + '/all_imgs.txt', args.test_inds, trans_funcs, args.mean, args.std, args.target_size, chns=in_chns) # NOTE: shuffle is False val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=None) if args.pred is not None: weights = torch.load(args.pred, map_location='cpu') model.load_state_dict(weights) model.cuda() predict_net(model, val_loader, save_path, args) return train_dataset = get_train_dataset(args.data_dir + '/img/', args.data_dir + '/gt/', args.data_dir + '/all_imgs.txt', args.test_inds, trans_funcs, args.mean, args.std, args.target_size, chns=in_chns) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, sampler=None) console = logging.StreamHandler() console.setLevel(logging.INFO) logging.getLogger('').addHandler(console) for epoch in range(1, args.epochs + 1): train_losses = train(epoch, model, train_loader, optimizer, args.cuda, args.log_interval, save_path, args, writer) test_losses = test_net(epoch, model, val_loader, args.cuda, save_path, args, writer) ex_util.save_checkpoint(model, epoch, save_path) for k in train_losses.keys(): name = k.replace('_train', '') train_name = k test_name = k.replace('train', 'test') writer.add_scalars( name, { 'train': train_losses[train_name], 'test': test_losses[test_name], }) scheduler.step()
def main(opts): """Main function for the training pipeline :opts: commandlien arguments :returns: None """ ########################################################################## # Basic settings # ########################################################################## exp_dir = 'experiments' log_dir = os.path.join(exp_dir, 'logs') model_dir = os.path.join(exp_dir, 'models') os.makedirs(os.path.join(model_dir, opts.run_name), exist_ok=True) ########################################################################## # Define all the necessary variables for model training and evaluation # ########################################################################## writer = SummaryWriter(os.path.join(log_dir, opts.run_name), flush_secs=5) train_dataset = get_train_dataset(root=os.path.join('data', 'train')) weights = make_weights_for_balanced_classes(train_dataset.imgs, len(train_dataset.classes)) weights = torch.DoubleTensor(weights) sampler = torch.utils.data.sampler.WeightedRandomSampler( weights, len(weights)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opts.batch_size, num_workers=6, drop_last=False, sampler=sampler) val_dataset = get_val_dataset(root=os.path.join('data', 'val')) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=opts.batch_size, shuffle=False, num_workers=6, drop_last=False) assert train_dataset.class_to_idx == val_dataset.class_to_idx, "Mapping not correct" model = load_baseline(n_classes=2) if torch.cuda.is_available(): model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=opts.lr, weight_decay=0.1) if opts.lr_scheduler == "plateau": scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=.3, threshold=1e-4, verbose=True) elif opts.lr_scheduler == "step": scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=opts.gamma) best_val_loss = float('inf') best_val_accu = float(0) iteration_change_loss = 0 t_start_training = time.time() ########################################################################## # Main training loop # ########################################################################## for epoch in range(opts.epochs): current_lr = get_lr(optimizer) t_start = time.time() ############################################################ # The actual training and validation step for each epoch # ############################################################ train_loss, train_metric = train_model(model, train_loader, epoch, opts.epochs, optimizer, writer, current_lr, opts.log_every) with torch.no_grad(): val_loss, val_metric = evaluate_model(model, val_loader, epoch, opts.epochs, writer, current_lr) ############################## # Write to summary writer # ############################## writer.add_scalar('Loss/Train', train_loss, epoch) writer.add_scalar('Accuracy/Train', train_metric['accuracy'], epoch) writer.add_scalar('Precision/Train', train_metric['precisions'], epoch) writer.add_scalar('Recall/Train', train_metric['recalls'], epoch) writer.add_scalar('F1/Train', train_metric['f1'], epoch) writer.add_scalar('Loss/Val', val_loss, epoch) writer.add_scalar('Accuracy/Val', val_metric['accuracy'], epoch) writer.add_scalar('Precision/Val', val_metric['precisions'], epoch) writer.add_scalar('Recall/Val', val_metric['recalls'], epoch) writer.add_scalar('F1/Val', val_metric['f1'], epoch) ############################## # Adjust the learning rate # ############################## if opts.lr_scheduler == 'plateau': scheduler.step(val_loss) elif opts.lr_scheduler == 'step': scheduler.step() t_end = time.time() delta = t_end - t_start print_epoch_progress(train_loss, val_loss, delta, train_metric, val_metric) iteration_change_loss += 1 print('-' * 30) train_acc, val_acc = train_metric['accuracy'], val_metric['accuracy'] # file_name = ('val_acc_{}_train_acc_{}_epoch_{}.pth'. # format(train_acc, val_acc, epoch)) # torch.save(model, os.path.join(model_dir, opts.run_name, file_name)) if val_acc > best_val_accu: best_val_accu = val_acc if bool(opts.save_model): torch.save(model, os.path.join(model_dir, opts.run_name, 'best.pth')) if val_loss < best_val_loss: best_val_loss = val_loss iteration_change_loss = 0 if iteration_change_loss == opts.patience and opts.early_stopping: print( ('Early stopping after {0} iterations without the decrease ' + 'of the val loss').format(iteration_change_loss)) break t_end_training = time.time() print('training took {}s'.format(t_end_training - t_start_training))