def demo_from_best_model(resnet_layer, pretrained, num_classes, path): assert resnet_layer == 18 or resnet_layer == 50 net_best = ResNet(layer_num=resnet_layer, pretrained=pretrained, num_classes=num_classes) net_best = net_best.to(device) net_best.load_state_dict(torch.load(path)) net_best.eval() best_acc = save_confusion_matrix(net_best, val_loader, 'backup_demo/cm_best.png') print('test_best_accuracy = %.2f' % best_acc)
log_columns = [ 'epoch', 'bce', 'lwlrap', 'bce_noisy', 'lwlrap_noisy', 'val_bce', 'val_lwlrap', 'time' ] os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1, 2, 3' for fold, (ids_train_split, ids_valid_split) in enumerate(folds): if fold + 1 not in FOLD_LIST: continue print("fold: {}".format(fold + 1)) train_log = pd.DataFrame(columns=log_columns) # build model model = ResNet(NUM_CLASS) model.to(device) # prepare data loaders df_train_fold = df_train.iloc[ids_train_split].reset_index(drop=True) dataset_train = MelDataset( df_train_fold['path'], df_train_fold[labels].values, crop=CROP_LENGTH, crop_mode='random', mixup=True, freqmask=True, gain=True, ) train_loader = DataLoader( dataset_train, batch_size=BATCH_SIZE,
def main(): global args, best_result, output_directory, train_csv, test_csv, device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") args = parser.parse_args() if args.modality == 'rgb' and args.num_samples != 0: print("number of samples is forced to be 0 when input modality is rgb") args.num_samples = 0 if args.modality == 'rgb' and args.max_depth != 0.0: print("max depth is forced to be 0.0 when input modality is rgb/rgbd") args.max_depth = 0.0 sparsifier = None max_depth = args.max_depth if args.max_depth >= 0.0 else np.inf if args.sparsifier == UniformSampling.name: sparsifier = UniformSampling(num_samples=args.num_samples, max_depth=max_depth) elif args.sparsifier == SimulatedStereo.name: sparsifier = SimulatedStereo(num_samples=args.num_samples, max_depth=max_depth) # create results folder, if not already exists output_directory = os.path.join( 'results', '{}.sparsifier={}.modality={}.arch={}.decoder={}.criterion={}.lr={}.bs={}' .format(args.data, sparsifier, args.modality, args.arch, args.decoder, args.criterion, args.lr, args.batch_size)) if not os.path.exists(output_directory): os.makedirs(output_directory) train_csv = os.path.join(output_directory, 'train.csv') test_csv = os.path.join(output_directory, 'test.csv') best_txt = os.path.join(output_directory, 'best.txt') # define loss function (criterion) and optimizer if args.criterion == 'l2': criterion = criteria.MaskedMSELoss().cuda() elif args.criterion == 'l1': criterion = criteria.MaskedL1Loss().cuda() out_channels = 1 # Data loading code print("=> creating data loaders ...") traindir = os.path.join('data', args.data, 'train') valdir = os.path.join('data', args.data, 'val') train_dataset = NYUDataset(traindir, type='train', modality=args.modality, sparsifier=sparsifier) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, sampler=None) # set batch size to be 1 for validation val_dataset = NYUDataset(valdir, type='val', modality=args.modality, sparsifier=sparsifier) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=args.workers, pin_memory=True) print("=> data loaders created.") # evaluation mode if args.evaluate: best_model_filename = os.path.join(output_directory, 'model_best.pth.tar') if os.path.isfile(best_model_filename): print("=> loading best model '{}'".format(best_model_filename)) checkpoint = torch.load(best_model_filename) args.start_epoch = checkpoint['epoch'] best_result = checkpoint['best_result'] model = checkpoint['model'] print("=> loaded best model (epoch {})".format( checkpoint['epoch'])) else: print("=> no best model found at '{}'".format(best_model_filename)) validate(val_loader, model, checkpoint['epoch'], write_to_file=False) return # optionally resume from a checkpoint elif args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] + 1 best_result = checkpoint['best_result'] model = checkpoint['model'] optimizer = checkpoint['optimizer'] print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) return # create new model else: # define model print("=> creating Model ({}-{}) ...".format(args.arch, args.decoder)) in_channels = len(args.modality) if args.arch == 'resnet50': model = ResNet(layers=50, decoder=args.decoder, in_channels=in_channels, out_channels=out_channels, pretrained=args.pretrained) elif args.arch == 'resnet18': model = ResNet(layers=18, decoder=args.decoder, in_channels=in_channels, out_channels=out_channels, pretrained=args.pretrained) print("=> model created.") optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # create new csv files with only header with open(train_csv, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() with open(test_csv, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() # model = torch.nn.DataParallel(model).cuda() model = model.to(device) print(model) print("=> model transferred to GPU.") for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set result, img_merge = validate(val_loader, model, epoch) # remember best rmse and save checkpoint is_best = result.rmse < best_result.rmse if is_best: best_result = result with open(best_txt, 'w') as txtfile: txtfile.write( "epoch={}\nmse={:.3f}\nrmse={:.3f}\nabsrel={:.3f}\nlg10={:.3f}\nmae={:.3f}\ndelta1={:.3f}\nt_gpu={:.4f}\n" .format(epoch, result.mse, result.rmse, result.absrel, result.lg10, result.mae, result.delta1, result.gpu_time)) if img_merge is not None: img_filename = output_directory + '/comparison_best.png' utils.save_image(img_merge, img_filename) save_checkpoint( { 'epoch': epoch, 'arch': args.arch, 'model': model, 'best_result': best_result, 'optimizer': optimizer, }, is_best, epoch)
def train(k, epochs): model = ResNet(k=k) opt = torch.optim.Adam(model.parameters(), lr=1e-4) criterion = nn.CrossEntropyLoss() if use_gpu: model.to('cuda') if use_horovod: # broadcast parameters and optimizer state from root device to other devices hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(opt, root_rank=0) # Wraps the opimizer for multiGPU operation opt = hvd.DistributedOptimizer( opt, named_parameters=model.named_parameters(), op=hvd.Adasum) loss_dict = {'epoch': [], 'train': [], 'val': []} for epoch in range(epochs): train_loss = 0 val_loss = 0 # train block for img_batch, labels_batch in train_loader: if use_gpu: img_batch = img_batch.to('cuda') labels_batch = labels_batch.to('cuda') pred = model(img_batch) opt.zero_grad() loss = criterion(pred, labels_batch) loss.backward() opt.step() train_loss += loss.item() #val block with torch.no_grad(): for img_batch, labels_batch in val_loader: if use_gpu: img_batch = img_batch.to('cuda') labels_batch = labels_batch.to('cuda') pred = model(img_batch) loss = criterion(pred, labels_batch) val_loss += loss.item() if use_horovod: train_loss = average_loss(train_loss, 'avg_train_loss') val_loss = average_loss(val_loss, 'avg_val_loss') loss_dict['epoch'].append(epoch + 1) loss_dict['train'].append(train_loss) loss_dict['val'].append(val_loss) print(",".join([ "{}:{:.2f}".format(key, val[epoch]) for key, val in loss_dict.items() ])) torch.save(model.state_dict(), "models/modelsdata/ResNet18_Cifar10_d{}.ckpt".format(k)) save_obj(loss_dict, "models/modelsdata/losses/ResNet18_Cifar10_d{}".format(k)) return loss_dict
def main(): global args, best_result, output_directory, train_csv, test_csv, eval_csv, pnp pnp = args.pnp # evaluation mode start_epoch = 0 if args.evaluate: args_new = args assert os.path.isfile(args.evaluate), \ "=> no best model found at '{}'".format(args.evaluate) print("=> loading best model '{}'".format(args.evaluate)) checkpoint = torch.load(args.evaluate) output_directory = os.path.dirname(args.evaluate) eval_csv = os.path.join(output_directory, 'eval.csv') with open(eval_csv, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=eval_fieldnames) writer.writeheader() args = checkpoint['args'] args.pnp = args_new.pnp start_epoch = checkpoint['epoch'] + 1 best_result = checkpoint['best_result'] model = checkpoint['model'] print("=> loaded best model (epoch {})".format(checkpoint['epoch'])) args.evaluate = True for num_samples in range(2, 9): args.num_samples = int(10**(num_samples / 2)) _, val_loader = create_data_loaders(args) validate(val_loader, model, checkpoint['epoch'], write_to_file=True) return # optionally resume from a checkpoint elif args.resume: chkpt_path = args.resume args_new = args assert os.path.isfile(chkpt_path), \ "=> no checkpoint found at '{}'".format(chkpt_path) print("=> loading checkpoint '{}'".format(chkpt_path)) checkpoint = torch.load(chkpt_path) args = checkpoint['args'] args.pnp = args_new.pnp start_epoch = checkpoint['epoch'] + 1 best_result = checkpoint['best_result'] model = checkpoint['model'] optimizer = checkpoint['optimizer'] output_directory = os.path.dirname(os.path.abspath(chkpt_path)) print("=> loaded checkpoint (epoch {})".format(checkpoint['epoch'])) train_loader, val_loader = create_data_loaders(args) args.resume = True # create new model else: train_loader, val_loader = create_data_loaders(args) print("=> creating Model ({}-{}) ...".format(args.arch, args.decoder)) in_channels = len(args.modality) if args.arch == 'resnet50': model = ResNet(layers=50, decoder=args.decoder, output_size=train_loader.dataset.output_size, in_channels=in_channels, pretrained=args.pretrained) elif args.arch == 'resnet18': model = ResNet(layers=18, decoder=args.decoder, output_size=train_loader.dataset.output_size, in_channels=in_channels, pretrained=args.pretrained) elif args.arch == 'vgg16': model = VGGNet(layers=16, decoder=args.decoder, output_size=train_loader.dataset.output_size, in_channels=in_channels, pretrained=args.pretrained) elif args.arch == 'vgg19': model = VGGNet(layers=19, decoder=args.decoder, output_size=train_loader.dataset.output_size, in_channels=in_channels, pretrained=args.pretrained) print("=> model created.") #change here optimizer = torch.optim.SGD(model.parameters(), args.lr, \ momentum=args.momentum, weight_decay=args.weight_decay) # model = torch.nn.DataParallel(model).cuda() # for multi-gpu training model = model.to(device) # define loss function (criterion) and optimizer if args.criterion == 'l2': criterion = criteria.MaskedMSELoss().to(device) elif args.criterion == 'l1': criterion = criteria.MaskedL1Loss().to(device) # create results folder, if not already exists output_directory = utils.get_output_directory(args) if not os.path.exists(output_directory): os.makedirs(output_directory) train_csv = os.path.join(output_directory, 'train.csv') test_csv = os.path.join(output_directory, 'test.csv') best_txt = os.path.join(output_directory, 'best.txt') # create new csv files with only header if not args.resume: with open(train_csv, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() with open(test_csv, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for epoch in range(start_epoch, args.epochs): utils.adjust_learning_rate(optimizer, epoch, args.lr) train(train_loader, model, criterion, optimizer, epoch) # train for one epoch result, img_merge = validate(val_loader, model, epoch) # evaluate on validation set # remember best rmse and save checkpoint is_best = result.rmse < best_result.rmse if is_best: best_result = result with open(best_txt, 'w') as txtfile: txtfile.write( "epoch={}\nmse={:.3f}\nrmse={:.3f}\nabsrel={:.3f}\nlg10={:.3f}\nmae={:.3f}\ndelta1={:.3f}\nt_gpu={:.4f}\n" .format(epoch, result.mse, result.rmse, result.absrel, result.lg10, result.mae, result.delta1, result.gpu_time)) if img_merge is not None: img_filename = output_directory + '/comparison_best.png' utils.save_image(img_merge, img_filename) utils.save_checkpoint( { 'args': args, 'epoch': epoch, 'arch': args.arch, 'model': model, 'best_result': best_result, 'optimizer': optimizer, }, is_best, epoch, output_directory)