def DUN_batch_time(net, savefile, dset, data_dir, batch_size=256, cuda=True, gpu=None, MC_samples=0, workers=4, big_data=False): _, _, val_loader, _, _, _ = \ get_image_loader(dset, batch_size, cuda=cuda, workers=workers, distributed=False, data_dir=data_dir) net.load(savefile) times = [] for i, (images, _) in enumerate(val_loader): if big_data and i > 5: break data_time = time.time() _ = net.fast_predict(images).data batch_time = time.time() - data_time if not (big_data and i == 0): times.append(batch_time) return np.mean(times)
def ensemble_batch_time(model, savefile_list, dset, data_dir, batch_size=256, cuda=True, gpu=None, MC_samples=0, workers=4, big_data=False): _, _, val_loader, _, _, _ = \ get_image_loader(dset, batch_size, cuda=cuda, workers=workers, distributed=False, data_dir=data_dir) times = [] for i, (images, target) in enumerate(val_loader): if big_data and i > 5: break data_time = time.time() if cuda: images = images.cuda(None, non_blocking=True) target = target.cuda(None, non_blocking=True) _ = ensemble_time_preds(model, savefile_list, images, gpu) batch_time = time.time() - data_time if not (big_data and i == 0): times.append(batch_time) return np.mean(times)
def DUN_test_stats(net, savefile, dset, data_dir, corruption=None, rotation=None, batch_size=256, cuda=True, gpu=None, MC_samples=0, workers=4, d_posterior=None): assert not (corruption is not None and rotation is not None) if corruption is None and rotation is None: _, _, val_loader, _, _, _ = \ get_image_loader(dset, batch_size, cuda=cuda, workers=workers, distributed=False, data_dir=data_dir) elif corruption is not None: val_loader = load_corrupted_dataset(dset, severity=corruption, data_dir=data_dir, batch_size=batch_size, cuda=cuda, workers=workers) elif rotation is not None: val_loader = rotate_load_dataset(dset, rotation, data_dir=data_dir, batch_size=batch_size, cuda=cuda, workers=workers) net.load(savefile) if d_posterior is not None: net.prob_model.current_posterior = d_posterior prob_vec, target_vec = get_preds_targets_DUN(net, val_loader) brier = class_brier(y=target_vec, probs=prob_vec, log_probs=None) err = class_err(y=target_vec, model_out=prob_vec) ll = class_ll(y=target_vec, probs=prob_vec, log_probs=None, eps=1e-40) ece = class_ECE(y=target_vec, probs=prob_vec, log_probs=None, nbins=10) return err, ll, brier, ece
def train_loop(net, dname, data_dir, epochs=90, workers=4, resume='', savedir='./', save_all_epochs=False, q_nograd_its=0, batch_size=256): mkdir(savedir) global best_err1 # Load data here: _, train_loader, val_loader, _, _, Ntrain = \ get_image_loader(dname, batch_size, cuda=True, workers=workers, distributed=False, data_dir=data_dir) net.N_train = Ntrain start_epoch = 0 marginal_loglike = np.zeros(epochs) train_loss = np.zeros(epochs) dev_loss = np.zeros(epochs) err_train = np.zeros(epochs) err_dev = np.zeros(epochs) # optionally resume from a checkpoint if resume: if os.path.isfile(resume): print("=> loading checkpoint '{}'".format(resume)) start_epoch, best_err1 = net.load(resume) print("=> loaded checkpoint '{}' (epoch {})".format( resume, start_epoch)) else: print("=> no checkpoint found at '{}'".format(resume)) candidate_progress_file = resume.split('/') candidate_progress_file = '/'.join( candidate_progress_file[:-1]) + '/stats_array.pkl' if os.path.isfile(candidate_progress_file): print("=> found progress file at '{}'".format( candidate_progress_file)) try: marginal_loglike, err_train, train_loss, err_dev, dev_loss = \ load_object(candidate_progress_file) print("=> Loaded progress file at '{}'".format( candidate_progress_file)) except Exception: print("=> Unable to load progress file at '{}'".format( candidate_progress_file)) else: print("=> NOT found progress file at '{}'".format( candidate_progress_file)) if q_nograd_its > 0: net.prob_model.q_logits.requires_grad = False for epoch in range(start_epoch, epochs): if q_nograd_its > 0 and epoch == q_nograd_its: net.prob_model.q_logits.requires_grad = True tic = time.time() nb_samples = 0 for x, y in train_loader: marg_loglike_estimate, minus_loglike, err = net.fit(x, y) marginal_loglike[epoch] += marg_loglike_estimate * x.shape[0] err_train[epoch] += err * x.shape[0] train_loss[epoch] += minus_loglike * x.shape[0] nb_samples += len(x) marginal_loglike[epoch] /= nb_samples train_loss[epoch] /= nb_samples err_train[epoch] /= nb_samples toc = time.time() # ---- print print('\n depth approx posterior', net.prob_model.current_posterior.data.cpu().numpy()) print( "it %d/%d, ELBO/evidence %.4f, pred minus loglike = %f, err = %f" % (epoch, epochs, marginal_loglike[epoch], train_loss[epoch], err_train[epoch]), end="") cprint('r', ' time: %f seconds\n' % (toc - tic)) net.update_lr() # ---- dev tic = time.time() nb_samples = 0 for x, y in val_loader: minus_loglike, err = net.eval(x, y) dev_loss[epoch] += minus_loglike * x.shape[0] err_dev[epoch] += err * x.shape[0] nb_samples += len(x) dev_loss[epoch] /= nb_samples err_dev[epoch] /= nb_samples toc = time.time() cprint('g', ' pred minus loglike = %f, err = %f\n' % (dev_loss[epoch], err_dev[epoch]), end="") cprint('g', ' time: %f seconds\n' % (toc - tic)) filename = 'checkpoint.pth.tar' if save_all_epochs: filename = str(epoch) + '_' + filename net.save(os.path.join(savedir, filename), best_err1) if err_dev[epoch] < best_err1: best_err1 = err_dev[epoch] cprint('b', 'best top1 dev err: %f' % err_dev[epoch]) shutil.copyfile(os.path.join(savedir, filename), os.path.join(savedir, 'model_best.pth.tar')) all_results = [ marginal_loglike, err_train, train_loss, err_dev, dev_loss ] save_object(all_results, os.path.join(savedir, 'stats_array.pkl'))
row_to_add_proto["number"] = num_repeat model_path = ("checkpoint.pth.tar" if best_or_last == "last" else "model_best.pth.tar") if method != "ensemble": savefile = folder / model_path else: model_indices = range(num_repeat*n_samples, (num_repeat + 1)*n_samples) savefile = [folder.parent / "_".join(folder_split[:-1] + [str(filt_ensembles_df["number"][int(model_idx)])]) / model_path for model_idx in model_indices] row_to_add_proto["n_samples"] = n_samples if method == "DUN" and savefile not in no_train_posteriors.keys() and use_no_train_post: _, train_loader, _, _, _, Ntrain = \ get_image_loader(dataset, batch_size=args.batch_size, cuda=True, workers=workers, data_dir=data_dir, distributed=False) model_obj.load(savefile) model_obj.N_train = Ntrain notrain_post, _ = model_obj.get_exact_d_posterior(train_loader, train_bn=False, logposterior=False) no_train_posteriors[savefile] = notrain_post.data kwargs = {} if method == "DUN" and use_no_train_post: kwargs = {"d_posterior": no_train_posteriors[savefile]} # all measurements of err, ll, ece, brier for rotation, corruption in [(0, 0)] + corruptions[dataset] + rotations[dataset]: row_to_add = row_to_add_proto.copy() row_to_add.update({"rotation": rotation, "corruption": corruption})
def ensemble_test_stats(model, savefile_list, dset, data_dir, corruption=None, rotation=None, batch_size=256, cuda=True, gpu=None, MC_samples=0, workers=4, iterate=False): assert not (corruption is not None and rotation is not None) if corruption is None and rotation is None: _, _, val_loader, _, _, _ = \ get_image_loader(dset, batch_size, cuda=cuda, workers=workers, distributed=False, data_dir=data_dir) elif corruption is not None: val_loader = load_corrupted_dataset(dset, severity=corruption, data_dir=data_dir, batch_size=batch_size, cuda=cuda, workers=workers) elif rotation is not None: val_loader = rotate_load_dataset(dset, rotation, data_dir=data_dir, batch_size=batch_size, cuda=cuda, workers=workers) logprob_vec, target_vec = ensemble_get_preds_targets(model, savefile_list, val_loader, cuda=cuda, gpu=gpu, return_vector=iterate) if iterate: brier_vec = [] err_vec = [] ll_vec = [] ece_vec = [] for n_samples in range(1, logprob_vec.shape[1] + 1): comb_logprobs = torch.logsumexp(logprob_vec[:, :n_samples, :], dim=1, keepdim=False) - np.log(n_samples) brier_vec.append( class_brier(y=target_vec, log_probs=comb_logprobs, probs=None)) err_vec.append(class_err(y=target_vec, model_out=comb_logprobs)) ll_vec.append( class_ll(y=target_vec, log_probs=comb_logprobs, probs=None, eps=1e-40)) ece_vec.append( class_ECE(y=target_vec, log_probs=comb_logprobs, probs=None, nbins=10)) return err_vec, ll_vec, brier_vec, ece_vec brier = class_brier(y=target_vec, log_probs=logprob_vec, probs=None) err = class_err(y=target_vec, model_out=logprob_vec) ll = class_ll(y=target_vec, log_probs=logprob_vec, probs=None, eps=1e-40) ece = class_ECE(y=target_vec, log_probs=logprob_vec, probs=None, nbins=10) return err, ll, brier, ece
def train_loop(model, dname, data_dir, epochs=90, workers=4, gpu=None, resume='', weight_decay=1e-4, savedir='./', milestones=None, MC_samples=1, batch_size=256): mkdir(savedir) global best_acc1 if gpu is not None: print("Use GPU: {} for training".format(gpu)) if gpu is not None: # Check for single GPU torch.cuda.set_device(gpu) model = model.cuda(gpu) else: # # DataParallel will divide and allocate batch_size to all available GPUs model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss(reduction='mean').cuda(gpu) optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) if milestones is None: # if milestones are not specified, set to impossible value so LR is never decayed. milestones = [epochs + 1] scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1) tr_acc1_vec = [] tr_acc5_vec = [] tr_loss_vec = [] acc1_vec = [] acc5_vec = [] loss_vec = [] start_epoch = 0 # optionally resume from a checkpoint if resume: if os.path.isfile(resume): print("=> loading checkpoint '{}'".format(resume)) if gpu is None: checkpoint = torch.load(resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(gpu) checkpoint = torch.load(resume, map_location=loc) start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) print("=> loaded checkpoint '{}' (epoch {})" .format(resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(resume)) candidate_progress_file = resume.split('/') candidate_progress_file = '/'.join(candidate_progress_file[:-1]) + '/stats_array.pkl' if os.path.isfile(candidate_progress_file): print("=> found progress file at '{}'".format(candidate_progress_file)) try: tr_acc1_vec, tr_acc5_vec, tr_loss_vec, acc1_vec, acc5_vec, loss_vec = \ load_object(candidate_progress_file) print("=> Loaded progress file at '{}'".format(candidate_progress_file)) except Exception: print("=> Unable to load progress file at '{}'".format(candidate_progress_file)) else: print("=> NOT found progress file at '{}'".format(candidate_progress_file)) cudnn.benchmark = True _, train_loader, val_loader, _, _, _ = \ get_image_loader(dname, batch_size, cuda=True, workers=workers, distributed=False, data_dir=data_dir) for epoch in range(start_epoch, epochs): # train for one epoch and update lr scheduler setting tr_acc1, tr_acc5, tr_loss = train(train_loader, model, criterion, optimizer, epoch, gpu) print('used lr: %f' % optimizer.param_groups[0]["lr"]) scheduler.step() tr_acc1_vec.append(tr_acc1) tr_acc5_vec.append(tr_acc5) tr_loss_vec.append(tr_loss) # evaluate on validation set acc1, acc5, loss = validate(val_loader, model, criterion, gpu, MC_samples=MC_samples) acc1_vec.append(acc1) acc5_vec.append(acc5) loss_vec.append(loss) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, is_best, savedir=savedir) all_results = [tr_acc1_vec, tr_acc5_vec, tr_loss_vec, acc1_vec, acc5_vec, loss_vec] save_object(all_results, os.path.join(savedir, 'stats_array.pkl'))