def train_epoch(model, loaders, criterion, optimizer, epoch, end_epoch, eval_freq=1, save_freq=10, output_dir='./', lr_init=0.01): time_ep = time.time() lr = training_utils.schedule(epoch, lr_init, end_epoch, swa=False) training_utils.adjust_learning_rate(optimizer, lr) train_res = training_utils.train_epoch(loaders["train"], model, criterion, optimizer) if (epoch == 0 or epoch % eval_freq == eval_freq - 1 or epoch == end_epoch - 1): test_res = training_utils.eval(loaders["test"], model, criterion) else: test_res = {"loss": None, "accuracy": None} if (epoch + 1) % save_freq == 0: training_utils.save_checkpoint( output_dir, epoch + 1, state_dict=model.state_dict(), optimizer=optimizer.state_dict(), ) time_ep = time.time() - time_ep values = [ epoch + 1, lr, train_res["loss"], train_res["accuracy"], test_res["loss"], test_res["accuracy"], time_ep, ] table = tabulate.tabulate([values], columns, tablefmt="simple", floatfmt="8.4f") if epoch % 40 == 0: table = table.split("\n") table = "\n".join([table[1]] + table) else: table = table.split("\n")[2] print(table)
epochs = args.epochs, criterion = criterion, batch_size=args.batch_size, subspace_type=args.subspace, subspace_kwargs={'max_rank':args.max_num_models}, momentum = args.momentum, wd=args.wd, lr_init=args.lr_init, swag_lr = args.swag_lr, swag_freq = 1, swag_start = args.swag_start, use_cuda = torch.cuda.is_available(), use_swag = args.swag, scale=args.scale, num_samples=args.num_samples, const_lr=args.no_schedule, double_bias_lr=False, model_variance=args.model_variance, **extra_args, input_dim=dataset.D, output_dim=output_dim, apply_var=args.noise_var, **model_cfg.kwargs ) mname = args.model if args.swag: mname = mname + args.subspace + args.inference bb_args = argparse.Namespace(model=mname, dataset=args.dataset, split=args.split, seed=args.seed, database_path=args.database_path) bb_result = run(bb_args, data=dataset, model=regression_model, is_test=args.database_path=='') print(bb_result) utils.save_checkpoint( args.dir, args.epochs, model_state_dict=regression_model.model.state_dict(), optimizer=regression_model.optimizer.state_dict(), result=bb_result )
#printf=print columns = ['ep', 'acc', 'loss', 'prior'] for epoch in range(args.epochs): train_res = utils.train_epoch(loaders['train'], proj_model, criterion, optimizer) values = [ '%d/%d' % (epoch + 1, args.epochs), train_res['accuracy'], train_res['loss'], train_res['stats']['prior'], train_res['stats']['nll'] ] if epoch == 0: printf( tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f')) else: printf( tabulate.tabulate([values], columns, tablefmt='plain', floatfmt='8.4f').split('\n')[1]) print(utils.eval(loaders['test'], proj_model, criterion)) utils.save_checkpoint(args.dir, epoch, name='projected', state_dict=proj_model.state_dict())
*model_cfg.args, num_classes=num_classes, **model_cfg.kwargs) swag_model.to(args.device) swag_model.load_state_dict(checkpoint["state_dict"]) columns = [ "ep", "lr", "tr_loss", "tr_acc", "te_loss", "te_acc", "time", "mem_usage" ] if args.swa: columns = columns[:-2] + ["swa_te_loss", "swa_te_acc"] + columns[-2:] swag_res = {"loss": None, "accuracy": None} utils.save_checkpoint( args.dir, start_epoch, state_dict=model.state_dict(), optimizer=optimizer.state_dict(), ) sgd_ens_preds = None sgd_targets = None n_ensembled = 0.0 for epoch in range(start_epoch, args.epochs): time_ep = time.time() if not args.no_schedule: lr = schedule(epoch) utils.adjust_learning_rate(optimizer, lr) else: lr = args.lr_init
] if epoch == 0: printf( tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f')) else: printf( tabulate.tabulate([values], columns, tablefmt='plain', floatfmt='8.4f').split('\n')[1]) utils.save_checkpoint(args.dir, epoch, name='vi_rnvp', state_dict=vi_model.state_dict()) if args.eval_ensemble: num_samples = 30 predictions = np.zeros((len(loaders['test'].dataset), num_classes)) targets = np.zeros(len(loaders['test'].dataset)) printf, logfile = utils.get_logging_print( os.path.join(args.dir, args.log_fname + '-%s.txt')) print('Saving logs to: %s' % logfile) columns = ['iter ens', 'acc', 'nll'] for i in range(num_samples):
swa_state_dict = checkpoint["swa_state_dict"] if swa_state_dict is not None: swa_model.load_state_dict(swa_state_dict) swa_n_ckpt = checkpoint["swa_n"] if swa_n_ckpt is not None: swa_n = swa_n_ckpt columns = ["ep", "lr", "tr_loss", "tr_acc", "te_loss", "te_acc", "time"] if args.swa: columns = columns[:-1] + ["swa_te_loss", "swa_te_acc"] + columns[-1:] swa_res = {"loss": None, "accuracy": None} utils.save_checkpoint( args.dir, start_epoch, state_dict=model.state_dict(), swa_state_dict=swa_model.state_dict() if args.swa else None, swa_n=swa_n if args.swa else None, optimizer=optimizer.state_dict(), ) for epoch in range(start_epoch, args.epochs): time_ep = time.time() lr = schedule(epoch) utils.adjust_learning_rate(optimizer, lr) train_res = utils.train_epoch(loaders["train"], model, criterion, optimizer) if (epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1): test_res = utils.eval(loaders["test"], model, criterion) else:
targets=sgld_targets) if args.swag and (epoch + 1) >= args.ens_start and ( epoch + 1 - args.ens_start) % args.swag_c_epochs == 0: swag_model.collect_model(model) if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1: swag_model.set_swa() utils.bn_update(loaders['train'], swag_model) swag_res = utils.eval(loaders['test'], swag_model, criterion) else: swag_res = {'loss': None, 'accuracy': None} if (epoch + 1) % args.save_freq == 0: utils.save_checkpoint( args.dir, epoch + 1, state_dict=model.state_dict(), ) if args.swag and epoch + 1 >= args.ens_start: utils.save_checkpoint( args.dir, epoch + 1, name='swag', state_dict=swag_model.state_dict(), ) time_ep = time.time() - time_ep memory_usage = torch.cuda.memory_allocated() / (1024.0**3) values = [ epoch + 1, lr, train_res['loss'], train_res['accuracy'],
print('Saving SWA model at epoch: ', epoch) swag_model.collect_model(model) if epoch % args.eval_freq is 0: swag_model.sample(0.0) bn_update(train_loader, swag_model) val_loss, val_err, val_iou = train_utils.test( swag_model, loaders['val'], criterion) print('SWA Val - Loss: {:.4f} | Acc: {:.4f} | IOU: {:.4f}'.format( val_loss, 1 - val_err, val_iou)) ### Checkpoint ### if epoch % args.save_freq is 0: print('Saving model at Epoch: ', epoch) save_checkpoint(dir=args.dir, epoch=epoch, state_dict=model.state_dict(), optimizer=optimizer.state_dict()) if args.swa and (epoch + 1) > args.swa_start: save_checkpoint( dir=args.dir, epoch=epoch, name='swag', state_dict=swag_model.state_dict(), ) if args.optimizer == 'RMSProp': ### Adjust Lr ### if epoch < args.ft_start: scheduler.step(epoch=epoch) else: scheduler.step(epoch=-1) #reset to args.lr_init
start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if args.swa and args.swa_resume is not None: checkpoint = torch.load(args.swa_resume) swag_model.load_state_dict(checkpoint['state_dict']) columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'time', 'mem_usage'] if args.swa: columns = columns[:-2] + ['swa_te_loss', 'swa_te_acc'] + columns[-2:] swag_res = {'loss': None, 'accuracy': None} utils.save_checkpoint( args.dir, start_epoch, state_dict=model.state_dict(), optimizer=optimizer.state_dict() ) num_iterates = 0 for epoch in range(start_epoch, args.epochs): time_ep = time.time() if not args.no_schedule: lr = schedule(epoch) utils.adjust_learning_rate(optimizer, lr) else: lr = args.lr_init print('EPOCH %d. TRAIN' % (epoch + 1))
printf, logfile = utils.get_logging_print(os.path.join(args.dir, args.log_fname + '-%s.txt')) print('Saving logs to: %s' % logfile) nuts_kernel = NUTS(pyro_model.model, step_size=10.) num_samples = 30 # x_, y_ = loaders["train"].dataset.tensors mcmc_run = MCMC(nuts_kernel, num_samples=num_samples, warmup_steps=10).run(inpts, trgts) #mcmc_run = MCMC(nuts_kernel, num_samples=num_samples, warmup_steps=100).run(islice(loaders["train"], 1000)) samples = torch.cat(list(mcmc_run.marginal(sites="t").support(flatten=True).values()), dim=-1) print(samples) utils.save_checkpoint( args.dir, 0, name='nuts', state_dict=pyro_model.state_dict() ) predictions = np.zeros((len(loaders['test'].dataset), num_classes)) targets = np.zeros(len(loaders['test'].dataset)) printf, logfile = utils.get_logging_print(os.path.join(args.dir, args.log_fname + '-%s.txt')) print('Saving logs to: %s' % logfile) columns = ['iter ens', 'acc', 'nll'] for i in range(num_samples): # utils.bn_update(loaders['train'], model, subset=args.bn_subset) pyro_model.eval() k = 0