x[mask] = k _replace_nan_with_k_inplace(x_is, -1) with torch.no_grad(): issf, _, _, acts_fake = inception_score(x_is, cuda=True, batch_size=32, resize=True, splits=10, return_preds=True) idxs_ = np.argsort(np.abs(acts_fake).sum(-1))[:1800] # filter the ones with super large values acts_fake = acts_fake[idxs_] # ipdb.set_trace() m1, s1 = calculate_activation_statistics(acts_real) m2, s2 = calculate_activation_statistics(acts_fake) try: fid_value = calculate_frechet_distance(m1, s1, m2, s2) except: # ipdb.set_trace() # This mostly happens when there are "a few really bad samples", which # results in, say, usually large activation (1e30). # These "activation outliers" mess up the statistics, and results in # ValueError fid_value = 2000 print (idx, issf, fid_value) stats_dict = { 'global_iteration': idx , 'fid': fid_value } iteration_logger.writerow(stats_dict) try: plot_csv(iteration_logger.filename) except: pass # ipdb.set_trace()
def main(args): # use_cuda = not args.no_cuda and torch.cuda.is_available() set_random_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} if args.dataset == 'mnist': train_data = get_dataset('mnist-train', args.dataroot) test_data = get_dataset('mnist-test', args.dataroot) train_tr = test_tr = get_transform('mnist_normalize') if args.dataset == 'cifar10': train_tr_name = 'cifar_augment_normalize' if args.data_augmentation else 'cifar_normalize' train_data = get_dataset('cifar10-train', args.dataroot) test_data = get_dataset('cifar10-test', args.dataroot) train_tr = get_transform(train_tr_name) test_tr = get_transform('cifar_normalize') if args.dataset == 'cifar-fs-train': train_tr_name = 'cifar_augment_normalize' if args.data_augmentation else 'cifar_normalize' train_data = get_dataset('cifar-fs-train-train', args.dataroot) test_data = get_dataset('cifar-fs-train-test', args.dataroot) train_tr = get_transform(train_tr_name) test_tr = get_transform('cifar_normalize') if args.dataset == 'miniimagenet': train_data = get_dataset('miniimagenet-train-train', args.dataroot) test_data = get_dataset('miniimagenet-train-test', args.dataroot) train_tr = get_transform('cifar_augment_normalize_84' if args.data_augmentation else 'cifar_normalize') test_tr = get_transform('cifar_normalize') model = ResNetClassifier(train_data['n_classes'], train_data['im_size']).to(device) if args.ckpt_path != '': loaded = torch.load(args.ckpt_path) model.load_state_dict(loaded) ipdb.set_trace() if args.eval: acc = test(args, model, device, test_loader, args.n_eval_batches) print("Eval Acc: ", acc) sys.exit() # Trace logging mkdir(args.output_dir) eval_fieldnames = ['global_iteration','val_acc','train_acc'] eval_logger = CSVLogger(every=1, fieldnames=eval_fieldnames, resume=args.resume, filename=os.path.join(args.output_dir, 'eval_log.csv')) wandb.run.name = os.path.basename(args.output_dir) wandb.run.save() wandb.watch(model) if args.optim == 'adadelta': optimizer = optim.Adadelta(model.parameters(), lr=args.lr) elif args.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr) elif args.optim == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, nesterov=True, weight_decay=5e-4) if args.dataset == 'mnist': scheduler = StepLR(optimizer, step_size=1, gamma=.7) else: scheduler = MultiStepLR(optimizer, milestones=[60, 120, 160], gamma=0.2) start_epoch = 1 if args.resume: last_ckpt_path = os.path.join(args.output_dir, 'last_ckpt.pt') if os.path.exists(last_ckpt_path): loaded = torch.load(last_ckpt_path) model.load_state_dict(loaded['model_sd']) optimizer.load_state_dict(loaded['optimizer_sd']) scheduler.load_state_dict(loaded['scheduler_sd']) start_epoch = loaded['epoch'] # It's important to set seed again before training b/c dataloading code # might have reset the seed. set_random_seed(args.seed) best_val = 0 if args.db: scheduler = MultiStepLR(optimizer, milestones=[1, 2, 3, 4], gamma=0.1) args.epochs = 5 for epoch in range(start_epoch, args.epochs + 1): if epoch % args.ckpt_every == 0: torch.save(model.state_dict(), os.path.join(args.output_dir , f"ckpt_{epoch}.pt")) stats_dict = {'global_iteration':epoch} val = stats_dict['val_acc'] = test(args, model, device, test_data, test_tr, args.n_eval_batches) stats_dict['train_acc'] = test(args, model, device, train_data, test_tr, args.n_eval_batches) grid = make_grid(torch.stack([train_tr(x) for x in train_data['x'][:30]]), nrow=6).permute(1,2,0).numpy() img_dict = {"examples": [wandb.Image(grid, caption="Data batch")]} wandb.log(stats_dict) wandb.log(img_dict) eval_logger.writerow(stats_dict) plot_csv(eval_logger.filename, os.path.join(args.output_dir, 'iteration_plots.png')) train(args, model, device, train_data, train_tr, optimizer, epoch) scheduler.step(epoch) if val > best_val: best_val = val torch.save(model.state_dict(), os.path.join(args.output_dir , f"ckpt_best.pt")) # For `resume` model.cpu() torch.save({ 'model_sd': model.state_dict(), 'optimizer_sd': optimizer.state_dict(), 'scheduler_sd': scheduler.state_dict(), 'epoch': epoch + 1 }, os.path.join(args.output_dir, "last_ckpt.pt")) model.to(device)
'w_mean': flat_weights.mean().item(), 'w_std': flat_weights.std().item(), 'g_norm': gradient_norm.item(), 'g_min': flat_gradients.min().item(), 'g_max': flat_gradients.max().item(), 'g_mean': flat_gradients.mean().item(), 'g_std': flat_gradients.std().item(), 'z_norm': z_norm.item(), 'z_min': z.view(z.size(0), -1).min().item(), 'z_max': z.view(z.size(0), -1).max().item(), 'z_mean': z.view(z.size(0), -1).mean().item(), 'z_std': z.view(z.size(0), -1).std().item(), 'train_acc': (num_correct / num_total).item() } iteration_logger.writerow(stats_dict) progress_bar.set_postfix(stats_dict) if (i + 1) % args.log_every == 0: # Save the lists of weight and gradient norms per layer with open( os.path.join( save_dir, 'norms', 'w_norm_{}_{}.pkl'.format(epoch, global_iteration)), 'wb') as f: pkl.dump(weight_norm_list, f) with open( os.path.join(
def cnn_val_loss(config={}, reporter=None, callback=None, return_all=False): print("Starting cnn_val_loss...") ############################################################################### # Arguments ############################################################################### dataset_options = ['cifar10', 'cifar100', 'fashion'] ## Tuning parameters: all of the dropouts parser = argparse.ArgumentParser(description='CNN') parser.add_argument('--dataset', default='cifar10', choices=dataset_options, help='Choose a dataset (cifar10, cifar100)') parser.add_argument( '--model', default='resnet32', choices=['resnet32', 'wideresnet', 'simpleconvnet'], help='Choose a model (resnet32, wideresnet, simpleconvnet)') #### Optimization hyperparameters parser.add_argument('--batch_size', type=int, default=128, help='Input batch size for training (default: 128)') parser.add_argument('--epochs', type=int, default=int(config['epochs']), help='Number of epochs to train (default: 200)') parser.add_argument('--lr', type=float, default=float(config['lr']), help='Learning rate') parser.add_argument('--momentum', type=float, default=float(config['momentum']), help='Nesterov momentum') parser.add_argument('--lr_decay', type=float, default=float(config['lr_decay']), help='Factor by which to multiply the learning rate.') # parser.add_argument('--weight_decay', type=float, default=float(config['weight_decay']), # help='Amount of weight decay to use.') # parser.add_argument('--dropout', type=float, default=config['dropout'] if 'dropout' in config else 0.0, # help='Amount of dropout for wideresnet') # parser.add_argument('--dropout1', type=float, default=config['dropout1'] if 'dropout1' in config else -1, # help='Amount of dropout for wideresnet') # parser.add_argument('--dropout2', type=float, default=config['dropout2'] if 'dropout2' in config else -1, # help='Amount of dropout for wideresnet') # parser.add_argument('--dropout3', type=float, default=config['dropout3'] if 'dropout3' in config else -1, # help='Amount of dropout for wideresnet') parser.add_argument('--dropout_type', type=str, default=config['dropout_type'], help='Type of dropout (bernoulli or gaussian)') # Data augmentation hyperparameters parser.add_argument( '--inscale', type=float, default=0 if 'inscale' not in config else config['inscale'], help='defines input scaling factor') parser.add_argument('--hue', type=float, default=0. if 'hue' not in config else config['hue'], help='hue jitter rate') parser.add_argument( '--brightness', type=float, default=0. if 'brightness' not in config else config['brightness'], help='brightness jitter rate') parser.add_argument( '--saturation', type=float, default=0. if 'saturation' not in config else config['saturation'], help='saturation jitter rate') parser.add_argument( '--contrast', type=float, default=0. if 'contrast' not in config else config['contrast'], help='contrast jitter rate') # Weight decay and dropout hyperparameters for each layer parser.add_argument( '--weight_decays', type=str, default='0.0', help= 'Amount of weight decay to use for each layer, represented as a comma-separated string of floats.' ) parser.add_argument( '--dropouts', type=str, default='0.0', help= 'Dropout rates for each layer, represented as a comma-separated string of floats' ) parser.add_argument( '--nonmono', '-nonm', type=int, default=60, help='how many previous epochs to consider for nonmonotonic criterion') parser.add_argument( '--patience', type=int, default=75, help= 'How long to wait for the val loss to improve before early stopping.') parser.add_argument( '--data_augmentation', action='store_true', default=config['data_augmentation'], help='Augment data by cropping and horizontal flipping') parser.add_argument( '--log_interval', type=int, default=10, help='how many steps before logging stats from training set') parser.add_argument( '--valid_log_interval', type=int, default=50, help='how many steps before logging stats from validations set') parser.add_argument('--no_cuda', action='store_true', default=False, help='enables CUDA training') parser.add_argument('--save', action='store_true', default=False, help='whether to save current run') parser.add_argument('--seed', type=int, default=11, help='random seed (default: 11)') parser.add_argument( '--save_dir', default=config['save_dir'], help= 'subdirectory of logdir/savedir to save in (default changes to date/time)' ) args, unknown = parser.parse_known_args() args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if args.cuda else "cpu") cudnn.benchmark = True # Should make training should go faster for large models torch.manual_seed(args.seed) np.random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) print(args) sys.stdout.flush() # args.dropout1 = args.dropout1 if args.dropout1 != -1 else args.dropout # args.dropout2 = args.dropout2 if args.dropout2 != -1 else args.dropout # args.dropout3 = args.dropout3 if args.dropout3 != -1 else args.dropout ############################################################################### # Saving ############################################################################### timestamp = '{:%Y-%m-%d}'.format(datetime.datetime.now()) random_hash = random.getrandbits(16) exp_name = '{}-dset:{}-model:{}-seed:{}-hash:{}'.format( timestamp, args.dataset, args.model, args.seed if args.seed else 'None', random_hash) dropout_rates = [float(value) for value in args.dropouts.split(',')] weight_decays = [float(value) for value in args.weight_decays.split(',')] # Create log folder BASE_SAVE_DIR = 'experiments' save_dir = os.path.join(BASE_SAVE_DIR, args.save_dir, exp_name) if not os.path.exists(save_dir): os.makedirs(save_dir) # Check whether the result.csv file exists already if os.path.exists(os.path.join(save_dir, 'result.csv')): if not args.overwrite: print( 'The result file {} exists! Run with --overwrite to overwrite this experiment.' .format(os.path.join(save_dir, 'result.csv'))) sys.exit(0) # Save command-line arguments with open(os.path.join(save_dir, 'args.yaml'), 'w') as f: yaml.dump(vars(args), f) epoch_csv_logger = CSVLogger( fieldnames=['epoch', 'train_loss', 'train_acc', 'val_loss', 'val_acc'], filename=os.path.join(save_dir, 'epoch_log.csv')) ############################################################################### # Data Loading/Model/Optimizer ############################################################################### if args.dataset == 'cifar10': train_loader, valid_loader, test_loader = data_loaders.load_cifar10( args, args.batch_size, val_split=True, augmentation=args.data_augmentation) num_classes = 10 elif args.dataset == 'cifar100': train_loader, valid_loader, test_loader = data_loaders.load_cifar100( args, args.batch_size, val_split=True, augmentation=args.data_augmentation) num_classes = 100 elif args.dataset == 'fashion': train_loader, valid_loader, test_loader = data_loaders.load_fashion_mnist( args.batch_size, val_split=True) num_classes = 10 if args.model == 'resnet32': cnn = resnet_cifar.resnet32(dropRates=dropout_rates) elif args.model == 'wideresnet': cnn = wide_resnet.WideResNet(depth=16, num_classes=num_classes, widen_factor=8, dropRates=dropout_rates, dropType=args.dropout_type) # cnn = wide_resnet.WideResNet(depth=28, num_classes=num_classes, widen_factor=10, dropRate=args.dropout) elif args.model == 'simpleconvnet': cnn = models.SimpleConvNet(dropType=args.dropout_type, conv_drop1=args.dropout1, conv_drop2=args.dropout2, fc_drop=args.dropout3) def optim_parameters(model): module_list = [ m for m in model.modules() if type(m) == nn.Linear or type(m) == nn.Conv2d ] weight_decays = [1e-4] * len(module_list) return [{ 'params': layer.parameters(), 'weight_decay': wdecay } for (layer, wdecay) in zip(module_list, weight_decays)] cnn = cnn.to(device) criterion = nn.CrossEntropyLoss() # cnn_optimizer = torch.optim.SGD(cnn.parameters(), # lr=args.lr, # momentum=args.momentum, # nesterov=True, # weight_decay=args.weight_decay) cnn_optimizer = torch.optim.SGD(optim_parameters(cnn), lr=args.lr, momentum=args.momentum, nesterov=True) ############################################################################### # Training/Evaluation ############################################################################### def evaluate(loader): """Returns the loss and accuracy on the entire validation/test set.""" cnn.eval() correct = total = loss = 0. with torch.no_grad(): for images, labels in loader: images, labels = images.to(device), labels.to(device) pred = cnn(images) loss += F.cross_entropy(pred, labels, reduction='sum').item() hard_pred = torch.max(pred, 1)[1] total += labels.size(0) correct += (hard_pred == labels).sum().item() accuracy = correct / total mean_loss = loss / total cnn.train() return mean_loss, accuracy epoch = 1 global_step = 0 patience_elapsed = 0 stored_loss = 1e8 best_val_loss = [] start_time = time.time() # This is based on the schedule used for WideResNets. The gamma (decay factor) can also be 0.2 (= 5x decay) # Right now we're not using the scheduler because we use nonmonotonic lr decay (based on validation performance) # scheduler = MultiStepLR(cnn_optimizer, milestones=[60,120,160], gamma=args.lr_decay) while epoch < args.epochs + 1 and patience_elapsed < args.patience: running_xentropy = correct = total = 0. progress_bar = tqdm(train_loader) for i, (images, labels) in enumerate(progress_bar): progress_bar.set_description('Epoch ' + str(epoch)) images, labels = images.to(device), labels.to(device) if args.inscale > 0: noise = torch.rand(images.size(0), device=device) scaled_noise = ( (1 + args.inscale) - (1 / (1 + args.inscale))) * noise + (1 / (1 + args.inscale)) images = images * scaled_noise[:, None, None, None] # images = F.dropout(images, p=args.indropout, training=True) # TODO: Incorporate input dropout cnn.zero_grad() pred = cnn(images) xentropy_loss = criterion(pred, labels) xentropy_loss.backward() cnn_optimizer.step() running_xentropy += xentropy_loss.item() # Calculate running average of accuracy _, hard_pred = torch.max(pred, 1) total += labels.size(0) correct += (hard_pred == labels).sum().item() accuracy = correct / float(total) global_step += 1 progress_bar.set_postfix( xentropy='%.3f' % (running_xentropy / (i + 1)), acc='%.3f' % accuracy, lr='%.3e' % cnn_optimizer.param_groups[0]['lr']) val_loss, val_acc = evaluate(valid_loader) print('Val loss: {:6.4f} | Val acc: {:6.4f}'.format(val_loss, val_acc)) sys.stdout.flush() stats = { 'global_step': global_step, 'time': time.time() - start_time, 'loss': val_loss, 'acc': val_acc } # logger.write('valid', stats) if (len(best_val_loss) > args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])): cnn_optimizer.param_groups[0]['lr'] *= args.lr_decay print('Decaying the learning rate to {}'.format( cnn_optimizer.param_groups[0]['lr'])) sys.stdout.flush() if val_loss < stored_loss: with open(os.path.join(save_dir, 'best_checkpoint.pt'), 'wb') as f: torch.save(cnn.state_dict(), f) print('Saving model (new best validation)') sys.stdout.flush() stored_loss = val_loss patience_elapsed = 0 else: patience_elapsed += 1 best_val_loss.append(val_loss) # scheduler.step(epoch) avg_xentropy = running_xentropy / (i + 1) train_acc = correct / float(total) if callback is not None: callback(epoch, avg_xentropy, train_acc, val_loss, val_acc, config) if reporter is not None: reporter(timesteps_total=epoch, mean_loss=val_loss) if cnn_optimizer.param_groups[0][ 'lr'] < 1e-7: # Another stopping criterion based on decaying the lr break epoch += 1 epoch_row = { 'epoch': str(epoch), 'train_loss': avg_xentropy, 'train_acc': str(train_acc), 'val_loss': str(val_loss), 'val_acc': str(val_acc) } epoch_csv_logger.writerow(epoch_row) # Load best model and run on test with open(os.path.join(save_dir, 'best_checkpoint.pt'), 'rb') as f: cnn.load_state_dict(torch.load(f)) train_loss = avg_xentropy train_acc = correct / float(total) # Run on val and test data. val_loss, val_acc = evaluate(valid_loader) test_loss, test_acc = evaluate(test_loader) print('=' * 89) print( '| End of training | trn loss: {:8.5f} | trn acc {:8.5f} | val loss {:8.5f} | val acc {:8.5f} | test loss {:8.5f} | test acc {:8.5f}' .format(train_loss, train_acc, val_loss, val_acc, test_loss, test_acc)) print('=' * 89) sys.stdout.flush() # Save the final val and test performance to a results CSV file with open(os.path.join(save_dir, 'result_{}.csv'.format(time.time())), 'w') as result_file: result_writer = csv.DictWriter(result_file, fieldnames=[ 'train_loss', 'train_acc', 'val_loss', 'val_acc', 'test_loss', 'test_acc' ]) result_writer.writeheader() result_writer.writerow({ 'train_loss': train_loss, 'train_acc': train_acc, 'val_loss': val_loss, 'val_acc': val_acc, 'test_loss': test_loss, 'test_acc': test_acc }) result_file.flush() if return_all: print("RETURNING ", train_loss, train_acc, val_loss, val_acc, test_loss, test_acc) sys.stdout.flush() return train_loss, train_acc, val_loss, val_acc, test_loss, test_acc else: print("RETURNING ", stored_loss) sys.stdout.flush() return stored_loss
def experiment(): parser = argparse.ArgumentParser(description='CNN Hyperparameter Fine-tuning') parser.add_argument('--dataset', default='cifar10', choices=['cifar10', 'cifar100'], help='Choose a dataset') parser.add_argument('--model', default='resnet18', choices=['resnet18', 'wideresnet'], help='Choose a model') parser.add_argument('--num_finetune_epochs', type=int, default=200, help='Number of fine-tuning epochs') parser.add_argument('--lr', type=float, default=0.1, help='Learning rate') parser.add_argument('--optimizer', type=str, default='sgdm', help='Choose an optimizer') parser.add_argument('--batch_size', type=int, default=128, help='Mini-batch size') parser.add_argument('--data_augmentation', action='store_true', default=True, help='Whether to use data augmentation') parser.add_argument('--wdecay', type=float, default=5e-4, help='Amount of weight decay') parser.add_argument('--load_checkpoint', type=str, help='Path to pre-trained checkpoint to load and finetune') parser.add_argument('--save_dir', type=str, default='finetuned_checkpoints', help='Save directory for the fine-tuned checkpoint') args = parser.parse_args() args.load_checkpoint = '/h/lorraine/PycharmProjects/CG_IFT_test/baseline_checkpoints/cifar10_resnet18_sgdm_lr0.1_wd0.0005_aug0.pt' if args.dataset == 'cifar10': num_classes = 10 train_loader, val_loader, test_loader = data_loaders.load_cifar10(args.batch_size, val_split=True, augmentation=args.data_augmentation) elif args.dataset == 'cifar100': num_classes = 100 train_loader, val_loader, test_loader = data_loaders.load_cifar100(args.batch_size, val_split=True, augmentation=args.data_augmentation) if args.model == 'resnet18': cnn = ResNet18(num_classes=num_classes) elif args.model == 'wideresnet': cnn = WideResNet(depth=28, num_classes=num_classes, widen_factor=10, dropRate=0.3) if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) test_id = '{}_{}_{}_lr{}_wd{}_aug{}'.format(args.dataset, args.model, args.optimizer, args.lr, args.wdecay, int(args.data_augmentation)) filename = os.path.join(args.save_dir, test_id + '.csv') csv_logger = CSVLogger( fieldnames=['epoch', 'train_loss', 'train_acc', 'val_loss', 'val_acc', 'test_loss', 'test_acc'], filename=filename) checkpoint = torch.load(args.load_checkpoint) init_epoch = checkpoint['epoch'] cnn.load_state_dict(checkpoint['model_state_dict']) model = cnn.cuda() model.train() args.hyper_train = 'augment' # 'all_weight' # 'weight' def init_hyper_train(model): """ :return: """ init_hyper = None if args.hyper_train == 'weight': init_hyper = np.sqrt(args.wdecay) model.weight_decay = Variable(torch.FloatTensor([init_hyper]).cuda(), requires_grad=True) model.weight_decay = model.weight_decay.cuda() elif args.hyper_train == 'all_weight': num_p = sum(p.numel() for p in model.parameters()) weights = np.ones(num_p) * np.sqrt(args.wdecay) model.weight_decay = Variable(torch.FloatTensor(weights).cuda(), requires_grad=True) model.weight_decay = model.weight_decay.cuda() model = model.cuda() return init_hyper if args.hyper_train == 'augment': # Dont do inside the prior function, else scope is wrong augment_net = UNet(in_channels=3, n_classes=3, depth=5, wf=6, padding=True, batch_norm=False, up_mode='upconv') # TODO(PV): Initialize UNet properly augment_net = augment_net.cuda() def get_hyper_train(): """ :return: """ if args.hyper_train == 'weight' or args.hyper_train == 'all_weight': return [model.weight_decay] if args.hyper_train == 'augment': return augment_net.parameters() def get_hyper_train_flat(): return torch.cat([p.view(-1) for p in get_hyper_train()]) # TODO: Check this size init_hyper_train(model) if args.hyper_train == 'all_weight': wdecay = 0.0 else: wdecay = args.wdecay optimizer = optim.SGD(model.parameters(), lr=args.lr * 0.2 * 0.2, momentum=0.9, nesterov=True, weight_decay=wdecay) # args.wdecay) # print(checkpoint['optimizer_state_dict']) # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) scheduler = MultiStepLR(optimizer, milestones=[60, 120], gamma=0.2) # [60, 120, 160] hyper_optimizer = torch.optim.Adam(get_hyper_train(), lr=1e-3) # try 0.1 as lr # Set random regularization hyperparameters # data_augmentation_hparams = {} # Random values for hue, saturation, brightness, contrast, rotation, etc. if args.dataset == 'cifar10': num_classes = 10 train_loader, val_loader, test_loader = data_loaders.load_cifar10(args.batch_size, val_split=True, augmentation=args.data_augmentation) elif args.dataset == 'cifar100': num_classes = 100 train_loader, val_loader, test_loader = data_loaders.load_cifar100(args.batch_size, val_split=True, augmentation=args.data_augmentation) def test(loader): model.eval() # Change model to 'eval' mode (BN uses moving mean/var). correct = 0. total = 0. losses = [] for images, labels in loader: images = images.cuda() labels = labels.cuda() with torch.no_grad(): pred = model(images) xentropy_loss = F.cross_entropy(pred, labels) losses.append(xentropy_loss.item()) pred = torch.max(pred.data, 1)[1] total += labels.size(0) correct += (pred == labels).sum().item() avg_loss = float(np.mean(losses)) acc = correct / total model.train() return avg_loss, acc def prepare_data(x, y): """ :param x: :param y: :return: """ x, y = x.cuda(), y.cuda() # x, y = Variable(x), Variable(y) return x, y def train_loss_func(x, y): """ :param x: :param y: :return: """ x, y = prepare_data(x, y) reg_loss = 0.0 if args.hyper_train == 'weight': pred = model(x) xentropy_loss = F.cross_entropy(pred, y) # print(f"weight_decay: {torch.exp(model.weight_decay).shape}") for p in model.parameters(): # print(f"weight_decay: {torch.exp(model.weight_decay).shape}") # print(f"shape: {p.shape}") reg_loss = reg_loss + .5 * (model.weight_decay ** 2) * torch.sum(p ** 2) # print(f"reg_loss: {reg_loss}") elif args.hyper_train == 'all_weight': pred = model(x) xentropy_loss = F.cross_entropy(pred, y) count = 0 for p in model.parameters(): reg_loss = reg_loss + .5 * torch.sum( (model.weight_decay[count: count + p.numel()] ** 2) * torch.flatten(p ** 2)) count += p.numel() elif args.hyper_train == 'augment': augmented_x = augment_net(x) pred = model(augmented_x) xentropy_loss = F.cross_entropy(pred, y) return xentropy_loss + reg_loss, pred def val_loss_func(x, y): """ :param x: :param y: :return: """ x, y = prepare_data(x, y) pred = model(x) xentropy_loss = F.cross_entropy(pred, y) return xentropy_loss for epoch in range(init_epoch, init_epoch + args.num_finetune_epochs): xentropy_loss_avg = 0. total_val_loss = 0. correct = 0. total = 0. progress_bar = tqdm(train_loader) for i, (images, labels) in enumerate(progress_bar): progress_bar.set_description('Finetune Epoch ' + str(epoch)) # TODO: Take a hyperparameter step here optimizer.zero_grad(), hyper_optimizer.zero_grad() val_loss, weight_norm, grad_norm = hyper_step(1, 1, get_hyper_train, get_hyper_train_flat, model, val_loss_func, val_loader, train_loss_func, train_loader, hyper_optimizer) # del val_loss # print(f"hyper: {get_hyper_train()}") images, labels = images.cuda(), labels.cuda() # pred = model(images) # xentropy_loss = F.cross_entropy(pred, labels) xentropy_loss, pred = train_loss_func(images, labels) optimizer.zero_grad(), hyper_optimizer.zero_grad() xentropy_loss.backward() optimizer.step() xentropy_loss_avg += xentropy_loss.item() # Calculate running average of accuracy pred = torch.max(pred.data, 1)[1] total += labels.size(0) correct += (pred == labels.data).sum().item() accuracy = correct / total progress_bar.set_postfix( train='%.5f' % (xentropy_loss_avg / (i + 1)), val='%.4f' % (total_val_loss / (i + 1)), acc='%.4f' % accuracy, weight='%.2f' % weight_norm, update='%.3f' % grad_norm) val_loss, val_acc = test(val_loader) test_loss, test_acc = test(test_loader) tqdm.write('val loss: {:6.4f} | val acc: {:6.4f} | test loss: {:6.4f} | test_acc: {:6.4f}'.format( val_loss, val_acc, test_loss, test_acc)) scheduler.step(epoch) row = {'epoch': str(epoch), 'train_loss': str(xentropy_loss_avg / (i + 1)), 'train_acc': str(accuracy), 'val_loss': str(val_loss), 'val_acc': str(val_acc), 'test_loss': str(test_loss), 'test_acc': str(test_acc)} csv_logger.writerow(row)
def main(opt): # Logging trace_file = os.path.join(opt['output_dir'], '{}_trace.txt'.format(opt['exp_name'])) # Load data if opt['dataset'] == 'cifar-fs': train_data = get_dataset('cifar-fs-train-train', opt['dataroot']) val_data = get_dataset('cifar-fs-val', opt['dataroot']) test_data = get_dataset('cifar-fs-test', opt['dataroot']) tr = get_transform('cifar_resize_normalize') normalize = cifar_normalize elif opt['dataset'] == 'miniimagenet': train_data = get_dataset('miniimagenet-train-train', opt['dataroot']) val_data = get_dataset('miniimagenet-val', opt['dataroot']) test_data = get_dataset('miniimagenet-test', opt['dataroot']) tr = get_transform('cifar_resize_normalize_84') normalize = cifar_normalize if opt['input_regularization'] == 'oe': reg_data = load_ood_data({ 'name': 'tinyimages', 'ood_scale': 1, 'n_anom': 50000, }) if not opt['ooe_only']: if opt['db']: ood_distributions = ['ooe', 'gaussian'] else: ood_distributions = [ 'ooe', 'gaussian', 'rademacher', 'texture3', 'svhn', 'tinyimagenet', 'lsun' ] if opt['input_regularization'] == 'oe': ood_distributions.append('tinyimages') ood_tensors = [('ooe', None)] + [(out_name, load_ood_data({ 'name': out_name, 'ood_scale': 1, 'n_anom': 10000, })) for out_name in ood_distributions[1:]] # Load trained model loaded = torch.load(opt['model.model_path']) if not isinstance(loaded, OrderedDict): fs_model = loaded else: classifier = ResNetClassifier(64, train_data['im_size']).to(device) classifier.load_state_dict(loaded) fs_model = Protonet(classifier.encoder) fs_model.eval() fs_model = fs_model.to(device) # Init Confidence Methods if opt['confidence_method'] == 'oec': init_sample = load_episode(train_data, tr, opt['data.test_way'], opt['data.test_shot'], opt['data.test_query'], device) conf_model = OECConfidence(None, fs_model, init_sample, opt) elif opt['confidence_method'] == 'deep-oec': init_sample = load_episode(train_data, tr, opt['data.test_way'], opt['data.test_shot'], opt['data.test_query'], device) conf_model = DeepOECConfidence(None, fs_model, init_sample, opt) elif opt['confidence_method'] == 'dm-iso': encoder = fs_model.encoder deep_mahala_obj = DeepMahala(None, None, None, encoder, device, num_feats=encoder.depth, num_classes=train_data['n_classes'], pretrained_path="", fit=False, normalize=None) conf_model = DMConfidence(deep_mahala_obj, { 'ls': range(encoder.depth), 'reduction': 'max', 'g_magnitude': .1 }, True, 'iso') if opt['pretrained_oec_path']: conf_model.load_state_dict(torch.load(opt['pretrained_oec_path'])) conf_model.to(device) print(conf_model) optimizer = optim.Adam(conf_model.confidence_parameters(), lr=opt['lr'], weight_decay=opt['wd']) scheduler = StepLR(optimizer, step_size=opt['lrsche_step_size'], gamma=opt['lrsche_gamma']) num_param = sum(p.numel() for p in conf_model.confidence_parameters()) print(f"Learning Confidence, Number of Parameters -- {num_param}") if conf_model.pretrain_parameters() is not None: pretrain_optimizer = optim.Adam(conf_model.pretrain_parameters(), lr=10) pretrain_iter = 100 start_idx = 0 if opt['resume']: last_ckpt_path = os.path.join(opt['output_dir'], 'last_ckpt.pt') if os.path.exists(last_ckpt_path): try: last_ckpt = torch.load(last_ckpt_path) if 'conf_model' in last_ckpt: conf_model = last_ckpt['conf_model'] else: sd = last_ckpt['conf_model_sd'] conf_model.load_state_dict(sd) optimizer = last_ckpt['optimizer'] pretrain_optimizer = last_ckpt['pretrain_optimizer'] scheduler = last_ckpt['scheduler'] start_idx = last_ckpt['outer_idx'] conf_model.to(device) except EOFError: print( "\n\nResuming but got EOF error, starting from init..\n\n") wandb.run.name = opt['exp_name'] wandb.run.save() # try: wandb.watch(conf_model) # except: # resuming a run # pass # Eval and Logging confs = { opt['confidence_method']: conf_model, } if opt['confidence_method'] == 'oec': confs['ed'] = FSCConfidence(fs_model, 'ed') elif opt['confidence_method'] == 'deep-oec': encoder = fs_model.encoder deep_mahala_obj = DeepMahala(None, None, None, encoder, device, num_feats=encoder.depth, num_classes=train_data['n_classes'], pretrained_path="", fit=False, normalize=None) confs['dm'] = DMConfidence(deep_mahala_obj, { 'ls': range(encoder.depth), 'reduction': 'max', 'g_magnitude': 0 }, True, 'iso').to(device) # Temporal Ensemble for Evaluation if opt['n_ensemble'] > 1: nets = [deepcopy(conf_model) for _ in range(opt['n_ensemble'])] confs['mixture-' + opt['confidence_method']] = Ensemble( nets, 'mixture') confs['poe-' + opt['confidence_method']] = Ensemble(nets, 'poe') ensemble_update_interval = opt['eval_every_outer'] // opt['n_ensemble'] iteration_fieldnames = ['global_iteration'] for c in confs: iteration_fieldnames += [ f'{c}_train_ooe', f'{c}_val_ooe', f'{c}_test_ooe', f'{c}_ood' ] iteration_logger = CSVLogger(every=0, fieldnames=iteration_fieldnames, filename=os.path.join(opt['output_dir'], 'iteration_log.csv')) best_val_ooe = 0 PATIENCE = 5 # Number of evaluations to wait waited = 0 progress_bar = tqdm(range(start_idx, opt['train_iter'])) for outer_idx in progress_bar: sample = load_episode(train_data, tr, opt['data.test_way'], opt['data.test_shot'], opt['data.test_query'], device) conf_model.train() if opt['full_supervision']: # sanity check conf_model.support(sample['xs']) in_score = conf_model.score(sample['xq'], detach=False).squeeze() out_score = conf_model.score(sample['ooc_xq'], detach=False).squeeze() out_scores = [out_score] for curr_ood, ood_tensor in ood_tensors: if curr_ood == 'ooe': continue start = outer_idx % (len(ood_tensor) // 2) stop = min( start + sample['xq'].shape[0] * sample['xq'].shape[0], len(ood_tensor) // 2) oxq = torch.stack([tr(x) for x in ood_tensor[start:stop]]).to(device) o = conf_model.score(oxq, detach=False).squeeze() out_scores.append(o) # out_score = torch.cat(out_scores) in_score = in_score.repeat(len(ood_tensors)) loss, acc = compute_loss_bce(in_score, out_score, mean_center=False) else: conf_model.support(sample['xs']) if opt['interpolate']: half_n_way = sample['xq'].shape[0] // 2 interp = .5 * (sample['xq'][:half_n_way] + sample['xq'][half_n_way:2 * half_n_way]) sample['ooc_xq'][:half_n_way] = interp if opt['input_regularization'] == 'oe': # Reshape ooc_xq nw, nq, c, h, w = sample['ooc_xq'].shape sample['ooc_xq'] = sample['ooc_xq'].view(1, nw * nq, c, h, w) oe_bs = int(nw * nq * opt['input_regularization_percent']) start = (outer_idx * oe_bs) % len(reg_data) end = np.min([start + oe_bs, len(reg_data)]) oe_batch = torch.stack([tr(x) for x in reg_data[start:end] ]).to(device) oe_batch = oe_batch.unsqueeze(0) sample['ooc_xq'][:, :oe_batch.shape[1]] = oe_batch if opt['in_out_1_batch']: inps = torch.cat([sample['xq'], sample['ooc_xq']], 1) scores = conf_model.score(inps, detach=False).squeeze() in_score, out_score = scores[:sample['xq'].shape[1]], scores[ sample['xq'].shape[1]:] else: in_score = conf_model.score(sample['xq'], detach=False).squeeze() out_score = conf_model.score(sample['ooc_xq'], detach=False).squeeze() loss, acc = compute_loss_bce(in_score, out_score, mean_center=False) if conf_model.pretrain_parameters( ) is not None and outer_idx < pretrain_iter: pretrain_optimizer.zero_grad() loss.backward() pretrain_optimizer.step() else: optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() progress_bar.set_postfix(loss='{:.3e}'.format(loss), acc='{:.3e}'.format(acc)) # Update Ensemble if opt['n_ensemble'] > 1 and outer_idx % ensemble_update_interval == 0: update_ind = (outer_idx // ensemble_update_interval) % opt['n_ensemble'] if opt['db']: print(f"===> Updating Ensemble: {update_ind}") confs['mixture-' + opt['confidence_method']].nets[update_ind] = deepcopy( conf_model) confs['poe-' + opt['confidence_method']].nets[update_ind] = deepcopy( conf_model) # AUROC eval if outer_idx % opt['eval_every_outer'] == 0: if not opt['eval_in_train']: conf_model.eval() # Eval.. stats_dict = {'global_iteration': outer_idx} for conf_name, conf in confs.items(): conf.eval() # OOE eval ooe_aurocs = {} for split, in_data in [('train', train_data), ('val', val_data), ('test', test_data)]: auroc = np.mean( eval_ood_aurocs( None, in_data, tr, opt['data.test_way'], opt['data.test_shot'], opt['data.test_query'], opt['data.test_episodes'], device, conf, no_grad=False if opt['confidence_method'].startswith('dm') else True)['aurocs']) ooe_aurocs[split] = auroc print_str = '{}, iter: {} ({}), auroc: {:.3e}'.format( conf_name, outer_idx, split, ooe_aurocs[split]) _print_and_log(print_str, trace_file) stats_dict[f'{conf_name}_train_ooe'] = ooe_aurocs['train'] stats_dict[f'{conf_name}_val_ooe'] = ooe_aurocs['val'] stats_dict[f'{conf_name}_test_ooe'] = ooe_aurocs['test'] # OOD eval if not opt['ooe_only']: aurocs = [] for curr_ood, ood_tensor in ood_tensors: auroc = np.mean( eval_ood_aurocs( ood_tensor, test_data, tr, opt['data.test_way'], opt['data.test_shot'], opt['data.test_query'], opt['data.test_episodes'], device, conf, no_grad=False if opt['confidence_method'].startswith('dm') else True)['aurocs']) aurocs.append(auroc) print_str = '{}, iter: {} ({}), auroc: {:.3e}'.format( conf_name, outer_idx, curr_ood, auroc) _print_and_log(print_str, trace_file) mean_ood_auroc = np.mean(aurocs) print_str = '{}, iter: {} (OOD_mean), auroc: {:.3e}'.format( conf_name, outer_idx, mean_ood_auroc) _print_and_log(print_str, trace_file) stats_dict[f'{conf_name}_ood'] = mean_ood_auroc iteration_logger.writerow(stats_dict) plot_csv(iteration_logger.filename, iteration_logger.filename) wandb.log(stats_dict) if stats_dict[f'{opt["confidence_method"]}_val_ooe'] > best_val_ooe: conf_model.cpu() torch.save( conf_model.state_dict(), os.path.join(opt['output_dir'], opt['exp_name'] + '_conf_best.pt')) conf_model.to(device) # Ckpt ensemble if opt['n_ensemble'] > 1: ensemble = confs['mixture-' + opt['confidence_method']] ensemble.cpu() torch.save( ensemble.state_dict(), os.path.join(opt['output_dir'], opt['exp_name'] + '_ensemble_best.pt')) ensemble.to(device) waited = 0 else: waited += 1 if waited >= PATIENCE: print("PATIENCE exceeded...exiting") sys.exit() # For `resume` conf_model.cpu() torch.save( { 'conf_model_sd': conf_model.state_dict(), 'optimizer': optimizer, 'pretrain_optimizer': pretrain_optimizer if conf_model.pretrain_parameters() is not None else None, 'scheduler': scheduler, 'outer_idx': outer_idx, }, os.path.join(opt['output_dir'], 'last_ckpt.pt')) conf_model.to(device) conf_model.train() sys.exit()
xentropy_loss_avg += xentropy_loss.item() # Calculate running average of accuracy pred = torch.max(pred.data, 1)[1] total += labels.size(0) correct += (pred == labels.data).sum().item() accuracy = correct / total progress_bar.set_postfix(xentropy='%.3f' % (xentropy_loss_avg / (i + 1)), acc='%.3f' % accuracy) val_loss, val_acc = test(val_loader) test_loss, test_acc = test(test_loader) tqdm.write( 'val loss: {:6.4f} | val acc: {:6.4f} | test loss: {:6.4f} | test_acc: {:6.4f}' .format(val_loss, val_acc, test_loss, test_acc)) # scheduler.step(epoch) row = { 'epoch': str(epoch), 'train_loss': str(xentropy_loss_avg / (i + 1)), 'train_acc': str(accuracy), 'val_loss': str(val_loss), 'val_acc': str(val_acc), 'test_loss': str(test_loss), 'test_acc': str(test_acc) } csv_logger.writerow(row)