def ce_gradient_norm_histogram(model, data_loader, tensorboard, epoch, name, max_num_examples=5000): model.eval() pred = utils.apply_on_dataset(model=model, dataset=data_loader.dataset, output_keys_regexp='pred', description='grad-histogram:pred', max_num_examples=max_num_examples)['pred'] n_examples = min(len(data_loader.dataset), max_num_examples) labels = [] for idx in range(n_examples): labels.append(data_loader.dataset[idx][1]) labels = torch.tensor(labels, dtype=torch.long) labels = F.one_hot(labels, num_classes=model.num_classes).float() labels = utils.to_cpu(labels) grad_wrt_logits = torch.softmax(pred, dim=-1) - labels grad_norms = torch.sum(grad_wrt_logits**2, dim=-1) grad_norms = utils.to_numpy(grad_norms) try: tensorboard.add_histogram(tag=name, values=grad_norms, global_step=epoch) except ValueError as e: print("Tensorboard histogram error: {}".format(e))
def ce_gradient_pair_scatter(model, data_loader, d1=0, d2=1, max_num_examples=2000, plt=None): if plt is None: plt = matplotlib.pyplot model.eval() pred = utils.apply_on_dataset(model=model, dataset=data_loader.dataset, output_keys_regexp='pred', max_num_examples=max_num_examples, description='grad-pair-scatter:pred')['pred'] n_examples = min(len(data_loader.dataset), max_num_examples) labels = [] for idx in range(n_examples): labels.append(data_loader.dataset[idx][1]) labels = torch.tensor(labels, dtype=torch.long) labels = F.one_hot(labels, num_classes=model.num_classes).float() labels = utils.to_cpu(labels) grad_wrt_logits = torch.softmax(pred, dim=-1) - labels grad_wrt_logits = utils.to_numpy(grad_wrt_logits) fig, ax = plt.subplots(1, figsize=(5, 5)) plt.scatter(grad_wrt_logits[:, d1], grad_wrt_logits[:, d2]) ax.set_xlabel(str(d1)) ax.set_ylabel(str(d2)) # L = np.percentile(grad_wrt_logits, q=5, axis=0) # R = np.percentile(grad_wrt_logits, q=95, axis=0) # ax.set_xlim(L[d1], R[d1]) # ax.set_ylim(L[d2], R[d2]) ax.set_title('Two coordinates of grad wrt to logits') return fig, plt
def pred_gradient_pair_scatter(model, data_loader, d1=0, d2=1, max_num_examples=2000, plt=None): if plt is None: plt = matplotlib.pyplot model.eval() grad_pred = utils.apply_on_dataset( model=model, dataset=data_loader.dataset, output_keys_regexp='grad_pred', max_num_examples=max_num_examples, description='grad-pair-scatter:grad_pred')['grad_pred'] grad_pred = utils.to_numpy(grad_pred) fig, ax = plt.subplots(1, figsize=(5, 5)) plt.scatter(grad_pred[:, d1], grad_pred[:, d2]) ax.set_xlabel(str(d1)) ax.set_ylabel(str(d2)) # L = np.percentile(grad_pred, q=5, axis=0) # R = np.percentile(grad_pred, q=95, axis=0) # ax.set_xlim(L[d1], R[d1]) # ax.set_ylim(L[d2], R[d2]) ax.set_title('Two coordinates of grad wrt to logits') return fig, plt
def __init__(self, model, train_data, val_data=None, l2_reg_coef=0.0, **kwargs): super(LinearizedModelV2, self).__init__(**kwargs) self.model = model self.train_data = train_data self.val_data = val_data self.l2_reg_coef = l2_reg_coef # copy the parameters at initialization self.init_params = copy.deepcopy(dict(model.named_parameters())) for k, v in self.init_params.items(): v.detach_() v.requires_grad = False # to stop training # compute all gradients self.jacobians = dict() jacobian_estimator = JacobianEstimator(projection='none') self.jacobians['train'] = jacobian_estimator.compute_jacobian( model=model, dataset=train_data, cpu=False) if val_data is not None: self.jacobians['val'] = jacobian_estimator.compute_jacobian( model=model, dataset=val_data, cpu=False) for partition in self.jacobians.keys(): for k, v in self.jacobians[partition].items(): v.detach_() # in case they some computation graph was built # compute predictions at initialization self.init_preds = dict() self.init_preds['train'] = utils.apply_on_dataset( model=model, dataset=train_data, output_keys_regexp='pred', cpu=False)['pred'] if val_data is not None: self.init_preds['val'] = utils.apply_on_dataset( model=model, dataset=val_data, output_keys_regexp='pred', cpu=False)['pred'] for partition in self.init_preds.keys(): self.init_preds[partition].detach_( ) # in case they some computation graph was built
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', '-d', default='cuda') parser.add_argument('--batch_size', '-b', type=int, default=256) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--dataset', '-D', type=str, default='mnist', choices=['mnist', 'uniform-noise-mnist', 'cifar10', 'uniform-noise-cifar10', 'pair-noise-cifar10', 'cifar100', 'uniform-noise-cifar100', 'clothing1m', 'imagenet']) parser.add_argument('--data_augmentation', '-A', action='store_true', dest='data_augmentation') parser.set_defaults(data_augmentation=False) parser.add_argument('--num_train_examples', type=int, default=None) parser.add_argument('--error_prob', '-n', type=float, default=0.0) parser.add_argument('--clean_validation', dest='clean_validation', action='store_true') parser.set_defaults(clean_validation=False) parser.add_argument('--load_from', type=str, default=None, required=True) parser.add_argument('--output_dir', '-o', type=str, default=None) args = parser.parse_args() print(args) # Load data _, _, test_loader, _ = load_data_from_arguments(args) print(f"Testing the model saved at {args.load_from}") model = utils.load(args.load_from, methods=methods, device=args.device) ret = utils.apply_on_dataset(model, test_loader.dataset, batch_size=args.batch_size, output_keys_regexp='pred|label', description='Testing') pred = ret['pred'] labels = ret['label'] if args.output_dir is not None: with open(os.path.join(args.output_dir, 'test_predictions.pkl'), 'wb') as f: pickle.dump({'pred': pred, 'labels': labels}, f) accuracy = torch.mean((pred.argmax(dim=1) == labels).float()) print(accuracy) if args.output_dir is not None: with open(os.path.join(args.output_dir, 'test_accuracy.txt'), 'w') as f: f.write("{}\n".format(accuracy))
def estimate_transition(load_from, data_loader, device='cpu', batch_size=256): """ Estimates the label noise matrix. The code is adapted form the original implementation. Source: https://github.com/giorgiop/loss-correction/. """ assert load_from is not None model = utils.load(load_from, methods=methods, device=device) pred = utils.apply_on_dataset(model=model, dataset=data_loader.dataset, batch_size=batch_size, cpu=True, description="Estimating transition matrix", output_keys_regexp='pred')['pred'] pred = torch.softmax(pred, dim=1) pred = utils.to_numpy(pred) c = model.num_classes T = np.zeros((c, c)) filter_outlier = True # find a 'perfect example' for each class for i in range(c): if not filter_outlier: idx_best = np.argmax(pred[:, i]) else: thresh = np.percentile(pred[:, i], 97, interpolation='higher') robust_eta = pred[:, i] robust_eta[robust_eta >= thresh] = 0.0 idx_best = np.argmax(robust_eta) for j in range(c): T[i, j] = pred[idx_best, j] # row normalize row_sums = T.sum(axis=1, keepdims=True) T /= row_sums T = torch.tensor(T, dtype=torch.float).to(device) print(T) return T
def pred_gradient_norm_histogram(model, data_loader, tensorboard, epoch, name, max_num_examples=5000): model.eval() grad_pred = utils.apply_on_dataset( model=model, dataset=data_loader.dataset, output_keys_regexp='grad_pred', description='grad-histogram:grad_pred', max_num_examples=max_num_examples)['grad_pred'] grad_norms = torch.sum(grad_pred**2, dim=-1) grad_norms = utils.to_numpy(grad_norms) try: tensorboard.add_histogram(tag=name, values=grad_norms, global_step=epoch) except ValueError as e: print("Tensorboard histogram error: {}".format(e))
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--config', '-c', type=str, default='sample_info/configs/1hidden-mlp-n1024-binary-mnist.json') parser.add_argument('--device', '-d', default='cuda', help='specifies the main device') parser.add_argument('--seed', type=int, default=42) # hyper-parameters parser.add_argument('--model_class', '-m', type=str, default='ClassifierL2') parser.add_argument('--lr', type=float, default=1e-2, help='Learning rate') args = parser.parse_args() print(args) # Build data data_X, data_Y = get_synthetic_data(args.seed) half = len(data_X) // 2 train_data = TensorDataset( torch.tensor(data_X[:half]).float(), torch.tensor(data_Y[:half]).long().reshape((-1, 1))) val_data = TensorDataset( torch.tensor(data_X[half:]).float(), torch.tensor(data_Y[half:]).long().reshape((-1, 1))) with open(args.config, 'r') as f: architecture_args = json.load(f) model_class = getattr(methods, args.model_class) model = model_class(input_shape=train_data[0][0].shape, architecture_args=architecture_args, device=args.device) jacobian_estimator = JacobianEstimator(projection='none') jacobians = jacobian_estimator.compute_jacobian(model=model, dataset=train_data, output_key='pred', cpu=False) # val_jacobians = get_jacobians(model=model, dataset=val_data, output_key='pred', cpu=False) init_preds = utils.apply_on_dataset(model=model, dataset=train_data, cpu=False)['pred'] # val_init_preds = utils.apply_on_dataset(model=model, dataset=val_data, cpu=False)['pred'] init_params = dict(model.named_parameters()) ntk = compute_ntk(jacobians=jacobians) Y = [torch.tensor([y]) for (x, y) in train_data] Y = torch.stack(Y).float().to(ntk.device) ts = range(0, 1001, 20) for idx, t in tqdm(enumerate(ts), desc='main loop', total=len(ts)): _, q = weight_stability(t=t, n=len(train_data), eta=args.lr / len(train_data), init_params=init_params, jacobians=jacobians, ntk=ntk, init_preds=init_preds, Y=Y, continuous=False, return_change_vectors=False, scale_by_hessian=False) fig, ax = plot(q, data_X=data_X, data_Y=data_Y, half=half, t=t) file_path = f'sample_info/plots/synthetic-data/weight-{idx:04d}.png' utils.make_path(os.path.dirname(file_path)) fig.savefig(file_path) plt.close() # save video cur_dir = os.path.abspath(os.curdir) os.chdir('sample_info/plots/synthetic-data') os.system("ffmpeg -r 2 -i weight-%04d.png movie.webm") os.chdir(cur_dir)
def prepare_needed_items(model, train_data, test_data=None, projection='none', cpu=False, batch_size=256, **kwargs): jacobian_estimator = JacobianEstimator(projection=projection, **kwargs) train_jacobians = jacobian_estimator.compute_jacobian(model=model, dataset=train_data, output_key='pred', cpu=cpu) test_jacobians = None if test_data is not None: test_jacobians = jacobian_estimator.compute_jacobian(model=model, dataset=test_data, output_key='pred', cpu=cpu) train_init_preds = utils.apply_on_dataset(model=model, dataset=train_data, cpu=cpu, batch_size=batch_size)['pred'] test_init_preds = None if test_data is not None: test_init_preds = utils.apply_on_dataset(model=model, dataset=test_data, cpu=cpu, batch_size=batch_size)['pred'] init_params = dict(model.named_parameters()) if cpu: for k, v in init_params.items(): init_params[k] = v.to('cpu') ntk = compute_ntk(jacobians=train_jacobians) lamb, _ = torch.eig(ntk) lamb = lamb[:, 0] logging.info(f'Min eigenvalue of NTK: {torch.min(lamb).item():.3f}\t' f'Max eigenvalue of NTK: {torch.max(lamb).item():.3f}') if torch.min(lamb).item() < 0: logging.warning( 'The lowest eigenvalue of NTK is negative, consider adding at least small weight decay.' ) test_train_ntk = None if test_data is not None: test_train_ntk = compute_test_train_ntk( train_jacobians=train_jacobians, test_jacobians=test_jacobians) def extract_labels(data): ys = [ utils.to_tensor(y, device=ntk.device).view((-1, )) for x, y in data ] return torch.stack(ys).float() train_Y = extract_labels(train_data) test_Y = None if test_data is not None: test_Y = extract_labels(test_data) return { 'jacobian_estimator': jacobian_estimator, 'train_jacobians': train_jacobians, 'test_jacobians': test_jacobians, 'train_init_preds': train_init_preds, 'test_init_preds': test_init_preds, 'init_params': init_params, 'ntk': ntk, 'test_train_ntk': test_train_ntk, 'train_Y': train_Y, 'test_Y': test_Y }
def main(): parser = argparse.ArgumentParser() parser.add_argument('--config', '-c', type=str, required=True) parser.add_argument('--device', '-d', default='cuda', help='specifies the main device') parser.add_argument( '--all_device_ids', nargs='+', type=str, default=None, help= "If not None, this list specifies devices for multiple GPU training. " "The first device should match with the main device (args.device).") parser.add_argument('--batch_size', '-b', type=int, default=256) parser.add_argument('--epochs', '-e', type=int, default=400) parser.add_argument('--stopping_param', type=int, default=2**30) parser.add_argument('--save_iter', '-s', type=int, default=2**30) parser.add_argument('--vis_iter', '-v', type=int, default=2**30) parser.add_argument('--seed', type=int, default=42) parser.add_argument( '--num_accumulation_steps', default=1, type=int, help='Number of training steps to accumulate before updating weights') # data parameters parser.add_argument('--dataset', '-D', type=str, default='mnist') parser.add_argument('--data_augmentation', '-A', action='store_true', dest='data_augmentation') parser.set_defaults(data_augmentation=False) parser.add_argument('--error_prob', '-n', type=float, default=0.0) parser.add_argument('--num_train_examples', type=int, default=None) parser.add_argument('--clean_validation', action='store_true', default=False) parser.add_argument('--resize_to_imagenet', action='store_true', dest='resize_to_imagenet') parser.set_defaults(resize_to_imagenet=False) parser.add_argument('--cache_dataset', action='store_true', dest='cache_dataset') parser.set_defaults(cache_dataset=False) parser.add_argument( '--sample_ranking_file', type=str, default=None, help= 'Points to a pickle file that stores an ordering of examples from least to ' 'most important. The most important args.exclude_ratio number of samples ' 'will be excluded from training.') parser.add_argument('--exclude_ratio', type=float, default=0.0, help='Fraction of examples to exclude.') parser.add_argument('--exclude_side', type=str, default='top', choices=['top', 'bottom'], help='from which side of the order to remove') parser.add_argument('--num_workers', type=int, default=0, help='number of workers in data loaders') # hyper-parameters parser.add_argument('--model_class', '-m', type=str, default='ClassifierL2') parser.add_argument('--l2_reg_coef', type=float, default=0.0) parser.add_argument('--lr', type=float, default=1e-3, help='Learning rate') parser.add_argument('--optimizer', type=str, default='adam', choices=['adam', 'sgd']) parser.add_argument('--random_baseline_seed', type=int, default=42) parser.add_argument('--output_dir', '-o', type=str, default='sample_info/results/data-summarization/') parser.add_argument('--baseline_name', '-B', type=str, required=True) parser.add_argument('--exp_name', '-E', type=str, required=True) args = parser.parse_args() print(args) # set tensorboard log directory args.log_dir = os.path.join(args.output_dir, args.baseline_name, args.exp_name, 'logs') utils.make_path(args.log_dir) # Load data train_data, val_data, test_data, _ = load_data_from_arguments( args, build_loaders=False) # exclude samples np.random.seed(args.random_baseline_seed) order = np.random.permutation(len(train_data)) # if sample ranking file is given, take the order from there if args.sample_ranking_file is not None: with open(args.sample_ranking_file, 'rb') as f: order = pickle.load(f) exclude_count = int(args.exclude_ratio * len(train_data)) if exclude_count == 0: exclude_indices = [] else: if args.exclude_side == 'top': exclude_indices = order[-exclude_count:] else: exclude_indices = order[:exclude_count] train_data = SubsetDataWrapper(dataset=train_data, exclude_indices=exclude_indices) if args.cache_dataset: train_data = CacheDatasetWrapper(train_data) val_data = CacheDatasetWrapper(val_data) test_data = CacheDatasetWrapper(test_data) shuffle_train = (args.batch_size * args.num_accumulation_steps < len(train_data)) train_loader, val_loader, test_loader = get_loaders_from_datasets( train_data, val_data, test_data, batch_size=args.batch_size, num_workers=args.num_workers, shuffle_train=shuffle_train) # Options optimization_args = { 'optimizer': { 'name': args.optimizer, 'lr': args.lr, } } with open(args.config, 'r') as f: architecture_args = json.load(f) model_class = getattr(methods, args.model_class) model = model_class(input_shape=train_loader.dataset[0][0].shape, architecture_args=architecture_args, l2_reg_coef=args.l2_reg_coef, device=args.device, seed=args.seed) # put the model in always eval mode. This makes sure that in case the network has pretrained BatchNorm # layers, their running average is fixed. utils.put_always_eval_mode(model) metrics_list = [ metrics.Accuracy(output_key='pred', one_hot=(train_data[0][1].ndim > 0)) ] if args.dataset == 'imagenet': metrics_list.append(metrics.TopKAccuracy(k=5, output_key='pred')) stopper = callbacks.EarlyStoppingWithMetric( metric=metrics_list[0], stopping_param=args.stopping_param, partition='val', direction='max') training.train(model=model, train_loader=train_loader, val_loader=val_loader, epochs=args.epochs, save_iter=args.save_iter, vis_iter=args.vis_iter, optimization_args=optimization_args, log_dir=args.log_dir, args_to_log=args, stopper=stopper, metrics=metrics_list, device_ids=args.all_device_ids, num_accumulation_steps=args.num_accumulation_steps) val_preds = utils.apply_on_dataset(model=model, dataset=val_data, cpu=True, partition='val', batch_size=args.batch_size)['pred'] val_acc = metrics_list[0].value(epoch=args.epochs - 1, partition='val') file_name = f'results-{args.exclude_ratio:.4f}' if args.baseline_name == 'random': file_name += f'-{args.random_baseline_seed}' file_name += '.pkl' file_path = os.path.join(args.output_dir, args.baseline_name, args.exp_name, file_name) utils.make_path(os.path.dirname(file_path)) with open(file_path, 'wb') as f: pickle.dump({ 'val_preds': val_preds, 'val_acc': val_acc, 'args': args }, f)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--config', '-c', type=str, required=True) parser.add_argument('--device', '-d', default='cuda', help='specifies the main device') parser.add_argument( '--all_device_ids', nargs='+', type=str, default=None, help= "If not None, this list specifies devices for multiple GPU training. " "The first device should match with the main device (args.device).") parser.add_argument('--batch_size', '-b', type=int, default=2**20) parser.add_argument('--epochs', '-e', type=int, default=2000) parser.add_argument('--stopping_param', type=int, default=2**20) parser.add_argument('--save_iter', '-s', type=int, default=2**20) parser.add_argument('--vis_iter', '-v', type=int, default=2**20) parser.add_argument('--log_dir', '-l', type=str, default='sample_info/logs/junk') parser.add_argument('--seed', type=int, default=42) parser.add_argument( '--num_accumulation_steps', default=1, type=int, help='Number of training steps to accumulate before updating weights') # data parameters parser.add_argument( '--dataset', '-D', type=str, default='mnist4vs9', choices=[ 'mnist4vs9', 'synthetic', 'cifar10-cat-vs-dog', 'cats-and-dogs' ], help='Which dataset to use. One can add more choices if needed.') parser.add_argument('--data_augmentation', '-A', action='store_true', dest='data_augmentation') parser.set_defaults(data_augmentation=False) parser.add_argument('--error_prob', '-n', type=float, default=0.0) parser.add_argument('--num_train_examples', type=int, default=None) parser.add_argument('--clean_validation', action='store_true', default=False) parser.add_argument('--resize_to_imagenet', action='store_true', dest='resize_to_imagenet') parser.set_defaults(resize_to_imagenet=False) parser.add_argument('--cache_dataset', action='store_true', dest='cache_dataset') parser.set_defaults(cache_dataset=False) parser.add_argument('--num_workers', type=int, default=0, help='number of workers in data loaders') parser.add_argument('--exclude_index', type=int, default=None, help='Index of an example to remove.') # hyper-parameters parser.add_argument('--model_class', '-m', type=str, default='ClassifierL2') parser.add_argument('--linearized', dest='linearized', action='store_true') parser.set_defaults(linearized=False) parser.add_argument('--l2_reg_coef', type=float, default=0.0) parser.add_argument('--lr', type=float, default=1e-3, help='Learning rate') parser.add_argument('--optimizer', type=str, default='sgd', choices=['adam', 'sgd']) parser.add_argument( '--output_dir', '-o', type=str, default='sample_info/results/ground-truth/ground-truth/') parser.add_argument('--exp_name', '-E', type=str, required=True) args = parser.parse_args() print(args) # Build data train_data, val_data, test_data, _ = load_data_from_arguments( args, build_loaders=False) # exclude the example if args.exclude_index is not None: train_data = SubsetDataWrapper(dataset=train_data, exclude_indices=[args.exclude_index]) if args.cache_dataset: train_data = CacheDatasetWrapper(train_data) val_data = CacheDatasetWrapper(val_data) test_data = CacheDatasetWrapper(test_data) shuffle_train = (args.batch_size * args.num_accumulation_steps < len(train_data)) train_loader, val_loader, test_loader = get_loaders_from_datasets( train_data, val_data, test_data, batch_size=args.batch_size, num_workers=args.num_workers, shuffle_train=shuffle_train) # Options optimization_args = { 'optimizer': { 'name': args.optimizer, 'lr': args.lr, } } with open(args.config, 'r') as f: architecture_args = json.load(f) model_class = getattr(methods, args.model_class) model = model_class(input_shape=train_loader.dataset[0][0].shape, architecture_args=architecture_args, l2_reg_coef=args.l2_reg_coef, seed=args.seed, device=args.device) # put the model in always eval mode. This makes sure that in case the network has pretrained BatchNorm # layers, their running average is fixed. utils.put_always_eval_mode(model) if args.linearized: print("Using a linearized model") model = LinearizedModelV2(model=model, train_data=train_data, val_data=val_data, l2_reg_coef=args.l2_reg_coef) if args.dataset == 'synthetic': model.visualize = (lambda *args, **kwargs: {} ) # no visualization is needed metrics_list = [metrics.Accuracy(output_key='pred')] training.train(model=model, train_loader=train_loader, val_loader=val_loader, epochs=args.epochs + 1, save_iter=args.save_iter, vis_iter=args.vis_iter, optimization_args=optimization_args, log_dir=args.log_dir, args_to_log=args, metrics=metrics_list, device_ids=args.all_device_ids, num_accumulation_steps=args.num_accumulation_steps) params = dict(model.named_parameters()) for k in params.keys(): params[k] = utils.to_cpu(params[k]) val_preds = utils.apply_on_dataset(model=model, dataset=val_data, cpu=True, partition='val', batch_size=args.batch_size)['pred'] val_acc = metrics_list[0].value(epoch=args.epochs, partition='val') exp_dir = os.path.join(args.output_dir, args.exp_name) # if it the the full dataset save params and val_preds, otherwise compare to the saved weights/predictions if args.exclude_index is None: file_path = os.path.join(exp_dir, 'full-data-training.pkl') else: file_path = os.path.join(exp_dir, f'{args.exclude_index}.pkl') utils.make_path(os.path.dirname(file_path)) with open(file_path, 'wb') as f: pickle.dump( { 'weights': params, 'val_preds': val_preds, 'val_acc': val_acc, 'args': args }, f)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--config', '-c', type=str, required=True) parser.add_argument('--device', '-d', default='cuda') parser.add_argument( '--all_device_ids', nargs='+', type=str, default=None, help= "If not None, this list specifies devices for multiple GPU training. " "The first device should match with the main device (args.device).") parser.add_argument('--batch_size', '-b', type=int, default=256) parser.add_argument('--epochs', '-e', type=int, default=400) parser.add_argument('--stopping_param', type=int, default=50) parser.add_argument('--save_iter', '-s', type=int, default=10) parser.add_argument('--vis_iter', '-v', type=int, default=10) parser.add_argument('--log_dir', '-l', type=str, default=None) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--dataset', '-D', type=str, default='mnist', choices=[ 'mnist', 'uniform-noise-mnist', 'cifar10', 'uniform-noise-cifar10', 'pair-noise-cifar10', 'cifar100', 'uniform-noise-cifar100', 'clothing1m', 'imagenet' ]) parser.add_argument('--data_augmentation', '-A', action='store_true', dest='data_augmentation') parser.set_defaults(data_augmentation=False) parser.add_argument('--num_train_examples', type=int, default=None) parser.add_argument('--error_prob', '-n', type=float, default=0.0) parser.add_argument('--clean_validation', dest='clean_validation', action='store_true') parser.set_defaults(clean_validation=False) parser.add_argument('--model_class', '-m', type=str, default='StandardClassifier') parser.add_argument( '--loss_function', type=str, default='ce', choices=['ce', 'mse', 'mae', 'gce', 'dmi', 'fw', 'none']) parser.add_argument('--loss_function_param', type=float, default=1.0) parser.add_argument('--load_from', type=str, default=None) parser.add_argument('--grad_weight_decay', '-L', type=float, default=0.0) parser.add_argument('--grad_l1_penalty', '-S', type=float, default=0.0) parser.add_argument('--lamb', type=float, default=1.0) parser.add_argument('--pretrained_arg', '-r', type=str, default=None) parser.add_argument('--sample_from_q', action='store_true', dest='sample_from_q') parser.set_defaults(sample_from_q=False) parser.add_argument('--q_dist', type=str, default='Gaussian', choices=['Gaussian', 'Laplace', 'dot', 'ce']) parser.add_argument('--no-detach', dest='detach', action='store_false') parser.set_defaults(detach=True) parser.add_argument('--warm_up', type=int, default=0, help='Number of epochs to skip before ' 'starting to train using predicted gradients') parser.add_argument('--weight_decay', type=float, default=0.0) parser.add_argument( '--add_noise', action='store_true', dest='add_noise', help='add noise to the gradients of a standard classifier.') parser.set_defaults(add_noise=False) parser.add_argument('--noise_type', type=str, default='Gaussian', choices=['Gaussian', 'Laplace']) parser.add_argument('--noise_std', type=float, default=0.0) parser.add_argument('--lr', type=float, default=1e-3, help='Learning rate') args = parser.parse_args() print(args) # Load data train_loader, val_loader, test_loader, _ = load_data_from_arguments(args) # Options optimization_args = { 'optimizer': { 'name': 'adam', 'lr': args.lr, 'weight_decay': args.weight_decay } } # optimization_args = { # 'optimizer': { # 'name': 'sgd', # 'lr': 1e-3, # }, # 'scheduler': { # 'step_size': 15, # 'gamma': 1.25 # } # } with open(args.config, 'r') as f: architecture_args = json.load(f) model_class = getattr(methods, args.model_class) model = model_class(input_shape=train_loader.dataset[0][0].shape, architecture_args=architecture_args, pretrained_arg=args.pretrained_arg, device=args.device, grad_weight_decay=args.grad_weight_decay, grad_l1_penalty=args.grad_l1_penalty, lamb=args.lamb, sample_from_q=args.sample_from_q, q_dist=args.q_dist, load_from=args.load_from, loss_function=args.loss_function, loss_function_param=args.loss_function_param, add_noise=args.add_noise, noise_type=args.noise_type, noise_std=args.noise_std, detach=args.detach, warm_up=args.warm_up) metrics_list = [metrics.Accuracy(output_key='pred')] if args.dataset == 'imagenet': metrics_list.append(metrics.TopKAccuracy(k=5, output_key='pred')) callbacks_list = [ callbacks.SaveBestWithMetric(metric=metrics_list[0], partition='val', direction='max') ] stopper = callbacks.EarlyStoppingWithMetric( metric=metrics_list[0], stopping_param=args.stopping_param, partition='val', direction='max') training.train(model=model, train_loader=train_loader, val_loader=val_loader, epochs=args.epochs, save_iter=args.save_iter, vis_iter=args.vis_iter, optimization_args=optimization_args, log_dir=args.log_dir, args_to_log=args, stopper=stopper, metrics=metrics_list, callbacks=callbacks_list, device_ids=args.all_device_ids) # if training finishes successfully, compute the test score print("Testing the best validation model...") model = utils.load(os.path.join(args.log_dir, 'checkpoints', 'best_val.mdl'), methods=methods, device=args.device) pred = utils.apply_on_dataset(model, test_loader.dataset, batch_size=args.batch_size, output_keys_regexp='pred', description='Testing')['pred'] labels = [p[1] for p in test_loader.dataset] labels = torch.tensor(labels, dtype=torch.long) labels = utils.to_cpu(labels) with open(os.path.join(args.log_dir, 'test_predictions.pkl'), 'wb') as f: pickle.dump({'pred': pred, 'labels': labels}, f) accuracy = torch.mean((pred.argmax(dim=1) == labels).float()) with open(os.path.join(args.log_dir, 'test_accuracy.txt'), 'w') as f: f.write("{}\n".format(accuracy))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--config', '-c', type=str, required=True) parser.add_argument('--device', '-d', default='cuda') parser.add_argument( '--all_device_ids', nargs='+', type=str, default=None, help= "If not None, this list specifies devices for multiple GPU training. " "The first device should match with the main device (args.device).") parser.add_argument('--batch_size', '-b', type=int, default=128) parser.add_argument('--epochs', '-e', type=int, default=4000) parser.add_argument('--stopping_param', type=int, default=2**30) parser.add_argument('--save_iter', '-s', type=int, default=100) parser.add_argument('--vis_iter', '-v', type=int, default=10) parser.add_argument('--log_dir', '-l', type=str, default=None) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--dataset', '-D', type=str, default='uniform-noise-cifar10', choices=['uniform-noise-cifar10']) parser.add_argument('--data_augmentation', '-A', action='store_true', dest='data_augmentation') parser.set_defaults(data_augmentation=False) parser.add_argument('--num_train_examples', type=int, default=None) parser.add_argument('--error_prob', '-n', type=float, default=0.0) parser.add_argument('--clean_validation', dest='clean_validation', action='store_true') parser.set_defaults(clean_validation=False) parser.add_argument('--model_class', '-m', type=str, default='StandardClassifier') parser.add_argument('--load_from', type=str, default=None) parser.add_argument('--grad_weight_decay', '-L', type=float, default=0.0) parser.add_argument('--lamb', type=float, default=1.0) parser.add_argument('--pretrained_arg', '-r', type=str, default=None) parser.add_argument('--sample_from_q', action='store_true', dest='sample_from_q') parser.set_defaults(sample_from_q=False) parser.add_argument('--q_dist', type=str, default='Gaussian', choices=['Gaussian', 'Laplace', 'dot']) parser.add_argument('--weight_decay', type=float, default=0.0) parser.add_argument('--lr', type=float, default=1e-4, help='Learning rate') parser.add_argument('--k', '-k', type=int, required=False, default=10, help='width parameter of ResNet18-k') parser.add_argument('--exclude_percent', type=float, default=0.0) # TODO: make this argument work args = parser.parse_args() print(args) # Load data train_loader, val_loader, test_loader, _ = load_data_from_arguments(args) # Options optimization_args = { 'optimizer': { 'name': 'adam', 'lr': args.lr, 'weight_decay': args.weight_decay } } with open(args.config, 'r') as f: architecture_args = json.load(f) # set the width parameter k if ('classifier' in architecture_args and architecture_args['classifier'].get( 'net', '').find('double-descent') != -1): architecture_args['classifier']['k'] = args.k if ('q-network' in architecture_args and architecture_args['classifier'].get( 'net', '').find('double-descent') != -1): architecture_args['q-network']['k'] = args.k model_class = getattr(methods, args.model_class) model = model_class(input_shape=train_loader.dataset[0][0].shape, architecture_args=architecture_args, pretrained_arg=args.pretrained_arg, device=args.device, grad_weight_decay=args.grad_weight_decay, lamb=args.lamb, sample_from_q=args.sample_from_q, q_dist=args.q_dist, load_from=args.load_from, loss_function='ce') metrics_list = [metrics.Accuracy(output_key='pred')] if args.dataset == 'imagenet': metrics_list.append(metrics.TopKAccuracy(k=5, output_key='pred')) callbacks_list = [ callbacks.SaveBestWithMetric(metric=metrics_list[0], partition='val', direction='max') ] stopper = callbacks.EarlyStoppingWithMetric( metric=metrics_list[0], stopping_param=args.stopping_param, partition='val', direction='max') training.train(model=model, train_loader=train_loader, val_loader=val_loader, epochs=args.epochs, save_iter=args.save_iter, vis_iter=args.vis_iter, optimization_args=optimization_args, log_dir=args.log_dir, args_to_log=args, stopper=stopper, metrics=metrics_list, callbacks=callbacks_list, device_ids=args.all_device_ids) # test the last model and best model models_to_test = [{ 'name': 'best', 'file': 'best_val_accuracy.mdl' }, { 'name': 'final', 'file': 'final.mdl' }] for spec in models_to_test: print("Testing the {} model...".format(spec['name'])) model = utils.load(os.path.join(args.log_dir, 'checkpoints', spec['file']), methods=methods, device=args.device) pred = utils.apply_on_dataset(model, test_loader.dataset, batch_size=args.batch_size, output_keys_regexp='pred', description='Testing')['pred'] labels = [p[1] for p in test_loader.dataset] labels = torch.tensor(labels, dtype=torch.long) labels = utils.to_cpu(labels) with open( os.path.join(args.log_dir, '{}_test_predictions.pkl'.format(spec['name'])), 'wb') as f: pickle.dump({'pred': pred, 'labels': labels}, f) accuracy = torch.mean((pred.argmax(dim=1) == labels).float()) with open( os.path.join(args.log_dir, '{}_test_accuracy.txt'.format(spec['name'])), 'w') as f: f.write("{}\n".format(accuracy))