def create_toy_model(): """ Create model from https://www.wolframcloud.com/obj/yaroslavvb/newton/linear-jacobians-and-hessians.nb PyTorch works on transposed representation, hence to obtain Y from notebook, do model(A.T).T """ model: u.SimpleMLP = u.SimpleMLP([2, 2, 2], bias=False) autograd_lib.register(model) A = torch.tensor([[-1., 4], [3, 0]]) B = torch.tensor([[-4., 3], [2, 6]]) X = torch.tensor([[-5., 0], [-2, -6]], requires_grad=True) model.layers[0].weight.data.copy_(X) model.layers[1].weight.data.copy_(B.t()) return A, model
def test_hessian_diag_sqr(): """Like above, but using LeastSquares loss""" data_width = 3 batch_size = 2 d = [data_width**2, 6, 10] train_steps = 2 model: u.SimpleMLP = u.SimpleMLP(d, nonlin=True, bias=True) autograd_lib.register(model) dataset = u.TinyMNIST(dataset_size=batch_size*train_steps, data_width=data_width) trainloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False) train_iter = iter(trainloader) #loss_fn = torch.nn.CrossEntropyLoss() loss_fn = u.least_squares for train_step in range(train_steps): data, targets = next(train_iter) hess = defaultdict(float) hess_diag = defaultdict(float) activations = {} def save_activations(layer, a, _): activations[layer] = a with autograd_lib.module_hook(save_activations): output = model(data) loss = loss_fn(output, torch.zeros_like(output)) def compute_hess(layer, _, B): A = activations[layer] BA = torch.einsum("nl,ni->nli", B, A) hess[layer] += torch.einsum('nli,nkj->likj', BA, BA) hess_diag[layer] += torch.einsum("ni,nj->ij", B * B, A * A) with autograd_lib.module_hook(compute_hess): autograd_lib.backward_hessian(output, loss='LeastSquares', retain_graph=True) # compute Hessian through autograd # for layer in model.layers: for layer in model.layers: H_autograd = u.hessian(loss, layer.weight) u.check_close(hess[layer] / batch_size, H_autograd) diag_autograd = torch.einsum('lili->li', H_autograd) u.check_close(diag_autograd, hess_diag[layer]/batch_size)
def test_hessian_kfac(): model: u.SimpleMLP = u.SimpleMLP([2, 2], nonlin=True, bias=True) model.layers[0].weight.data.copy_(torch.eye(2)) autograd_lib.register(model) loss_fn = torch.nn.CrossEntropyLoss() data = u.to_logits(torch.tensor([[0.7, 0.3]])) targets = torch.tensor([0]) data = data.repeat([3, 1]) targets = targets.repeat([3]) n = len(data) activations = {} hessians = defaultdict(lambda: AttrDefault(float)) for i in range(n): def save_activations(layer, A, _): activations[layer] = A hessians[layer].AA += torch.einsum("ni,nj->ij", A, A) with autograd_lib.module_hook(save_activations): data_batch = data[i: i+1] targets_batch = targets[i: i+1] Y = model(data_batch) loss = loss_fn(Y, targets_batch) def compute_hess(layer, _, B): hessians[layer].BB += torch.einsum("ni,nj->ij", B, B) with autograd_lib.module_hook(compute_hess): autograd_lib.backward_hessian(Y, loss='CrossEntropy', retain_graph=True) # check diagonal entries against autograd hess_autograd = u.hessian(loss, model.layers[0].weight) hess0_factored = hessians[model.layers[0]] diag_autograd = torch.einsum('lili->li', hess_autograd) diag_kfac = torch.einsum('ll,ii->li', hess0_factored.BB / n, hess0_factored.AA / n) u.check_close(diag_autograd, diag_kfac) # check all entries against autograd hess0 = torch.einsum('kl,ij->kilj', hess0_factored.BB / n, hess0_factored.AA / n) u.check_close(hess_autograd, hess0)
def test_hessian_full(): data_width = 3 batch_size = 2 d = [data_width**2, 10] o = d[-1] n = batch_size train_steps = 1 model = u.SimpleMLP(d, nonlin=False, bias=True) autograd_lib.register(model) dataset = u.TinyMNIST(dataset_size=batch_size, data_width=data_width) trainloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False) train_iter = iter(trainloader) loss_fn = torch.nn.CrossEntropyLoss() hess = defaultdict(float) for train_step in range(train_steps): data, targets = next(train_iter) activations = {} def save_activations(layer, a, _): activations[layer] = a with autograd_lib.module_hook(save_activations): output = model(data) loss = loss_fn(output, targets) def compute_hess(layer, _, B): A = activations[layer] BA = torch.einsum("nl,ni->nli", B, A) hess[layer] += torch.einsum('nli,nkj->likj', BA, BA) with autograd_lib.module_hook(compute_hess): autograd_lib.backward_hessian(output, loss='CrossEntropy', retain_graph=True) # compute Hessian through autograd H_autograd = u.hessian(loss, model.layers[0].weight) u.check_close(hess[model.layers[0]] / n, H_autograd)