def line_search(directionv, start, end, steps=10): """Takes steps between start and end, returns steps+1 loss entries""" param0 = param.data.clone() param0v = u.vec(param0).t() losses = [] for i in range(steps + 1): output = model( stats_data) # use last saved data batch for backprop loss = compute_loss(output, stats_targets) losses.append(loss) offset = start + i * ((end - start) / steps) param1v = param0v + offset * directionv param1 = u.unvec(param1v.t(), param.data.shape[0]) param.data.copy_(param1) output = model( stats_data) # use last saved data batch for backprop loss = compute_loss(output, stats_targets) losses.append(loss) param.data.copy_(param0) return losses
def relu_gradient_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_relu_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [4,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0, name="X0") Y = tf.constant(Y0, name="Y0") W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): if i == 0: A[i+1] = X else: A[i+1] = tf.nn.relu(tf.matmul(W[i], A[i], name="A"+str(i+1))) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(0.1, dtype=dtype) # Create B's B = [0]*(n+1) B[n] = (-err/dsize)*u.relu_mask(A[n+1]) for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] if i > 0: # there's no relu on first matrix B[i] = B[i]*u.relu_mask(A[i+1]) # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) expected_losses = np.loadtxt("data/rotations_relu_gradient_losses.csv", delimiter= ",") observed_losses = [] # From accompanying notebook # {0.407751, 0.0683822, 0.0138657, 0.0039221, 0.00203637, 0.00164892, # 0.00156137, 0.00153857, 0.00153051, 0.00152593} for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def simple_gradient_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_simple_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_simple_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_simple_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [10,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1)) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(1.0, dtype=dtype) # Create B's B = [0]*(n+1) B[n] = -err/dsize for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) expected_losses = np.loadtxt("data/rotations_simple_gradient_losses.csv", delimiter= ",") observed_losses = [] # from accompanying notebook # {0.0111498, 0.00694816, 0.00429464, 0.00248228, 0.00159361, # 0.000957424, 0.000651653, 0.000423802, 0.000306749, 0.00021772, for i in range(20): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) # Create B's B = [0]*(n+1) B[n] = -err/dsize for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) lr_holder = tf.placeholder(dtype=dtype, shape=()) lr = tf.Variable(lr_holder, dtype=dtype) # run tests do_run_iters = 5 result = newton(1.0) expected_result = [8.9023744225439743e-05, 0.060120791316053412, 0.0059295249954177918, 1.9856240803246437e-05, 2.7125563957575423e-10] u.check_equal(result, expected_result) # 720 ms per step # result = newton(1.0) # np.savetxt("data/newton.csv", result, delimiter=',') # sys.exit() # natural_empirical(0.000000002)
def rotations2_natural_sampled_kfac(num_samples=1): tf.reset_default_graph() np.random.seed(0) tf.set_random_seed(0) # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's # initialize data + layers # W[0] is input matrix (X), W[n] is last matrix # A[1] has activations for W[1], equal to W[0]=X # A[n+1] has predictions # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) A = [0] * (n + 2) A2 = [0] * (n + 2) # augmented forward props for natural gradient A[0] = u.Identity(dsize) A2[0] = u.Identity(dsize * num_samples) for i in range(n + 1): # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1] A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) if i == 0: # replicate dataset multiple times corresponding to number of samples A2[i + 1] = tf.concat([W[0]] * num_samples, axis=1) else: A2[i + 1] = tf.matmul(W[i], A2[i], name="A2" + str(i + 1)) # input dimensions match assert W[0].get_shape() == X0.shape # output dimensions match assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape assert A[n + 1].get_shape() == Y0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) # lower learning rate by 10x lr = tf.Variable(0.01, dtype=dtype) # create backprop matrices # B[i] has backprop for matrix i B = [0] * (n + 1) B2 = [0] * (n + 1) B[n] = -err / dsize B2[n] = tf.random_normal((f(n), dsize * num_samples), 0, 1, seed=0, dtype=dtype) for i in range(n - 1, -1, -1): B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i)) B2[i] = tf.matmul(tf.transpose(W[i + 1]), B2[i + 1], name="B2" + str(i)) # Create gradient update. Make copy of variables and split update into # two run calls. Using single set of variables will gives updates that # occasionally produce wrong results/NaN's because of data race dW = [0] * (n + 1) dW2 = [0] * (n + 1) updates1 = [0] * (n + 1) # compute updated value into Wcopy updates2 = [0] * (n + 1) # copy value back into W Wcopy = [0] * (n + 1) for i in range(n + 1): Wi_name = "Wcopy" + str(i) Wi_shape = (fs[i + 1], fs[i]) Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init") Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False) dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) dW2[i] = tf.matmul(B2[i], tf.transpose(A2[i]), name="dW2" + str(i)) del dW[0] # get rid of W[0] update del dW2[0] # get rid of W[0] update # construct flattened gradient update vector dWf = tf.concat([vec(grad) for grad in dW], axis=0) # todo: divide both activations and backprops by size for cov calc # Kronecker factored covariance blocks iblocks = u.empty_grid(n + 1, n + 1) for i in range(1, n + 1): for j in range(1, n + 1): if i == j: acov = A2[i] @ t(A2[j]) / (dsize * num_samples) bcov = B2[i] @ t(B2[j]) / (dsize * num_samples) term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov)) else: term = tf.zeros(shape=(f(i) * f(i - 1), f(j) * f(j - 1)), dtype=dtype) iblocks[i][j] = term # remove leftmost blocks (those are with respect to W[0] which is input) del iblocks[0] for row in iblocks: del row[0] ifisher = u.concat_blocks(iblocks) Wf_copy = tf.Variable(tf.zeros(dtype=dtype, shape=Wf.shape, name="Wf_copy_init"), name="Wf_copy") new_val_matrix = Wf - lr * (ifisher @ dWf) train_op1 = Wf_copy.assign(new_val_matrix) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) observed_losses = [] u.reset_time() for i in range(20): loss0 = sess.run(loss) print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
def rotations2_natural_empirical(): tf.reset_default_graph() # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's # initialize data + layers # W[0] is input matrix (X), W[n] is last matrix # A[1] has activations for W[1], equal to W[0]=X # A[n+1] has predictions # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1] A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) # input dimensions match assert W[0].get_shape() == X0.shape # output dimensions match assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape assert A[n + 1].get_shape() == Y0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr = tf.Variable(0.000001, dtype=dtype) # create backprop matrices # B[i] has backprop for matrix i B = [0] * (n + 1) B[n] = -err / dsize for i in range(n - 1, -1, -1): B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i)) # Create gradient update. Make copy of variables and split update into # two run calls. Using single set of variables will gives updates that # occasionally produce wrong results/NaN's because of data race dW = [0] * (n + 1) updates1 = [0] * (n + 1) # compute updated value into Wcopy updates2 = [0] * (n + 1) # copy value back into W Wcopy = [0] * (n + 1) for i in range(n + 1): Wi_name = "Wcopy" + str(i) Wi_shape = (fs[i + 1], fs[i]) Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init") Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False) dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update # construct flattened gradient update vector dWf = tf.concat([vec(grad) for grad in dW], axis=0) # inverse fisher preconditioner grads = tf.concat([u.khatri_rao(A[i], B[i]) for i in range(1, n + 1)], axis=0) fisher = grads @ tf.transpose(grads) / dsize ifisher = u.pseudo_inverse(fisher) Wf_copy = tf.Variable(tf.zeros(dtype=dtype, shape=Wf.shape, name="Wf_copy_init"), name="Wf_copy") new_val_matrix = Wf - lr * (ifisher @ dWf) train_op1 = Wf_copy.assign(new_val_matrix) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) observed_losses = [] u.reset_time() for i in range(10): loss0 = sess.run(loss) print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
def rotations2_newton_kfac(): tf.reset_default_graph() # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) assert W[0].get_shape() == X0.shape assert A[n + 1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr = tf.Variable(0.1, dtype=dtype, name="learning_rate") # Create B's B = [0] * (n + 1) B[n] = -err / dsize Bn = [0] * (n + 1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n - 1, -1, -1): B[i] = t(W[i + 1]) @ B[i + 1] Bn[i] = t(W[i + 1]) @ Bn[i + 1] # inverse Hessian blocks iblocks = u.empty_grid(n + 1, n + 1) for i in range(1, n + 1): for j in range(1, n + 1): # reuse Hess tensor calculation in order to get off-diag block sizes dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize if i == j: acov = A[i] @ t(A[j]) bcov = (Bn[i] @ t(Bn[j])) / dsize term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov)) else: term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype) iblocks[i][j] = term # remove leftmost blocks (those are with respect to W[0] which is input) del iblocks[0] for row in iblocks: del row[0] ihess = u.concat_blocks(iblocks) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0] * (n + 1) for i in range(n + 1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) observed_losses = [] elapsed_times = [] u.reset_time() for i in range(10): loss0 = sess.run([loss])[0] print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
def rotations1_gradient_test(): # https://www.wolframcloud.com/objects/ff6ecaf0-fccd-44e3-b26f-970d8fc2a57c tf.reset_default_graph() X0 = np.genfromtxt('data/large_rotations1_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations1_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations1_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations1_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) assert W[0].get_shape() == X0.shape assert A[n + 1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr0 = np.genfromtxt('data/large_rotations1_gradient_lr.csv') lr = tf.Variable(lr0, dtype=dtype) # Create B's B = [0] * (n + 1) B[n] = -err / dsize for i in range(n - 1, -1, -1): B[i] = t(W[i + 1]) @ B[i + 1] # create dW's dW = [0] * (n + 1) for i in range(n + 1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) expected_losses = np.loadtxt("data/large_rotations1_gradient_losses.csv", delimiter=",") observed_losses = [] # from accompanying notebook # {0.102522, 0.028124, 0.00907214, 0.00418929, 0.00293379, for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def test_explicit_hessian(): """Check computation of hessian of loss(B'WA) from https://github.com/yaroslavvb/kfac_pytorch/blob/master/derivation.pdf """ torch.set_default_dtype(torch.float64) A = torch.tensor([[-1., 4], [3, 0]]) B = torch.tensor([[-4., 3], [2, 6]]) X = torch.tensor([[-5., 0], [-2, -6]], requires_grad=True) Y = B.t() @ X @ A u.check_equal(Y, [[-52, 64], [-81, -108]]) loss = torch.sum(Y * Y) / 2 hess0 = u.hessian(loss, X).reshape([4, 4]) hess1 = u.Kron(A @ A.t(), B @ B.t()) u.check_equal(loss, 12512.5) # PyTorch autograd computes Hessian with respect to row-vectorized parameters, whereas # autograd_lib uses math convention and does column-vectorized. # Commuting order of Kronecker product switches between two representations u.check_equal(hess1.commute(), hess0) # Do a test using Linear layers instead of matrix multiplies model: u.SimpleFullyConnected2 = u.SimpleFullyConnected2([2, 2, 2], bias=False) model.layers[0].weight.data.copy_(X) # Transpose to match previous results, layers treat dim0 as batch dimension u.check_equal(model.layers[0](A.t()).t(), [[5, -20], [-16, -8]]) # XA = (A'X0)' model.layers[1].weight.data.copy_(B.t()) u.check_equal(model(A.t()).t(), Y) Y = model(A.t()).t() # transpose to data-dimension=columns loss = torch.sum(Y * Y) / 2 loss.backward() u.check_equal(model.layers[0].weight.grad, [[-2285, -105], [-1490, -1770]]) G = B @ Y @ A.t() u.check_equal(model.layers[0].weight.grad, G) u.check_equal(hess0, u.Kron(B @ B.t(), A @ A.t())) # compute newton step u.check_equal(u.Kron([email protected](), [email protected]()).pinv() @ u.vec(G), u.v2c([-5, -2, 0, -6])) # compute Newton step using factored representation autograd_lib.add_hooks(model) Y = model(A.t()) n = 2 loss = torch.sum(Y * Y) / 2 autograd_lib.backprop_hess(Y, hess_type='LeastSquares') autograd_lib.compute_hess(model, method='kron', attr_name='hess_kron', vecr_order=False, loss_aggregation='sum') param = model.layers[0].weight hess2 = param.hess_kron print(hess2) u.check_equal(hess2, [[425, 170, -75, -30], [170, 680, -30, -120], [-75, -30, 225, 90], [-30, -120, 90, 360]]) # Gradient test model.zero_grad() loss.backward() u.check_close(u.vec(G).flatten(), u.Vec(param.grad)) # Newton step test # Method 0: PyTorch native autograd newton_step0 = param.grad.flatten() @ torch.pinverse(hess0) newton_step0 = newton_step0.reshape(param.shape) u.check_equal(newton_step0, [[-5, 0], [-2, -6]]) # Method 1: colummn major order ihess2 = hess2.pinv() u.check_equal(ihess2.LL, [[1/16, 1/48], [1/48, 17/144]]) u.check_equal(ihess2.RR, [[2/45, -(1/90)], [-(1/90), 1/36]]) u.check_equal(torch.flatten(hess2.pinv() @ u.vec(G)), [-5, -2, 0, -6]) newton_step1 = (ihess2 @ u.Vec(param.grad)).matrix_form() # Method2: row major order ihess2_rowmajor = ihess2.commute() newton_step2 = ihess2_rowmajor @ u.Vecr(param.grad) newton_step2 = newton_step2.matrix_form() u.check_equal(newton_step0, newton_step1) u.check_equal(newton_step0, newton_step2)
def compute_layer_stats(layer): refreeze = False if hasattr(layer, 'frozen') and layer.frozen: u.unfreeze(layer) refreeze = True s = AttrDefault(str, {}) n = args.stats_batch_size param = u.get_param(layer) _d = len(param.flatten()) # dimensionality of parameters layer_idx = model.layers.index(layer) # TODO: get layer type, include it in name assert layer_idx >= 0 assert stats_data.shape[0] == n def backprop_loss(): model.zero_grad() output = model( stats_data) # use last saved data batch for backprop loss = compute_loss(output, stats_targets) loss.backward() return loss, output def backprop_output(): model.zero_grad() output = model(stats_data) output.backward(gradient=torch.ones_like(output)) return output # per-example gradients, n, d _loss, _output = backprop_loss() At = layer.data_input Bt = layer.grad_output * n G = u.khatri_rao_t(At, Bt) g = G.sum(dim=0, keepdim=True) / n u.check_close(g, u.vec(param.grad).t()) s.diversity = torch.norm(G, "fro")**2 / g.flatten().norm()**2 s.grad_fro = g.flatten().norm() s.param_fro = param.data.flatten().norm() pos_activations = torch.sum(layer.data_output > 0) neg_activations = torch.sum(layer.data_output <= 0) s.a_sparsity = neg_activations.float() / ( pos_activations + neg_activations) # 1 sparsity means all 0's activation_size = len(layer.data_output.flatten()) s.a_magnitude = torch.sum(layer.data_output) / activation_size _output = backprop_output() B2t = layer.grad_output J = u.khatri_rao_t(At, B2t) # batch output Jacobian H = J.t() @ J / n s.hessian_l2 = u.l2_norm(H) s.jacobian_l2 = u.l2_norm(J) J1 = J.sum(dim=0) / n # single output Jacobian s.J1_l2 = J1.norm() # newton decrement def loss_direction(direction, eps): """loss improvement if we take step eps in direction dir""" return u.to_python_scalar(eps * (direction @ g.t()) - 0.5 * eps**2 * direction @ H @ direction.t()) s.regret_newton = u.to_python_scalar(g @ u.pinv(H) @ g.t() / 2) # TODO: gradient diversity is stuck at 1 # TODO: newton/gradient angle # TODO: newton step magnitude s.grad_curvature = u.to_python_scalar( g @ H @ g.t()) # curvature in direction of g s.step_openai = u.to_python_scalar( s.grad_fro**2 / s.grad_curvature) if s.grad_curvature else 999 s.regret_gradient = loss_direction(g, s.step_openai) if refreeze: u.freeze(layer) return s
def _test_explicit_hessian_refactored(): """Check computation of hessian of loss(B'WA) from https://github.com/yaroslavvb/kfac_pytorch/blob/master/derivation.pdf """ torch.set_default_dtype(torch.float64) A = torch.tensor([[-1., 4], [3, 0]]) B = torch.tensor([[-4., 3], [2, 6]]) X = torch.tensor([[-5., 0], [-2, -6]], requires_grad=True) Y = B.t() @ X @ A u.check_equal(Y, [[-52, 64], [-81, -108]]) loss = torch.sum(Y * Y) / 2 hess0 = u.hessian(loss, X).reshape([4, 4]) hess1 = u.Kron(A @ A.t(), B @ B.t()) u.check_equal(loss, 12512.5) # Do a test using Linear layers instead of matrix multiplies model: u.SimpleFullyConnected2 = u.SimpleFullyConnected2([2, 2, 2], bias=False) model.layers[0].weight.data.copy_(X) # Transpose to match previous results, layers treat dim0 as batch dimension u.check_equal(model.layers[0](A.t()).t(), [[5, -20], [-16, -8]]) # XA = (A'X0)' model.layers[1].weight.data.copy_(B.t()) u.check_equal(model(A.t()).t(), Y) Y = model(A.t()).t() # transpose to data-dimension=columns loss = torch.sum(Y * Y) / 2 loss.backward() u.check_equal(model.layers[0].weight.grad, [[-2285, -105], [-1490, -1770]]) G = B @ Y @ A.t() u.check_equal(model.layers[0].weight.grad, G) autograd_lib.register(model) activations_dict = autograd_lib.ModuleDict() # todo(y): make save_activations ctx manager automatically create A with autograd_lib.save_activations(activations_dict): Y = model(A.t()) Acov = autograd_lib.ModuleDict(autograd_lib.SecondOrderCov) for layer, activations in activations_dict.items(): print(layer, activations) Acov[layer].accumulate(activations, activations) autograd_lib.set_default_activations(activations_dict) autograd_lib.set_default_Acov(Acov) B = autograd_lib.ModuleDict(autograd_lib.SymmetricFourthOrderCov) autograd_lib.backward_accum(Y, "identity", B, retain_graph=False) print(B[model.layers[0]]) autograd_lib.backprop_hess(Y, hess_type='LeastSquares') autograd_lib.compute_hess(model, method='kron', attr_name='hess_kron', vecr_order=False, loss_aggregation='sum') param = model.layers[0].weight hess2 = param.hess_kron print(hess2) u.check_equal(hess2, [[425, 170, -75, -30], [170, 680, -30, -120], [-75, -30, 225, 90], [-30, -120, 90, 360]]) # Gradient test model.zero_grad() loss.backward() u.check_close(u.vec(G).flatten(), u.Vec(param.grad)) # Newton step test # Method 0: PyTorch native autograd newton_step0 = param.grad.flatten() @ torch.pinverse(hess0) newton_step0 = newton_step0.reshape(param.shape) u.check_equal(newton_step0, [[-5, 0], [-2, -6]]) # Method 1: colummn major order ihess2 = hess2.pinv() u.check_equal(ihess2.LL, [[1/16, 1/48], [1/48, 17/144]]) u.check_equal(ihess2.RR, [[2/45, -(1/90)], [-(1/90), 1/36]]) u.check_equal(torch.flatten(hess2.pinv() @ u.vec(G)), [-5, -2, 0, -6]) newton_step1 = (ihess2 @ u.Vec(param.grad)).matrix_form() # Method2: row major order ihess2_rowmajor = ihess2.commute() newton_step2 = ihess2_rowmajor @ u.Vecr(param.grad) newton_step2 = newton_step2.matrix_form() u.check_equal(newton_step0, newton_step1) u.check_equal(newton_step0, newton_step2)
def test_factored_hessian(): """"Simple test to ensure Hessian computation is working. In a linear neural network with squared loss, Newton step will converge in one step. Compute stats after minimizing, pass sanity checks. """ u.seed_random(1) loss_type = 'LeastSquares' data_width = 2 n = 5 d1 = data_width ** 2 o = 10 d = [d1, o] model = u.SimpleFullyConnected2(d, bias=False, nonlin=False) model = model.to(gl.device) print(model) dataset = u.TinyMNIST(data_width=data_width, dataset_size=n, loss_type=loss_type) stats_loader = torch.utils.data.DataLoader(dataset, batch_size=n, shuffle=False) stats_iter = u.infinite_iter(stats_loader) stats_data, stats_targets = next(stats_iter) if loss_type == 'LeastSquares': loss_fn = u.least_squares else: # loss_type == 'CrossEntropy': loss_fn = nn.CrossEntropyLoss() autograd_lib.add_hooks(model) gl.reset_global_step() last_outer = 0 data, targets = stats_data, stats_targets # Capture Hessian and gradient stats autograd_lib.enable_hooks() autograd_lib.clear_backprops(model) output = model(data) loss = loss_fn(output, targets) print(loss) loss.backward(retain_graph=True) layer = model.layers[0] autograd_lib.clear_hess_backprops(model) autograd_lib.backprop_hess(output, hess_type=loss_type) autograd_lib.disable_hooks() # compute Hessian using direct method, compare against PyTorch autograd hess0 = u.hessian(loss, layer.weight) autograd_lib.compute_hess(model) hess1 = layer.weight.hess print(hess1) u.check_close(hess0.reshape(hess1.shape), hess1, atol=1e-9, rtol=1e-6) # compute Hessian using factored method autograd_lib.compute_hess(model, method='kron', attr_name='hess2', vecr_order=True) # s.regret_newton = vecG.t() @ pinvH.commute() @ vecG.t() / 2 # TODO(y): figure out why needed transposes hess2 = layer.weight.hess2 u.check_close(hess1, hess2, atol=1e-9, rtol=1e-6) # Newton step in regular notation g1 = layer.weight.grad.flatten() newton1 = hess1 @ g1 g2 = u.Vecr(layer.weight.grad) newton2 = g2 @ hess2 u.check_close(newton1, newton2, atol=1e-9, rtol=1e-6) # compute regret in factored notation, compare against actual drop in loss regret1 = g1 @ hess1.pinverse() @ g1 / 2 regret2 = g2 @ hess2.pinv() @ g2 / 2 u.check_close(regret1, regret2) current_weight = layer.weight.detach().clone() param: torch.nn.Parameter = layer.weight # param.data.sub_((hess1.pinverse() @ g1).reshape(param.shape)) # output = model(data) # loss = loss_fn(output, targets) # print("result 1", loss) # param.data.sub_((hess1.pinverse() @ u.vec(layer.weight.grad)).reshape(param.shape)) # output = model(data) # loss = loss_fn(output, targets) # print("result 2", loss) # param.data.sub_((u.vec(layer.weight.grad).t() @ hess1.pinverse()).reshape(param.shape)) # output = model(data) # loss = loss_fn(output, targets) # print("result 3", loss) # del layer.weight.grad output = model(data) loss = loss_fn(output, targets) loss.backward() param.data.sub_(u.unvec(hess1.pinverse() @ u.vec(layer.weight.grad), layer.weight.shape[0])) output = model(data) loss = loss_fn(output, targets) print("result 4", loss) # param.data.sub_((g1 @ hess1.pinverse() @ g1).reshape(param.shape)) print(loss)
def test_kron(): """Test kron, vec and vecr identities""" torch.set_default_dtype(torch.float64) a = torch.tensor([1, 2, 3, 4]).reshape(2, 2) b = torch.tensor([5, 6, 7, 8]).reshape(2, 2) u.check_close(u.Kron(a, b).trace(), 65) a = torch.tensor([[2., 7, 9], [1, 9, 8], [2, 7, 5]]) b = torch.tensor([[6., 6, 1], [10, 7, 7], [7, 10, 10]]) Ck = u.Kron(a, b) u.check_close(a.flatten().norm() * b.flatten().norm(), Ck.frobenius_norm()) u.check_close(Ck.frobenius_norm(), 4 * math.sqrt(11635.)) Ci = [[ 0, 5 / 102, -(7 / 204), 0, -(70 / 561), 49 / 561, 0, 125 / 1122, -(175 / 2244) ], [ 1 / 20, -(53 / 1020), 8 / 255, -(7 / 55), 371 / 2805, -(224 / 2805), 5 / 44, -(265 / 2244), 40 / 561 ], [ -(1 / 20), 3 / 170, 3 / 170, 7 / 55, -(42 / 935), -(42 / 935), -(5 / 44), 15 / 374, 15 / 374 ], [ 0, -(5 / 102), 7 / 204, 0, 20 / 561, -(14 / 561), 0, 35 / 1122, -(49 / 2244) ], [ -(1 / 20), 53 / 1020, -(8 / 255), 2 / 55, -(106 / 2805), 64 / 2805, 7 / 220, -(371 / 11220), 56 / 2805 ], [ 1 / 20, -(3 / 170), -(3 / 170), -(2 / 55), 12 / 935, 12 / 935, -(7 / 220), 21 / 1870, 21 / 1870 ], [0, 5 / 102, -(7 / 204), 0, 0, 0, 0, -(5 / 102), 7 / 204], [ 1 / 20, -(53 / 1020), 8 / 255, 0, 0, 0, -(1 / 20), 53 / 1020, -(8 / 255) ], [ -(1 / 20), 3 / 170, 3 / 170, 0, 0, 0, 1 / 20, -(3 / 170), -(3 / 170) ]] C = Ck.expand() C0 = u.to_numpy(C) Ci = torch.tensor(Ci) u.check_close(C @ Ci @ C, C) u.check_close(Ck.inv().expand(), torch.inverse(Ck.expand())) u.check_close(Ck.inv().expand_vec(), torch.inverse(Ck.expand_vec())) u.check_close(Ck.pinv().expand(), torch.pinverse(Ck.expand())) u.check_close(linalg.pinv(C0), Ci, rtol=1e-5, atol=1e-6) u.check_close(torch.pinverse(C), Ci, rtol=1e-5, atol=1e-6) u.check_close(Ck.inv().expand(), Ci, rtol=1e-5, atol=1e-6) u.check_close(Ck.pinv().expand(), Ci, rtol=1e-5, atol=1e-6) Ck2 = u.Kron(b, 2 * a) u.check_close((Ck @ Ck2).expand(), Ck.expand() @ Ck2.expand()) u.check_close((Ck @ Ck2).expand_vec(), Ck.expand_vec() @ Ck2.expand_vec()) d2 = 3 d1 = 2 G = torch.randn(d2, d1) g = u.vec(G) H = u.Kron(u.random_cov(d1), u.random_cov(d2)) Gt = G.t() gt = g.reshape(1, -1) vecX = u.Vec([1, 2, 3, 4], shape=(2, 2)) K = u.Kron([[5, 6], [7, 8]], [[9, 10], [11, 12]]) u.check_equal(vecX @ K, [644, 706, 748, 820]) u.check_equal(K @ vecX, [543, 655, 737, 889]) u.check_equal(u.matmul(vecX @ K, vecX), 7538) u.check_equal(vecX @ (vecX @ K), 7538) u.check_equal(vecX @ vecX, 30) vecX = u.Vec([1, 2], shape=(1, 2)) K = u.Kron([[5]], [[9, 10], [11, 12]]) u.check_equal(vecX.norm()**2, 5) # check kronecker rules X = torch.tensor([[1., 2], [3, 4]]) A = torch.tensor([[5., 6], [7, 8]]) B = torch.tensor([[9., 10], [11, 12]]) x = u.Vec(X) # kron/vec/vecr identities u.check_equal(u.Vec(A @ X @ B), x @ u.Kron(B, A.t())) u.check_equal(u.Vec(A @ X @ B), u.Kron(B.t(), A) @ x) u.check_equal(u.Vecr(A @ X @ B), u.Kron(A, B.t()) @ u.Vecr(X)) u.check_equal(u.Vecr(A @ X @ B), u.Vecr(X) @ u.Kron(A.t(), B)) def extra_checks(A, X, B): x = u.Vec(X) u.check_equal(u.Vec(A @ X @ B), x @ u.Kron(B, A.t())) u.check_equal(u.Vec(A @ X @ B), u.Kron(B.t(), A) @ x) u.check_equal(u.Vecr(A @ X @ B), u.Kron(A, B.t()) @ u.Vecr(X)) u.check_equal(u.Vecr(A @ X @ B), u.Vecr(X) @ u.Kron(A.t(), B)) u.check_equal(u.Vecr(A @ X @ B), u.Vecr(X) @ u.Kron(A.t(), B).normal_form()) u.check_equal(u.Vecr(A @ X @ B), u.matmul(u.Kron(A, B.t()).normal_form(), u.Vecr(X))) u.check_equal(u.Vec(A @ X @ B), u.matmul(u.Kron(B.t(), A).normal_form(), x)) u.check_equal(u.Vec(A @ X @ B), x @ u.Kron(B, A.t()).normal_form()) u.check_equal(u.Vec(A @ X @ B), x.normal_form() @ u.Kron(B, A.t()).normal_form()) u.check_equal(u.Vec(A @ X @ B), u.Kron(B.t(), A).normal_form() @ x.normal_form()) u.check_equal(u.Vecr(A @ X @ B), u.Kron(A, B.t()).normal_form() @ u.Vecr(X).normal_form()) u.check_equal(u.Vecr(A @ X @ B), u.Vecr(X).normal_form() @ u.Kron(A.t(), B).normal_form()) # shape checks d1, d2 = 3, 4 extra_checks(torch.ones((d1, d1)), torch.ones((d1, d2)), torch.ones((d2, d2))) A = torch.rand(d1, d1) B = torch.rand(d2, d2) #x = torch.rand((d1*d2)) #X = x.t().reshape(d1, d2) # X = torch.rand((d1, d2)) # x = u.vec(X) x = torch.rand((d1 * d2))
def simple_newton_bd_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_simple_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_simple_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_simple_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [10,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1)) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(0.5, dtype=dtype, name="learning_rate") # Create B's B = [0]*(n+1) B[n] = -err/dsize Bn = [0]*(n+1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] Bn[i] = t(W[i+1]) @ Bn[i+1] # Create U's U = [list(range(n+1)) for _ in range(n+1)] for bottom in range(n+1): for top in range(n+1): if bottom > top: prod = u.Identity(f(top)) else: prod = u.Identity(f(bottom-1)) for i in range(bottom, top+1): prod = prod@t(W[i]) U[bottom][top] = prod # Block i, j gives hessian block between layer i and layer j blocks = [list(range(n+1)) for _ in range(n+1)] for i in range(1, n+1): for j in range(1, n+1): term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize; if i == j: term2 = tf.zeros((f(i)*f(i-1), f(i)*f(i-1)), dtype=dtype) elif i < j: term2 = kr(A[i] @ t(B[j]), U[i+1][j-1]) else: term2 = kr(t(U[j+1][i-1]), B[i] @ t(A[j])) blocks[i][j]=term1 + term2 @ Kmat(f(j), f(j-1)) # remove leftmost blocks (those are with respect to W[0] which is input) del blocks[0] for row in blocks: del row[0] #hess = u.concat_blocks(blocks) ihess = u.concat_blocks(u.block_diagonal_inverse(blocks)) # ihess = u.pseudo_inverse(hess) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) expected_losses = np.loadtxt("data/rotations_simple_newtonbd_losses.csv", delimiter= ",") observed_losses = [] # from accompanying notebook # 0.0111498, 0.0000171591, 4.11445*10^-11, 2.33652*10^-22, # 1.21455*10^-32, for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def rotations2_newton_bd(): # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) tf.reset_default_graph() X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) assert W[0].get_shape() == X0.shape assert A[n + 1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr = tf.Variable(0.1, dtype=dtype, name="learning_rate") # Create B's B = [0] * (n + 1) B[n] = -err / dsize Bn = [0] * (n + 1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n - 1, -1, -1): B[i] = t(W[i + 1]) @ B[i + 1] Bn[i] = t(W[i + 1]) @ Bn[i + 1] # Create U's U = [list(range(n + 1)) for _ in range(n + 1)] for bottom in range(n + 1): for top in range(n + 1): if bottom > top: prod = u.Identity(f(top)) else: prod = u.Identity(f(bottom - 1)) for i in range(bottom, top + 1): prod = prod @ t(W[i]) U[bottom][top] = prod # Block i, j gives hessian block between layer i and layer j blocks = [list(range(n + 1)) for _ in range(n + 1)] for i in range(1, n + 1): for j in range(1, n + 1): term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize if i == j: term2 = tf.zeros((f(i) * f(i - 1), f(i) * f(i - 1)), dtype=dtype) elif i < j: term2 = kr(A[i] @ t(B[j]), U[i + 1][j - 1]) else: term2 = kr(t(U[j + 1][i - 1]), B[i] @ t(A[j])) blocks[i][j] = term1 + term2 @ Kmat(f(j), f(j - 1)) # remove leftmost blocks (those are with respect to W[0] which is input) del blocks[0] for row in blocks: del row[0] ihess = u.concat_blocks(u.block_diagonal_inverse(blocks)) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0] * (n + 1) for i in range(n + 1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) observed_losses = [] u.reset_time() for i in range(20): loss0 = sess.run([loss])[0] print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
def simple_newton_kfac_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_simple_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_simple_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_simple_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [10,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1)) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(0.5, dtype=dtype, name="learning_rate") # Create B's B = [0]*(n+1) B[n] = -err/dsize Bn = [0]*(n+1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] Bn[i] = t(W[i+1]) @ Bn[i+1] # inverse Hessian blocks iblocks = u.empty_grid(n+1, n+1) for i in range(1, n+1): for j in range(1, n+1): # reuse Hess tensor calculation in order to get off-diag block sizes dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize; if i == j: acov = A[i] @ t(A[j]) bcov = Bn[i] @ t(Bn[j]) / dsize; term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov)) else: term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype) iblocks[i][j]=term # remove leftmost blocks (those are with respect to W[0] which is input) del iblocks[0] for row in iblocks: del row[0] ihess = u.concat_blocks(iblocks) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) expected_losses = np.loadtxt("data/rotations_simple_newtonkfac_losses.csv", delimiter= ",") observed_losses = [] # from accompanying notebook # {0.0111498, 0.0000171591, 4.11445*10^-11, 2.33653*10^-22, # 6.88354*10^-33, for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def compute_layer_stats(layer): stats = AttrDefault(str, {}) n = stats_batch_size param = u.get_param(layer) d = len(param.flatten()) layer_idx = model.layers.index(layer) assert layer_idx >= 0 assert stats_data.shape[0] == n def backprop_loss(): model.zero_grad() output = model( stats_data) # use last saved data batch for backprop loss = compute_loss(output, stats_targets) loss.backward() return loss, output def backprop_output(): model.zero_grad() output = model(stats_data) output.backward(gradient=torch.ones_like(output)) return output # per-example gradients, n, d loss, output = backprop_loss() At = layer.data_input Bt = layer.grad_output * n G = u.khatri_rao_t(At, Bt) g = G.sum(dim=0, keepdim=True) / n u.check_close(g, u.vec(param.grad).t()) stats.diversity = torch.norm(G, "fro")**2 / g.flatten().norm()**2 stats.gradient_norm = g.flatten().norm() stats.parameter_norm = param.data.flatten().norm() pos_activations = torch.sum(layer.data_output > 0) neg_activations = torch.sum(layer.data_output <= 0) stats.sparsity = pos_activations.float() / (pos_activations + neg_activations) output = backprop_output() At2 = layer.data_input u.check_close(At, At2) B2t = layer.grad_output J = u.khatri_rao_t(At, B2t) H = J.t() @ J / n model.zero_grad() output = model(stats_data) # use last saved data batch for backprop loss = compute_loss(output, stats_targets) hess = u.hessian(loss, param) hess = hess.transpose(2, 3).transpose(0, 1).reshape(d, d) u.check_close(hess, H) u.check_close(hess, H) stats.hessian_norm = u.l2_norm(H) stats.jacobian_norm = u.l2_norm(J) Joutput = J.sum(dim=0) / n stats.jacobian_sensitivity = Joutput.norm() # newton decrement stats.loss_newton = u.to_python_scalar(g @ u.pinv(H) @ g.t() / 2) u.check_close(stats.loss_newton, loss) # do line-search to find optimal step def line_search(directionv, start, end, steps=10): """Takes steps between start and end, returns steps+1 loss entries""" param0 = param.data.clone() param0v = u.vec(param0).t() losses = [] for i in range(steps + 1): output = model( stats_data) # use last saved data batch for backprop loss = compute_loss(output, stats_targets) losses.append(loss) offset = start + i * ((end - start) / steps) param1v = param0v + offset * directionv param1 = u.unvec(param1v.t(), param.data.shape[0]) param.data.copy_(param1) output = model( stats_data) # use last saved data batch for backprop loss = compute_loss(output, stats_targets) losses.append(loss) param.data.copy_(param0) return losses # try to take a newton step gradv = g line_losses = line_search(-gradv @ u.pinv(H), 0, 2, steps=10) u.check_equal(line_losses[0], loss) u.check_equal(line_losses[6], 0) assert line_losses[5] > line_losses[6] assert line_losses[7] > line_losses[6] return stats