def __init__(self, target, name): self.name = name self.target = target self.tf_svd = SvdTuple(tf.svd(target)) self.init = SvdTuple( u.ones(target.shape[0], name=name+"_s_init"), u.Identity(target.shape[0], name=name+"_u_init"), u.Identity(target.shape[0], name=name+"_v_init") ) assert self.tf_svd.s.shape == self.init.s.shape assert self.tf_svd.u.shape == self.init.u.shape assert self.tf_svd.v.shape == self.init.v.shape self.cached = SvdTuple( tf.Variable(self.init.s, name=name+"_s"), tf.Variable(self.init.u, name=name+"_u"), tf.Variable(self.init.v, name=name+"_v") ) self.s = self.cached.s self.u = self.cached.u self.v = self.cached.v self.holder = SvdTuple( tf.placeholder(dtype, shape=self.cached.s.shape, name=name+"_s_holder"), tf.placeholder(dtype, shape=self.cached.u.shape, name=name+"_u_holder"), tf.placeholder(dtype, shape=self.cached.v.shape, name=name+"_v_holder") ) self.update_tf_op = tf.group( self.cached.s.assign(self.tf_svd.s), self.cached.u.assign(self.tf_svd.u), self.cached.v.assign(self.tf_svd.v) ) self.update_external_op = tf.group( self.cached.s.assign(self.holder.s), self.cached.u.assign(self.holder.u), self.cached.v.assign(self.holder.v) ) self.init_ops = (self.s.initializer, self.u.initializer, self.v.initializer)
def simple_newton_kfac_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_simple_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_simple_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_simple_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [10,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1)) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(0.5, dtype=dtype, name="learning_rate") # Create B's B = [0]*(n+1) B[n] = -err/dsize Bn = [0]*(n+1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] Bn[i] = t(W[i+1]) @ Bn[i+1] # inverse Hessian blocks iblocks = u.empty_grid(n+1, n+1) for i in range(1, n+1): for j in range(1, n+1): # reuse Hess tensor calculation in order to get off-diag block sizes dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize; if i == j: acov = A[i] @ t(A[j]) bcov = Bn[i] @ t(Bn[j]) / dsize; term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov)) else: term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype) iblocks[i][j]=term # remove leftmost blocks (those are with respect to W[0] which is input) del iblocks[0] for row in iblocks: del row[0] ihess = u.concat_blocks(iblocks) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) expected_losses = np.loadtxt("data/rotations_simple_newtonkfac_losses.csv", delimiter= ",") observed_losses = [] # from accompanying notebook # {0.0111498, 0.0000171591, 4.11445*10^-11, 2.33653*10^-22, # 6.88354*10^-33, for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def simple_newton_bd_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_simple_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_simple_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_simple_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [10,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1)) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(0.5, dtype=dtype, name="learning_rate") # Create B's B = [0]*(n+1) B[n] = -err/dsize Bn = [0]*(n+1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] Bn[i] = t(W[i+1]) @ Bn[i+1] # Create U's U = [list(range(n+1)) for _ in range(n+1)] for bottom in range(n+1): for top in range(n+1): if bottom > top: prod = u.Identity(f(top)) else: prod = u.Identity(f(bottom-1)) for i in range(bottom, top+1): prod = prod@t(W[i]) U[bottom][top] = prod # Block i, j gives hessian block between layer i and layer j blocks = [list(range(n+1)) for _ in range(n+1)] for i in range(1, n+1): for j in range(1, n+1): term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize; if i == j: term2 = tf.zeros((f(i)*f(i-1), f(i)*f(i-1)), dtype=dtype) elif i < j: term2 = kr(A[i] @ t(B[j]), U[i+1][j-1]) else: term2 = kr(t(U[j+1][i-1]), B[i] @ t(A[j])) blocks[i][j]=term1 + term2 @ Kmat(f(j), f(j-1)) # remove leftmost blocks (those are with respect to W[0] which is input) del blocks[0] for row in blocks: del row[0] #hess = u.concat_blocks(blocks) ihess = u.concat_blocks(u.block_diagonal_inverse(blocks)) # ihess = u.pseudo_inverse(hess) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) expected_losses = np.loadtxt("data/rotations_simple_newtonbd_losses.csv", delimiter= ",") observed_losses = [] # from accompanying notebook # 0.0111498, 0.0000171591, 4.11445*10^-11, 2.33652*10^-22, # 1.21455*10^-32, for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def relu_gradient_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_relu_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [4,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0, name="X0") Y = tf.constant(Y0, name="Y0") W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): if i == 0: A[i+1] = X else: A[i+1] = tf.nn.relu(tf.matmul(W[i], A[i], name="A"+str(i+1))) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(0.1, dtype=dtype) # Create B's B = [0]*(n+1) B[n] = (-err/dsize)*u.relu_mask(A[n+1]) for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] if i > 0: # there's no relu on first matrix B[i] = B[i]*u.relu_mask(A[i+1]) # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) expected_losses = np.loadtxt("data/rotations_relu_gradient_losses.csv", delimiter= ",") observed_losses = [] # From accompanying notebook # {0.407751, 0.0683822, 0.0138657, 0.0039221, 0.00203637, 0.00164892, # 0.00156137, 0.00153857, 0.00153051, 0.00152593} for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def simple_gradient_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_simple_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_simple_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_simple_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [10,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1)) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(1.0, dtype=dtype) # Create B's B = [0]*(n+1) B[n] = -err/dsize for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) expected_losses = np.loadtxt("data/rotations_simple_gradient_losses.csv", delimiter= ",") observed_losses = [] # from accompanying notebook # {0.0111498, 0.00694816, 0.00429464, 0.00248228, 0.00159361, # 0.000957424, 0.000651653, 0.000423802, 0.000306749, 0.00021772, for i in range(20): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def d_sigmoid(y): return y * (1 - y) def kl(x, y): return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y)) def d_kl(x, y): return (1 - x) / (1 - y) - x / y # A[i] = activations needed to compute gradient of W[i] # A[n+1] = network output A = [None] * (n + 2) # A[0] is just for shape checks, assert fail on run with tf.control_dependencies([tf.assert_equal(1, 0, message="too huge")]): A[0] = u.Identity(dsize, dtype=dtype) A[1] = W[0] for i in range(1, n + 1): A[i + 1] = sigmoid(W[i] @ A[i]) # reconstruction error and sparsity error err = (A[3] - A[1]) rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize # B[i] = backprops needed to compute gradient of W[i] # B2[i] = synthetic backprops for natural gradient B = [None] * (n + 1) B2 = [None] * (n + 1) B[n] = err * d_sigmoid(A[n + 1]) B2[n] = tf.random_normal((f(n), f(-1)), dtype=dtype) * d_sigmoid(A[n + 1]) for i in range(n - 1, -1, -1):
def main(): np.random.seed(0) tf.set_random_seed(0) dtype = np.float32 # 64-bit doesn't help much, search for 64-bit in # https://www.wolframcloud.com/objects/5f297f41-30f7-4b1b-972c-cac8d1f8d8e4 u.default_dtype = dtype machine_epsilon = np.finfo(dtype).eps # 1e-7 or 1e-16 train_images = load_MNIST.load_MNIST_images('data/train-images-idx3-ubyte') dsize = 10000 patches = train_images[:, :dsize] fs = [dsize, 28 * 28, 196, 28 * 28] # values from deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial X0 = patches lambda_ = 3e-3 rho = tf.constant(0.1, dtype=dtype) beta = 3 W0f = W_uniform(fs[2], fs[3]) def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = f(-1) n = len(fs) - 2 # helper to create variables with numpy or TF initial value init_dict = {} # {var_placeholder: init_value} vard = {} # {var: util.VarInfo} def init_var(val, name, trainable=False, noinit=False): if isinstance(val, tf.Tensor): collections = [] if noinit else None var = tf.Variable(val, name=name, collections=collections) else: val = np.array(val) assert u.is_numeric, "Unknown type" holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder") var = tf.Variable(holder, name=name, trainable=trainable) init_dict[holder] = val var_p = tf.placeholder(var.dtype, var.shape) var_setter = var.assign(var_p) vard[var] = u.VarInfo(var_setter, var_p) return var lr = init_var(0.2, "lr") if purely_linear: # need lower LR without sigmoids lr = init_var(.02, "lr") Wf = init_var(W0f, "Wf", True) Wf_copy = init_var(W0f, "Wf_copy", True) W = u.unflatten(Wf, fs[1:]) # perftodo: this creates transposes X = init_var(X0, "X") W.insert(0, X) def sigmoid(x): if not purely_linear: return tf.sigmoid(x) else: return tf.identity(x) def d_sigmoid(y): if not purely_linear: return y * (1 - y) else: return 1 def kl(x, y): return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y)) def d_kl(x, y): return (1 - x) / (1 - y) - x / y # A[i] = activations needed to compute gradient of W[i] # A[n+1] = network output A = [None] * (n + 2) # A[0] is just for shape checks, assert fail on run # tf.assert always fails because of static assert # fail_node = tf.assert_equal(1, 0, message="too huge") fail_node = tf.Print(0, [0], "fail, this must never run") with tf.control_dependencies([fail_node]): A[0] = u.Identity(dsize, dtype=dtype) A[1] = W[0] for i in range(1, n + 1): A[i + 1] = sigmoid(W[i] @ A[i]) # reconstruction error and sparsity error err = (A[3] - A[1]) rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize # B[i] = backprops needed to compute gradient of W[i] # B2[i] = backprops from sampled labels needed for natural gradient B = [None] * (n + 1) B2 = [None] * (n + 1) B[n] = err * d_sigmoid(A[n + 1]) sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) sampled_labels = init_var(sampled_labels_live, "sampled_labels", noinit=True) B2[n] = sampled_labels * d_sigmoid(A[n + 1]) for i in range(n - 1, -1, -1): backprop = t(W[i + 1]) @ B[i + 1] backprop2 = t(W[i + 1]) @ B2[i + 1] if i == 1 and not drop_sparsity: backprop += beta * d_kl(rho, rho_hat) backprop2 += beta * d_kl(rho, rho_hat) B[i] = backprop * d_sigmoid(A[i + 1]) B2[i] = backprop2 * d_sigmoid(A[i + 1]) # dW[i] = gradient of W[i] dW = [None] * (n + 1) pre_dW = [None] * (n + 1) # preconditioned dW pre_dW_stable = [None] * (n + 1) # preconditioned stable dW cov_A = [None] * (n + 1) # covariance of activations[i] cov_B2 = [None] * (n + 1) # covariance of synthetic backprops[i] vars_svd_A = [None] * (n + 1) vars_svd_B2 = [None] * (n + 1) for i in range(1, n + 1): cov_A[i] = init_var(A[i] @ t(A[i]) / dsize, "cov_A%d" % (i, )) cov_B2[i] = init_var(B2[i] @ t(B2[i]) / dsize, "cov_B2%d" % (i, )) vars_svd_A[i] = u.SvdWrapper(cov_A[i], "svd_A_%d" % (i, )) vars_svd_B2[i] = u.SvdWrapper(cov_B2[i], "svd_B2_%d" % (i, )) if use_tikhonov: whitened_A = u.regularized_inverse2(vars_svd_A[i], L=Lambda) @ A[i] else: whitened_A = u.pseudo_inverse2(vars_svd_A[i]) @ A[i] if use_tikhonov: whitened_B2 = u.regularized_inverse2(vars_svd_B2[i], L=Lambda) @ B[i] else: whitened_B2 = u.pseudo_inverse2(vars_svd_B2[i]) @ B[i] whitened_A_stable = u.pseudo_inverse_sqrt2(vars_svd_A[i]) @ A[i] whitened_B2_stable = u.pseudo_inverse_sqrt2(vars_svd_B2[i]) @ B[i] pre_dW[i] = (whitened_B2 @ t(whitened_A)) / dsize pre_dW_stable[i] = (whitened_B2_stable @ t(whitened_A_stable)) / dsize dW[i] = (B[i] @ t(A[i])) / dsize # Loss function reconstruction = u.L2(err) / (2 * dsize) sparsity = beta * tf.reduce_sum(kl(rho, rho_hat)) L2 = (lambda_ / 2) * (u.L2(W[1]) + u.L2(W[1])) loss = reconstruction if not drop_l2: loss = loss + L2 if not drop_sparsity: loss = loss + sparsity grad_live = u.flatten(dW[1:]) pre_grad_live = u.flatten(pre_dW[1:]) # fisher preconditioned gradient pre_grad_stable_live = u.flatten( pre_dW_stable[1:]) # sqrt fisher preconditioned grad grad = init_var(grad_live, "grad") pre_grad = init_var(pre_grad_live, "pre_grad") pre_grad_stable = init_var(pre_grad_stable_live, "pre_grad_stable") update_params_op = Wf.assign(Wf - lr * pre_grad).op update_params_stable_op = Wf.assign(Wf - lr * pre_grad_stable).op save_params_op = Wf_copy.assign(Wf).op pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad) pre_grad_stable_dot_grad = tf.reduce_sum(pre_grad * grad) grad_norm = tf.reduce_sum(grad * grad) pre_grad_norm = u.L2(pre_grad) pre_grad_stable_norm = u.L2(pre_grad_stable) def dump_svd_info(step): """Dump singular values and gradient values in those coordinates.""" for i in range(1, n + 1): svd = vars_svd_A[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) util.dump(s0, "A_%d_%d" % (i, step)) A0 = A[i].eval() At0 = v0.T @ A0 util.dump(A0 @ A0.T, "Acov_%d_%d" % (i, step)) util.dump(At0 @ At0.T, "Atcov_%d_%d" % (i, step)) util.dump(s0, "As_%d_%d" % (i, step)) for i in range(1, n + 1): svd = vars_svd_B2[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) util.dump(s0, "B2_%d_%d" % (i, step)) B0 = B[i].eval() Bt0 = v0.T @ B0 util.dump(B0 @ B0.T, "Bcov_%d_%d" % (i, step)) util.dump(Bt0 @ Bt0.T, "Btcov_%d_%d" % (i, step)) util.dump(s0, "Bs_%d_%d" % (i, step)) def advance_batch(): sess.run(sampled_labels.initializer) # new labels for next call def update_covariances(): ops_A = [cov_A[i].initializer for i in range(1, n + 1)] ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)] sess.run(ops_A + ops_B2) def update_svds(): if whitening_mode > 1: vars_svd_A[2].update() if whitening_mode > 2: vars_svd_B2[2].update() if whitening_mode > 3: vars_svd_B2[1].update() def init_svds(): """Initialize our SVD to identity matrices.""" ops = [] for i in range(1, n + 1): ops.extend(vars_svd_A[i].init_ops) ops.extend(vars_svd_B2[i].init_ops) sess = tf.get_default_session() sess.run(ops) init_op = tf.global_variables_initializer() # tf.get_default_graph().finalize() from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0) graph_options = tf.GraphOptions(optimizer_options=optimizer_options, rewrite_options=rewrite_options) config = tf.ConfigProto(graph_options=graph_options) #sess = tf.Session(config=config) sess = tf.InteractiveSession(config=config) sess.run(Wf.initializer, feed_dict=init_dict) sess.run(X.initializer, feed_dict=init_dict) advance_batch() update_covariances() init_svds() sess.run(init_op, feed_dict=init_dict) # initialize everything else print("Running training.") u.reset_time() step_lengths = [] # keep track of learning rates losses = [] ratios = [] # actual loss decrease / expected decrease grad_norms = [] pre_grad_norms = [] # preconditioned grad norm squared pre_grad_stable_norms = [] # sqrt preconditioned grad norms squared target_delta_list = [] # predicted decrease linear approximation target_delta2_list = [] # predicted decrease quadratic appromation actual_delta_list = [] # actual decrease # adaptive line search parameters alpha = 0.3 # acceptable fraction of predicted decrease beta = 0.8 # how much to shrink when violation growth_rate = 1.05 # how much to grow when too conservative def update_cov_A(i): sess.run(cov_A[i].initializer) def update_cov_B2(i): sess.run(cov_B2[i].initializer) # only update whitening matrix of input activations in the beginning if whitening_mode > 0: vars_svd_A[1].update() # compute t(delta).H.delta/2 def hessian_quadratic(delta): # update_covariances() W = u.unflatten(delta, fs[1:]) W.insert(0, None) total = 0 for l in range(1, n + 1): decrement = tf.trace(t(W[l]) @ cov_B2[l] @ W[l] @ cov_A[l]) total += decrement return (total / 2).eval() # compute t(delta).H^-1.delta/2 def hessian_quadratic_inv(delta): # update_covariances() W = u.unflatten(delta, fs[1:]) W.insert(0, None) total = 0 for l in range(1, n + 1): invB2 = u.pseudo_inverse2(vars_svd_B2[l]) invA = u.pseudo_inverse2(vars_svd_A[l]) decrement = tf.trace(t(W[l]) @ invB2 @ W[l] @ invA) total += decrement return (total / 2).eval() # do line search, dump values as csv def line_search(initial_value, direction, step, num_steps): saved_val = tf.Variable(Wf) sess.run(saved_val.initializer) pl = tf.placeholder(dtype, shape=(), name="linesearch_p") assign_op = Wf.assign(initial_value - direction * step * pl) vals = [] for i in range(num_steps): sess.run(assign_op, feed_dict={pl: i}) vals.append(loss.eval()) sess.run(Wf.assign(saved_val)) # restore original value return vals for step in range(num_steps): update_covariances() if step % whiten_every_n_steps == 0: update_svds() sess.run(grad.initializer) sess.run(pre_grad.initializer) lr0, loss0 = sess.run([lr, loss]) save_params_op.run() # regular inverse becomes unstable when grad norm exceeds 1 stabilized_mode = grad_norm.eval() < 1 if stabilized_mode and not use_tikhonov: update_params_stable_op.run() else: update_params_op.run() loss1 = loss.eval() advance_batch() # line search stuff target_slope = (-pre_grad_dot_grad.eval() if stabilized_mode else -pre_grad_stable_dot_grad.eval()) target_delta = lr0 * target_slope target_delta_list.append(target_delta) # second order prediction of target delta # TODO: the sign is wrong, debug this # https://www.wolframcloud.com/objects/8f287f2f-ceb7-42f7-a599-1c03fda18f28 if local_quadratics: x0 = Wf_copy.eval() x_opt = x0 - pre_grad.eval() # computes t(x)@H^-1 @(x)/2 y_opt = loss0 - hessian_quadratic_inv(grad) # computes t(x)@H @(x)/2 y_expected = hessian_quadratic(Wf - x_opt) + y_opt target_delta2 = y_expected - loss0 target_delta2_list.append(target_delta2) actual_delta = loss1 - loss0 actual_slope = actual_delta / lr0 slope_ratio = actual_slope / target_slope # between 0 and 1.01 actual_delta_list.append(actual_delta) if do_line_search: vals1 = line_search(Wf_copy, pre_grad, lr / 100, 40) vals2 = line_search(Wf_copy, grad, lr / 100, 40) u.dump(vals1, "line1-%d" % (i, )) u.dump(vals2, "line2-%d" % (i, )) losses.append(loss0) step_lengths.append(lr0) ratios.append(slope_ratio) grad_norms.append(grad_norm.eval()) pre_grad_norms.append(pre_grad_norm.eval()) pre_grad_stable_norms.append(pre_grad_stable_norm.eval()) if step % report_frequency == 0: print( "Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f ratio %.2f grad norm: %.2f pregrad norm: %.2f" % (step, loss0, target_delta, actual_delta, slope_ratio, grad_norm.eval(), pre_grad_norm.eval())) if adaptive_step_frequency and adaptive_step and step > adaptive_step_burn_in: # shrink if wrong prediction, don't shrink if prediction is tiny if slope_ratio < alpha and abs( target_delta) > 1e-6 and adaptive_step: print("%.2f %.2f %.2f" % (loss0, loss1, slope_ratio)) print( "Slope optimality %.2f, shrinking learning rate to %.2f" % ( slope_ratio, lr0 * beta, )) sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0 * beta}) # grow learning rate, slope_ratio .99 worked best for gradient elif step > 0 and i % 50 == 0 and slope_ratio > 0.90 and adaptive_step: print("%.2f %.2f %.2f" % (loss0, loss1, slope_ratio)) print("Growing learning rate to %.2f" % (lr0 * growth_rate)) sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0 * growth_rate}) u.record_time() # check against expected loss if 'Apple' in sys.version: pass # u.dump(losses, "kfac_small_final_mac.csv") targets = np.loadtxt("data/kfac_small_final_mac.csv", delimiter=",") else: pass # u.dump(losses, "kfac_small_final_linux.csv") targets = np.loadtxt("data/kfac_small_final_linux.csv", delimiter=",") u.check_equal(targets, losses[:len(targets)], rtol=1e-1) u.summarize_time() print("Test passed")
def cost_and_grad(W0f=None, fs=None, lambda_=3e-3, rho=0.1, beta=3, X0=None, lr=0.1): """Construct sparse autoencoder loss and gradient. Args: W0f: initial value of weights (flattened representation) fs: list of sizes [dsize, visible, hidden, visible] sparsity_param: global feature sparsity target beta: weight on sparsity penalty X0: value of X (aka W[0]) Returns: cost, train_step """ np.random.seed(0) tf.set_random_seed(0) dtype = np.float32 if not fs: fs = [dsize, 28 * 28, 196, 28 * 28] if not W0f: W0f = W_uniform(fs[2], fs[3]) rho = tf.constant(rho, dtype=dtype) def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = f(-1) n = len(fs) - 2 init_dict = {} def init_var(val, name, trainable=True): holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder") var = tf.Variable(holder, name=name + "_var", trainable=trainable) init_dict[holder] = val return var Wf = init_var(W0f, "Wf") Wf_copy = init_var(W0f, "Wf_copy") W = u.unflatten(Wf, fs[1:]) X = init_var(X0, "X", False) W.insert(0, X) def sigmoid(x): return tf.sigmoid(x) def d_sigmoid(y): return y * (1 - y) def kl(x, y): return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y)) def d_kl(x, y): return (1 - x) / (1 - y) - x / y # A[i] = activations needed to compute gradient of W[i] # A[n+1] = network output A = [None] * (n + 2) A[0] = u.Identity(dsize, dtype=dtype) A[1] = W[0] for i in range(1, n + 1): A[i + 1] = sigmoid(W[i] @ A[i]) # reconstruction error and sparsity error err = (A[3] - A[1]) rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize # B[i] = backprops needed to compute gradient of W[i] B = [None] * (n + 1) B[n] = err * d_sigmoid(A[n + 1]) for i in range(n - 1, -1, -1): backprop = t(W[i + 1]) @ B[i + 1] if i == 1: backprop += beta * d_kl(rho, rho_hat) B[i] = backprop * d_sigmoid(A[i + 1]) # dW[i] = gradient of W[i] dW = [None] * (n + 1) for i in range(n + 1): dW[i] = (B[i] @ t(A[i])) / dsize # Cost function reconstruction = u.L2(err) / (2 * dsize) sparsity = beta * tf.reduce_sum(kl(rho, rho_hat)) L2 = (lambda_ / 2) * (u.L2(W[1]) + u.L2(W[1])) cost = reconstruction + sparsity + L2 grad = u.flatten(dW[1:]) copy_op = Wf_copy.assign(Wf - lr * grad) with tf.control_dependencies([copy_op]): train_op = Wf.assign(Wf_copy) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) return cost, train_op
def rotations2_natural_empirical(): tf.reset_default_graph() # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's # initialize data + layers # W[0] is input matrix (X), W[n] is last matrix # A[1] has activations for W[1], equal to W[0]=X # A[n+1] has predictions # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1] A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) # input dimensions match assert W[0].get_shape() == X0.shape # output dimensions match assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape assert A[n + 1].get_shape() == Y0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr = tf.Variable(0.000001, dtype=dtype) # create backprop matrices # B[i] has backprop for matrix i B = [0] * (n + 1) B[n] = -err / dsize for i in range(n - 1, -1, -1): B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i)) # Create gradient update. Make copy of variables and split update into # two run calls. Using single set of variables will gives updates that # occasionally produce wrong results/NaN's because of data race dW = [0] * (n + 1) updates1 = [0] * (n + 1) # compute updated value into Wcopy updates2 = [0] * (n + 1) # copy value back into W Wcopy = [0] * (n + 1) for i in range(n + 1): Wi_name = "Wcopy" + str(i) Wi_shape = (fs[i + 1], fs[i]) Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init") Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False) dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update # construct flattened gradient update vector dWf = tf.concat([vec(grad) for grad in dW], axis=0) # inverse fisher preconditioner grads = tf.concat([u.khatri_rao(A[i], B[i]) for i in range(1, n + 1)], axis=0) fisher = grads @ tf.transpose(grads) / dsize ifisher = u.pseudo_inverse(fisher) Wf_copy = tf.Variable(tf.zeros(dtype=dtype, shape=Wf.shape, name="Wf_copy_init"), name="Wf_copy") new_val_matrix = Wf - lr * (ifisher @ dWf) train_op1 = Wf_copy.assign(new_val_matrix) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) observed_losses = [] u.reset_time() for i in range(10): loss0 = sess.run(loss) print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
def rotations2_newton_kfac(): tf.reset_default_graph() # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) assert W[0].get_shape() == X0.shape assert A[n + 1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr = tf.Variable(0.1, dtype=dtype, name="learning_rate") # Create B's B = [0] * (n + 1) B[n] = -err / dsize Bn = [0] * (n + 1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n - 1, -1, -1): B[i] = t(W[i + 1]) @ B[i + 1] Bn[i] = t(W[i + 1]) @ Bn[i + 1] # inverse Hessian blocks iblocks = u.empty_grid(n + 1, n + 1) for i in range(1, n + 1): for j in range(1, n + 1): # reuse Hess tensor calculation in order to get off-diag block sizes dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize if i == j: acov = A[i] @ t(A[j]) bcov = (Bn[i] @ t(Bn[j])) / dsize term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov)) else: term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype) iblocks[i][j] = term # remove leftmost blocks (those are with respect to W[0] which is input) del iblocks[0] for row in iblocks: del row[0] ihess = u.concat_blocks(iblocks) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0] * (n + 1) for i in range(n + 1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) observed_losses = [] elapsed_times = [] u.reset_time() for i in range(10): loss0 = sess.run([loss])[0] print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
def rotations2_newton_bd(): # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) tf.reset_default_graph() X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) assert W[0].get_shape() == X0.shape assert A[n + 1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr = tf.Variable(0.1, dtype=dtype, name="learning_rate") # Create B's B = [0] * (n + 1) B[n] = -err / dsize Bn = [0] * (n + 1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n - 1, -1, -1): B[i] = t(W[i + 1]) @ B[i + 1] Bn[i] = t(W[i + 1]) @ Bn[i + 1] # Create U's U = [list(range(n + 1)) for _ in range(n + 1)] for bottom in range(n + 1): for top in range(n + 1): if bottom > top: prod = u.Identity(f(top)) else: prod = u.Identity(f(bottom - 1)) for i in range(bottom, top + 1): prod = prod @ t(W[i]) U[bottom][top] = prod # Block i, j gives hessian block between layer i and layer j blocks = [list(range(n + 1)) for _ in range(n + 1)] for i in range(1, n + 1): for j in range(1, n + 1): term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize if i == j: term2 = tf.zeros((f(i) * f(i - 1), f(i) * f(i - 1)), dtype=dtype) elif i < j: term2 = kr(A[i] @ t(B[j]), U[i + 1][j - 1]) else: term2 = kr(t(U[j + 1][i - 1]), B[i] @ t(A[j])) blocks[i][j] = term1 + term2 @ Kmat(f(j), f(j - 1)) # remove leftmost blocks (those are with respect to W[0] which is input) del blocks[0] for row in blocks: del row[0] ihess = u.concat_blocks(u.block_diagonal_inverse(blocks)) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0] * (n + 1) for i in range(n + 1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) observed_losses = [] u.reset_time() for i in range(20): loss0 = sess.run([loss])[0] print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
def rotations1_gradient_test(): # https://www.wolframcloud.com/objects/ff6ecaf0-fccd-44e3-b26f-970d8fc2a57c tf.reset_default_graph() X0 = np.genfromtxt('data/large_rotations1_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations1_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations1_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations1_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) assert W[0].get_shape() == X0.shape assert A[n + 1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr0 = np.genfromtxt('data/large_rotations1_gradient_lr.csv') lr = tf.Variable(lr0, dtype=dtype) # Create B's B = [0] * (n + 1) B[n] = -err / dsize for i in range(n - 1, -1, -1): B[i] = t(W[i + 1]) @ B[i + 1] # create dW's dW = [0] * (n + 1) for i in range(n + 1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) expected_losses = np.loadtxt("data/large_rotations1_gradient_losses.csv", delimiter=",") observed_losses = [] # from accompanying notebook # {0.102522, 0.028124, 0.00907214, 0.00418929, 0.00293379, for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def main(): np.random.seed(0) tf.set_random_seed(0) dtype = np.float32 train_images = u.get_mnist_images() dsize = 10000 patches = train_images[:, :dsize].astype(dtype) fs = [dsize, 28 * 28, 196, 28 * 28] # values from deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial X0 = patches lambda_ = 3e-3 rho = tf.constant(0.1, dtype=dtype) beta = 3 W0_0 = u.ng_init(fs[2], fs[3]) W1_0 = u.ng_init(fs[3], fs[2]) W0f = u.flatten([W0_0.flatten(), W1_0.flatten()]) def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = f(-1) n = len(fs) - 2 # helper to create variables with numpy or TF initial value init_dict = {} # {var_placeholder: init_value} vard = {} # {var: u.VarInfo} def init_var(val, name, trainable=False, noinit=False): if isinstance(val, tf.Tensor): collections = [] if noinit else None var = tf.Variable(val, name=name, collections=collections) else: val = np.array(val) assert u.is_numeric, "Unknown type" holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder") var = tf.Variable(holder, name=name, trainable=trainable) init_dict[holder] = val var_p = tf.placeholder(var.dtype, var.shape) var_setter = var.assign(var_p) vard[var] = u.VarInfo(var_setter, var_p) return var lr = init_var(0.2, "lr") Wf = init_var(W0f, "Wf", True) Wf_copy = init_var(W0f, "Wf_copy", True) W = u.unflatten(Wf, fs[1:]) # perftodo: this creates transposes X = init_var(X0, "X") W.insert(0, X) def sigmoid(x): return tf.sigmoid(x) def d_sigmoid(y): return y * (1 - y) def kl(x, y): return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y)) def d_kl(x, y): return (1 - x) / (1 - y) - x / y # A[i] = activations needed to compute gradient of W[i] # A[n+1] = network output A = [None] * (n + 2) fail_node = tf.Print(0, [0], "fail, this must never run") with tf.control_dependencies([fail_node]): A[0] = u.Identity(dsize, dtype=dtype) A[1] = W[0] for i in range(1, n + 1): A[i + 1] = sigmoid(W[i] @ A[i]) # reconstruction error and sparsity error err = (A[3] - A[1]) rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize # B[i] = backprops needed to compute gradient of W[i] # B2[i] = backprops from sampled labels needed for natural gradient B = [None] * (n + 1) B2 = [None] * (n + 1) B[n] = err * d_sigmoid(A[n + 1]) sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) sampled_labels = init_var(sampled_labels_live, "sampled_labels", noinit=True) B2[n] = sampled_labels * d_sigmoid(A[n + 1]) for i in range(n - 1, -1, -1): backprop = t(W[i + 1]) @ B[i + 1] backprop2 = t(W[i + 1]) @ B2[i + 1] B[i] = backprop * d_sigmoid(A[i + 1]) B2[i] = backprop2 * d_sigmoid(A[i + 1]) # dW[i] = gradient of W[i] dW = [None] * (n + 1) pre_dW = [None] * (n + 1) # preconditioned dW pre_dW_stable = [None] * (n + 1) # preconditioned stable dW cov_A = [None] * (n + 1) # covariance of activations[i] cov_B2 = [None] * (n + 1) # covariance of synthetic backprops[i] vars_svd_A = [None] * (n + 1) vars_svd_B2 = [None] * (n + 1) for i in range(1, n + 1): cov_op = A[i] @ t(A[i]) / dsize + lambda_ * u.Identity(A[i].shape[0]) cov_A[i] = init_var(cov_op, "cov_A%d" % (i, )) cov_op = B2[i] @ t(B2[i]) / dsize + lambda_ * u.Identity( B2[i].shape[0]) cov_B2[i] = init_var(cov_op, "cov_B2%d" % (i, )) vars_svd_A[i] = u.SvdWrapper(cov_A[i], "svd_A_%d" % (i, ), do_inverses=True) vars_svd_B2[i] = u.SvdWrapper(cov_B2[i], "svd_B2_%d" % (i, ), do_inverses=True) whitened_A = vars_svd_A[i].inv @ A[i] whitened_B = vars_svd_B2[i].inv @ B[i] pre_dW[i] = (whitened_B @ t(whitened_A)) / dsize dW[i] = (B[i] @ t(A[i])) / dsize # Loss function reconstruction = u.L2(err) / (2 * dsize) loss = reconstruction grad_live = u.flatten(dW[1:]) pre_grad_live = u.flatten(pre_dW[1:]) # fisher preconditioned gradient grad = init_var(grad_live, "grad") pre_grad = init_var(pre_grad_live, "pre_grad") update_params_op = Wf.assign(Wf - lr * pre_grad).op save_params_op = Wf_copy.assign(Wf).op pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad) grad_norm = tf.reduce_sum(grad * grad) pre_grad_norm = u.L2(pre_grad) def dump_svd_info(step): """Dump singular values and gradient values in those coordinates.""" for i in range(1, n + 1): svd = vars_svd_A[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) u.dump(s0, "A_%d_%d" % (i, step)) A0 = A[i].eval() At0 = v0.T @ A0 u.dump(A0 @ A0.T, "Acov_%d_%d" % (i, step)) u.dump(At0 @ At0.T, "Atcov_%d_%d" % (i, step)) u.dump(s0, "As_%d_%d" % (i, step)) for i in range(1, n + 1): svd = vars_svd_B2[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) u.dump(s0, "B2_%d_%d" % (i, step)) B0 = B[i].eval() Bt0 = v0.T @ B0 u.dump(B0 @ B0.T, "Bcov_%d_%d" % (i, step)) u.dump(Bt0 @ Bt0.T, "Btcov_%d_%d" % (i, step)) u.dump(s0, "Bs_%d_%d" % (i, step)) def advance_batch(): sess.run(sampled_labels.initializer) # new labels for next call def update_covariances(): ops_A = [cov_A[i].initializer for i in range(1, n + 1)] ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)] sess.run(ops_A + ops_B2) def update_svds(): vars_svd_A[2].update() vars_svd_B2[2].update() vars_svd_B2[1].update() def init_svds(): """Initialize our SVD to identity matrices.""" ops = [] for i in range(1, n + 1): ops.extend(vars_svd_A[i].init_ops) ops.extend(vars_svd_B2[i].init_ops) sess = tf.get_default_session() sess.run(ops) init_op = tf.global_variables_initializer() from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0) graph_options = tf.GraphOptions(optimizer_options=optimizer_options, rewrite_options=rewrite_options) config = tf.ConfigProto(graph_options=graph_options) sess = tf.InteractiveSession(config=config) sess.run(Wf.initializer, feed_dict=init_dict) sess.run(X.initializer, feed_dict=init_dict) advance_batch() update_covariances() init_svds() sess.run(init_op, feed_dict=init_dict) # initialize everything else print("Running training.") u.reset_time() step_lengths = [] # keep track of learning rates losses = [] # adaptive line search parameters alpha = 0.3 # acceptable fraction of predicted decrease beta = 0.8 # how much to shrink when violation growth_rate = 1.05 # how much to grow when too conservative def update_cov_A(i): sess.run(cov_A[i].initializer) def update_cov_B2(i): sess.run(cov_B2[i].initializer) # only update whitening matrix of input activations in the beginning vars_svd_A[1].update() for step in range(40): update_covariances() update_svds() sess.run(grad.initializer) sess.run(pre_grad.initializer) lr0, loss0 = sess.run([lr, loss]) update_params_op.run() advance_batch() losses.append(loss0) step_lengths.append(lr0) print("Step %d loss %.2f" % (step, loss0)) u.record_time() assert losses[-1] < 0.59 assert losses[-1] > 0.57 assert 20e-3 < min( u.global_time_list) < 50e-3, "Time should be 40ms on 1080" u.summarize_time() print("Test passed")
def model_creator(batch_size, name='defaultmodel', dtype=np.float32): """Create MNIST autoencoder model. Dataset is part of model.""" model = Model(name) init_dict = {} global_vars = [] local_vars = [] # TODO: factor out to reuse between scripts # TODO: change feed_dict logic to reuse value provided to VarStruct # current situation makes reinitialization of global variable change # it's value, counterinituitive def init_var(val, name, is_global=False): """Helper to create variables with numpy or TF initial values.""" if isinstance(val, tf.Tensor): var = u.get_variable(name=name, initializer=val, reuse=is_global) else: val = np.array(val) assert u.is_numeric(val), "Non-numeric type." var_struct = u.get_var(name=name, initializer=val, reuse=is_global) holder = var_struct.val_ init_dict[holder] = val var = var_struct.var if is_global: global_vars.append(var) else: local_vars.append(var) return var # TODO: get rid of purely_relu def nonlin(x): if purely_relu: return tf.nn.relu(x) elif purely_linear: return tf.identity(x) else: return tf.sigmoid(x) # TODO: rename into "nonlin_d" def d_nonlin(y): if purely_relu: return u.relu_mask(y) elif purely_linear: return 1 else: return y * (1 - y) train_images = load_MNIST.load_MNIST_images( 'data/train-images-idx3-ubyte').astype(dtype) patches = train_images[:, :batch_size] test_patches = train_images[:, -batch_size:] assert dsize < 25000 fs = [ batch_size, 28 * 28, 1024, 1024, 1024, 196, 1024, 1024, 1024, 28 * 28 ] def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] n = len(fs) - 2 X = init_var(patches, "X", is_global=False) W = [None] * n W.insert(0, X) A = [None] * (n + 2) A[1] = W[0] for i in range(1, n + 1): init_val = ng_init(f(i), f(i - 1)).astype(dtype) W[i] = init_var(init_val, "W_%d" % (i, ), is_global=True) A[i + 1] = nonlin(kfac_lib.matmul(W[i], A[i])) err = A[n + 1] - A[1] # create test error eval layer = init_var(test_patches, "X_test", is_global=False) for i in range(1, n + 1): layer = nonlin(W[i] @ layer) verr = (layer - test_patches) model.vloss = u.L2(verr) / (2 * batch_size) # manually compute backprop to use for sanity checking B = [None] * (n + 1) B2 = [None] * (n + 1) B[n] = err * d_nonlin(A[n + 1]) _sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) if use_fixed_labels: _sampled_labels_live = tf.ones(shape=(f(n), f(-1)), dtype=dtype) _sampled_labels = init_var(_sampled_labels_live, "to_be_deleted", is_global=False) B2[n] = _sampled_labels * d_nonlin(A[n + 1]) for i in range(n - 1, -1, -1): backprop = t(W[i + 1]) @ B[i + 1] B[i] = backprop * d_nonlin(A[i + 1]) backprop2 = t(W[i + 1]) @ B2[i + 1] B2[i] = backprop2 * d_nonlin(A[i + 1]) cov_A = [None] * (n + 1) # covariance of activations[i] cov_B2 = [None] * (n + 1) # covariance of synthetic backprops[i] vars_svd_A = [None] * (n + 1) vars_svd_B2 = [None] * (n + 1) dW = [None] * (n + 1) dW2 = [None] * (n + 1) pre_dW = [None] * (n + 1) # preconditioned dW for i in range(1, n + 1): if regularized_svd: cov_A[i] = init_var( A[i] @ t(A[i]) / batch_size + LAMBDA * u.Identity(f(i - 1)), "cov_A%d" % (i, )) cov_B2[i] = init_var( B2[i] @ t(B2[i]) / batch_size + LAMBDA * u.Identity(f(i)), "cov_B2%d" % (i, )) else: cov_A[i] = init_var(A[i] @ t(A[i]) / batch_size, "cov_A%d" % (i, )) cov_B2[i] = init_var(B2[i] @ t(B2[i]) / batch_size, "cov_B2%d" % (i, )) vars_svd_A[i] = u.SvdWrapper(cov_A[i], "svd_A_%d" % (i, )) vars_svd_B2[i] = u.SvdWrapper(cov_B2[i], "svd_B2_%d" % (i, )) if use_tikhonov: whitened_A = u.regularized_inverse3(vars_svd_A[i], L=LAMBDA) @ A[i] whitened_B2 = u.regularized_inverse3(vars_svd_B2[i], L=LAMBDA) @ B[i] else: whitened_A = u.pseudo_inverse2(vars_svd_A[i]) @ A[i] whitened_B2 = u.pseudo_inverse2(vars_svd_B2[i]) @ B[i] dW[i] = (B[i] @ t(A[i])) / batch_size dW2[i] = B[i] @ t(A[i]) pre_dW[i] = (whitened_B2 @ t(whitened_A)) / batch_size # model.extra['A'] = A # model.extra['B'] = B # model.extra['B2'] = B2 # model.extra['cov_A'] = cov_A # model.extra['cov_B2'] = cov_B2 # model.extra['vars_svd_A'] = vars_svd_A # model.extra['vars_svd_B2'] = vars_svd_B2 # model.extra['W'] = W # model.extra['dW'] = dW # model.extra['dW2'] = dW2 # model.extra['pre_dW'] = pre_dW model.loss = u.L2(err) / (2 * batch_size) sampled_labels_live = A[n + 1] + tf.random_normal( (f(n), f(-1)), dtype=dtype, seed=0) if use_fixed_labels: sampled_labels_live = A[n + 1] + tf.ones(shape=(f(n), f(-1)), dtype=dtype) sampled_labels = init_var(sampled_labels_live, "sampled_labels", is_global=False) err2 = A[n + 1] - sampled_labels model.loss2 = u.L2(err2) / (2 * batch_size) model.global_vars = global_vars model.local_vars = local_vars model.trainable_vars = W[1:] def advance_batch(): sess = tf.get_default_session() # TODO: get rid of _sampled_labels sess.run([sampled_labels.initializer, _sampled_labels.initializer]) model.advance_batch = advance_batch # TODO: refactor this to take initial values out of Var struct #global_init_op = tf.group(*[v.initializer for v in global_vars]) global_init_ops = [v.initializer for v in global_vars] global_init_op = tf.group(*[v.initializer for v in global_vars]) global_init_query_op = [ tf.logical_not(tf.is_variable_initialized(v)) for v in global_vars ] def initialize_global_vars(verbose=False, reinitialize=False): """If reinitialize is false, will not reinitialize variables already initialized.""" sess = tf.get_default_session() if not reinitialize: uninited = sess.run(global_init_query_op) # use numpy boolean indexing to select list of initializers to run to_initialize = list(np.asarray(global_init_ops)[uninited]) else: to_initialize = global_init_ops if verbose: print("Initializing following:") for v in to_initialize: print(" " + v.name) sess.run(to_initialize, feed_dict=init_dict) model.initialize_global_vars = initialize_global_vars local_init_op = tf.group(*[v.initializer for v in local_vars]) def initialize_local_vars(): sess = tf.get_default_session() sess.run(X.initializer, feed_dict=init_dict) # A's depend on X sess.run(_sampled_labels.initializer, feed_dict=init_dict) sess.run(local_init_op, feed_dict=init_dict) model.initialize_local_vars = initialize_local_vars return model
def kl(x, y): return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y)) def d_kl(x, y): return (1 - x) / (1 - y) - x / y # A[i] = activations needed to compute gradient of W[i] # A[n+1] = network output A = [None] * (n + 2) # A[0] is just for shape checks, assert fail on run # Have to disable the assert test, tensorflow 1.1 tries to run it # using Python static assertion testing with tf.control_dependencies([tf.assert_equal(0, 0, message="too huge")]): A[0] = u.Identity(dsize, dtype=dtype) A[1] = W[0] for i in range(1, n + 1): A[i + 1] = sigmoid(W[i] @ A[i]) # reconstruction error and sparsity error err = (A[3] - A[1]) rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize # B[i] = backprops needed to compute gradient of W[i] # B2[i] = backprops from sampled labels needed for natural gradient B = [None] * (n + 1) B2 = [None] * (n + 1) B[n] = err * d_sigmoid(A[n + 1]) sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) if use_fixed_labels:
def model_creator(batch_size, name='defaultmodel', dtype=np.float32): """Create MNIST autoencoder model. Dataset is part of model.""" global hack_global_init_dict model = Model(name) # TODO: actually use batch_size init_dict = {} # todo: rename to feed_dict? global_vars = [] local_vars = [] # TODO: rename to make_var def init_var(val, name, is_global=False): """Helper to create variables with numpy or TF initial values.""" if isinstance(val, tf.Tensor): var = u.get_variable(name=name, initializer=val, reuse=is_global) else: val = np.array(val) assert u.is_numeric(val), "Non-numeric type." var_struct = u.get_var(name=name, initializer=val, reuse=is_global) holder = var_struct.val_ init_dict[holder] = val var = var_struct.var if is_global: global_vars.append(var) else: local_vars.append(var) return var # TODO: get rid of purely_relu def nonlin(x): if purely_relu: return tf.nn.relu(x) elif purely_linear: return tf.identity(x) else: return tf.sigmoid(x) # TODO: rename into "nonlin_d" def d_nonlin(y): if purely_relu: return u.relu_mask(y) elif purely_linear: return 1 else: return y * (1 - y) train_images = load_MNIST.load_MNIST_images( 'data/train-images-idx3-ubyte').astype(dtype) patches = train_images[:, :batch_size] fs = [batch_size, 28 * 28, 196, 28 * 28] def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] n = len(fs) - 2 X = init_var(patches, "X", is_global=False) W = [None] * n W.insert(0, X) A = [None] * (n + 2) A[1] = W[0] W0f_old = W_uniform(fs[2], fs[3]).astype(dtype) # to match previous generation W0s_old = u.unflatten(W0f_old, fs[1:]) # perftodo: this creates transposes for i in range(1, n + 1): # temp = init_var(ng_init(f(i), f(i-1)), "W_%d"%(i,), is_global=True) # init_val1 = W0s_old[i-1] init_val = ng_init(f(i), f(i - 1)).astype(dtype) W[i] = init_var(init_val, "W_%d" % (i, ), is_global=True) A[i + 1] = nonlin(kfac_lib.matmul(W[i], A[i])) err = A[n + 1] - A[1] # manually compute backprop to use for sanity checking B = [None] * (n + 1) B2 = [None] * (n + 1) B[n] = err * d_nonlin(A[n + 1]) _sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) if use_fixed_labels: _sampled_labels_live = tf.ones(shape=(f(n), f(-1)), dtype=dtype) _sampled_labels = init_var(_sampled_labels_live, "to_be_deleted", is_global=False) B2[n] = _sampled_labels * d_nonlin(A[n + 1]) for i in range(n - 1, -1, -1): backprop = t(W[i + 1]) @ B[i + 1] B[i] = backprop * d_nonlin(A[i + 1]) backprop2 = t(W[i + 1]) @ B2[i + 1] B2[i] = backprop2 * d_nonlin(A[i + 1]) cov_A = [None] * (n + 1) # covariance of activations[i] cov_B2 = [None] * (n + 1) # covariance of synthetic backprops[i] vars_svd_A = [None] * (n + 1) vars_svd_B2 = [None] * (n + 1) dW = [None] * (n + 1) dW2 = [None] * (n + 1) pre_dW = [None] * (n + 1) # preconditioned dW for i in range(1, n + 1): if regularized_svd: cov_A[i] = init_var( A[i] @ t(A[i]) / batch_size + LAMBDA * u.Identity(f(i - 1)), "cov_A%d" % (i, )) cov_B2[i] = init_var( B2[i] @ t(B2[i]) / batch_size + LAMBDA * u.Identity(f(i)), "cov_B2%d" % (i, )) else: cov_A[i] = init_var(A[i] @ t(A[i]) / batch_size, "cov_A%d" % (i, )) cov_B2[i] = init_var(B2[i] @ t(B2[i]) / batch_size, "cov_B2%d" % (i, )) vars_svd_A[i] = u.SvdWrapper(cov_A[i], "svd_A_%d" % (i, )) vars_svd_B2[i] = u.SvdWrapper(cov_B2[i], "svd_B2_%d" % (i, )) if use_tikhonov: whitened_A = u.regularized_inverse3(vars_svd_A[i], L=LAMBDA) @ A[i] whitened_B2 = u.regularized_inverse3(vars_svd_B2[i], L=LAMBDA) @ B[i] else: whitened_A = u.pseudo_inverse2(vars_svd_A[i]) @ A[i] whitened_B2 = u.pseudo_inverse2(vars_svd_B2[i]) @ B[i] dW[i] = (B[i] @ t(A[i])) / batch_size dW2[i] = B[i] @ t(A[i]) pre_dW[i] = (whitened_B2 @ t(whitened_A)) / batch_size # model.extra['A'] = A # model.extra['B'] = B # model.extra['B2'] = B2 # model.extra['cov_A'] = cov_A # model.extra['cov_B2'] = cov_B2 # model.extra['vars_svd_A'] = vars_svd_A # model.extra['vars_svd_B2'] = vars_svd_B2 # model.extra['W'] = W # model.extra['dW'] = dW # model.extra['dW2'] = dW2 # model.extra['pre_dW'] = pre_dW model.loss = u.L2(err) / (2 * batch_size) sampled_labels_live = A[n + 1] + tf.random_normal( (f(n), f(-1)), dtype=dtype, seed=0) if use_fixed_labels: sampled_labels_live = A[n + 1] + tf.ones(shape=(f(n), f(-1)), dtype=dtype) sampled_labels = init_var(sampled_labels_live, "sampled_labels", is_global=False) err2 = A[n + 1] - sampled_labels model.loss2 = u.L2(err2) / (2 * batch_size) model.global_vars = global_vars model.local_vars = local_vars model.trainable_vars = W[1:] def advance_batch(): sess = tf.get_default_session() # TODO: get rid of _sampled_labels sess.run([sampled_labels.initializer, _sampled_labels.initializer]) model.advance_batch = advance_batch global_init_op = tf.group(*[v.initializer for v in global_vars]) def initialize_global_vars(): sess = tf.get_default_session() sess.run(global_init_op, feed_dict=init_dict) model.initialize_global_vars = initialize_global_vars local_init_op = tf.group(*[v.initializer for v in local_vars]) def initialize_local_vars(): sess = tf.get_default_session() sess.run(X.initializer, feed_dict=init_dict) # A's depend on X sess.run(_sampled_labels.initializer, feed_dict=init_dict) sess.run(local_init_op, feed_dict=init_dict) model.initialize_local_vars = initialize_local_vars hack_global_init_dict = init_dict return model
return 1 def kl(x, y): return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y)) def d_kl(x, y): return (1-x)/(1-y) - x/y # A[i] = activations needed to compute gradient of W[i] # A[n+1] = network output A = [None]*(n+2) # A[0] is just for shape checks, assert fail on run # Have to disable the assert test, tensorflow 1.1 tries to run it # using Python static assertion testing with tf.control_dependencies([tf.assert_equal(0, 0, message="too huge")]): A[0] = u.Identity(dsize, dtype=dtype) A[1] = W[0] for i in range(1, n+1): A[i+1] = sigmoid(W[i] @ A[i]) # reconstruction error and sparsity error err = (A[3] - A[1]) rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True)/dsize # B[i] = backprops needed to compute gradient of W[i] # B2[i] = backprops from sampled labels needed for natural gradient B = [None]*(n+1) B2 = [None]*(n+1) B[n] = err*d_sigmoid(A[n+1]) sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) if use_fixed_labels:
def rotations2_natural_sampled_kfac(num_samples=1): tf.reset_default_graph() np.random.seed(0) tf.set_random_seed(0) # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's # initialize data + layers # W[0] is input matrix (X), W[n] is last matrix # A[1] has activations for W[1], equal to W[0]=X # A[n+1] has predictions # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) A = [0] * (n + 2) A2 = [0] * (n + 2) # augmented forward props for natural gradient A[0] = u.Identity(dsize) A2[0] = u.Identity(dsize * num_samples) for i in range(n + 1): # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1] A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) if i == 0: # replicate dataset multiple times corresponding to number of samples A2[i + 1] = tf.concat([W[0]] * num_samples, axis=1) else: A2[i + 1] = tf.matmul(W[i], A2[i], name="A2" + str(i + 1)) # input dimensions match assert W[0].get_shape() == X0.shape # output dimensions match assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape assert A[n + 1].get_shape() == Y0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) # lower learning rate by 10x lr = tf.Variable(0.01, dtype=dtype) # create backprop matrices # B[i] has backprop for matrix i B = [0] * (n + 1) B2 = [0] * (n + 1) B[n] = -err / dsize B2[n] = tf.random_normal((f(n), dsize * num_samples), 0, 1, seed=0, dtype=dtype) for i in range(n - 1, -1, -1): B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i)) B2[i] = tf.matmul(tf.transpose(W[i + 1]), B2[i + 1], name="B2" + str(i)) # Create gradient update. Make copy of variables and split update into # two run calls. Using single set of variables will gives updates that # occasionally produce wrong results/NaN's because of data race dW = [0] * (n + 1) dW2 = [0] * (n + 1) updates1 = [0] * (n + 1) # compute updated value into Wcopy updates2 = [0] * (n + 1) # copy value back into W Wcopy = [0] * (n + 1) for i in range(n + 1): Wi_name = "Wcopy" + str(i) Wi_shape = (fs[i + 1], fs[i]) Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init") Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False) dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) dW2[i] = tf.matmul(B2[i], tf.transpose(A2[i]), name="dW2" + str(i)) del dW[0] # get rid of W[0] update del dW2[0] # get rid of W[0] update # construct flattened gradient update vector dWf = tf.concat([vec(grad) for grad in dW], axis=0) # todo: divide both activations and backprops by size for cov calc # Kronecker factored covariance blocks iblocks = u.empty_grid(n + 1, n + 1) for i in range(1, n + 1): for j in range(1, n + 1): if i == j: acov = A2[i] @ t(A2[j]) / (dsize * num_samples) bcov = B2[i] @ t(B2[j]) / (dsize * num_samples) term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov)) else: term = tf.zeros(shape=(f(i) * f(i - 1), f(j) * f(j - 1)), dtype=dtype) iblocks[i][j] = term # remove leftmost blocks (those are with respect to W[0] which is input) del iblocks[0] for row in iblocks: del row[0] ifisher = u.concat_blocks(iblocks) Wf_copy = tf.Variable(tf.zeros(dtype=dtype, shape=Wf.shape, name="Wf_copy_init"), name="Wf_copy") new_val_matrix = Wf - lr * (ifisher @ dWf) train_op1 = Wf_copy.assign(new_val_matrix) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) observed_losses = [] u.reset_time() for i in range(20): loss0 = sess.run(loss) print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
# TODO: rename into "nonlin_d" def d_sigmoid(y): if purely_relu: return u.relu_mask(y) elif purely_linear: return 1 else: return y*(1-y) # A[i] = activations needed to compute gradient of W[i] # A[n+1] = network output A = [None]*(n+2) # A[0] is just for shape checks, assert fail on run with tf.control_dependencies([tf.assert_equal(1, 0, message="too huge")]): A[0] = u.Identity(dsize, dtype=dtype) A[1] = W[0] for i in range(1, n+1): A[i+1] = sigmoid(W[i] @ A[i]) # reconstruction error and sparsity error err = (A[3] - A[1]) rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True)/dsize # B[i] = backprops needed to compute gradient of W[i] # B2[i] = backprops from sampled labels needed for natural gradient B = [None]*(n+1) B2 = [None]*(n+1) B[n] = err*d_sigmoid(A[n+1]) sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) sampled_labels = init_var(sampled_labels_live, "sampled_labels", noinit=True)
def model_creator(batch_size, name="default", dtype=np.float32): """Create MNIST autoencoder model. Dataset is part of model.""" model = Model(name) def get_batch_size(data): if isinstance(data, IndexedGrad): return int(data.live[0].shape[1]) else: return int(data.shape[1]) init_dict = {} global_vars = [] local_vars = [] # TODO: factor out to reuse between scripts # TODO: change feed_dict logic to reuse value provided to VarStruct # current situation makes reinitialization of global variable change # it's value, counterinituitive def init_var(val, name, is_global=False): """Helper to create variables with numpy or TF initial values.""" if isinstance(val, tf.Tensor): var = u.get_variable(name=name, initializer=val, reuse=is_global) else: val = np.array(val) assert u.is_numeric(val), "Non-numeric type." var_struct = u.get_var(name=name, initializer=val, reuse=is_global) holder = var_struct.val_ init_dict[holder] = val var = var_struct.var if is_global: global_vars.append(var) else: local_vars.append(var) return var # TODO: get rid of purely_relu def nonlin(x): if purely_relu: return tf.nn.relu(x) elif purely_linear: return tf.identity(x) else: return tf.sigmoid(x) # TODO: rename into "nonlin_d" def d_nonlin(y): if purely_relu: return u.relu_mask(y) elif purely_linear: return 1 else: return y*(1-y) patches = train_images[:,:args.batch_size]; test_patches = test_images[:,:args.batch_size]; if args.dataset == 'cifar': input_dim = 3*32*32 elif args.dataset == 'mnist': input_dim = 28*28 else: assert False fs = [args.batch_size, input_dim, 1024, 1024, 1024, 196, 1024, 1024, 1024, input_dim] def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] n = len(fs) - 2 # Full dataset from which new batches are sampled X_full = init_var(train_images, "X_full", is_global=True) X = init_var(patches, "X", is_global=False) # stores local batch per model W = [None]*n W.insert(0, X) A = [None]*(n+2) A[1] = W[0] for i in range(1, n+1): init_val = ng_init(f(i), f(i-1)).astype(dtype) W[i] = init_var(init_val, "W_%d"%(i,), is_global=True) A[i+1] = nonlin(kfac_lib.matmul(W[i], A[i])) err = A[n+1] - A[1] model.loss = u.L2(err) / (2 * get_batch_size(err)) # create test error eval layer0 = init_var(test_patches, "X_test", is_global=True) layer = layer0 for i in range(1, n+1): layer = nonlin(W[i] @ layer) verr = (layer - layer0) model.vloss = u.L2(verr) / (2 * get_batch_size(verr)) # manually compute backprop to use for sanity checking B = [None]*(n+1) B2 = [None]*(n+1) B[n] = err*d_nonlin(A[n+1]) _sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) if args.fixed_labels: _sampled_labels_live = tf.ones(shape=(f(n), f(-1)), dtype=dtype) _sampled_labels = init_var(_sampled_labels_live, "to_be_deleted", is_global=False) B2[n] = _sampled_labels*d_nonlin(A[n+1]) for i in range(n-1, -1, -1): backprop = t(W[i+1]) @ B[i+1] B[i] = backprop*d_nonlin(A[i+1]) backprop2 = t(W[i+1]) @ B2[i+1] B2[i] = backprop2*d_nonlin(A[i+1]) cov_A = [None]*(n+1) # covariance of activations[i] cov_B2 = [None]*(n+1) # covariance of synthetic backprops[i] vars_svd_A = [None]*(n+1) vars_svd_B2 = [None]*(n+1) dW = [None]*(n+1) dW2 = [None]*(n+1) pre_dW = [None]*(n+1) # preconditioned dW # todo: decouple initial value from covariance update # maybe need start with identity and do running average for i in range(1,n+1): if regularized_svd: cov_A[i] = init_var(A[i]@t(A[i])/args.batch_size+args.Lambda*u.Identity(f(i-1)), "cov_A%d"%(i,)) cov_B2[i] = init_var(B2[i]@t(B2[i])/args.batch_size+args.Lambda*u.Identity(f(i)), "cov_B2%d"%(i,)) else: cov_A[i] = init_var(A[i]@t(A[i])/args.batch_size, "cov_A%d"%(i,)) cov_B2[i] = init_var(B2[i]@t(B2[i])/args.batch_size, "cov_B2%d"%(i,)) vars_svd_A[i] = u.SvdWrapper(cov_A[i],"svd_A_%d"%(i,)) vars_svd_B2[i] = u.SvdWrapper(cov_B2[i],"svd_B2_%d"%(i,)) if use_tikhonov: whitened_A = u.regularized_inverse3(vars_svd_A[i],L=args.Lambda) @ A[i] whitened_B2 = u.regularized_inverse3(vars_svd_B2[i],L=args.Lambda) @ B[i] else: whitened_A = u.pseudo_inverse2(vars_svd_A[i]) @ A[i] whitened_B2 = u.pseudo_inverse2(vars_svd_B2[i]) @ B[i] dW[i] = (B[i] @ t(A[i]))/args.batch_size dW2[i] = B[i] @ t(A[i]) pre_dW[i] = (whitened_B2 @ t(whitened_A))/args.batch_size sampled_labels_live = A[n+1] + tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) if args.fixed_labels: sampled_labels_live = A[n+1]+tf.ones(shape=(f(n), f(-1)), dtype=dtype) sampled_labels = init_var(sampled_labels_live, "sampled_labels", is_global=False) err2 = A[n+1] - sampled_labels model.loss2 = u.L2(err2) / (2 * args.batch_size) model.global_vars = global_vars model.local_vars = local_vars model.trainable_vars = W[1:] # todo, we have 3 places where model step is tracked, reduce model.step = init_var(u.as_int32(0), "step", is_global=False) advance_step_op = model.step.assign_add(1) assert get_batch_size(X_full) % args.batch_size == 0 batches_per_dataset = (get_batch_size(X_full) // args.batch_size) batch_idx = tf.mod(model.step, batches_per_dataset) start_idx = batch_idx * args.batch_size advance_batch_op = X.assign(X_full[:,start_idx:start_idx + args.batch_size]) def advance_batch(): print("Step for model(%s) is %s"%(model.name, u.eval(model.step))) sess = u.get_default_session() # TODO: get rid of _sampled_labels sessrun([sampled_labels.initializer, _sampled_labels.initializer]) if args.advance_batch: with u.timeit("advance_batch"): sessrun(advance_batch_op) sessrun(advance_step_op) model.advance_batch = advance_batch # TODO: refactor this to take initial values out of Var struct #global_init_op = tf.group(*[v.initializer for v in global_vars]) global_init_ops = [v.initializer for v in global_vars] global_init_op = tf.group(*[v.initializer for v in global_vars]) global_init_query_ops = [tf.logical_not(tf.is_variable_initialized(v)) for v in global_vars] def initialize_global_vars(verbose=False, reinitialize=False): """If reinitialize is false, will not reinitialize variables already initialized.""" sess = u.get_default_session() if not reinitialize: uninited = sessrun(global_init_query_ops) # use numpy boolean indexing to select list of initializers to run to_initialize = list(np.asarray(global_init_ops)[uninited]) else: to_initialize = global_init_ops if verbose: print("Initializing following:") for v in to_initialize: print(" " + v.name) sessrun(to_initialize, feed_dict=init_dict) model.initialize_global_vars = initialize_global_vars # didn't quite work (can't initialize var in same run call as deps likely) # enforce that batch is initialized before everything # except fake labels opa # for v in local_vars: # if v != X and v != sampled_labels and v != _sampled_labels: # print("Adding dep %s on %s"%(v.initializer.name, X.initializer.name)) # u.add_dep(v.initializer, on_op=X.initializer) local_init_op = tf.group(*[v.initializer for v in local_vars], name="%s_localinit"%(model.name)) print("Local vars:") for v in local_vars: print(v.name) def initialize_local_vars(): sess = u.get_default_session() sessrun(_sampled_labels.initializer, feed_dict=init_dict) sessrun(X.initializer, feed_dict=init_dict) sessrun(local_init_op, feed_dict=init_dict) model.initialize_local_vars = initialize_local_vars return model