def opt_step(): with tf.GradientTape() as g2nd: # second order derivative with tf.GradientTape() as g1st: # first order derivative cost = f() grads = g1st.gradient(cost, xyz) # gradient vs = [tf.random.normal(w.shape) for w in xyz] # a random vector hess_vs = g2nd.gradient(grads, xyz, vs) # Hessian-vector products Q.assign(psgd.update_precond_dense(Q, vs, hess_vs, step=0.1)) # update Q pre_grads = psgd.precond_grad_dense(Q, grads) # this is the preconditioned gradient [w.assign_sub(0.1*g) for (w, g) in zip(xyz, pre_grads)] # update parameters return cost
""" import torch from torch.autograd import grad import matplotlib.pyplot as plt import preconditioned_stochastic_gradient_descent as psgd def Rosenbrock(x): return 100.0 * (x[1] - x[0]**2)**2 + (1.0 - x[0])**2 x = [ torch.tensor(-1.0, requires_grad=True), torch.tensor(1.0, requires_grad=True) ] Q = 0.1 * torch.eye(2) # initialize Q with small values; otherwise, diverge Cost = [] for i in range(300): cost = Rosenbrock(x) Cost.append(cost.item()) g = grad(cost, x, create_graph=True) v = [torch.randn([]), torch.randn([])] gv = g[0] * v[0] + g[1] * v[1] hv = grad(gv, x) with torch.no_grad(): Q = psgd.update_precond_dense(Q, v, hv, 0.2) pre_g = psgd.precond_grad_dense(Q, g) x[0] -= 0.5 * pre_g[0] x[1] -= 0.5 * pre_g[1] plt.semilogy(Cost)
""" import tensorflow as tf import matplotlib.pyplot as plt import preconditioned_stochastic_gradient_descent as psgd with tf.Session() as sess: x1 = tf.Variable(-1.0) x2 = tf.Variable(1.0) Q = tf.Variable(0.1 * tf.eye(2), trainable=False) # P=Q^T*Q is the preconditioner f = 100.0 * (x2 - x1**2)**2 + (1.0 - x1)**2 # the function to be minimized xs = [x1, x2] # put all x in xs grads = tf.gradients(f, xs) # gradients precond_grads = psgd.precond_grad_dense(Q, grads) # preconditioned gradients new_xs = [x - 0.5 * g for (x, g) in zip(xs, precond_grads) ] # new x; no need to use line search! update_xs = [tf.assign(old, new) for (old, new) in zip(xs, new_xs)] # update x delta_xs = [tf.random_normal(x.shape) for x in xs] # a random vector grad_deltaw = tf.reduce_sum([ tf.reduce_sum(g * v) for (g, v) in zip(grads, delta_xs) ]) # gradient-vector product hess_deltaw = tf.gradients(grad_deltaw, xs) # Hessian-vector product new_Q = psgd.update_precond_dense(Q, delta_xs, hess_deltaw, 0.2) # new Q update_Q = tf.assign(Q, new_Q) # update Q # begin to excute the graph sess.run(tf.global_variables_initializer())
x, y = get_batches() # calculate loss and gradient loss = train_criterion(Ws, x, y) grads = grad(loss, Ws, create_graph=True) Loss.append(loss.data.numpy()[0]) # update preconditioners Q_update_gap = max(int(np.floor(np.log10(num_iter + 1.0))), 1) if num_iter % Q_update_gap == 0: # let us update Q less frequently delta = [Variable(torch.randn(W.size())) for W in Ws] grad_delta = sum([torch.sum(g * d) for (g, d) in zip(grads, delta)]) hess_delta = grad(grad_delta, Ws) Q = psgd.update_precond_dense(Q, [d.data.numpy() for d in delta], [h.data.numpy() for h in hess_delta]) # update Ws pre_grads = psgd.precond_grad_dense(Q, [g.data.numpy() for g in grads]) grad_norm = np.sqrt(sum([np.sum(g * g) for g in pre_grads])) if grad_norm > grad_norm_clip_thr: step_adjust = grad_norm_clip_thr / grad_norm else: step_adjust = 1.0 for i in range(len(Ws)): Ws[i].data = Ws[i].data - step_adjust * step_size * torch.FloatTensor( pre_grads[i]) if num_iter % 100 == 0: print('training loss: {}'.format(Loss[-1])) plt.semilogy(Loss)
torch.tensor(-1.0, requires_grad=True), torch.tensor(1.0, requires_grad=True) ] def Rosenbrock(xs): x1, x2 = xs return 100.0 * (x2 - x1**2)**2 + (1.0 - x1)**2 Q = 0.1 * torch.eye(2) # the preconditioner is Q^T*Q f_values = [] for i in range(500): y = Rosenbrock(xs) f_values.append(y.item()) grads = grad(y, xs, create_graph=True) # gradient vs = [torch.randn([]), torch.randn([])] # a random vector grad_vs = grads[0] * vs[0] + grads[1] * vs[ 1] # gradient-vector inner product hess_vs = grad(grad_vs, xs) # Hessian-vector product with torch.no_grad(): Q = psgd.update_precond_dense(Q, vs, hess_vs, 0.2) # update the preconditioner pre_grads = psgd.precond_grad_dense( Q, grads) # calculate the preconditioned gradient [x.subtract_(0.5 * g) for (x, g) in zip(xs, pre_grads)] # update the variables plt.semilogy(f_values) plt.xlabel('Iterations') plt.ylabel('Function values')
x, y, z = np.random.randn(R, I), np.random.randn(R, J), np.random.randn(R, K) Q = 0.1 * np.eye(R * (I + J + K)) sqrt_eps = np.sqrt(np.finfo(np.float64).eps) dense_Loss = [] dense_Times = [] dense_Iter = np.linspace(1, 5000, 5000) dense_Iter.tolist() t0 = time.time() for num_iter in range(5000): loss, grads = trd_cost_grad(T, [x, y, z]) dense_Loss.append(loss) t1 = time.time() dense_Times.append(t1 - t0) dx, dy, dz = sqrt_eps * np.random.randn(R, I), sqrt_eps * np.random.randn( R, J), sqrt_eps * np.random.randn(R, K) _, perturbed_grads = trd_cost_grad(T, [x + dx, y + dy, z + dz]) Q = psgd.update_precond_dense(Q, [dx, dy, dz], [ perturbed_grads[0] - grads[0], perturbed_grads[1] - grads[1], perturbed_grads[2] - grads[2] ]) pre_grads = psgd.precond_grad_dense(Q, grads) x -= pre_grads[0] y -= pre_grads[1] z -= pre_grads[2] #plt.subplot(121) #plt.loglog(Loss) #plt.subplot(122) #plt.loglog(Times,Loss)