def opt_step():
     with tf.GradientTape() as g2nd: # second order derivative
         with tf.GradientTape() as g1st: # first order derivative
             cost = f()
         grads = g1st.gradient(cost, xyz) # gradient
         vs = [tf.random.normal(w.shape) for w in xyz] # a random vector
     hess_vs = g2nd.gradient(grads, xyz, vs) # Hessian-vector products
     Q.assign(psgd.update_precond_dense(Q, vs, hess_vs, step=0.1)) # update Q
     pre_grads = psgd.precond_grad_dense(Q, grads) # this is the preconditioned gradient
     [w.assign_sub(0.1*g) for (w, g) in zip(xyz, pre_grads)] # update parameters
     return cost
예제 #2
0
"""
import torch
from torch.autograd import grad
import matplotlib.pyplot as plt
import preconditioned_stochastic_gradient_descent as psgd


def Rosenbrock(x):
    return 100.0 * (x[1] - x[0]**2)**2 + (1.0 - x[0])**2


x = [
    torch.tensor(-1.0, requires_grad=True),
    torch.tensor(1.0, requires_grad=True)
]
Q = 0.1 * torch.eye(2)  # initialize Q with small values; otherwise, diverge
Cost = []
for i in range(300):
    cost = Rosenbrock(x)
    Cost.append(cost.item())
    g = grad(cost, x, create_graph=True)
    v = [torch.randn([]), torch.randn([])]
    gv = g[0] * v[0] + g[1] * v[1]
    hv = grad(gv, x)
    with torch.no_grad():
        Q = psgd.update_precond_dense(Q, v, hv, 0.2)
        pre_g = psgd.precond_grad_dense(Q, g)
        x[0] -= 0.5 * pre_g[0]
        x[1] -= 0.5 * pre_g[1]

plt.semilogy(Cost)
예제 #3
0
"""
import tensorflow as tf
import matplotlib.pyplot as plt

import preconditioned_stochastic_gradient_descent as psgd

with tf.Session() as sess:
    x1 = tf.Variable(-1.0)
    x2 = tf.Variable(1.0)
    Q = tf.Variable(0.1 * tf.eye(2),
                    trainable=False)  # P=Q^T*Q is the preconditioner
    f = 100.0 * (x2 - x1**2)**2 + (1.0 - x1)**2  # the function to be minimized

    xs = [x1, x2]  # put all x in xs
    grads = tf.gradients(f, xs)  # gradients
    precond_grads = psgd.precond_grad_dense(Q,
                                            grads)  # preconditioned gradients
    new_xs = [x - 0.5 * g for (x, g) in zip(xs, precond_grads)
              ]  # new x; no need to use line search!
    update_xs = [tf.assign(old, new)
                 for (old, new) in zip(xs, new_xs)]  # update x

    delta_xs = [tf.random_normal(x.shape) for x in xs]  # a random vector
    grad_deltaw = tf.reduce_sum([
        tf.reduce_sum(g * v) for (g, v) in zip(grads, delta_xs)
    ])  # gradient-vector product
    hess_deltaw = tf.gradients(grad_deltaw, xs)  # Hessian-vector product
    new_Q = psgd.update_precond_dense(Q, delta_xs, hess_deltaw, 0.2)  # new Q
    update_Q = tf.assign(Q, new_Q)  # update Q

    # begin to excute the graph
    sess.run(tf.global_variables_initializer())
예제 #4
0
    x, y = get_batches()

    # calculate loss and gradient
    loss = train_criterion(Ws, x, y)
    grads = grad(loss, Ws, create_graph=True)
    Loss.append(loss.data.numpy()[0])

    # update preconditioners
    Q_update_gap = max(int(np.floor(np.log10(num_iter + 1.0))), 1)
    if num_iter % Q_update_gap == 0:  # let us update Q less frequently
        delta = [Variable(torch.randn(W.size())) for W in Ws]
        grad_delta = sum([torch.sum(g * d) for (g, d) in zip(grads, delta)])
        hess_delta = grad(grad_delta, Ws)
        Q = psgd.update_precond_dense(Q, [d.data.numpy() for d in delta],
                                      [h.data.numpy() for h in hess_delta])

    # update Ws
    pre_grads = psgd.precond_grad_dense(Q, [g.data.numpy() for g in grads])
    grad_norm = np.sqrt(sum([np.sum(g * g) for g in pre_grads]))
    if grad_norm > grad_norm_clip_thr:
        step_adjust = grad_norm_clip_thr / grad_norm
    else:
        step_adjust = 1.0
    for i in range(len(Ws)):
        Ws[i].data = Ws[i].data - step_adjust * step_size * torch.FloatTensor(
            pre_grads[i])

    if num_iter % 100 == 0:
        print('training loss: {}'.format(Loss[-1]))

plt.semilogy(Loss)
예제 #5
0
    torch.tensor(-1.0, requires_grad=True),
    torch.tensor(1.0, requires_grad=True)
]


def Rosenbrock(xs):
    x1, x2 = xs
    return 100.0 * (x2 - x1**2)**2 + (1.0 - x1)**2


Q = 0.1 * torch.eye(2)  # the preconditioner is Q^T*Q
f_values = []
for i in range(500):
    y = Rosenbrock(xs)
    f_values.append(y.item())
    grads = grad(y, xs, create_graph=True)  # gradient
    vs = [torch.randn([]), torch.randn([])]  # a random vector
    grad_vs = grads[0] * vs[0] + grads[1] * vs[
        1]  # gradient-vector inner product
    hess_vs = grad(grad_vs, xs)  # Hessian-vector product
    with torch.no_grad():
        Q = psgd.update_precond_dense(Q, vs, hess_vs,
                                      0.2)  # update the preconditioner
        pre_grads = psgd.precond_grad_dense(
            Q, grads)  # calculate the preconditioned gradient
        [x.subtract_(0.5 * g)
         for (x, g) in zip(xs, pre_grads)]  # update the variables

plt.semilogy(f_values)
plt.xlabel('Iterations')
plt.ylabel('Function values')
예제 #6
0
x, y, z = np.random.randn(R, I), np.random.randn(R, J), np.random.randn(R, K)

Q = 0.1 * np.eye(R * (I + J + K))
sqrt_eps = np.sqrt(np.finfo(np.float64).eps)
dense_Loss = []
dense_Times = []
dense_Iter = np.linspace(1, 5000, 5000)
dense_Iter.tolist()
t0 = time.time()
for num_iter in range(5000):
    loss, grads = trd_cost_grad(T, [x, y, z])
    dense_Loss.append(loss)
    t1 = time.time()
    dense_Times.append(t1 - t0)
    dx, dy, dz = sqrt_eps * np.random.randn(R, I), sqrt_eps * np.random.randn(
        R, J), sqrt_eps * np.random.randn(R, K)
    _, perturbed_grads = trd_cost_grad(T, [x + dx, y + dy, z + dz])
    Q = psgd.update_precond_dense(Q, [dx, dy, dz], [
        perturbed_grads[0] - grads[0], perturbed_grads[1] - grads[1],
        perturbed_grads[2] - grads[2]
    ])
    pre_grads = psgd.precond_grad_dense(Q, grads)
    x -= pre_grads[0]
    y -= pre_grads[1]
    z -= pre_grads[2]

#plt.subplot(121)
#plt.loglog(Loss)
#plt.subplot(122)
#plt.loglog(Times,Loss)