예제 #1
0
qz1, qz2 = np.stack([np.ones(R), np.zeros(R)]), 0.1 * np.ones((1, K))
sqrt_eps = np.sqrt(np.finfo(np.float64).eps)
scan_Loss = []
scan_Times = []
scan_Iter = np.linspace(1, 5000, 5000)
scan_Iter.tolist()

t0 = time.time()
for num_iter in range(5000):
    loss, grads = trd_cost_grad(T, [x, y, z])
    scan_Loss.append(loss)
    t1 = time.time()
    scan_Times.append(t1 - t0)
    dx, dy, dz = sqrt_eps * np.random.randn(R, I), sqrt_eps * np.random.randn(
        R, J), sqrt_eps * np.random.randn(R, K)
    _, perturbed_grads = trd_cost_grad(T, [x + dx, y + dy, z + dz])
    qx1, qx2 = psgd.update_precond_scan(qx1, qx2, dx,
                                        perturbed_grads[0] - grads[0])
    qy1, qy2 = psgd.update_precond_scan(qy1, qy2, dy,
                                        perturbed_grads[1] - grads[1])
    qz1, qz2 = psgd.update_precond_scan(qz1, qz2, dz,
                                        perturbed_grads[2] - grads[2])
    x -= 0.5 * psgd.precond_grad_scan(qx1, qx2, grads[0])
    y -= 0.5 * psgd.precond_grad_scan(qy1, qy2, grads[1])
    z -= 0.5 * psgd.precond_grad_scan(qz1, qz2, grads[2])

#plt.subplot(121)
#plt.loglog(Loss)
#plt.subplot(122)
#plt.loglog(Times,Loss)
예제 #2
0
# initialize preconditioners with identity matrices
Qs = [[torch.cat([torch.ones((1, W.shape[0])), torch.zeros((1, W.shape[0]))]),
       torch.ones((1, W.shape[1]))] for W in Ws]
# begin iteration here
step_size = 0.02
grad_norm_clip_thr = 1.0
Loss = []
for num_iter in range(10000):
    x, y = get_batches( )
    
    # calculate loss and gradient
    loss = train_criterion(Ws, x, y)
    grads = grad(loss, Ws, create_graph=True)
    Loss.append(loss.item())
    delta = [torch.randn(W.shape) for W in Ws]
    grad_delta = sum([torch.sum(g*d) for (g, d) in zip(grads, delta)])
    hess_delta = grad(grad_delta, Ws)
    with torch.no_grad():
        Qs = [psgd.update_precond_scan(q[0], q[1], dw, dg) for (q, dw, dg) in zip(Qs, delta, hess_delta)]
        pre_grads = [psgd.precond_grad_scan(q[0], q[1], g) for (q, g) in zip(Qs, grads)]
        grad_norm = torch.sqrt(sum([torch.sum(g*g) for g in pre_grads]))
        step_adjust = min(grad_norm_clip_thr/(grad_norm + 1.2e-38), 1.0)
        for i in range(len(Ws)):
            Ws[i] -= step_adjust*step_size*pre_grads[i]
            
        if num_iter % 100 == 0:
            print('training loss: {}'.format(Loss[-1]))
    
plt.semilogy(Loss)
예제 #3
0
        tf.reduce_sum([tf.reduce_sum(g * g) for g in precond_grads]))
    step_size_adjust = tf.minimum(1.0,
                                  grad_norm_clip_thr / (grad_norm + 1.2e-38))
    new_Ws = [
        W - (step_size_adjust * step_size) * g
        for (W, g) in zip(Ws, precond_grads)
    ]
    update_Ws = [tf.assign(W, new_W) for (W, new_W) in zip(Ws, new_Ws)]

    delta_Ws = [tf.random_normal(W.shape, dtype=dtype) for W in Ws]
    grad_deltaw = tf.reduce_sum(
        [tf.reduce_sum(g * v) for (g, v) in zip(grads, delta_Ws)])
    hess_deltaw = tf.gradients(grad_deltaw, Ws)

    new_qs = [
        psgd.update_precond_scan(ql, qr, dw, dg)
        for (ql, qr, dw, dg) in zip(qs_left, qs_right, delta_Ws, hess_deltaw)
    ]
    update_qs = [[tf.assign(old_ql, new_q[0]),
                  tf.assign(old_qr, new_q[1])]
                 for (old_ql, old_qr, new_q) in zip(qs_left, qs_right, new_qs)]

    test_loss = test_criterion(Ws)

    sess.run(tf.global_variables_initializer())
    avg_train_loss = 0.0
    TrainLoss = list()
    TestLoss = list()
    Time = list()
    for num_iter in range(20000):
        _train_inputs, _train_outputs = get_batches()
예제 #4
0
for num_iter in range(10000):
    x, y = get_batches()

    # calculate loss and gradient
    loss = train_criterion(Ws, x, y)
    grads = grad(loss, Ws, create_graph=True)
    Loss.append(loss.data.numpy()[0])

    # update preconditioners
    Q_update_gap = max(int(np.floor(np.log10(num_iter + 1.0))), 1)
    if num_iter % Q_update_gap == 0:  # let us update Q less frequently
        delta = [Variable(torch.randn(W.size())) for W in Ws]
        grad_delta = sum([torch.sum(g * d) for (g, d) in zip(grads, delta)])
        hess_delta = grad(grad_delta, Ws)
        Qs = [
            psgd.update_precond_scan(q[0], q[1], dw.data.numpy(),
                                     dg.data.numpy())
            for (q, dw, dg) in zip(Qs, delta, hess_delta)
        ]

    # update Ws
    pre_grads = [
        psgd.precond_grad_scan(q[0], q[1], g.data.numpy())
        for (q, g) in zip(Qs, grads)
    ]
    grad_norm = np.sqrt(sum([np.sum(g * g) for g in pre_grads]))
    if grad_norm > grad_norm_clip_thr:
        step_adjust = grad_norm_clip_thr / grad_norm
    else:
        step_adjust = 1.0
    for i in range(len(Ws)):
        Ws[i].data = Ws[i].data - step_adjust * step_size * torch.FloatTensor(