Пример #1
0
def test_sgd():
    # Create simple Nd gaussian functions to optimize. These functions are
    # (perfectly) well-conditioned so it should take only one gradient step
    # to converge using 1/L, where L is the largest eigenvalue of the hessian.
    max_epoch = 2
    for N in range(1, 5):
        center = np.arange(1, N+1)[None, :].astype(floatX)
        param = sharedX(np.zeros((1, N)))
        cost = T.sum(0.5*T.dot(T.dot((param-center), T.eye(N)), (param-center).T))
        loss = DummyLossWithGradient(cost, param)

        trainer = Trainer(SGD(loss), DummyBatchScheduler())

        # Monitor the gradient of `loss` w.r.t. to `param`.
        tracker = tasks.Tracker(loss.gradients[param])
        trainer.append_task(tracker)

        trainer.append_task(stopping_criteria.MaxEpochStopping(max_epoch))
        trainer.train()

        # Since the problem is well-conditionned and we use an optimal gradient step 1/L,
        # two epochs should be enough for `param` to be around `center` and the gradients near 0.
        assert_array_almost_equal(param.get_value(), center)
        assert_array_almost_equal(tracker[0], 0.)

    # Create an Nd gaussian function to optimize. This function is not
    # well-conditioned and there exists no perfect gradient step to converge in
    # only one iteration.
    # cost = T.sum(N*0.5*T.dot(T.dot((param-center), np.diag(1./np.arange(1, N+1))), ((param-center).T)))
    max_epoch = 80
    N = 4
    center = 5*np.ones((1, N)).astype(floatX)
    param = sharedX(np.zeros((1, N)))
    cost = T.sum(0.5*T.dot(T.dot((param-center), np.diag(1./np.arange(1, N+1))), (param-center).T))
    loss = DummyLossWithGradient(cost, param)

    trainer = Trainer(SGD(loss), DummyBatchScheduler())
    trainer.append_task(stopping_criteria.MaxEpochStopping(max_epoch))

    # Monitor the gradient of `loss` w.r.t. to `param`.
    tracker = tasks.Tracker(loss.gradients[param])
    trainer.append_task(tracker)
    trainer.train()

    # Since the problem is well-conditionned and we use an optimal gradient step 1/L,
    # two epochs should be enough for `param` to be around `center` and the gradients near 0.
    assert_array_almost_equal(param.get_value(), center, decimal=6)
    assert_array_almost_equal(tracker[0], 0.)
Пример #2
0
def test_adagrad():
    max_epoch = 15

    # Create an Nd gaussian functions to optimize. These functions are not
    # well-conditioned and there exists no perfect gradient step to converge in
    # only one iteration.
    for N in range(1, 5):
        center = 5*np.ones((1, N)).astype(floatX)
        param = sharedX(np.zeros((1, N)))
        cost = T.sum(0.5*T.dot(T.dot((param-center), np.diag(1./np.arange(1, N+1))), ((param-center).T)))
        loss = DummyLossWithGradient(cost, param)

        # Even with a really high gradient step, AdaGrad can still converge.
        # Actually, it is faster than using the optimal gradient step with SGD.
        optimizer = AdaGrad(loss, lr=100, eps=1e-1)
        trainer = Trainer(optimizer, DummyBatchScheduler())
        trainer.append_task(stopping_criteria.MaxEpochStopping(max_epoch))

        # Monitor the gradient of `loss` w.r.t. to `param`.
        tracker = tasks.Tracker(loss.gradients[param])
        trainer.append_task(tracker)
        trainer.train()

        # After 15 epochs, param should be around the center and gradients near 0.
        assert_array_almost_equal(param.get_value(), center)
        assert_array_almost_equal(tracker[0], 0.)