예제 #1
0
    def adadelta(allparams,
                 nat_stepsize,
                 num_epochs,
                 seq_len,
                 num_seqs=None,
                 rho=0.95,
                 epsilon=1e-6,
                 num_samples=1,
                 permute=True):
        natparams, params = allparams[:1], allparams[1:]
        sum_gsq = zeros_like(params)  # accumulated sq. grads
        sum_usq = zeros_like(params)  # accumulated sq. updates
        accumulate = lambda a, b: add(scale(rho, a), scale(1 - rho, b))

        for epoch in xrange(num_epochs):
            vals = []
            batches, num_batches = split_into_batches(data, seq_len, num_seqs)
            for y in batches:
                val, grad = scale(
                    1. / num_datapoints,
                    val_and_grad(y, num_batches, num_samples, *allparams))
                natgrad, grad = grad[:1], grad[1:]
                sum_gsq = accumulate(sum_gsq, square(grad))
                diag_scaling = div(sqrt(add_scalar(epsilon, sum_usq)),
                                   sqrt(add_scalar(epsilon, sum_gsq)))
                update = mul(diag_scaling, grad)
                sum_usq = accumulate(sum_usq, square(update))

                natparams = add(natparams, scale(nat_stepsize, natgrad))
                params = add(params, update)
                allparams = concat(natparams, params)
                vals.append(val)

                if callback: callback(epoch, vals, natgrad, allparams)
        return allparams
예제 #2
0
 def __init__(params, lr=0.0001, usage=0.7, decay=0.99, pullback=):
     self.param_list = params
     self.lr = lr
     self.usage = usage
     self.decay = decay
     self.pullback = pullback
     self.lagged = zeros_like(params)
예제 #3
0
 def step():
     step = zeros_like(self.params)
     grad = DirectSum([x.grad for x in self.param_list])
     lag_comp = self.decay * ddot(grad, self.lagged)
     step = grad * (lr + (lag_comp / pow(dnorm(grad), 1.5)))
     step -= self.lagged * pullback
     self.lagged -= self.lagged * lag_comp / dnorm(grad)
     self.lagged *= decay
     for p ,s in zip(self.param_list, step):
         p -= s
     self.lagged += grad
예제 #4
0
    def adam(allparams,
             nat_stepsize,
             stepsize,
             num_epochs,
             seq_len,
             num_seqs=None,
             b1=0.9,
             b2=0.999,
             eps=1e-8,
             num_samples=1):
        natparams, params = allparams[:1], allparams[1:]
        m = zeros_like(params)
        v = zeros_like(params)
        i = 0
        accumulate = lambda rho, a, b: add(scale(1 - rho, a), scale(rho, b))

        for epoch in xrange(num_epochs):
            vals = []
            batches, num_batches = split_into_batches(data, seq_len, num_seqs)
            for y in batches:
                val, grad = scale(
                    1. / num_datapoints,
                    val_and_grad(y, num_batches, num_samples, *allparams))
                natgrad, grad = grad[:1], grad[1:]

                m = accumulate(b1, grad, m)  # first moment estimate
                v = accumulate(b2, square(grad), v)  # second moment estimate
                mhat = scale(1. / (1 - b1**(i + 1)), m)  # bias correction
                vhat = scale(1. / (1 - b2**(i + 1)), v)
                update = scale(stepsize, div(mhat, add_scalar(eps,
                                                              sqrt(vhat))))

                natparams = add(natparams, scale(nat_stepsize, natgrad))
                params = add(params, update)
                allparams = concat(natparams, params)
                vals.append(val)
                i += 1

                if callback: callback(epoch, vals, natgrad, allparams)

        return allparams
예제 #5
0
def adam(gradfun, allparams, num_iters, step_size, b1=0.9, b2=0.999, eps=1e-8):
    natparams, params = allparams[:1], allparams[1:]
    m = zeros_like(params)
    v = zeros_like(params)
    i = 0
    accumulate = lambda rho, a, b: add(scale(1-rho, a), scale(rho, b))
 
    for i in xrange(num_iters):
        grad = gradfun(allparams, i)
        natgrad, grad = grad[:1], grad[1:]
 
        m = accumulate(b1, grad, m)          # first moment estimate
        v = accumulate(b2, square(grad), v)  # second moment estimate
        mhat = scale(1./(1 - b1**(i+1)), m)  # bias correction
        vhat = scale(1./(1 - b2**(i+1)), v)
        update = scale(step_size, div(mhat, add_scalar(eps, sqrt(vhat))))
 
        natparams = sub(natparams, scale(step_size, natgrad))
        params = sub(params, update)
        allparams = concat(natparams, params)
 
    return allparams