예제 #1
0
def _test_optimizer(optimizer):

    mbs = 10

    dataset = random.Random('probability')
    data = B.eval(dataset.train.data[0:mbs])
    pixels = data.shape[1]

    W0 = B.variable(np.random.normal(size=(pixels, )),
                    dtype=B.floatx(),
                    name='W0')
    W1 = B.variable(np.random.normal(size=(pixels, )),
                    dtype=B.floatx(),
                    name='W1')
    params = [W0, W1]
    inputs = B.placeholder((mbs, pixels), dtype=B.floatx())
    loss = B.sum(B.dot(inputs, B.square(W0) + B.square(W1)))

    updates = optimizer.get_updates(params, loss)

    f = B.function([inputs], [loss], updates=updates)

    output = f(data)
    assert len(output) == 1
    assert output[0].size == 1
예제 #2
0
    def get_updates(self, params, loss):
        grads = self.get_gradients(loss, params)
        self.updates = OrderedDict()
        self.updates[self.iterations] = self.iterations + 1

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / (1. + self.decay * self.iterations))

        t = self.iterations + 1
        lr_t = lr * (B.sqrt(1. - B.pow(self.beta_2, t)) /
                     (1. - B.pow(self.beta_1, t)))

        ms = []
        vs = []
        for p in params:
            shape = B.get_variable_shape(p)
            name = p.name + '_ms'
            ms.append(B.zeros(shape, name=name))
            name = p.name + '_vs'
            vs.append(B.zeros(shape, name=name))
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * B.square(g)
            p_t = p - lr_t * m_t / (B.sqrt(v_t) + self.epsilon)

            self.updates[m] = m_t
            self.updates[v] = v_t

            new_p = p_t
            self.updates[p] = new_p
        return self.updates
예제 #3
0
 def __call__(self, x):
     regularization = 0.
     if self.l1:
         regularization += B.sum(self.l1 * B.abs(x))
     if self.l2:
         regularization += B.sum(self.l2 * B.square(x))
     return regularization
예제 #4
0
 def get_gradients(self, loss, params):
     grads = B.gradients(loss, params)
     if hasattr(self, 'clipnorm') and self.clipnorm > 0:
         norm = B.sqrt(sum([B.sum(B.square(g)) for g in grads]))
         grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
     if hasattr(self, 'clipvalue') and self.clipvalue > 0:
         grads = [B.clip(g, -self.clipvalue, self.clipvalue) for g in grads]
     return grads
예제 #5
0
    def get_updates(self, params, loss):
        grads = self.get_gradients(loss, params)
        self.updates = OrderedDict()
        self.updates[self.iterations] = self.iterations + 1

        t = self.iterations + 1

        # Due to the recommendations in [2], i.e. warming momentum schedule
        momentum_cache_t = self.beta_1 * (
            1. - 0.5 * (B.pow(0.96, t * self.schedule_decay)))
        momentum_cache_t_1 = self.beta_1 * (
            1. - 0.5 * (B.pow(0.96, (t + 1) * self.schedule_decay)))
        m_schedule_new = self.m_schedule * momentum_cache_t
        m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
        self.updates[self.m_schedule] = m_schedule_new

        ms = []
        vs = []
        for p in params:
            shape = B.get_variable_shape(p)
            name = p.name + '_ms'
            ms.append(B.zeros(shape, name=name))
            name = p.name + '_vs'
            vs.append(B.zeros(shape, name=name))

        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            # the following equations given in [1]
            g_prime = g / (1. - m_schedule_new)
            m_t = self.beta_1 * m + (1. - self.beta_1) * g
            m_t_prime = m_t / (1. - m_schedule_next)
            v_t = self.beta_2 * v + (1. - self.beta_2) * B.square(g)
            v_t_prime = v_t / (1. - B.pow(self.beta_2, t))
            m_t_bar = (1. - momentum_cache_t
                       ) * g_prime + momentum_cache_t_1 * m_t_prime

            self.updates[m] = m_t
            self.updates[v] = v_t

            p_t = p - self.lr * m_t_bar / (B.sqrt(v_t_prime) + self.epsilon)
            new_p = p_t

            self.updates[p] = new_p
        return self.updates