def _test_optimizer(optimizer): mbs = 10 dataset = random.Random('probability') data = B.eval(dataset.train.data[0:mbs]) pixels = data.shape[1] W0 = B.variable(np.random.normal(size=(pixels, )), dtype=B.floatx(), name='W0') W1 = B.variable(np.random.normal(size=(pixels, )), dtype=B.floatx(), name='W1') params = [W0, W1] inputs = B.placeholder((mbs, pixels), dtype=B.floatx()) loss = B.sum(B.dot(inputs, B.square(W0) + B.square(W1))) updates = optimizer.get_updates(params, loss) f = B.function([inputs], [loss], updates=updates) output = f(data) assert len(output) == 1 assert output[0].size == 1
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) self.updates = OrderedDict() self.updates[self.iterations] = self.iterations + 1 lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * self.iterations)) t = self.iterations + 1 lr_t = lr * (B.sqrt(1. - B.pow(self.beta_2, t)) / (1. - B.pow(self.beta_1, t))) ms = [] vs = [] for p in params: shape = B.get_variable_shape(p) name = p.name + '_ms' ms.append(B.zeros(shape, name=name)) name = p.name + '_vs' vs.append(B.zeros(shape, name=name)) self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * B.square(g) p_t = p - lr_t * m_t / (B.sqrt(v_t) + self.epsilon) self.updates[m] = m_t self.updates[v] = v_t new_p = p_t self.updates[p] = new_p return self.updates
def __call__(self, x): regularization = 0. if self.l1: regularization += B.sum(self.l1 * B.abs(x)) if self.l2: regularization += B.sum(self.l2 * B.square(x)) return regularization
def get_gradients(self, loss, params): grads = B.gradients(loss, params) if hasattr(self, 'clipnorm') and self.clipnorm > 0: norm = B.sqrt(sum([B.sum(B.square(g)) for g in grads])) grads = [clip_norm(g, self.clipnorm, norm) for g in grads] if hasattr(self, 'clipvalue') and self.clipvalue > 0: grads = [B.clip(g, -self.clipvalue, self.clipvalue) for g in grads] return grads
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) self.updates = OrderedDict() self.updates[self.iterations] = self.iterations + 1 t = self.iterations + 1 # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_t = self.beta_1 * ( 1. - 0.5 * (B.pow(0.96, t * self.schedule_decay))) momentum_cache_t_1 = self.beta_1 * ( 1. - 0.5 * (B.pow(0.96, (t + 1) * self.schedule_decay))) m_schedule_new = self.m_schedule * momentum_cache_t m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 self.updates[self.m_schedule] = m_schedule_new ms = [] vs = [] for p in params: shape = B.get_variable_shape(p) name = p.name + '_ms' ms.append(B.zeros(shape, name=name)) name = p.name + '_vs' vs.append(B.zeros(shape, name=name)) self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): # the following equations given in [1] g_prime = g / (1. - m_schedule_new) m_t = self.beta_1 * m + (1. - self.beta_1) * g m_t_prime = m_t / (1. - m_schedule_next) v_t = self.beta_2 * v + (1. - self.beta_2) * B.square(g) v_t_prime = v_t / (1. - B.pow(self.beta_2, t)) m_t_bar = (1. - momentum_cache_t ) * g_prime + momentum_cache_t_1 * m_t_prime self.updates[m] = m_t self.updates[v] = v_t p_t = p - self.lr * m_t_bar / (B.sqrt(v_t_prime) + self.epsilon) new_p = p_t self.updates[p] = new_p return self.updates