def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6): """ Adadelta updates The learning rate is scaled by the ratio of accumulated gradients to the ratio of accumulated step sizes. Math: * ``accu_new = rho * accu + (1 - rho) * grad ** 2`` * ``update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))`` * ``param = param - learning_rate * update`` * ``delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. rho : float Controls decay of gradient moving average. epsilon : float Avoid division by 0 while scaling. Small constant. Returns ------- list of tuples of the form (param, updates), (accumulated_grads, accumulated_grads_new), (step_accum, step_accum_new) References ---------- .. [1] Zeiler, M. D. (2012): ADADELTA: An Adaptive Learning Rate Method. arXiv Preprint arXiv:1212.5701. """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) delta_accu = cgt.shared( np.zeros(param.op.get_shape(), dtype=param.dtype)) accu_new = rho * accu + (1 - rho) * grad**2 updates.append((accu, accu_new)) update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon)) updates.append((param, param - learning_rate * update)) delta_accu_new = rho * delta_accu + (1 - rho) * update**2 updates.append((delta_accu, delta_accu_new)) return updates
def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6): """ Adadelta updates The learning rate is scaled by the ratio of accumulated gradients to the ratio of accumulated step sizes. Math: * ``accu_new = rho * accu + (1 - rho) * grad ** 2`` * ``update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))`` * ``param = param - learning_rate * update`` * ``delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. rho : float Controls decay of gradient moving average. epsilon : float Avoid division by 0 while scaling. Small constant. Returns ------- list of tuples of the form (param, updates), (accumulated_grads, accumulated_grads_new), (step_accum, step_accum_new) References ---------- .. [1] Zeiler, M. D. (2012): ADADELTA: An Adaptive Learning Rate Method. arXiv Preprint arXiv:1212.5701. """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) delta_accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) accu_new = rho * accu + (1 - rho) * grad ** 2 updates.append((accu, accu_new)) update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon)) updates.append((param, param - learning_rate * update)) delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2 updates.append((delta_accu, delta_accu_new)) return updates
def __init__(self, x, n_in, n_hid, n_out, nlayers=1, y=None, eps=None): super(GaussianMLP, self).__init__(x, n_in, n_hid, nlayers=nlayers, prefix="GaussianMLP_hidden") self.mu_layer = HiddenLayer( input=self.hidden_layers[-1].output, n_in=self.hidden_layers[-1].n_out, n_out=n_out, activation=None, prefix="GaussianMLP_mu" ) # log(sigma^2) self.logvar_layer = HiddenLayer( input=self.hidden_layers[-1].output, n_in=self.hidden_layers[-1].n_out, n_out=n_out, activation=None, prefix="GaussianMLP_logvar" ) self.mu = self.mu_layer.output self.var = cgt.exp(self.logvar_layer.output) self.sigma = cgt.sqrt(self.var) self.params = self.params + self.mu_layer.params +\ self.logvar_layer.params # for use as encoder if eps is not None: assert(y is None) self.out = self.mu + self.sigma * eps # for use as decoder if y: assert(eps is None) self.out = cgt.sigmoid(self.mu) self.cost = -cgt.sum(log_diag_mvn(self.out, self.var)(y))
def adagrad_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6): grads = cgt.grad(cost, params) updates = [] for param, grad in zip(params, grads): value = param.op.get_value() accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) delta_accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) accu_new = rho * accu + (1 - rho) * grad**2 updates.append((accu, accu_new)) update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon)) updates.append((param, param - stepsize * update)) delta_accu_new = rho * delta_accu + (1 - rho) * update**2 updates.append((delta_accu, delta_accu_new)) return updates
def rmsprop_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6): grads = cgt.grad(cost, params) updates = [] for p, g in zip(params, grads): acc = cgt.shared(p.op.get_value() * 0.) acc_new = rho * acc + (1 - rho) * cgt.square(g) gradient_scaling = cgt.sqrt(acc_new + epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - stepsize * g)) return updates
def __init__(self, xdim, args, dec="bernoulli"): self.xdim = xdim self.hdim = args.hdim self.zdim = args.zdim self.lmbda = args.lmbda # weight decay coefficient * 2 self.x = cgt.matrix("x", dtype=cgt.floatX) self.eps = cgt.matrix("eps", dtype=cgt.floatX) self.enc_mlp = GaussianMLP(self.x, self.xdim, self.hdim, self.zdim, nlayers=args.nlayers, eps=self.eps) if dec == "bernoulli": # log p(x | z) defined as -CE(x, y) = dec_mlp.cost(y) self.dec_mlp = BernoulliMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x) elif dec == "gaussian": self.dec_mlp = GaussianMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x) else: raise RuntimeError("unrecognized decoder %" % dec) self.cost = (-cgt.sum(kld_unit_mvn(self.enc_mlp.mu, self.enc_mlp.var)) + self.dec_mlp.cost) / args.batch_size self.params = self.enc_mlp.params + self.dec_mlp.params # L2 regularization self.gparams = [cgt.grad(self.cost, [p])[0] + self.lmbda * p for p in self.params] self.gaccums = [cgt.shared(np.zeros(p.op.get_value().shape, dtype=cgt.floatX)) for p in self.params] # XXX replace w/ adagrad update from nn ADAGRAD_EPS = 1e-10 # for stability self.updates = [ (param, param - args.lr * gparam / cgt.sqrt(gaccum + cgt.square(gparam) + ADAGRAD_EPS)) for param, gparam, gaccum in zip(self.params, self.gparams, self.gaccums) ] self.updates += [ (gaccum, gaccum + cgt.square(gparam)) for gaccum, gparam in zip(self.gaccums, self.gparams) ] self.train = cgt.function( [self.x, self.eps], self.cost, updates=self.updates ) self.test = cgt.function( [self.x, self.eps], self.cost, updates=None ) # can be used for semi-supervised learning for example self.encode = cgt.function( [self.x, self.eps], self.enc_mlp.out )
def rmsprop(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6): """RMSProp updates Divide learning rate by moving average of RMS gradients. See [1] Math: * ``accu_new = rho * accu + (1 - rho) * grad ** 2`` * ``param = param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. rho : float Controls decay of gradient moving average. epsilon : float Avoid division by 0 while scaling. Small constant. Returns ------- list of tuples of the form [(param, updates), (accumulated_RMS_grads, accumulated_RMS_grads_new)] References ---------- .. [1] Yann N. Dauphin, Harm de Vries, Junyoung Chung, Yoshua Bengio (2015): RMSProp and equilibrated adaptive learning rates for non-convex optimization arXiv:1502.04390 http://arxiv.org/abs/1502.04390 """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): value = param.op.get_value() accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) accu_new = rho * accu + (1 - rho) * grad**2 updates.append((accu, accu_new)) updates.append( (param, param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon)))) return updates
def rmsprop(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6): """RMSProp updates Divide learning rate by moving average of RMS gradients. See [1] Math: * ``accu_new = rho * accu + (1 - rho) * grad ** 2`` * ``param = param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. rho : float Controls decay of gradient moving average. epsilon : float Avoid division by 0 while scaling. Small constant. Returns ------- list of tuples of the form (param, updates), (accumulated_RMS_grads, accumulated_RMS_grads_new) References ---------- .. [1] Yann N. Dauphin, Harm de Vries, Junyoung Chung, Yoshua Bengio (2015): RMSProp and equilibrated adaptive learning rates for non-convex optimization arXiv:1502.04390 http://arxiv.org/abs/1502.04390 """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) accu_new = rho * accu + (1 - rho) * grad ** 2 updates.append((accu, accu_new)) updates.append((param, param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon)))) return updates
def adagrad(cost, params, learning_rate=1.0, epsilon=1e-6): """Adagrad updates The learning rate will be scaled by dividing it by the sqaure root of the sum of accumulated squared gradients. Math: * ``accu_new = accu + grad ** 2`` * ``param = param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. epsilon: avoids division close to zero. Small float. Returns ------- list of tuples of the form [(param, updates), (accumulated_grads, accumulated_grads_new)] References ---------- .. [1] Duchi, J., Hazan, E., & Singer, Y. (2011): Adaptive subgradient methods for online learning and stochastic optimization. JMLR, 12:2121-2159. """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): value = param.op.get_value() accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype)) accu_new = accu + grad**2 updates.append((accu, accu_new)) updates.append( (param, param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon))) return updates
def adagrad(cost, params, learning_rate=1.0, epsilon=1e-6): """Adagrad updates The learning rate will be scaled by dividing it by the sqaure root of the sum of accumulated squared gradients. Math: * ``accu_new = accu + grad ** 2`` * ``param = param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)`` Parameters ---------- cost : a scalar loss. params : a list of cgt shared variables. We generate update expressions w.r.t. these variables. learning_rate : float Tunes the size of the update step. epsilon: avoids division close to zero. Small float. Returns ------- list of tuples of the form [(param, updates), (accumulated_grads, accumulated_grads_new)] References ---------- .. [1] Duchi, J., Hazan, E., & Singer, Y. (2011): Adaptive subgradient methods for online learning and stochastic optimization. JMLR, 12:2121-2159. """ updates = [] grads = cgt.grad(cost, params) for param, grad in zip(params, grads): assert isinstance(param.op, core.GetData) accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype)) accu_new = accu + grad ** 2 updates.append((accu, accu_new)) updates.append((param, param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon))) return updates
def sqrt(x): return cgt.sqrt(x)