def _get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=9, word_vec_name='W_emb'): print "Generating adadelta updates (implementation from dnn)" # compute list of weights updates gparams = T.grad(cost, params) accugrads, accudeltas = [], [] for param in params: accugrads.append(build_shared_zeros(param.shape.eval(), 'accugrad')) accudeltas.append(build_shared_zeros(param.shape.eval(), 'accudelta')) # compute list of weights updates updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(accugrads, accudeltas, params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = rho * accugrad + (1 - rho) * gparam * gparam dx = - T.sqrt((accudelta + eps) / (agrad + eps)) * gparam updates[accudelta] = (rho * accudelta + (1 - rho) * dx * dx) if (max_norm > 0) and param.ndim == 2 and param.name != word_vec_name: W = param + dx col_norms = W.norm(2, axis=0) desired_norms = T.clip(col_norms, 0, T.sqrt(max_norm)) updates[param] = W * (desired_norms / (1e-7 + col_norms)) else: updates[param] = param + dx updates[accugrad] = agrad return updates
def get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=9, word_vec_name='W_emb'): """ adadelta update rule, mostly from https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta) """ print "Generating adadelta updates" updates = OrderedDict({}) exp_sqr_grads = OrderedDict({}) exp_sqr_ups = OrderedDict({}) gparams = [] for param in params: exp_sqr_grads[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name) gp = T.grad(cost, param) exp_sqr_ups[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name) gparams.append(gp) for param, gp in zip(params, gparams): exp_sg = exp_sqr_grads[param] exp_su = exp_sqr_ups[param] up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp) updates[exp_sg] = up_exp_sg step = -(T.sqrt(exp_su + eps) / T.sqrt(up_exp_sg + eps)) * gp updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) stepped_param = param + step # if (param.get_value(borrow=True).ndim == 2) and (param.name != word_vec_name): if max_norm and param.name != word_vec_name: col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(max_norm)) scale = desired_norms / (1e-7 + col_norms) updates[param] = stepped_param * scale else: updates[param] = stepped_param return updates
def _get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=9, word_vec_name='W_emb'): print "Generating adadelta updates (implementation from dnn)" # compute list of weights updates gparams = T.grad(cost, params) accugrads, accudeltas = [], [] for param in params: accugrads.append(build_shared_zeros(param.shape.eval(), 'accugrad')) accudeltas.append(build_shared_zeros(param.shape.eval(), 'accudelta')) # compute list of weights updates updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(accugrads, accudeltas, params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = rho * accugrad + (1 - rho) * gparam * gparam dx = -T.sqrt((accudelta + eps) / (agrad + eps)) * gparam updates[accudelta] = (rho * accudelta + (1 - rho) * dx * dx) if (max_norm > 0) and param.ndim == 2 and param.name != word_vec_name: W = param + dx col_norms = W.norm(2, axis=0) desired_norms = T.clip(col_norms, 0, T.sqrt(max_norm)) updates[param] = W * (desired_norms / (1e-7 + col_norms)) else: updates[param] = param + dx updates[accugrad] = agrad return updates
def get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=9, word_vec_name='W_emb'): print "Generating adadelta updates" updates = OrderedDict({}) exp_sqr_grads = OrderedDict({}) exp_sqr_ups = OrderedDict({}) gparams = [] for param in params: exp_sqr_grads[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name) gp = T.grad(cost, param) exp_sqr_ups[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name) gparams.append(gp) for param, gp in zip(params, gparams): exp_sg = exp_sqr_grads[param] exp_su = exp_sqr_ups[param] up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp) updates[exp_sg] = up_exp_sg step = -(T.sqrt(exp_su + eps) / T.sqrt(up_exp_sg + eps)) * gp updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) stepped_param = param + step # if (param.get_value(borrow=True).ndim == 2) and (param.name != word_vec_name): if max_norm and param.name != word_vec_name: col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(max_norm)) scale = desired_norms / (1e-7 + col_norms) updates[param] = stepped_param * scale else: updates[param] = stepped_param return updates
def get_adagrad_updates(mean_cost, params, learning_rate=0.1, max_norm=9, _eps=1e-6): """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate. """ print "Generating adagrad updates" # compute the gradients with respect to the model parameters gparams = T.grad(mean_cost, params) accugrads = [] for param in params: accugrads.append(build_shared_zeros(param.shape.eval(), 'accugrad')) # compute list of weights updates updates = OrderedDict() for accugrad, param, gparam in zip(accugrads, params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = accugrad + gparam * gparam dx = - (learning_rate / T.sqrt(agrad + _eps)) * gparam update = param + dx if max_norm: W = param + dx col_norms = W.norm(2, axis=0) desired_norms = T.clip(col_norms, 0, max_norm) update = W * (desired_norms / (1e-6 + col_norms)) updates[param] = update updates[accugrad] = agrad return updates
def get_adagrad_updates(mean_cost, params, learning_rate=0.1, max_norm=9, _eps=1e-6): """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate. """ print "Generating adagrad updates" # compute the gradients with respect to the model parameters gparams = T.grad(mean_cost, params) accugrads = [] for param in params: accugrads.append(build_shared_zeros(param.shape.eval(), 'accugrad')) # compute list of weights updates updates = OrderedDict() for accugrad, param, gparam in zip(accugrads, params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = accugrad + gparam * gparam dx = -(learning_rate / T.sqrt(agrad + _eps)) * gparam update = param + dx if max_norm: W = param + dx col_norms = W.norm(2, axis=0) desired_norms = T.clip(col_norms, 0, max_norm) update = W * (desired_norms / (1e-6 + col_norms)) updates[param] = update updates[accugrad] = agrad return updates