def sgd_updates_adagrad(self, cost, learning_rate): """ Return the dictionary of parameter specific learning rate updates using adagrad algorithm. """ #Initialize the variables accumulators = {} e0s = {} learn_rates = [] ups = {} #initialize the accumulator and the epsilon_0 for param in self.params: accumulators[param] = theano.shared(value=as_floatX(0.), name="acc_%s" % param.name) e0s[param] = as_floatX(learning_rate) self.grads = [T.grad(cost, p) for p in self.params] #Compute the learning rates for param, gp in zip(self.params, self.grads): acc = accumulators[param] ups[acc] = T.sqrt((gp ** 2).sum()) learn_rates.append(e0s[param] / ups[acc]) #Find the updates based on the parameters updates = [(p, p - step * gp) for (step, p, gp) in zip(learn_rates, self.params, self.grads)] p_up = dict(updates) safe_update(ups, p_up) return ups
def sgd_updates_adagrad(self, cost, learning_rate): """ Return the dictionary of parameter specific learning rate updates using adagrad algorithm. """ #Initialize the variables accumulators = {} e0s = {} learn_rates = [] ups = {} #initialize the accumulator and the epsilon_0 for param in self.params: accumulators[param] = theano.shared(value=as_floatX(0.), name="acc_%s" % param.name) e0s[param] = as_floatX(learning_rate) self.grads = [T.grad(cost, p) for p in self.params] #Compute the learning rates for param, gp in zip(self.params, self.grads): acc = accumulators[param] ups[acc] = T.sqrt((gp**2).sum()) learn_rates.append(e0s[param] / ups[acc]) #Find the updates based on the parameters updates = [(p, p - step * gp) for (step, p, gp) in zip(learn_rates, self.params, self.grads)] p_up = dict(updates) safe_update(ups, p_up) return ups
def __init__(self, inputs, cost, layers, max_col_norm=None, loss_based_pooling=False, pooling_loss=None, learning_rate=0.01, momentum=None, rmsprop=True, adadelta=False, center_grads=False, rho=0.96, epsilon=1e-8, use_nesterov=True, seed=None, rng=None, constants=None, **kw): self.loss_based_pooling = loss_based_pooling self.rng = rng params = [layer.W for layer in layers] + [layer.b for layer in layers] self.learning_rate = theano.shared(numpy.asarray(learning_rate, dtype=theano.config.floatX)) self.layers = layers self.max_col_norm = max_col_norm #Initialize parameters for rmsprop: accumulators = OrderedDict({}) accumulators_mgrad = OrderedDict({}) exp_sqr_grads = OrderedDict({}) exp_sqr_ups = OrderedDict({}) e0s = OrderedDict({}) learn_rates = [] from utils import as_floatX self.max_col_norm = max_col_norm gparams = [] for param in params: eps_p = numpy.zeros_like(param.get_value()) accumulators[param] = theano.shared(value=as_floatX(eps_p), name="acc_%s" % param.name) accumulators_mgrad[param] = theano.shared(value=as_floatX(eps_p), name="acc_mgrad%s" % param.name) exp_sqr_grads[param] = theano.shared(value=as_floatX(eps_p), name="exp_grad_%s" % param.name) exp_sqr_ups[param] = theano.shared(value=as_floatX(eps_p), name="exp_grad_%s" % param.name) e0s[param] = as_floatX(learning_rate) gparam = T.grad(cost, param, consider_constant=constants) gparams.append(gparam) updates = OrderedDict({}) i = 0 for param, gparam in zip(params, gparams): if rmsprop: acc = accumulators[param] rms_grad = rho * acc + (1 - rho) * T.sqr(gparam) updates[acc] = rms_grad val = T.maximum(T.sqrt(T.sum(rms_grad, axis=0)), epsilon) learn_rates.append(e0s[param] / val) if center_grads: acc_mg = accumulators_mgrad[param] mean_grad = rho * acc_mg + (1 - rho) * gparam gparam = gparam - mean_grad updates[acc_mg] = mean_grad if momentum and not use_nesterov: memory = theano.shared(param.get_value() * 0.) updates[param] = param - memory updates[memory] = momentum * memory + learn_rates[i] * gparam elif use_nesterov: memory = theano.shared(param.get_value() * 0.) new_memo = momentum * memory - e0s[param] * gparam #new_memo = momentum * memory - learn_rates[i] * gparam updates[memory] = new_memo updates[param] = param + (momentum * new_memo - e0s[param] * gparam) / val else: updates[param] = param - learn_rates[i] * gparam i +=1 elif adadelta: exp_sg = exp_sqr_grads[param] exp_su = exp_sqr_ups[param] up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gparam) updates[exp_sg] = up_exp_sg step = -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gparam updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) updates[param] = param + step else: if momentum and not use_nesterov: memory = theano.shared(param.get_value() * 0.) updates[param] = param - memory updates[memory] = momentum * memory + learning_rate * gparam elif use_nesterov: memory = theano.shared(param.get_value() * 0.) new_memo = momentum * memory - learning_rate * gparam updates[memory] = new_memo updates[param] = param + momentum * new_memo - learning_rate * gparam else: updates[param] = param - learning_rate * gparam if max_col_norm is not None: updates = self.constrain_weights(layers, updates, max_col_norm) self.updates = updates self._train = theano.function(inputs, outputs=cost, updates=updates) self._constrain_inputs = theano.function(inputs, outputs=T.argsort(pooling_loss, axis=0))
def __init__(self, inputs, cost, layers, max_col_norm=None, loss_based_pooling=False, pooling_loss=None, learning_rate=0.01, momentum=None, rmsprop=True, adadelta=False, center_grads=False, rho=0.96, epsilon=1e-8, use_nesterov=True, seed=None, rng=None, constants=None, **kw): self.loss_based_pooling = loss_based_pooling self.rng = rng params = [layer.W for layer in layers] + [layer.b for layer in layers] self.learning_rate = theano.shared( numpy.asarray(learning_rate, dtype=theano.config.floatX)) self.layers = layers self.max_col_norm = max_col_norm #Initialize parameters for rmsprop: accumulators = OrderedDict({}) accumulators_mgrad = OrderedDict({}) exp_sqr_grads = OrderedDict({}) exp_sqr_ups = OrderedDict({}) e0s = OrderedDict({}) learn_rates = [] from utils import as_floatX self.max_col_norm = max_col_norm gparams = [] for param in params: eps_p = numpy.zeros_like(param.get_value()) accumulators[param] = theano.shared(value=as_floatX(eps_p), name="acc_%s" % param.name) accumulators_mgrad[param] = theano.shared(value=as_floatX(eps_p), name="acc_mgrad%s" % param.name) exp_sqr_grads[param] = theano.shared(value=as_floatX(eps_p), name="exp_grad_%s" % param.name) exp_sqr_ups[param] = theano.shared(value=as_floatX(eps_p), name="exp_grad_%s" % param.name) e0s[param] = as_floatX(learning_rate) gparam = T.grad(cost, param, consider_constant=constants) gparams.append(gparam) updates = OrderedDict({}) i = 0 for param, gparam in zip(params, gparams): if rmsprop: acc = accumulators[param] rms_grad = rho * acc + (1 - rho) * T.sqr(gparam) updates[acc] = rms_grad val = T.maximum(T.sqrt(T.sum(rms_grad, axis=0)), epsilon) learn_rates.append(e0s[param] / val) if center_grads: acc_mg = accumulators_mgrad[param] mean_grad = rho * acc_mg + (1 - rho) * gparam gparam = gparam - mean_grad updates[acc_mg] = mean_grad if momentum and not use_nesterov: memory = theano.shared(param.get_value() * 0.) updates[param] = param - memory updates[ memory] = momentum * memory + learn_rates[i] * gparam elif use_nesterov: memory = theano.shared(param.get_value() * 0.) new_memo = momentum * memory - e0s[param] * gparam #new_memo = momentum * memory - learn_rates[i] * gparam updates[memory] = new_memo updates[param] = param + (momentum * new_memo - e0s[param] * gparam) / val else: updates[param] = param - learn_rates[i] * gparam i += 1 elif adadelta: exp_sg = exp_sqr_grads[param] exp_su = exp_sqr_ups[param] up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gparam) updates[exp_sg] = up_exp_sg step = -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gparam updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) updates[param] = param + step else: if momentum and not use_nesterov: memory = theano.shared(param.get_value() * 0.) updates[param] = param - memory updates[ memory] = momentum * memory + learning_rate * gparam elif use_nesterov: memory = theano.shared(param.get_value() * 0.) new_memo = momentum * memory - learning_rate * gparam updates[memory] = new_memo updates[ param] = param + momentum * new_memo - learning_rate * gparam else: updates[param] = param - learning_rate * gparam if max_col_norm is not None: updates = self.constrain_weights(layers, updates, max_col_norm) self.updates = updates self._train = theano.function(inputs, outputs=cost, updates=updates) self._constrain_inputs = theano.function(inputs, outputs=T.argsort( pooling_loss, axis=0))