def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ Provides the updates for learning with gradient descent + momentum. Parameters ---------- learning_rate : float Learning rate coefficient. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ gshared = OrderedDict({ p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems() }) gsup = [(gs, g) for gs, g in zip(gshared.values(), grads.values())] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) updates = OrderedDict() for param, grad in gshared.keys(): vel = sharedX(param.get_value() * 0.) assert param.dtype == vel.dtype assert grad.dtype == param.dtype if param.name is not None: vel.name = 'vel_' + param.name scaled_lr = learning_rate * lr_scalers.get(param, 1.) updates[vel] = self.momentum * vel - scaled_lr * grad inc = updates[vel] if self.nesterov_momentum: inc = self.momentum * inc - scaled_lr * grad assert inc.dtype == vel.dtype updates[param] = param + inc f_update = theano.function([learning_rate], [], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def get_updates(self, learning_rate, grads, lr_scalers=None): """ Compute the AdaDelta updates Parameters ---------- learning_rate : float Learning rate coefficient. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ updates = OrderedDict() tot_norm_up = 0 tot_param_norm = 0 for param in grads.keys(): # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0.) if param.name is not None: mean_square_grad.name = 'mean_square_grad_' + param.name mean_square_dx.name = 'mean_square_dx_' + param.name # Accumulate gradient new_mean_squared_grad = (self.decay * mean_square_grad + (1 - self.decay) * T.sqr(grads[param])) # Compute update epsilon = lr_scalers.get(param, 1.) * learning_rate rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon) rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon) delta_x_t = -rms_dx_tm1 / rms_grad_t * grads[param] # Accumulate updates new_mean_square_dx = (self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t)) # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[param] = param + delta_x_t tot_norm_up += delta_x_t.norm(2) tot_param_norm += param.norm(2) return updates, tot_norm_up, tot_param_norm
def adam(lr, tparams, grads, inp, cost, errors): gshared = OrderedDict({p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems()}) gsup = [(gshared[p], g) for p, g in grads.iteritems()] gnorm = get_norms(grads.values()) pnorm = get_norms(tparams.values()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup, profile=profile) lr0 = lr b1 = 0.1 b2 = 0.001 e = 1e-8 updates = [] i = sharedX(numpy.float32(0.)) i_t = i + 1. fix1 = 1.0 - (1 - b1)**(i_t) fix2 = 1.0 - (1 - b2)**(i_t) lr_t = lr0 * (tensor.sqrt(fix2) / fix1) up_list = [] for p in tparams.values(): g = gshared[p] m = sharedX(p.get_value() * 0.) v = sharedX(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) g_t = m_t / (tensor.sqrt(v_t) + e) up_list.append(lr_t * g_t) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) upnorm = get_norms(up_list) f_update = theano.function([lr], [upnorm], updates=updates, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ Provides the updates for learning with gradient descent + momentum. Parameters ---------- learning_rate : float Learning rate coefficient. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ gshared = OrderedDict({p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems()}) gsup = [(gs, g) for gs, g in zip(gshared.values(), grads.values())] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) updates = OrderedDict() for param, grad in gshared.keys(): vel = sharedX(param.get_value() * 0.) assert param.dtype == vel.dtype assert grad.dtype == param.dtype if param.name is not None: vel.name = 'vel_' + param.name scaled_lr = learning_rate * lr_scalers.get(param, 1.) updates[vel] = self.momentum * vel - scaled_lr * grad inc = updates[vel] if self.nesterov_momentum: inc = self.momentum * inc - scaled_lr * grad assert inc.dtype == vel.dtype updates[param] = param + inc f_update = theano.function([learning_rate], [], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def adam(lr, tparams, grads, inp, cost, errors): gshared = OrderedDict({ p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems() }) gsup = [(gshared[p], g) for p, g in grads.iteritems()] gnorm = get_norms(grads.values()) pnorm = get_norms(tparams.values()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup, profile=profile) lr0 = lr b1 = 0.1 b2 = 0.001 e = 1e-8 updates = [] i = sharedX(numpy.float32(0.)) i_t = i + 1. fix1 = 1.0 - (1 - b1)**(i_t) fix2 = 1.0 - (1 - b2)**(i_t) lr_t = lr0 * (tensor.sqrt(fix2) / fix1) up_list = [] for p in tparams.values(): g = gshared[p] m = sharedX(p.get_value() * 0.) v = sharedX(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) g_t = m_t / (tensor.sqrt(v_t) + e) up_list.append(lr_t * g_t) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) upnorm = get_norms(up_list) f_update = theano.function([lr], [upnorm], updates=updates, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def __init__(self, decay=0.95, gamma_clip=0.0, grad_clip=None, start_var_reduction=0, delta_clip=None, use_adagrad=False, skip_nan_inf=False, use_corrected_grad=True): assert decay >= 0. assert decay < 1. self.start_var_reduction = start_var_reduction self.delta_clip = delta_clip self.gamma_clip = gamma_clip self.grad_clip = grad_clip self.decay = sharedX(decay, "decay") self.use_corrected_grad = use_corrected_grad self.use_adagrad = use_adagrad self.damping = 1e-7 # We have to bound the tau to prevent it to # grow to an arbitrarily large number, oftenwise # that causes numerical instabilities for very deep # networks. Note that once tau become very large, it will keep, # increasing indefinitely. self.skip_nan_inf = skip_nan_inf self.upper_bound_tau = 1e7 self.lower_bound_tau = 1.5
def construct_updates(self, grads): if not self.updates: self.updates = OrderedDict({}) ngrads = OrderedDict({}) mb_step = sharedX(0, name="mb_step") self.updates[mb_step] = mb_step + 1 cond = TT.eq((mb_step) % self.nbatches, 0) rate = 1.0 / self.nbatches for op, og in grads.iteritems(): for i, g in enumerate(self.gs): if op.name in g.name: break else: raise ValueError("Gradient for %s was not found." % op.name) if rate < 1.0: new_grad = (og + self.gs[i]) * as_floatX(rate) self.updates[self.gs[i]] = cond * new_grad + (1 - cond) * og * \ as_floatX(rate) ngrads[op] = new_grad else: ngrads[op] = og return ngrads
def adadelta(lr, tparams, grads, inp, cost, errors): gnorm = get_norms(grads) pnorm = get_norms(tparams.values()) zipped_grads = [ sharedX(p.get_value() * numpy.float32(0.), name='%s_grad' % k) for k, p in tparams.iteritems() ] running_up2 = [ sharedX(p.get_value() * numpy.float32(0.), name='%s_rup2' % k) for k, p in tparams.iteritems() ] running_grads2 = [ sharedX(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems() ] zgup = [(zg, g) for zg, g in \ zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) \ for rg2, g in \ zip(running_grads2, grads)] f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], \ updates=zgup + rg2up) updir = [-tensor.sqrt(ru2 + 1e-6) / \ tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 \ in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) \ for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in \ zip(itemlist(tparams), updir)] upnorm = get_norms(updir) f_update = theano.function([lr], [upnorm], updates=ru2up + param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def adadelta(lr, tparams, grads, inp, cost, errors): gnorm = get_norms(grads) pnorm = get_norms(tparams.values()) zipped_grads = [sharedX(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()] running_up2 = [sharedX(p.get_value() * numpy.float32(0.), name='%s_rup2'%k) for k, p in tparams.iteritems()] running_grads2 = [sharedX(p.get_value() * numpy.float32(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in \ zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) \ for rg2, g in \ zip(running_grads2, grads)] f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], \ updates=zgup + rg2up) updir = [-tensor.sqrt(ru2 + 1e-6) / \ tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 \ in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) \ for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in \ zip(itemlist(tparams), updir)] upnorm = get_norms(updir) f_update = theano.function([lr], [upnorm], updates=ru2up+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def rmsprop(lr, tparams, grads, inp, cost, errors): zipped_grads = [sharedX(p.get_value() * numpy.float32(0.), \ name='%s_grad'%k) for k, p in tparams.iteritems()] running_grads = [sharedX(p.get_value() * numpy.float32(0.), \ name='%s_rgrad'%k) for k, p in tparams.iteritems()] running_grads2 = [sharedX(p.get_value() * numpy.float32(0.), \ name='%s_rgrad2'%k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g \ in zip(running_grads2, grads)] pnorm = get_norms(tparams.values()) gnorm = get_norms(grads) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=zgup+rgup+rg2up, profile=profile) updir = [sharedX(p.get_value() * numpy.float32(0.), name='%s_updir'%k) \ for k, p in tparams.iteritems()] updir_new = [(ud, 0.9 * ud - lr * zg / \ tensor.maximum(tensor.sqrt(rg2 - rg ** 2 + 1e-8)), 1e-8) \ for ud, zg, rg, rg2 in zip(updir, zipped_grads, \ running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in \ zip(itemlist(tparams), updir_new)] upnorm = get_norms(updir_new) f_update = theano.function([lr], [upnorm], updates=updir_new+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def rmsprop(lr, tparams, grads, inp, cost, errors): zipped_grads = [sharedX(p.get_value() * numpy.float32(0.), \ name='%s_grad'%k) for k, p in tparams.iteritems()] running_grads = [sharedX(p.get_value() * numpy.float32(0.), \ name='%s_rgrad'%k) for k, p in tparams.iteritems()] running_grads2 = [sharedX(p.get_value() * numpy.float32(0.), \ name='%s_rgrad2'%k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g \ in zip(running_grads2, grads)] pnorm = get_norms(tparams.values()) gnorm = get_norms(grads) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=zgup + rgup + rg2up, profile=profile) updir = [sharedX(p.get_value() * numpy.float32(0.), name='%s_updir'%k) \ for k, p in tparams.iteritems()] updir_new = [(ud, 0.9 * ud - lr * zg / \ tensor.maximum(tensor.sqrt(rg2 - rg ** 2 + 1e-8)), 1e-8) \ for ud, zg, rg, rg2 in zip(updir, zipped_grads, \ running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in \ zip(itemlist(tparams), updir_new)] upnorm = get_norms(updir_new) f_update = theano.function([lr], [upnorm], updates=updir_new + param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def sgd(lr, tparams, grads, x, mask, y, cost, errors): gshared = [sharedX(p.get_value() * 0., name='%s_grad'%k) for k, p \ in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] pnorm = get_norms(tparams.values()) gnorm = get_norms(grads) f_grad_shared = theano.function([x, mask, y], [cost, errors, gnorm, pnorm], updates=gsup, profile=profile) pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] upnorm = lr*gnorm f_update = theano.function([lr], [upnorm], updates=pup, profile=profile) return f_grad_shared, f_update
def sgd(lr, tparams, grads, x, mask, y, cost, errors): gshared = [sharedX(p.get_value() * 0., name='%s_grad'%k) for k, p \ in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] pnorm = get_norms(tparams.values()) gnorm = get_norms(grads) f_grad_shared = theano.function([x, mask, y], [cost, errors, gnorm, pnorm], updates=gsup, profile=profile) pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] upnorm = lr * gnorm f_update = theano.function([lr], [upnorm], updates=pup, profile=profile) return f_grad_shared, f_update
def __init__(self, init_momentum=0.9, averaging_coeff=0.99, stabilizer=1e-4, update_param_norm_ratio=0.003, gradient_clipping=None): init_momentum = float(init_momentum) assert init_momentum >= 0. assert init_momentum <= 1. averaging_coeff = float(averaging_coeff) assert averaging_coeff >= 0. assert averaging_coeff <= 1. stabilizer = float(stabilizer) assert stabilizer >= 0. self.__dict__.update(locals()) del self.self self.momentum = sharedX(self.init_momentum) self.update_param_norm_ratio = update_param_norm_ratio self.gradient_clipping = gradient_clipping if gradient_clipping is not None: self.gradient_clipping = np.cast[config.floatX](gradient_clipping)
def __init__(self, init_momentum, averaging_coeff=0.95, stabilizer=1e-2, use_first_order=False, bound_inc=False, momentum_clipping=None): init_momentum = float(init_momentum) assert init_momentum >= 0. assert init_momentum <= 1. averaging_coeff = float(averaging_coeff) assert averaging_coeff >= 0. assert averaging_coeff <= 1. stabilizer = float(stabilizer) assert stabilizer >= 0. self.__dict__.update(locals()) del self.self self.momentum = sharedX(self.init_momentum) self.momentum_clipping = momentum_clipping if momentum_clipping is not None: self.momentum_clipping = np.cast[config.floatX](momentum_clipping)
def __init__(self, decay=0.95, gamma_clip=0.0, grad_clip=None, start_var_reduction=0, delta_clip=None, gamma_reg=1e-6, slow_decay=0.995, learning_rate=1.0, use_adagrad=False, perform_update=True, skip_nan_inf=False, use_corrected_grad=True): assert decay >= 0. assert decay < 1. self.start_var_reduction = start_var_reduction self.delta_clip = delta_clip self.gamma_clip = gamma_clip self.grad_clip = grad_clip self.slow_decay = slow_decay self.decay = sharedX(decay, "decay") self.use_corrected_grad = use_corrected_grad self.use_adagrad = use_adagrad self.gamma_reg = gamma_reg self.damping = 1e-7 self.learning_rate = learning_rate self.perform_update = perform_update # We have to bound the tau to prevent it to # grow to an arbitrarily large number, oftenwise # that causes numerical instabilities for very deep # networks. Note that once tau become very large, it will keep, # increasing indefinitely. self.skip_nan_inf = skip_nan_inf self.upper_bound_tau = 1e7 self.lower_bound_tau = 1.5
def get_updates(self, learning_rate, grads, lr_scalers=None): """ Provides the updates for learning with gradient descent + momentum. Parameters ---------- learning_rate : float Learning rate coefficient. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ updates = OrderedDict() for (param, grad) in six.iteritems(grads): vel = sharedX(param.get_value() * 0.) assert param.dtype == vel.dtype assert grad.dtype == param.dtype if param.name is not None: vel.name = 'vel_' + param.name scaled_lr = learning_rate * lr_scalers.get(param, 1.) updates[vel] = self.momentum * vel - scaled_lr * grad inc = updates[vel] if self.nesterov_momentum: inc = self.momentum * inc - scaled_lr * grad assert inc.dtype == vel.dtype updates[param] = param + inc return updates
def get_updates(self, learning_rate, grads, lr_scalers=None): """ .. todo:: WRITEME """ updates = OrderedDict() velocity = OrderedDict() tot_norm_up = 0 tot_param_norm = 0 for param in grads.keys(): avg_grad_sqr = sharedX(np.zeros_like(param.get_value())) velocity[param] = sharedX(np.zeros_like(param.get_value())) if param.name is not None: avg_grad_sqr.name = 'avg_grad_sqr_' + param.name new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr +\ (1 - self.averaging_coeff) * T.sqr(grads[param]) if self.use_first_order: avg_grad = sharedX(np.zeros_like(param.get_value())) if param.name is not None: avg_grad.name = 'avg_grad_' + param.name new_avg_grad = self.averaging_coeff * avg_grad +\ (1 - self.averaging_coeff) * grads[param] rms_grad_t = T.sqrt(new_avg_grad_sqr - new_avg_grad**2) updates[avg_grad] = new_avg_grad else: rms_grad_t = T.sqrt(new_avg_grad_sqr) rms_grad_t = T.maximum(rms_grad_t, self.stabilizer) normalized_grad = grads[param] / (rms_grad_t) new_velocity = self.momentum * velocity[param] -\ learning_rate * normalized_grad tot_norm_up += new_velocity.norm(2) tot_param_norm += param.norm(2) updates[avg_grad_sqr] = new_avg_grad_sqr updates[velocity[param]] = new_velocity updates[param] = param + new_velocity if self.momentum_clipping is not None: tot_norm_up = 0 new_mom_norm = sum( map(lambda X: T.sqr(X).sum(), [updates[velocity[param]] for param in grads.keys()])) new_mom_norm = T.sqrt(new_mom_norm) scaling_den = T.maximum(self.momentum_clipping, new_mom_norm) scaling_num = self.momentum_clipping for param in grads.keys(): if self.bound_inc: updates[velocity[param]] *= (scaling_num / scaling_den) updates[param] = param + updates[velocity[param]] else: update_step = updates[velocity[param]] * (scaling_num / scaling_den) tot_norm_up += update_step.norm(2) updates[param] = param + update_step return updates, tot_norm_up, tot_param_norm
def train(dim_word_desc=400,# word vector dimensionality dim_word_q=400, dim_word_ans=600, dim_proj=300, dim=400,# the number of LSTM units encoder_desc='lstm', encoder_desc_word='lstm', encoder_desc_sent='lstm', use_dq_sims=False, eyem=None, learn_h0=False, use_desc_skip_c_g=False, debug=False, encoder_q='lstm', patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., clip_c=-1., lrate=0.01, n_words_q=49145, n_words_desc=115425, n_words_ans=409, pkl_train_files=None, pkl_valid_files=None, maxlen=2000, # maximum length of the description optimizer='rmsprop', batch_size=2, vocab=None, valid_batch_size=16, use_elu_g=False, saveto='model.npz', model_dir=None, ms_nlayers=3, validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates datasets=[None], truncate=400, momentum=0.9, use_bidir=False, cost_mask=None, valid_datasets=['/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5', '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5'], dropout_rate=0.5, use_dropout=True, reload_=True, **opt_ds): ensure_dir_exists(model_dir) mpath = os.path.join(model_dir, saveto) mpath_best = os.path.join(model_dir, prfx("best", saveto)) mpath_last = os.path.join(model_dir, prfx("last", saveto)) mpath_stats = os.path.join(model_dir, prfx("stats", saveto)) # Model options model_options = locals().copy() model_options['use_sent_reps'] = opt_ds['use_sent_reps'] stats = defaultdict(list) del model_options['eyem'] del model_options['cost_mask'] if cost_mask is not None: cost_mask = sharedX(cost_mask) # reload options and parameters if reload_: print "Reloading the model." if os.path.exists(mpath_best): print "Reloading the best model from %s." % mpath_best with open(os.path.join(mpath_best, '%s.pkl' % mpath_best), 'rb') as f: models_options = pkl.load(f) params = init_params(model_options) params = load_params(mpath_best, params) elif os.path.exists(mpath): print "Reloading the model from %s." % mpath with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f: models_options = pkl.load(f) params = init_params(model_options) params = load_params(mpath, params) else: raise IOError("Couldn't open the file.") else: print "Couldn't reload the models initializing from scratch." params = init_params(model_options) if datasets[0]: print "Short dataset", datasets[0] print 'Loading data' print 'Building model' if pkl_train_files is None or pkl_valid_files is None: train, valid, test = load_data(path=datasets[0], valid_path=valid_datasets[0], test_path=valid_datasets[1], batch_size=batch_size, **opt_ds) else: train, valid, test = load_pkl_data(train_file_paths=pkl_train_files, valid_file_paths=pkl_valid_files, batch_size=batch_size, vocab=vocab, eyem=eyem, **opt_ds) tparams = init_tparams(params) trng, use_noise, inps_d, \ opt_ret, \ cost, errors, ent_errors, ent_derrors, probs = \ build_model(tparams, model_options, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, valid, cost_mask=cost_mask) alphas = opt_ret['dec_alphas'] if opt_ds['use_sent_reps']: inps = [inps_d["desc"], \ inps_d["word_mask"], \ inps_d["q"], \ inps_d['q_mask'], \ inps_d['ans'], \ inps_d['wlen'], inps_d['slen'], inps_d['qlen'],\ inps_d['ent_mask'] ] else: inps = [inps_d["desc"], \ inps_d["word_mask"], \ inps_d["q"], \ inps_d['q_mask'], \ inps_d['ans'], \ inps_d['wlen'], \ inps_d['qlen'], \ inps_d['ent_mask']] outs = [cost, errors, probs, alphas] if ent_errors: outs += [ent_errors] if ent_derrors: outs += [ent_derrors] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, outs, profile=profile) print 'Done' # Apply weight decay on the feed-forward connections if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): if "logit" in kk or "ff" in kk: weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Computing gradient...', grads = safe_grad(cost, itemlist(tparams)) print 'Done' # Gradient clipping: if clip_c > 0.: g2 = get_norms(grads) for p, g in grads.iteritems(): grads[p] = tensor.switch(g2 > (clip_c**2), (g / tensor.sqrt(g2 + 1e-8)) * clip_c, g) inps.pop() if optimizer.lower() == "adasecant": learning_rule = Adasecant(delta_clip=25.0, use_adagrad=True, grad_clip=0.25, gamma_clip=0.) elif optimizer.lower() == "rmsprop": learning_rule = RMSPropMomentum(init_momentum=momentum) elif optimizer.lower() == "adam": learning_rule = Adam() elif optimizer.lower() == "adadelta": learning_rule = AdaDelta() lr = tensor.scalar(name='lr') print 'Building optimizers...', learning_rule = None if learning_rule: f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr, grads=grads, inp=inps, cost=cost, errors=errors) else: f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, errors) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(mpath): history_errs = list(numpy.load(mpath)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size best_found = False uidx = 0 estop = False train_cost_ave, train_err_ave, \ train_gnorm_ave = reset_train_vals() for eidx in xrange(max_epochs): n_samples = 0 if train.done: train.reset() for d_, q_, a, em in train: n_samples += len(a) uidx += 1 use_noise.set_value(1.) if opt_ds['use_sent_reps']: # To mask the description and the question. d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents(d_, q_) if d is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost, errors, gnorm, pnorm = f_grad_shared(d, d_mask, q, q_mask, a, dlen, slen, qlen) else: d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_) if d is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost, errors, gnorm, pnorm = f_grad_shared(d, d_mask, q, q_mask, a, dlen, qlen) upnorm = f_update(lrate) ud = time.time() - ud_start # Collect the running ave train stats. train_cost_ave = running_ave(train_cost_ave, cost) train_err_ave = running_ave(train_err_ave, errors) train_gnorm_ave = running_ave(train_gnorm_ave, gnorm) if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' import ipdb; ipdb.set_trace() if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, ' Update ', uidx, \ ' Cost ', cost, ' UD ', ud, \ ' UpNorm ', upnorm[0].tolist(), \ ' GNorm ', gnorm, \ ' Pnorm ', pnorm, 'Terrors ', errors if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None and best_found: numpy.savez(mpath_best, history_errs=history_errs, **best_p) pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb')) else: params = unzip(tparams) numpy.savez(mpath, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % mpath, 'wb')) pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb')) print 'Done' print_param_norms(tparams) if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) if valid.done: valid.reset() valid_costs, valid_errs, valid_probs, \ valid_alphas, error_ent, error_dent = eval_model(f_log_probs, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, model_options, valid, use_sent_rep=opt_ds['use_sent_reps']) valid_alphas_ = numpy.concatenate([va.argmax(0) for va in valid_alphas.tolist()], axis=0) valid_err = valid_errs.mean() valid_cost = valid_costs.mean() valid_alpha_ent = -negentropy(valid_alphas) mean_valid_alphas = valid_alphas_.mean() std_valid_alphas = valid_alphas_.std() mean_valid_probs = valid_probs.argmax(1).mean() std_valid_probs = valid_probs.argmax(1).std() history_errs.append([valid_cost, valid_err]) stats['train_err_ave'].append(train_err_ave) stats['train_cost_ave'].append(train_cost_ave) stats['train_gnorm_ave'].append(train_gnorm_ave) stats['valid_errs'].append(valid_err) stats['valid_costs'].append(valid_cost) stats['valid_err_ent'].append(error_ent) stats['valid_err_desc_ent'].append(error_dent) stats['valid_alphas_mean'].append(mean_valid_alphas) stats['valid_alphas_std'].append(std_valid_alphas) stats['valid_alphas_ent'].append(valid_alpha_ent) stats['valid_probs_mean'].append(mean_valid_probs) stats['valid_probs_std'].append(std_valid_probs) if uidx == 0 or valid_err <= numpy.array(history_errs)[:, 1].min(): best_p = unzip(tparams) bad_counter = 0 best_found = True else: bst_found = False if numpy.isnan(valid_err): import ipdb; ipdb.set_trace() print "============================" print '\t>>>Valid error: ', valid_err, \ ' Valid cost: ', valid_cost print '\t>>>Valid pred mean: ', mean_valid_probs, \ ' Valid pred std: ', std_valid_probs print '\t>>>Valid alphas mean: ', mean_valid_alphas, \ ' Valid alphas std: ', std_valid_alphas, \ ' Valid alpha negent: ', valid_alpha_ent, \ ' Valid error ent: ', error_ent, \ ' Valid error desc ent: ', error_dent print "============================" print "Running average train stats " print '\t>>>Train error: ', train_err_ave, \ ' Train cost: ', train_cost_ave, \ ' Train grad norm: ', train_gnorm_ave print "============================" train_cost_ave, train_err_ave, \ train_gnorm_ave = reset_train_vals() print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid.reset() valid_cost, valid_error, valid_probs, \ valid_alphas, error_ent = eval_model(f_log_probs, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, model_options, valid, use_sent_rep=opt_ds['use_sent_rep']) print " Final eval resuts: " print 'Valid error: ', valid_error.mean() print 'Valid cost: ', valid_cost.mean() print '\t>>>Valid pred mean: ', valid_probs.mean(), \ ' Valid pred std: ', valid_probs.std(), \ ' Valid error ent: ', error_ent params = copy.copy(best_p) numpy.savez(mpath_last, zipped_params=best_p, history_errs=history_errs, **params) return valid_err, valid_cost
def __call__(self, probs, samples, updates, cost=None, mask=None, deterministic=False, child_probs=None, dimshuffle_probs=False, child_samples=None): if input is None: raise ValueError("input for the %s should " " not be empty." % __class__.__name__) key_baseline = get_key_byname_from_dict(updates, "baseline") step = 0 if key_baseline: rbaseline = updates[key_baseline] key_step = get_key_byname_from_dict(updates, "step") if key_step: step = updates[key_step] else: step = sharedX(0., name="step") else: if self.generative_pred: baseline = sharedX(np.zeros((self.maxlen,)) + 1.0 + self.eps, name="baseline") else: baseline = sharedX(0. + 1.0 + self.eps, name="new_baseline") key_step = get_key_byname_from_dict(updates, "step") fix_decay = self.decay**(step + as_floatX(1)) if key_step: step = updates[key_step] else: step = sharedX(0., name="step") updates[step] = step + as_floatX(1) if self.use_rms_baseline: if self.generative_pred: new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean(-1)**2 else: new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean()**2 updates[baseline] = new_baseline rbaseline = new_baseline / (1 - fix_decay) rbaseline = TT.sqrt(rbaseline) else: if self.generative_pred: new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean(-1) else: new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean() updates[baseline] = new_baseline rbaseline = new_baseline key_cvar = get_key_byname_from_dict(updates, "cost_var") if key_cvar: cost_var = updates[key_cvar] new_cost_var = cost_var else: if self.generative_pred: cost_var = sharedX(np.zeros((self.maxlen,)) + as_floatX(1.2), name="cost_var") cost_var_ave = (cost.mean(-1) - new_baseline)**2 else: cost_var = sharedX(as_floatX(1.2), name="cost_var") cost_var_ave = (cost.mean() - new_baseline)**2 new_cost_var = as_floatX(self.decay) * cost_var + as_floatX(1 - self.decay) * cost_var_ave updates[cost_var] = new_cost_var lambda2_reg = self.lambda2_reg """ if not self.schedule_h_opts: start = self.schedule_h_opts["lambda2_reg_start"] nbatches = self.schedule_h_opts["end_nbatches"] end = self.lambda2_reg assert start > end lambda2_reg = TT.minimum(((start - end) * step / nbatches) + start, end) """ if dimshuffle_probs: probsd = probs.dimshuffle(0, 2, 1) else: probsd = probs if probs.ndim == 3 and cost.ndim == 1: if dimshuffle_probs: reward = cost.dimshuffle('x', 'x', 0) if self.generative_pred: rbaseline = rbaseline.dimshuffle('x', 'x', 0) cost_std = new_cost_var.dimshuffle('x', 'x', 0) else: reward = cost.dimshuffle('x', 0, 'x') if self.generative_pred: rbaseline = rbaseline.dimshuffle('x', 0, 'x') cost_std = new_cost_var.dimshuffle('x', 0, 'x') elif probs.ndim == 3 and cost.ndim == 2: if dimshuffle_probs: reward = cost.dimshuffle(0, 'x', 1) if self.generative_pred: rbaseline = rbaseline.dimshuffle(0, 'x', 'x') new_cost_var = new_cost_var.dimshuffle(0, 'x', 'x') else: reward = cost.dimshuffle('x', 0, 1) if self.generative_pred: rbaseline = rbaseline.dimshuffle('x', 0, 'x') new_cost_var = new_cost_var.dimshuffle('x', 0, 'x') elif probs.ndim == 4 and self.cost.ndim == 1: reward = cost.dimshuffle('x', 'x', 0, 'x') elif probs.ndim == 4: reward = cost.dimshuffle(0, 'x', 1, 'x') centered_cost = reward - rbaseline N = probsd.shape[-1] if self.use_cost_std: cost_std = TT.maximum(TT.sqrt(new_cost_var + 1e-8), 1.0) else: cost_std = 1 if child_probs is not None and child_samples is not None: cprobs1 = child_samples / (child_probs + 1e-8) + samples / (probsd + 1e-8) else: cprobs1 = samples / (probsd + 1e-8) gradp = self.lambda1_reg * (centered_cost / cost_std) * \ (cprobs1) + (lambda2_reg) * (TT.log(probsd + 1e-8) + as_floatX(1)) if dimshuffle_probs: gradp = gradp.dimshuffle(0, 2, 1) if mask is not None: if dimshuffle_probs: gradp = mask.dimshuffle(0, 1, 'x') * gradp else: gradp = mask.dimshuffle(0, 1, 'x') * gradp / N known_grads = {probs: gradp} policy = -(TT.log(probsd + 1e-8) * samples).mean((1, 2)).sum() return updates, known_grads, rbaseline, cost_std, policy, lambda2_reg
def __call__(self, probs, samples, baseline, updates, cost = None, cost_mean=None, mask=None, seq_len=20, batch_size=140, deterministic=False, dimshuffle_probs=True): print("Using the input based baseline") if input is None: raise ValueError("input for the %s should" " not be empty." % __class__.__name__) if cost_mean is None: cost_mean = cost.mean() step = 0 key_step = get_key_byname_from_dict(updates, "step") if key_step: step = updates[key_step] else: step = sharedX(0., name="step") updates[step] = step + as_floatX(1) key_center = get_key_byname_from_dict(updates, "center") if key_center: center = updates[key_center] new_center = center else: if self.generative_pred: center = sharedX(np.zeros((self.maxlen,)) + 0.15 + self.eps, name="center") new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost.mean(-1) else: center = sharedX(0.15 + self.eps, name="center") assert cost_mean is not None, "Cost mean should not be empty!" if cost.ndim > 2 and cost.broadcastable[0] is False: new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost_mean else: new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost_mean updates[center] = new_center key_cvar = get_key_byname_from_dict(updates, "cost_var") if key_cvar: cost_var = updates[key_cvar] new_cost_var = cost_var else: if self.generative_pred: cost_var_tot = (cost_mean - new_center)**2 cost_var = sharedX(numpy.zeros((self.maxlen,)) + as_floatX(1.0), name="cost_var") else: if cost.ndim > 2 and cost.broadcastable[0] is False: cost_var_tot = (cost_mean - new_center)**2 else: cost_var_tot = (cost_mean - new_center)**2 cost_var = sharedX(1.0, name="cost_var") new_cost_var = as_floatX(self.decay) * cost_var + as_floatX(1 - self.decay) * \ cost_var_tot updates[cost_var] = new_cost_var lambda2_reg = self.lambda2_reg """ if not self.schedule_h_opts: start = self.schedule_h_opts["lambda2_reg_start"] nbatches = self.schedule_h_opts["end_nbatches"] end = self.lambda2_reg assert start > end lambda2_reg = TT.minimum(((start - end) * step / nbatches) + start, end) """ if dimshuffle_probs: probsd = probs.dimshuffle(0, 2, 1) else: probsd = probs if samples.ndim == 4: reward = cost.dimshuffle(0, 'x', 1, 'x') policy = -(TT.log(probsd + 1e-8) * samples).mean((2, 3)).sum() else: if cost.ndim == 2: if dimshuffle_probs: reward = cost.dimshuffle(0, 'x', 1) if self.generative_pred: new_center = new_center.dimshuffle(0, 'x', 'x') new_cost_var = new_cost_var.dimshuffle(0, 'x', 'x') baseline = baseline.dimshuffle(0, 2, 1) else: reward = cost.dimshuffle(0, 1, 'x') policy = -(TT.log(probsd + 1e-8) * samples).mean((1, 2)).sum() elif cost.ndim == 1: reward = cost.dimshuffle('x', 0, 'x') if dimshuffle_probs: baseline = baseline.dimshuffle(0, 2, 1) else: baseline = baseline.dimshuffle(1, 0, 2) cost_std = TT.maximum(TT.sqrt(new_cost_var + 1e-8), 1.0) centered_reward = (reward - baseline - new_center) / cost_std if cost.ndim == 2: centered_reward = TT.addbroadcast(centered_reward, 1) N = probs.shape[-1] gradp = self.lambda1_reg * (centered_reward) * \ (samples / (probsd + 1e-8)) + lambda2_reg * (TT.log(probsd + 1e-6) + as_floatX(1)) if dimshuffle_probs: gradp = gradp.dimshuffle(0, 2, 1) if mask is not None: if self.generative_pred: gradp = mask.dimshuffle(0, 1, 'x') * gradp / N else: gradp = mask.dimshuffle(0, 1, 'x') * gradp known_grads = {probs: gradp} return updates, known_grads, new_center, cost_std, policy, lambda2_reg
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ .. todo:: WRITEME """ if self.gradient_clipping is not None: grads_norm = sum( map(lambda X: T.sqr(X).sum(), [grads[param] for param in grads.keys()])) grads_norm = T.sqrt(grads_norm) scaling_den = T.maximum(self.gradient_clipping, grads_norm) scaling_num = self.gradient_clipping for param in grads.keys(): grads[param] = scaling_num * grads[param] / scaling_den updates = OrderedDict() velocity = OrderedDict() normalized_velocities = OrderedDict() counter = sharedX(0, 'counter') tot_norm_up = 0 gshared = OrderedDict({ p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems() }) gsup = [(gshared[p], g) for p, g in grads.iteritems()] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) for param in gshared.keys(): avg_grad_sqr = sharedX(np.zeros_like(param.get_value())) velocity[param] = sharedX(np.zeros_like(param.get_value())) next_counter = counter + 1. fix_first_moment = 1. - self.momentum**next_counter fix_second_moment = 1. - self.averaging_coeff**next_counter if param.name is not None: avg_grad_sqr.name = 'avg_grad_sqr_' + param.name new_avg_grad_sqr = self.averaging_coeff*avg_grad_sqr \ + (1 - self.averaging_coeff)*T.sqr(gshared[param]) rms_grad_t = T.sqrt(new_avg_grad_sqr) rms_grad_t = T.maximum(rms_grad_t, self.stabilizer) new_velocity = self.momentum * velocity[param] \ - (1 - self.momentum) * gshared[param] normalized_velocity = (new_velocity * T.sqrt(fix_second_moment)) \ / (rms_grad_t * fix_first_moment) tot_norm_up += learning_rate * normalized_velocity.norm(2) normalized_velocities[param] = normalized_velocity updates[avg_grad_sqr] = new_avg_grad_sqr updates[velocity[param]] = new_velocity updates[param] = param + normalized_velocities[param] updates[counter] = counter + 1 f_update = theano.function([learning_rate], [tot_norm_up], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ Compute the AdaDelta updates Parameters ---------- learning_rate : float Learning rate coefficient. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ updates = OrderedDict() tot_norm_up = 0 gshared = OrderedDict({ p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems() }) gsup = [(gshared[p], g) for p, g in grads.iteritems()] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) for param in gshared.keys(): # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0.) if param.name is not None: mean_square_grad.name = 'mean_square_grad_' + param.name mean_square_dx.name = 'mean_square_dx_' + param.name # Accumulate gradient new_mean_squared_grad = (self.decay * mean_square_grad + (1 - self.decay) * T.sqr(gshared[param])) # Compute update epsilon = learning_rate rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon) rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon) delta_x_t = -rms_dx_tm1 / rms_grad_t * gshared[param] # Accumulate updates new_mean_square_dx = (self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t)) # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[param] = param + delta_x_t tot_norm_up += delta_x_t.norm(2) f_update = theano.function([learning_rate], [tot_norm_up], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def get_updates(self, learning_rate, grads, lr_scalers=None): """ .. todo:: WRITEME Parameters ---------- learning_rate : float Learning rate coefficient. Learning rate is not being used but, pylearn2 requires a learning rate to be defined. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ updates = OrderedDict({}) eps = self.damping step = sharedX(0., name="step") if self.skip_nan_inf: #If norm of the gradients of a parameter is inf or nan don't update that parameter #That might be useful for RNNs. grads = OrderedDict({ p: T.switch(T.or_(T.isinf(grads[p]), T.isnan(grads[p])), 0, grads[p]) for p in grads.keys() }) #Block-normalize gradients: nparams = len(grads.keys()) #Apply the gradient clipping, this is only sometimes #necessary for RNNs and sometimes for very deep networks if self.grad_clip: assert self.grad_clip > 0. assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1." gnorm = sum([g.norm(2) for g in grads.values()]) notfinite = T.or_(T.isnan(gnorm), T.isinf(gnorm)) for p, g in grads.iteritems(): tmpg = T.switch(gnorm / nparams > self.grad_clip, g * self.grad_clip * nparams / gnorm, g) grads[p] = T.switch(notfinite, as_floatX(0.1) * p, tmpg) tot_norm_up = 0 tot_param_norm = 0 fix_decay = self.slow_decay**(step + 1) for param in grads.keys(): grads[param].name = "grad_%s" % param.name mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name) mean_corrected_grad = sharedX(param.get_value() * 0 + eps, name="mean_corrected_grad_%s" % param.name) gnorm_sqr = sharedX(0.0 + eps, name="gnorm_%s" % param.name) prod_taus = sharedX((np.ones_like(param.get_value()) - 2 * eps), name="prod_taus_x_t_" + param.name) slow_constant = 2.1 if self.use_adagrad: # sum_square_grad := \sum_i g_i^2 sum_square_grad = sharedX(param.get_value(borrow=True) * 0., name="sum_square_grad_%s" % param.name) """ Initialization of accumulators """ taus_x_t = sharedX( (np.ones_like(param.get_value()) + eps) * slow_constant, name="taus_x_t_" + param.name) self.taus_x_t = taus_x_t #Variance reduction parameters #Numerator of the gamma: gamma_nume_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_nume_sqr_" + param.name) #Denominator of the gamma: gamma_deno_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_deno_sqr_" + param.name) #For the covariance parameter := E[\gamma \alpha]_{t-1} cov_num_t = sharedX(np.zeros_like(param.get_value()) + eps, name="cov_num_t_" + param.name) # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(np.zeros_like(param.get_value()) + eps, name="msg_" + param.name) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name) if self.use_corrected_grad: old_grad = sharedX(param.get_value() * 0. + eps) #The uncorrected gradient of previous of the previous update: old_plain_grad = sharedX(param.get_value() * 0. + eps) mean_curvature = sharedX(param.get_value() * 0. + eps) mean_curvature_sqr = sharedX(param.get_value() * 0. + eps) # Initialize the E[\Delta]_{t-1} mean_dx = sharedX(param.get_value() * 0.) # Block-wise normalize the gradient: norm_grad = grads[param] #For the first time-step, assume that delta_x_t := norm_grad gnorm = T.sqr(norm_grad).sum() cond = T.eq(step, 0) gnorm_sqr_o = cond * gnorm + (1 - cond) * gnorm_sqr gnorm_sqr_b = gnorm_sqr_o / (1 - fix_decay) norm_grad = norm_grad / (T.sqrt(gnorm_sqr_b) + eps) msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx mdx = cond * norm_grad + (1 - cond) * mean_dx new_prod_taus = (prod_taus * (1 - 1 / taus_x_t)) """ Compute the new updated values. """ # E[g_i^2]_t new_mean_squared_grad = (mean_square_grad * (1 - 1 / taus_x_t) + T.sqr(norm_grad) / (taus_x_t)) new_mean_squared_grad.name = "msg_" + param.name # E[g_i]_t new_mean_grad = (mean_grad * (1 - 1 / taus_x_t) + norm_grad / taus_x_t) new_mean_grad.name = "nmg_" + param.name mg = new_mean_grad / (1 - new_prod_taus) mgsq = new_mean_squared_grad / (1 - new_prod_taus) new_gnorm_sqr = (gnorm_sqr_o * self.slow_decay + T.sqr(norm_grad).sum() * (1 - self.slow_decay)) # Keep the rms for numerator and denominator of gamma. new_gamma_nume_sqr = (gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr( (norm_grad - old_grad) * (old_grad - mg)) / taus_x_t) new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name new_gamma_deno_sqr = (gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr( (mg - norm_grad) * (old_grad - mg)) / taus_x_t) new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name gamma = T.sqrt(gamma_nume_sqr) / (T.sqrt(gamma_deno_sqr + eps) + \ self.gamma_reg) gamma.name = "gamma_" + param.name if self.gamma_clip and self.gamma_clip > -1: gamma = T.minimum(gamma, self.gamma_clip) momentum_step = gamma * mg corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma) #For starting the variance reduction. if self.start_var_reduction > -1: cond = T.le(self.start_var_reduction, step) corrected_grad = cond * corrected_grad_cand + ( 1 - cond) * norm_grad else: corrected_grad = norm_grad if self.use_adagrad: g = corrected_grad # Accumulate gradient new_sum_squared_grad = (sum_square_grad + T.sqr(g)) rms_g_t = T.sqrt(new_sum_squared_grad) rms_g_t = T.maximum(rms_g_t, 1.0) #Use the gradients from the previous update #to compute the \nabla f(x_t) - \nabla f(x_{t-1}) cur_curvature = norm_grad - old_plain_grad #cur_curvature = theano.printing.Print("Curvature: ")(cur_curvature) cur_curvature_sqr = T.sqr(cur_curvature) new_curvature_ave = (mean_curvature * (1 - 1 / taus_x_t) + (cur_curvature / taus_x_t)) new_curvature_ave.name = "ncurve_ave_" + param.name #Average average curvature nc_ave = new_curvature_ave / (1 - new_prod_taus) new_curvature_sqr_ave = (mean_curvature_sqr * (1 - 1 / taus_x_t) + (cur_curvature_sqr / taus_x_t)) new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name #Unbiased average squared curvature nc_sq_ave = new_curvature_sqr_ave / (1 - new_prod_taus) epsilon = 1e-7 #lr_scalers.get(param, 1.) * learning_rate scaled_lr = sharedX(1.0) rms_dx_tm1 = T.sqrt(msdx + epsilon) rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon) #This is where the update step is being defined delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon)) delta_x_t.name = "delta_x_t_" + param.name # This part seems to be necessary for only RNNs # For feedforward networks this does not seem to be important. if self.delta_clip: logger.info( "Clipping will be applied on the adaptive step size.") delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip) if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad is disabled.") delta_x_t = delta_x_t * corrected_grad else: logger.info( "Clipping will not be applied on the adaptive step size.") if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad will not be used.") delta_x_t = delta_x_t * corrected_grad new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX( 1 + eps, "stabilized") #To compute the E[\Delta^2]_t new_mean_square_dx = (msdx * (1 - 1 / taus_x_t) + (T.sqr(delta_x_t) / taus_x_t)) #To compute the E[\Delta]_t new_mean_dx = (mdx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t))) #Perform the outlier detection: #This outlier detection is slightly different: new_taus_t = T.switch( T.or_( abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)), abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))), T.switch(new_taus_t > 2.5, sharedX(2.5), new_taus_t + sharedX(1.0) + eps), new_taus_t) #Apply the bound constraints on tau: new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t) new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t) new_cov_num_t = (cov_num_t * (1 - 1 / taus_x_t) + (delta_x_t * cur_curvature) * (1 / taus_x_t)) update_step = delta_x_t tot_norm_up += update_step.norm(2) tot_param_norm += param.norm(2) # Apply updates updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[mean_dx] = new_mean_dx updates[gnorm_sqr] = new_gnorm_sqr updates[gamma_nume_sqr] = new_gamma_nume_sqr updates[gamma_deno_sqr] = new_gamma_deno_sqr updates[taus_x_t] = new_taus_t updates[cov_num_t] = new_cov_num_t updates[mean_grad] = new_mean_grad updates[old_plain_grad] = norm_grad updates[mean_curvature] = new_curvature_ave updates[mean_curvature_sqr] = new_curvature_sqr_ave if self.perform_update: updates[param] = param + update_step updates[step] = step + 1 updates[prod_taus] = new_prod_taus if self.use_adagrad: updates[sum_square_grad] = new_sum_squared_grad if self.use_corrected_grad: updates[old_grad] = corrected_grad return updates, tot_norm_up, tot_param_norm
def train( dim_word_desc=400, # word vector dimensionality dim_word_q=400, dim_word_ans=600, dim_proj=300, dim=400, # the number of LSTM units encoder_desc='lstm', encoder_desc_word='lstm', encoder_desc_sent='lstm', use_dq_sims=False, eyem=None, learn_h0=False, use_desc_skip_c_g=False, debug=False, encoder_q='lstm', patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., clip_c=-1., lrate=0.01, n_words_q=49145, n_words_desc=115425, n_words_ans=409, pkl_train_files=None, pkl_valid_files=None, maxlen=2000, # maximum length of the description optimizer='rmsprop', batch_size=2, vocab=None, valid_batch_size=16, use_elu_g=False, saveto='model.npz', model_dir=None, ms_nlayers=3, validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates datasets=[None], truncate=400, momentum=0.9, use_bidir=False, cost_mask=None, valid_datasets=[ '/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5', '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5' ], dropout_rate=0.5, use_dropout=True, reload_=True, **opt_ds): ensure_dir_exists(model_dir) mpath = os.path.join(model_dir, saveto) mpath_best = os.path.join(model_dir, prfx("best", saveto)) mpath_last = os.path.join(model_dir, prfx("last", saveto)) mpath_stats = os.path.join(model_dir, prfx("stats", saveto)) # Model options model_options = locals().copy() model_options['use_sent_reps'] = opt_ds['use_sent_reps'] stats = defaultdict(list) del model_options['eyem'] del model_options['cost_mask'] if cost_mask is not None: cost_mask = sharedX(cost_mask) # reload options and parameters if reload_: print "Reloading the model." if os.path.exists(mpath_best): print "Reloading the best model from %s." % mpath_best with open(os.path.join(mpath_best, '%s.pkl' % mpath_best), 'rb') as f: models_options = pkl.load(f) params = init_params(model_options) params = load_params(mpath_best, params) elif os.path.exists(mpath): print "Reloading the model from %s." % mpath with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f: models_options = pkl.load(f) params = init_params(model_options) params = load_params(mpath, params) else: raise IOError("Couldn't open the file.") else: print "Couldn't reload the models initializing from scratch." params = init_params(model_options) if datasets[0]: print "Short dataset", datasets[0] print 'Loading data' print 'Building model' if pkl_train_files is None or pkl_valid_files is None: train, valid, test = load_data(path=datasets[0], valid_path=valid_datasets[0], test_path=valid_datasets[1], batch_size=batch_size, **opt_ds) else: train, valid, test = load_pkl_data(train_file_paths=pkl_train_files, valid_file_paths=pkl_valid_files, batch_size=batch_size, vocab=vocab, eyem=eyem, **opt_ds) tparams = init_tparams(params) trng, use_noise, inps_d, \ opt_ret, \ cost, errors, ent_errors, ent_derrors, probs = \ build_model(tparams, model_options, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, valid, cost_mask=cost_mask) alphas = opt_ret['dec_alphas'] if opt_ds['use_sent_reps']: inps = [inps_d["desc"], \ inps_d["word_mask"], \ inps_d["q"], \ inps_d['q_mask'], \ inps_d['ans'], \ inps_d['wlen'], inps_d['slen'], inps_d['qlen'],\ inps_d['ent_mask'] ] else: inps = [inps_d["desc"], \ inps_d["word_mask"], \ inps_d["q"], \ inps_d['q_mask'], \ inps_d['ans'], \ inps_d['wlen'], \ inps_d['qlen'], \ inps_d['ent_mask']] outs = [cost, errors, probs, alphas] if ent_errors: outs += [ent_errors] if ent_derrors: outs += [ent_derrors] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, outs, profile=profile) print 'Done' # Apply weight decay on the feed-forward connections if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): if "logit" in kk or "ff" in kk: weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Computing gradient...', grads = safe_grad(cost, itemlist(tparams)) print 'Done' # Gradient clipping: if clip_c > 0.: g2 = get_norms(grads) for p, g in grads.iteritems(): grads[p] = tensor.switch(g2 > (clip_c**2), (g / tensor.sqrt(g2 + 1e-8)) * clip_c, g) inps.pop() if optimizer.lower() == "adasecant": learning_rule = Adasecant(delta_clip=25.0, use_adagrad=True, grad_clip=0.25, gamma_clip=0.) elif optimizer.lower() == "rmsprop": learning_rule = RMSPropMomentum(init_momentum=momentum) elif optimizer.lower() == "adam": learning_rule = Adam() elif optimizer.lower() == "adadelta": learning_rule = AdaDelta() lr = tensor.scalar(name='lr') print 'Building optimizers...', learning_rule = None if learning_rule: f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr, grads=grads, inp=inps, cost=cost, errors=errors) else: f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, errors) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(mpath): history_errs = list(numpy.load(mpath)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size best_found = False uidx = 0 estop = False train_cost_ave, train_err_ave, \ train_gnorm_ave = reset_train_vals() for eidx in xrange(max_epochs): n_samples = 0 if train.done: train.reset() for d_, q_, a, em in train: n_samples += len(a) uidx += 1 use_noise.set_value(1.) if opt_ds['use_sent_reps']: # To mask the description and the question. d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents( d_, q_) if d is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost, errors, gnorm, pnorm = f_grad_shared( d, d_mask, q, q_mask, a, dlen, slen, qlen) else: d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_) if d is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost, errors, gnorm, pnorm = f_grad_shared( d, d_mask, q, q_mask, a, dlen, qlen) upnorm = f_update(lrate) ud = time.time() - ud_start # Collect the running ave train stats. train_cost_ave = running_ave(train_cost_ave, cost) train_err_ave = running_ave(train_err_ave, errors) train_gnorm_ave = running_ave(train_gnorm_ave, gnorm) if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' import ipdb ipdb.set_trace() if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, ' Update ', uidx, \ ' Cost ', cost, ' UD ', ud, \ ' UpNorm ', upnorm[0].tolist(), \ ' GNorm ', gnorm, \ ' Pnorm ', pnorm, 'Terrors ', errors if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None and best_found: numpy.savez(mpath_best, history_errs=history_errs, **best_p) pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb')) else: params = unzip(tparams) numpy.savez(mpath, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % mpath, 'wb')) pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb')) print 'Done' print_param_norms(tparams) if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) if valid.done: valid.reset() valid_costs, valid_errs, valid_probs, \ valid_alphas, error_ent, error_dent = eval_model(f_log_probs, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, model_options, valid, use_sent_rep=opt_ds['use_sent_reps']) valid_alphas_ = numpy.concatenate( [va.argmax(0) for va in valid_alphas.tolist()], axis=0) valid_err = valid_errs.mean() valid_cost = valid_costs.mean() valid_alpha_ent = -negentropy(valid_alphas) mean_valid_alphas = valid_alphas_.mean() std_valid_alphas = valid_alphas_.std() mean_valid_probs = valid_probs.argmax(1).mean() std_valid_probs = valid_probs.argmax(1).std() history_errs.append([valid_cost, valid_err]) stats['train_err_ave'].append(train_err_ave) stats['train_cost_ave'].append(train_cost_ave) stats['train_gnorm_ave'].append(train_gnorm_ave) stats['valid_errs'].append(valid_err) stats['valid_costs'].append(valid_cost) stats['valid_err_ent'].append(error_ent) stats['valid_err_desc_ent'].append(error_dent) stats['valid_alphas_mean'].append(mean_valid_alphas) stats['valid_alphas_std'].append(std_valid_alphas) stats['valid_alphas_ent'].append(valid_alpha_ent) stats['valid_probs_mean'].append(mean_valid_probs) stats['valid_probs_std'].append(std_valid_probs) if uidx == 0 or valid_err <= numpy.array( history_errs)[:, 1].min(): best_p = unzip(tparams) bad_counter = 0 best_found = True else: bst_found = False if numpy.isnan(valid_err): import ipdb ipdb.set_trace() print "============================" print '\t>>>Valid error: ', valid_err, \ ' Valid cost: ', valid_cost print '\t>>>Valid pred mean: ', mean_valid_probs, \ ' Valid pred std: ', std_valid_probs print '\t>>>Valid alphas mean: ', mean_valid_alphas, \ ' Valid alphas std: ', std_valid_alphas, \ ' Valid alpha negent: ', valid_alpha_ent, \ ' Valid error ent: ', error_ent, \ ' Valid error desc ent: ', error_dent print "============================" print "Running average train stats " print '\t>>>Train error: ', train_err_ave, \ ' Train cost: ', train_cost_ave, \ ' Train grad norm: ', train_gnorm_ave print "============================" train_cost_ave, train_err_ave, \ train_gnorm_ave = reset_train_vals() print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid.reset() valid_cost, valid_error, valid_probs, \ valid_alphas, error_ent = eval_model(f_log_probs, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, model_options, valid, use_sent_rep=opt_ds['use_sent_rep']) print " Final eval resuts: " print 'Valid error: ', valid_error.mean() print 'Valid cost: ', valid_cost.mean() print '\t>>>Valid pred mean: ', valid_probs.mean(), \ ' Valid pred std: ', valid_probs.std(), \ ' Valid error ent: ', error_ent params = copy.copy(best_p) numpy.savez(mpath_last, zipped_params=best_p, history_errs=history_errs, **params) return valid_err, valid_cost
def __init__(self, n_in, n_hids, n_out, mem_size, mem_nel, deep_out_size, bow_size=40, inps=None, dropout=None, predict_bow_out=False, seq_len=None, n_read_heads=1, n_layers=1, n_write_heads=1, train_profile=False, erase_activ=None, content_activ=None, l1_pen=None, l2_pen=None, use_reinforce=False, use_reinforce_baseline=False, n_reading_steps=2, use_gru_inp_rep=False, use_simple_rnn_inp_rep=False, use_nogru_mem2q=False, sub_mb_size=40, lambda1_rein=2e-4, lambda2_rein=2e-5, baseline_reg=1e-2, anticorrelation=None, use_layer_norm=False, recurrent_dropout_prob=-1, correlation_ws=None, hybrid_att=True, max_fact_len=7, use_dice_val=False, use_qmask=False, renormalization_scale=4.8, w2v_embed_scale=0.42, emb_scale=0.32, use_soft_att=False, use_hard_att_eval=False, use_batch_norm=False, learning_rule=None, use_loc_based_addressing=True, smoothed_diff_weights=False, use_multiscale_shifts=True, use_ff_controller=False, use_gate_quad_interactions=False, permute_order=False, wpenalty=None, noise=None, w2v_embed_path=None, glove_embed_path=None, learn_embeds=True, use_last_hidden_state=False, use_adv_indexing=False, use_bow_input=True, use_out_mem=True, use_deepout=True, use_q_mask=False, use_inp_content=True, rnd_indxs=None, address_size=0, learn_h0=False, use_context=False, debug=False, controller_activ=None, mem_gater_activ=None, weight_initializer=None, bias_initializer=None, use_cost_mask=True, use_bow_cost_mask=True, theano_function_mode=None, batch_size=32, use_noise=False, reinforce_decay=0.9, softmax=False, use_mask=False, name="ntm_model", **kwargs): assert deep_out_size is not None, ("Size of the deep output " " should not be None.") if sub_mb_size is None: sub_mb_size = batch_size assert sub_mb_size <= batch_size, "batch_size should be greater than sub_mb_size" self.hybrid_att = hybrid_att self.state = locals() self.use_context = use_context self.eps = 1e-8 self.use_mask = use_mask self.l1_pen = l1_pen self.l2_pen = l2_pen self.l2_penalizer = None self.emb_scale = emb_scale self.w2v_embed_path = w2v_embed_path self.glove_embed_path = glove_embed_path self.learn_embeds = learn_embeds self.exclude_params = {} self.use_gate_quad_interactions = use_gate_quad_interactions self.reinforce_decay = reinforce_decay self.max_fact_len = max_fact_len self.lambda1_reinf = lambda1_rein self.lambda2_reinf = lambda2_rein self.use_reinforce_baseline = use_reinforce_baseline self.use_reinforce = use_reinforce self.use_gru_inp_rep = use_gru_inp_rep self.use_simple_rnn_inp_rep = use_simple_rnn_inp_rep self.use_q_mask = use_q_mask self.use_inp_content = use_inp_content self.rnd_indxs = rnd_indxs self.use_layer_norm = use_layer_norm self.recurrent_dropout_prob = recurrent_dropout_prob self.n_reading_steps = n_reading_steps self.sub_mb_size = sub_mb_size self.predict_bow_out = predict_bow_out self.correlation_ws = correlation_ws self.smoothed_diff_weights = smoothed_diff_weights self.use_soft_att = use_soft_att self.use_hard_att_eval = use_hard_att_eval if anticorrelation and n_read_heads < 2: raise ValueError("Anti-correlation of the attention weight" " do not support the multiple read heads.") self.anticorrelation = anticorrelation if self.predict_bow_out: if len(inps) <= 4: raise ValueError( "The number of inputs should be greater than 4.") if l2_pen: self.l2_penalizer = L2Penalty(self.l2_pen) #assert use_bow_input ^ use_gru_inp_rep ^ self.use_simple_rnn_inp_rep, \ # "You should either use GRU or BOW input." self.renormalization_scale = renormalization_scale self.w2v_embed_scale = w2v_embed_scale self.baseline_reg = baseline_reg self.inps = inps self.erase_activ = erase_activ self.use_ff_controller = use_ff_controller self.content_activ = content_activ self.use_bow_cost_mask = use_bow_cost_mask self.ntm_outs = None self.theano_function_mode = theano_function_mode self.n_in = n_in self.dropout = dropout self.wpenalty = wpenalty self.noise = noise self.bow_size = bow_size self.use_last_hidden_state = use_last_hidden_state self.use_loc_based_addressing = use_loc_based_addressing self.train_profile = train_profile self.use_nogru_mem2q = use_nogru_mem2q self.use_qmask = use_qmask self.permute_order = permute_order self.use_batch_norm = use_batch_norm # Use this if you have a ff-controller because otherwise this is not effective: self.n_layers = n_layers if self.use_reinforce: reinforceCls = REINFORCE if not self.use_reinforce_baseline: reinforceCls = REINFORCEBaselineExt self.Reinforce = reinforceCls(lambda1_reg=self.lambda1_reinf, lambda2_reg=self.lambda2_reinf, decay=self.reinforce_decay) self.ReaderReinforce = \ ReinforcePenalty(reinf_level=self.lambda1_reinf, maxent_level=self.lambda2_reinf, use_reinforce_baseline=self.use_reinforce_baseline) self.dice_val = None if use_dice_val: self.dice_val = sharedX(1.) self.use_dice_val = use_dice_val if bow_size is None: raise ValueError("bow_size should be specified.") if name is None: raise ValueError("name should not be empty.") self.n_hids = n_hids self.mem_size = mem_size self.use_deepout = use_deepout self.mem_nel = mem_nel self.n_out = n_out self.use_out_mem = use_out_mem self.use_multiscale_shifts = use_multiscale_shifts self.address_size = address_size self.n_read_heads = n_read_heads self.n_write_heads = n_write_heads self.learn_h0 = learn_h0 self.use_adv_indexing = use_adv_indexing self.softmax = softmax self.use_bow_input = use_bow_input self.use_cost_mask = use_cost_mask self.deep_out_size = deep_out_size self.controller_activ = controller_activ self.mem_gater_activ = mem_gater_activ self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer if batch_size: self.batch_size = batch_size else: self.batch_size = inps[0].shape[1] #assert self.batch_size >= self.sub_mb_size, ("Minibatch size should be " # " greater than the sub minibatch size") self.comp_grad_fn = None self.name = name self.use_noise = use_noise self.train_timer = Timer("Training function") self.gradfn_timer = Timer("Gradient function") self.grads_timer = Timer("Computing the grads") self.reset() self.seq_len = TT.iscalar('seq_len') self.__convert_inps_to_list() if debug: if self.use_gru_inp_rep or self.use_bow_input: self.seq_len.tag.test_value = self.inps[ 0].tag.test_value.shape[1] else: self.seq_len.tag.test_value = self.inps[ 0].tag.test_value.shape[0] self.learning_rule = learning_rule if self.predict_bow_out: self.bow_out_w = TT.fscalar("bow_out_w") if debug: self.bow_out_w.tag.test_value = np.float32(1.0) else: self.bow_out_w = 0
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ Compute the AdaDelta updates Parameters ---------- learning_rate : float Learning rate coefficient. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ updates = OrderedDict() tot_norm_up = 0 gshared = OrderedDict({p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems()}) gsup = [(gshared[p], g) for p, g in grads.iteritems()] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) for param in gshared.keys(): # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0.) if param.name is not None: mean_square_grad.name = 'mean_square_grad_' + param.name mean_square_dx.name = 'mean_square_dx_' + param.name # Accumulate gradient new_mean_squared_grad = ( self.decay * mean_square_grad + (1 - self.decay) * T.sqr(gshared[param]) ) # Compute update epsilon = learning_rate rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon) rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon) delta_x_t = - rms_dx_tm1 / rms_grad_t * gshared[param] # Accumulate updates new_mean_square_dx = ( self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t) ) # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[param] = param + delta_x_t tot_norm_up += delta_x_t.norm(2) f_update = theano.function([learning_rate], [tot_norm_up], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ .. todo:: WRITEME """ updates = OrderedDict() velocity = OrderedDict() tot_norm_up = 0 gshared = OrderedDict({p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems()}) gsup = [(gshared[p], g) for p, g in grads.iteritems()] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) for param in gshared.keys(): avg_grad_sqr = sharedX(np.zeros_like(param.get_value())) velocity[param] = sharedX(np.zeros_like(param.get_value())) if param.name is not None: avg_grad_sqr.name = 'avg_grad_sqr_' + param.name new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr +\ (1 - self.averaging_coeff) * T.sqr(gshared[param]) if self.use_first_order: avg_grad = sharedX(np.zeros_like(param.get_value())) if param.name is not None: avg_grad.name = 'avg_grad_' + param.name new_avg_grad = self.averaging_coeff * avg_grad +\ (1 - self.averaging_coeff) * gshared[param] rms_grad_t = T.sqrt(new_avg_grad_sqr - new_avg_grad**2) updates[avg_grad] = new_avg_grad else: rms_grad_t = T.sqrt(new_avg_grad_sqr) rms_grad_t = T.maximum(rms_grad_t, self.stabilizer) normalized_grad = gshared[param] / (rms_grad_t) new_velocity = self.momentum * velocity[param] -\ learning_rate * normalized_grad tot_norm_up += new_velocity.norm(2) updates[avg_grad_sqr] = new_avg_grad_sqr updates[velocity[param]] = new_velocity updates[param] = param + new_velocity if self.momentum_clipping is not None: tot_norm_up = 0 new_mom_norm = sum( map(lambda X: T.sqr(X).sum(), [updates[velocity[param]] for param in grads.keys()]) ) new_mom_norm = T.sqrt(new_mom_norm) scaling_den = T.maximum(self.momentum_clipping, new_mom_norm) scaling_num = self.momentum_clipping for param in grads.keys(): if self.bound_inc: updates[velocity[param]] *= (scaling_num / scaling_den) updates[param] = param + updates[velocity[param]] else: update_step = updates[velocity[param]] * (scaling_num / scaling_den) tot_norm_up += update_step.norm(2) updates[param] = param + update_step f_update = theano.function([learning_rate], [tot_norm_up], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ .. todo:: WRITEME """ if self.gradient_clipping is not None: grads_norm = sum( map(lambda X: T.sqr(X).sum(), [grads[param] for param in grads.keys()]) ) grads_norm = T.sqrt(grads_norm) scaling_den = T.maximum(self.gradient_clipping, grads_norm) scaling_num = self.gradient_clipping for param in grads.keys(): grads[param] = scaling_num * grads[param] / scaling_den updates = OrderedDict() velocity = OrderedDict() normalized_velocities = OrderedDict() counter = sharedX(0, 'counter') tot_norm_up = 0 gshared = OrderedDict({p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems()}) gsup = [(gshared[p], g) for p, g in grads.iteritems()] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) for param in gshared.keys(): avg_grad_sqr = sharedX(np.zeros_like(param.get_value())) velocity[param] = sharedX(np.zeros_like(param.get_value())) next_counter = counter + 1. fix_first_moment = 1. - self.momentum**next_counter fix_second_moment = 1. - self.averaging_coeff**next_counter if param.name is not None: avg_grad_sqr.name = 'avg_grad_sqr_' + param.name new_avg_grad_sqr = self.averaging_coeff*avg_grad_sqr \ + (1 - self.averaging_coeff)*T.sqr(gshared[param]) rms_grad_t = T.sqrt(new_avg_grad_sqr) rms_grad_t = T.maximum(rms_grad_t, self.stabilizer) new_velocity = self.momentum * velocity[param] \ - (1 - self.momentum) * gshared[param] normalized_velocity = (new_velocity * T.sqrt(fix_second_moment)) \ / (rms_grad_t * fix_first_moment) tot_norm_up += learning_rate*normalized_velocity.norm(2) normalized_velocities[param] = normalized_velocity updates[avg_grad_sqr] = new_avg_grad_sqr updates[velocity[param]] = new_velocity updates[param] = param + normalized_velocities[param] updates[counter] = counter + 1 f_update = theano.function([learning_rate], [tot_norm_up], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def __call__(self, probs, samples, baseline, updates, cost = None, mask=None, seq_len=20, batch_size=140, deterministic=False): if input is None: raise ValueError("input for the %s should" " not be empty." % __class__.__name__) step = 0 key_step = get_key_byname_from_dict(updates, "step") if key_step: step = updates[key_step] else: step = sharedX(0., name="step") updates[step] = step + as_floatX(1) key_center = get_key_byname_from_dict(updates, "center") if key_center: center = updates[key_center] new_center = center else: center = sharedX(0.08 + self.eps, name="center") new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost.sum(0).mean() updates[center] = new_center key_cvar = get_key_byname_from_dict(updates, "cost_var") if key_cvar: cost_var = updates[key_cvar] new_cost_var = cost_var else: cost_var_tot = (cost.sum(0).mean() - new_center)**2 cost_var = sharedX(as_floatX(0.5), name="cost_var") new_cost_var = as_floatX(self.decay) * cost_var + as_floatX(1 - self.decay) * \ cost_var_tot updates[cost_var] = new_cost_var lambda2_reg = self.lambda2_reg if not self.schedule_h_opts: start = self.schedule_h_opts["lambda2_reg_start"] nbatches = self.schedule_h_opts["end_nbatches"] end = self.lambda2_reg assert start > end lambda2_reg = TT.minimum(((start - end) * step / nbatches) + start, end) action_probs = samples * probs if samples.ndim == 4: reward = cost.dimshuffle(0, 'x', 1, 'x') policy = (TT.log(probs + 1e-8) * samples).mean((2, 3)).sum() else: if cost.ndim == 2: reward = cost.dimshuffle(0, 1, 'x') elif cost.ndim == 1: reward = cost.dimshuffle('x', 0, 'x') baseline = baseline.dimshuffle(1, 0, 2) policy = (TT.log(probs + 1e-8) * samples).mean((1, 2)).sum() cost_std = TT.maximum(TT.sqrt(new_cost_var + 1e-8), 1e-6) centered_reward = (reward - baseline - new_center) / cost_std N = probs.shape[-1] gradp = self.lambda1_reg * (centered_reward) * \ (samples / (probs + 1e-8)) + lambda2_reg * (TT.log(probs + 1e-6) + as_floatX(1)) if mask is not None: gradp = mask.dimshuffle(0, 1, 'x') * gradp / N known_grads = {probs: gradp} return updates, known_grads, new_center, cost_std, policy, lambda2_reg
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ .. todo:: WRITEME """ updates = OrderedDict() velocity = OrderedDict() tot_norm_up = 0 gshared = OrderedDict({ p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems() }) gsup = [(gshared[p], g) for p, g in grads.iteritems()] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) for param in gshared.keys(): avg_grad_sqr = sharedX(np.zeros_like(param.get_value())) velocity[param] = sharedX(np.zeros_like(param.get_value())) if param.name is not None: avg_grad_sqr.name = 'avg_grad_sqr_' + param.name new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr +\ (1 - self.averaging_coeff) * T.sqr(gshared[param]) if self.use_first_order: avg_grad = sharedX(np.zeros_like(param.get_value())) if param.name is not None: avg_grad.name = 'avg_grad_' + param.name new_avg_grad = self.averaging_coeff * avg_grad +\ (1 - self.averaging_coeff) * gshared[param] rms_grad_t = T.sqrt(new_avg_grad_sqr - new_avg_grad**2) updates[avg_grad] = new_avg_grad else: rms_grad_t = T.sqrt(new_avg_grad_sqr) rms_grad_t = T.maximum(rms_grad_t, self.stabilizer) normalized_grad = gshared[param] / (rms_grad_t) new_velocity = self.momentum * velocity[param] -\ learning_rate * normalized_grad tot_norm_up += new_velocity.norm(2) updates[avg_grad_sqr] = new_avg_grad_sqr updates[velocity[param]] = new_velocity updates[param] = param + new_velocity if self.momentum_clipping is not None: tot_norm_up = 0 new_mom_norm = sum( map(lambda X: T.sqr(X).sum(), [updates[velocity[param]] for param in grads.keys()])) new_mom_norm = T.sqrt(new_mom_norm) scaling_den = T.maximum(self.momentum_clipping, new_mom_norm) scaling_num = self.momentum_clipping for param in grads.keys(): if self.bound_inc: updates[velocity[param]] *= (scaling_num / scaling_den) updates[param] = param + updates[velocity[param]] else: update_step = updates[velocity[param]] * (scaling_num / scaling_den) tot_norm_up += update_step.norm(2) updates[param] = param + update_step f_update = theano.function([learning_rate], [tot_norm_up], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def get_updates(self, learning_rate, grads, lr_scalers=None): """ .. todo:: WRITEME """ updates = OrderedDict() velocity = OrderedDict() normalized_velocities = OrderedDict() counter = sharedX(0, 'counter') tot_norm_up = 0 tot_param_norm = 0 if self.gradient_clipping is not None: grads_norm = sum( map(lambda X: T.sqr(X).sum(), [grads[param] for param in grads.keys()])) grads_norm = T.sqrt(grads_norm) scaling_den = T.maximum(self.gradient_clipping, grads_norm) scaling_num = self.gradient_clipping for param in grads.keys(): grads[param] = scaling_num * grads[param] / scaling_den for param in grads.keys(): avg_grad_sqr = sharedX(np.zeros_like(param.get_value())) velocity[param] = sharedX(np.zeros_like(param.get_value())) next_counter = counter + 1. fix_first_moment = 1. - self.momentum**next_counter fix_second_moment = 1. - self.averaging_coeff**next_counter if param.name is not None: avg_grad_sqr.name = 'avg_grad_sqr_' + param.name new_avg_grad_sqr = self.averaging_coeff*avg_grad_sqr \ + (1 - self.averaging_coeff)*T.sqr(grads[param]) rms_grad_t = T.sqrt(new_avg_grad_sqr) rms_grad_t = T.maximum(rms_grad_t, self.stabilizer) new_velocity = self.momentum * velocity[param] \ - (1 - self.momentum) * grads[param] normalized_velocity = (new_velocity * T.sqrt(fix_second_moment)) \ / (rms_grad_t * fix_first_moment) tot_param_norm += param.norm(2) tot_norm_up += learning_rate * normalized_velocity.norm(2) normalized_velocities[param] = normalized_velocity updates[avg_grad_sqr] = new_avg_grad_sqr updates[velocity[param]] = new_velocity update_param_norm_ratio = tot_norm_up / (tot_param_norm + 1e-7) new_lr = ifelse.ifelse( T.ge(update_param_norm_ratio, self.update_param_norm_ratio), as_floatX(learning_rate * self.update_param_norm_ratio) / update_param_norm_ratio, as_floatX(learning_rate)) new_lr = ifelse.ifelse(T.ge(counter, 6000), new_lr, as_floatX(learning_rate)) for param in grads.keys(): updates[param] = param + new_lr * normalized_velocities[param] updates[counter] = counter + 1 return updates, tot_norm_up, tot_param_norm
def lstm_tied_layer(tparams, state_below, options, prefix='lstm_tied', mask=None, one_step=False, init_state=None, init_memory=None, nsteps=None, **kwargs): if nsteps is None: nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 param = lambda name: tparams[prfx(prefix, name)] dim = param('U').shape[0] if mask is None: mask = tensor.alloc(1., state_below.shape[0], 1) # initial/previous state if init_state is None: if not options['learn_h0']: init_state = tensor.alloc(0., n_samples, dim) else: init_state0 = sharedX(numpy.zeros((options['dim'])), name=prfx(prefix, "h0")) init_state = tensor.concatenate([[init_state0] \ for i in xrange(options['batch_size'])], axis=0) tparams[prfx(prefix, 'h0')] = init_state0 # initial/previous memory if init_memory is None: init_memory = tensor.alloc(0., n_samples, dim) def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] def _step(mask, sbelow, sbefore, cell_before): preact = dot(sbefore, param('U')) preact += sbelow preact += tparams[prfx(prefix, 'b')] f = Sigmoid(_slice(preact, 0, dim)) o = Sigmoid(_slice(preact, 1, dim)) c = Tanh(_slice(preact, 2, dim)) c = f * cell_before + (1 - f) * c c = mask * c + (1. - mask) * cell_before h = o * tensor.tanh(c) h = mask * h + (1. - mask) * sbefore return h, c state_below = dot(state_below, param('W')) + param('b') if one_step: mask = mask.dimshuffle(0, 'x') h, c = _step(mask, state_below, init_state, init_memory) rval = [h, c] else: if mask.ndim == 3 and mask.ndim == state_below.ndim: mask = mask.reshape((mask.shape[0], mask.shape[1]*mask.shape[2])).dimshuffle(0, 1, 'x') elif mask.ndim == 2: mask = mask.dimshuffle(0, 1, 'x') rval, updates = theano.scan(_step, sequences=[mask, state_below], outputs_info=[init_state, init_memory], name=prfx(prefix, '_layers'), n_steps=nsteps) return rval
def __init__(self, init_momentum, nesterov_momentum=False): assert init_momentum >= 0. assert init_momentum < 1. self.momentum = sharedX(init_momentum, 'momentum') self.nesterov_momentum = nesterov_momentum
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ .. todo:: WRITEME Parameters ---------- learning_rate : float Learning rate coefficient. Learning rate is not being used but, pylearn2 requires a learning rate to be defined. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ updates = OrderedDict({}) eps = self.damping step = sharedX(0., name="step") if self.skip_nan_inf: #If norm of the gradients of a parameter is inf or nan don't update that parameter #That might be useful for RNNs. grads = OrderedDict({p: T.switch(T.or_(T.isinf(grads[p]), T.isnan(grads[p])), 0, grads[p]) for p in grads.keys()}) # Block-normalize gradients: nparams = len(grads.keys()) # Apply the gradient clipping, this is only sometimes # necessary for RNNs and sometimes for very deep networks if self.grad_clip: assert self.grad_clip > 0. assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1." gnorm = sum([g.norm(2) for g in grads.values()]) notfinite = T.or_(T.isnan(gnorm), T.isinf(gnorm)) for p, g in grads.iteritems(): tmpg = T.switch(gnorm / nparams > self.grad_clip, g * self.grad_clip * nparams / gnorm , g) grads[p] = T.switch(notfinite, as_floatX(0.1)*p, tmpg) tot_norm_up = 0 gshared = OrderedDict({p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems()}) gsup = [(gshared[p], g) for p, g in grads.iteritems()] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) fix_decay = self.slow_decay**(step + 1) for param in gshared.keys(): gshared[param].name = "grad_%s" % param.name mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name) gnorm_sqr = sharedX(0.0 + eps, name="gnorm_%s" % param.name) prod_taus = sharedX((np.ones_like(param.get_value()) - 2*eps), name="prod_taus_x_t_" + param.name) slow_constant = 2.1 if self.use_adagrad: # sum_square_grad := \sum_i g_i^2 sum_square_grad = sharedX(param.get_value(borrow=True) * 0., name="sum_square_grad_%s" % param.name) """ Initialization of accumulators """ taus_x_t = sharedX((np.ones_like(param.get_value()) + eps) * slow_constant, name="taus_x_t_" + param.name) self.taus_x_t = taus_x_t #Variance reduction parameters #Numerator of the gamma: gamma_nume_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_nume_sqr_" + param.name) #Denominator of the gamma: gamma_deno_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_deno_sqr_" + param.name) #For the covariance parameter := E[\gamma \alpha]_{t-1} cov_num_t = sharedX(np.zeros_like(param.get_value()) + eps, name="cov_num_t_" + param.name) # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(np.zeros_like(param.get_value()) + eps, name="msg_" + param.name) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name) if self.use_corrected_grad: old_grad = sharedX(param.get_value() * 0. + eps) #The uncorrected gradient of previous of the previous update: old_plain_grad = sharedX(param.get_value() * 0. + eps) mean_curvature = sharedX(param.get_value() * 0. + eps) mean_curvature_sqr = sharedX(param.get_value() * 0. + eps) # Initialize the E[\Delta]_{t-1} mean_dx = sharedX(param.get_value() * 0.) # Block-wise normalize the gradient: norm_grad = gshared[param] #For the first time-step, assume that delta_x_t := norm_grad gnorm = T.sqr(norm_grad).sum() cond = T.eq(step, 0) gnorm_sqr_o = cond * gnorm + (1 - cond) * gnorm_sqr gnorm_sqr_b = gnorm_sqr_o / (1 - fix_decay) norm_grad = norm_grad / (T.sqrt(gnorm_sqr_b) + eps) msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx mdx = cond * norm_grad + (1 - cond) * mean_dx new_prod_taus = ( prod_taus * (1 - 1 / taus_x_t) ) """ Compute the new updated values. """ # E[g_i^2]_t new_mean_squared_grad = ( mean_square_grad * (1 - 1 / taus_x_t) + T.sqr(norm_grad) / (taus_x_t) ) new_mean_squared_grad.name = "msg_" + param.name # E[g_i]_t new_mean_grad = ( mean_grad * (1 - 1 / taus_x_t) + norm_grad / taus_x_t ) new_mean_grad.name = "nmg_" + param.name mg = new_mean_grad / (1 - new_prod_taus) mgsq = new_mean_squared_grad / (1 - new_prod_taus) new_gnorm_sqr = ( gnorm_sqr_o * self.slow_decay + T.sqr(norm_grad).sum() * (1 - self.slow_decay) ) # Keep the rms for numerator and denominator of gamma. new_gamma_nume_sqr = ( gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr((norm_grad - old_grad) * (old_grad - mg)) / taus_x_t ) new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name new_gamma_deno_sqr = ( gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr((mg - norm_grad) * (old_grad - mg)) / taus_x_t ) new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name gamma = T.sqrt(gamma_nume_sqr) / (T.sqrt(gamma_deno_sqr + eps) + \ self.gamma_reg) gamma.name = "gamma_" + param.name if self.gamma_clip and self.gamma_clip > -1: gamma = T.minimum(gamma, self.gamma_clip) momentum_step = gamma * mg corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma) #For starting the variance reduction. if self.start_var_reduction > -1: cond = T.le(self.start_var_reduction, step) corrected_grad = cond * corrected_grad_cand + (1 - cond) * norm_grad else: corrected_grad = norm_grad if self.use_adagrad: g = corrected_grad # Accumulate gradient new_sum_squared_grad = ( sum_square_grad + T.sqr(g) ) rms_g_t = T.sqrt(new_sum_squared_grad) rms_g_t = T.maximum(rms_g_t, 1.0) #Use the gradients from the previous update #to compute the \nabla f(x_t) - \nabla f(x_{t-1}) cur_curvature = norm_grad - old_plain_grad #cur_curvature = theano.printing.Print("Curvature: ")(cur_curvature) cur_curvature_sqr = T.sqr(cur_curvature) new_curvature_ave = ( mean_curvature * (1 - 1 / taus_x_t) + (cur_curvature / taus_x_t) ) new_curvature_ave.name = "ncurve_ave_" + param.name #Average average curvature nc_ave = new_curvature_ave / (1 - new_prod_taus) new_curvature_sqr_ave = ( mean_curvature_sqr * (1 - 1 / taus_x_t) + (cur_curvature_sqr / taus_x_t) ) new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name #Unbiased average squared curvature nc_sq_ave = new_curvature_sqr_ave / (1 - new_prod_taus) epsilon = 1e-7 #lr_scalers.get(param, 1.) * learning_rate scaled_lr = sharedX(1.0) rms_dx_tm1 = T.sqrt(msdx + epsilon) rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon) #This is where the update step is being defined delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon)) delta_x_t.name = "delta_x_t_" + param.name # This part seems to be necessary for only RNNs # For feedforward networks this does not seem to be important. if self.delta_clip: logger.info("Clipping will be applied on the adaptive step size.") delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip) if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad is disabled.") delta_x_t = delta_x_t * corrected_grad else: logger.info("Clipping will not be applied on the adaptive step size.") if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad will not be used.") delta_x_t = delta_x_t * corrected_grad new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX(1 + eps, "stabilized") #To compute the E[\Delta^2]_t new_mean_square_dx = ( msdx * (1 - 1 / taus_x_t) + (T.sqr(delta_x_t) / taus_x_t) ) #To compute the E[\Delta]_t new_mean_dx = ( mdx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t)) ) #Perform the outlier detection: #This outlier detection is slightly different: new_taus_t = T.switch(T.or_(abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)), abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))), T.switch(new_taus_t > 2.5, sharedX(2.5), new_taus_t + sharedX(1.0) + eps), new_taus_t) #Apply the bound constraints on tau: new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t) new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t) new_cov_num_t = ( cov_num_t * (1 - 1 / taus_x_t) + (delta_x_t * cur_curvature) * (1 / taus_x_t) ) update_step = delta_x_t tot_norm_up += update_step.norm(2) # Apply updates updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[mean_dx] = new_mean_dx updates[gnorm_sqr] = new_gnorm_sqr updates[gamma_nume_sqr] = new_gamma_nume_sqr updates[gamma_deno_sqr] = new_gamma_deno_sqr updates[taus_x_t] = new_taus_t updates[cov_num_t] = new_cov_num_t updates[mean_grad] = new_mean_grad updates[old_plain_grad] = norm_grad updates[mean_curvature] = new_curvature_ave updates[mean_curvature_sqr] = new_curvature_sqr_ave if self.perform_update: updates[param] = param + update_step updates[step] = step + 1 updates[prod_taus] = new_prod_taus if self.use_adagrad: updates[sum_square_grad] = new_sum_squared_grad if self.use_corrected_grad: updates[old_grad] = corrected_grad f_update = theano.function([learning_rate], [tot_norm_up], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def gru_layer(tparams, state_below, options, prefix='gru', mask=None, nsteps=None, truncate=None, init_state=None, **kwargs): if nsteps is None: nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 param = lambda name: tparams[prfx(prefix, name)] dim = param('Ux').shape[1] if mask is None: mask = tensor.alloc(1., state_below.shape[0], 1) if mask.ndim == 3 and mask.ndim == state_below.ndim: mask = mask.reshape((mask.shape[0], \ mask.shape[1] * mask.shape[2])).dimshuffle(0, 1, 'x') elif mask.ndim == 2: mask = mask.dimshuffle(0, 1, 'x') def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] state_below_ = dot(state_below, param('W')) + param('b') state_belowx = dot(state_below, param('Wx')) + param('bx') # initial/previous state if init_state is None: if not options['learn_h0']: init_state = tensor.alloc(0., n_samples, dim) else: init_state0 = sharedX(numpy.zeros((options['dim'])), name=prfx(prefix, "h0")) init_state = tensor.concatenate([[init_state0] \ for i in xrange(options['batch_size'])], axis=0) tparams[prfx(prefix, 'h0')] = init_state0 U = tparams[prfx(prefix, 'U')] Ux = tparams[prfx(prefix, 'Ux')] def _step_slice(mask, sbelow, sbelowx, sbefore, U, Ux): preact = dot(sbefore, U) preact += sbelow r = Sigmoid(_slice(preact, 0, dim)) u = Sigmoid(_slice(preact, 1, dim)) preactx = dot(r * sbefore, Ux) # preactx = preactx preactx = preactx + sbelowx h = Tanh(preactx) h = u * sbefore + (1. - u) * h h = mask[:, None] * h + (1. - mask)[:, None] * sbefore return h seqs = [mask, state_below_, state_belowx] _step = _step_slice rval, updates = theano.scan(_step, sequences=seqs, outputs_info=[init_state], non_sequences=[U, Ux], name=prfx(prefix, '_layers'), n_steps=nsteps, truncate_gradient=truncate, profile=profile, strict=True) rval = [rval] return rval