def get_updates(self, learning_rate, grads, lr_scalers=None): """ .. todo:: WRITEME Parameters ---------- learning_rate : float Learning rate coefficient. Learning rate is not being used but, pylearn2 requires a learning rate to be defined. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ updates = OrderedDict({}) eps = self.damping step = sharedX(0., name="step") if self.skip_nan_inf: #If norm of the gradients of a parameter is inf or nan don't update that parameter #That might be useful for RNNs. grads = OrderedDict({ p: T.switch(T.or_(T.isinf(grads[p]), T.isnan(grads[p])), 0, grads[p]) for p in grads.keys() }) #Block-normalize gradients: nparams = len(grads.keys()) #Apply the gradient clipping, this is only sometimes #necessary for RNNs and sometimes for very deep networks if self.grad_clip: assert self.grad_clip > 0. assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1." gnorm = sum([g.norm(2) for g in grads.values()]) notfinite = T.or_(T.isnan(gnorm), T.isinf(gnorm)) for p, g in grads.iteritems(): tmpg = T.switch(gnorm / nparams > self.grad_clip, g * self.grad_clip * nparams / gnorm, g) grads[p] = T.switch(notfinite, as_floatX(0.1) * p, tmpg) tot_norm_up = 0 tot_param_norm = 0 fix_decay = self.slow_decay**(step + 1) for param in grads.keys(): grads[param].name = "grad_%s" % param.name mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name) mean_corrected_grad = sharedX(param.get_value() * 0 + eps, name="mean_corrected_grad_%s" % param.name) gnorm_sqr = sharedX(0.0 + eps, name="gnorm_%s" % param.name) prod_taus = sharedX((np.ones_like(param.get_value()) - 2 * eps), name="prod_taus_x_t_" + param.name) slow_constant = 2.1 if self.use_adagrad: # sum_square_grad := \sum_i g_i^2 sum_square_grad = sharedX(param.get_value(borrow=True) * 0., name="sum_square_grad_%s" % param.name) """ Initialization of accumulators """ taus_x_t = sharedX( (np.ones_like(param.get_value()) + eps) * slow_constant, name="taus_x_t_" + param.name) self.taus_x_t = taus_x_t #Variance reduction parameters #Numerator of the gamma: gamma_nume_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_nume_sqr_" + param.name) #Denominator of the gamma: gamma_deno_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_deno_sqr_" + param.name) #For the covariance parameter := E[\gamma \alpha]_{t-1} cov_num_t = sharedX(np.zeros_like(param.get_value()) + eps, name="cov_num_t_" + param.name) # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(np.zeros_like(param.get_value()) + eps, name="msg_" + param.name) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name) if self.use_corrected_grad: old_grad = sharedX(param.get_value() * 0. + eps) #The uncorrected gradient of previous of the previous update: old_plain_grad = sharedX(param.get_value() * 0. + eps) mean_curvature = sharedX(param.get_value() * 0. + eps) mean_curvature_sqr = sharedX(param.get_value() * 0. + eps) # Initialize the E[\Delta]_{t-1} mean_dx = sharedX(param.get_value() * 0.) # Block-wise normalize the gradient: norm_grad = grads[param] #For the first time-step, assume that delta_x_t := norm_grad gnorm = T.sqr(norm_grad).sum() cond = T.eq(step, 0) gnorm_sqr_o = cond * gnorm + (1 - cond) * gnorm_sqr gnorm_sqr_b = gnorm_sqr_o / (1 - fix_decay) norm_grad = norm_grad / (T.sqrt(gnorm_sqr_b) + eps) msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx mdx = cond * norm_grad + (1 - cond) * mean_dx new_prod_taus = (prod_taus * (1 - 1 / taus_x_t)) """ Compute the new updated values. """ # E[g_i^2]_t new_mean_squared_grad = (mean_square_grad * (1 - 1 / taus_x_t) + T.sqr(norm_grad) / (taus_x_t)) new_mean_squared_grad.name = "msg_" + param.name # E[g_i]_t new_mean_grad = (mean_grad * (1 - 1 / taus_x_t) + norm_grad / taus_x_t) new_mean_grad.name = "nmg_" + param.name mg = new_mean_grad / (1 - new_prod_taus) mgsq = new_mean_squared_grad / (1 - new_prod_taus) new_gnorm_sqr = (gnorm_sqr_o * self.slow_decay + T.sqr(norm_grad).sum() * (1 - self.slow_decay)) # Keep the rms for numerator and denominator of gamma. new_gamma_nume_sqr = (gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr( (norm_grad - old_grad) * (old_grad - mg)) / taus_x_t) new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name new_gamma_deno_sqr = (gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr( (mg - norm_grad) * (old_grad - mg)) / taus_x_t) new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name gamma = T.sqrt(gamma_nume_sqr) / (T.sqrt(gamma_deno_sqr + eps) + \ self.gamma_reg) gamma.name = "gamma_" + param.name if self.gamma_clip and self.gamma_clip > -1: gamma = T.minimum(gamma, self.gamma_clip) momentum_step = gamma * mg corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma) #For starting the variance reduction. if self.start_var_reduction > -1: cond = T.le(self.start_var_reduction, step) corrected_grad = cond * corrected_grad_cand + ( 1 - cond) * norm_grad else: corrected_grad = norm_grad if self.use_adagrad: g = corrected_grad # Accumulate gradient new_sum_squared_grad = (sum_square_grad + T.sqr(g)) rms_g_t = T.sqrt(new_sum_squared_grad) rms_g_t = T.maximum(rms_g_t, 1.0) #Use the gradients from the previous update #to compute the \nabla f(x_t) - \nabla f(x_{t-1}) cur_curvature = norm_grad - old_plain_grad #cur_curvature = theano.printing.Print("Curvature: ")(cur_curvature) cur_curvature_sqr = T.sqr(cur_curvature) new_curvature_ave = (mean_curvature * (1 - 1 / taus_x_t) + (cur_curvature / taus_x_t)) new_curvature_ave.name = "ncurve_ave_" + param.name #Average average curvature nc_ave = new_curvature_ave / (1 - new_prod_taus) new_curvature_sqr_ave = (mean_curvature_sqr * (1 - 1 / taus_x_t) + (cur_curvature_sqr / taus_x_t)) new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name #Unbiased average squared curvature nc_sq_ave = new_curvature_sqr_ave / (1 - new_prod_taus) epsilon = 1e-7 #lr_scalers.get(param, 1.) * learning_rate scaled_lr = sharedX(1.0) rms_dx_tm1 = T.sqrt(msdx + epsilon) rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon) #This is where the update step is being defined delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon)) delta_x_t.name = "delta_x_t_" + param.name # This part seems to be necessary for only RNNs # For feedforward networks this does not seem to be important. if self.delta_clip: logger.info( "Clipping will be applied on the adaptive step size.") delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip) if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad is disabled.") delta_x_t = delta_x_t * corrected_grad else: logger.info( "Clipping will not be applied on the adaptive step size.") if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad will not be used.") delta_x_t = delta_x_t * corrected_grad new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX( 1 + eps, "stabilized") #To compute the E[\Delta^2]_t new_mean_square_dx = (msdx * (1 - 1 / taus_x_t) + (T.sqr(delta_x_t) / taus_x_t)) #To compute the E[\Delta]_t new_mean_dx = (mdx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t))) #Perform the outlier detection: #This outlier detection is slightly different: new_taus_t = T.switch( T.or_( abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)), abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))), T.switch(new_taus_t > 2.5, sharedX(2.5), new_taus_t + sharedX(1.0) + eps), new_taus_t) #Apply the bound constraints on tau: new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t) new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t) new_cov_num_t = (cov_num_t * (1 - 1 / taus_x_t) + (delta_x_t * cur_curvature) * (1 / taus_x_t)) update_step = delta_x_t tot_norm_up += update_step.norm(2) tot_param_norm += param.norm(2) # Apply updates updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[mean_dx] = new_mean_dx updates[gnorm_sqr] = new_gnorm_sqr updates[gamma_nume_sqr] = new_gamma_nume_sqr updates[gamma_deno_sqr] = new_gamma_deno_sqr updates[taus_x_t] = new_taus_t updates[cov_num_t] = new_cov_num_t updates[mean_grad] = new_mean_grad updates[old_plain_grad] = norm_grad updates[mean_curvature] = new_curvature_ave updates[mean_curvature_sqr] = new_curvature_sqr_ave if self.perform_update: updates[param] = param + update_step updates[step] = step + 1 updates[prod_taus] = new_prod_taus if self.use_adagrad: updates[sum_square_grad] = new_sum_squared_grad if self.use_corrected_grad: updates[old_grad] = corrected_grad return updates, tot_norm_up, tot_param_norm
shared_inner_outputs) if condition is not None: inner_outs.append(condition) # Cuda is imported here, instead of being imported on top of the file # because forces on the user some dependencies that we might do not want # to. Currently we are working on removing the dependencies on sandbox # code completeley. from theano.sandbox import cuda if cuda.cuda_available: # very often we end up in this situation when we want to # replace w with w_copy, where w is CudaNdarray # and w_copy is TensorType. This is caused because shared # variables are put on GPU right aways >:| , new_givens = OrderedDict() for w, w_copy in givens.iteritems(): if (isinstance(w.type, cuda.CudaNdarrayType) and isinstance(w_copy.type, tensor.TensorType)): for o in inner_outs: new_givens = traverse(o, w, w_copy, new_givens) else: new_givens[w] = w_copy else: new_givens = givens new_outs = scan_utils.clone(inner_outs, replace=new_givens) ## # Step 7. Create the Scan Op ##
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ .. todo:: WRITEME Parameters ---------- learning_rate : float Learning rate coefficient. Learning rate is not being used but, pylearn2 requires a learning rate to be defined. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ updates = OrderedDict({}) eps = self.damping step = sharedX(0., name="step") if self.skip_nan_inf: #If norm of the gradients of a parameter is inf or nan don't update that parameter #That might be useful for RNNs. grads = OrderedDict({p: T.switch(T.or_(T.isinf(grads[p]), T.isnan(grads[p])), 0, grads[p]) for p in grads.keys()}) # Block-normalize gradients: nparams = len(grads.keys()) # Apply the gradient clipping, this is only sometimes # necessary for RNNs and sometimes for very deep networks if self.grad_clip: assert self.grad_clip > 0. assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1." gnorm = sum([g.norm(2) for g in grads.values()]) notfinite = T.or_(T.isnan(gnorm), T.isinf(gnorm)) for p, g in grads.iteritems(): tmpg = T.switch(gnorm / nparams > self.grad_clip, g * self.grad_clip * nparams / gnorm , g) grads[p] = T.switch(notfinite, as_floatX(0.1)*p, tmpg) tot_norm_up = 0 gshared = OrderedDict({p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems()}) gsup = [(gshared[p], g) for p, g in grads.iteritems()] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) fix_decay = self.slow_decay**(step + 1) for param in gshared.keys(): gshared[param].name = "grad_%s" % param.name mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name) gnorm_sqr = sharedX(0.0 + eps, name="gnorm_%s" % param.name) prod_taus = sharedX((np.ones_like(param.get_value()) - 2*eps), name="prod_taus_x_t_" + param.name) slow_constant = 2.1 if self.use_adagrad: # sum_square_grad := \sum_i g_i^2 sum_square_grad = sharedX(param.get_value(borrow=True) * 0., name="sum_square_grad_%s" % param.name) """ Initialization of accumulators """ taus_x_t = sharedX((np.ones_like(param.get_value()) + eps) * slow_constant, name="taus_x_t_" + param.name) self.taus_x_t = taus_x_t #Variance reduction parameters #Numerator of the gamma: gamma_nume_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_nume_sqr_" + param.name) #Denominator of the gamma: gamma_deno_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_deno_sqr_" + param.name) #For the covariance parameter := E[\gamma \alpha]_{t-1} cov_num_t = sharedX(np.zeros_like(param.get_value()) + eps, name="cov_num_t_" + param.name) # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(np.zeros_like(param.get_value()) + eps, name="msg_" + param.name) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name) if self.use_corrected_grad: old_grad = sharedX(param.get_value() * 0. + eps) #The uncorrected gradient of previous of the previous update: old_plain_grad = sharedX(param.get_value() * 0. + eps) mean_curvature = sharedX(param.get_value() * 0. + eps) mean_curvature_sqr = sharedX(param.get_value() * 0. + eps) # Initialize the E[\Delta]_{t-1} mean_dx = sharedX(param.get_value() * 0.) # Block-wise normalize the gradient: norm_grad = gshared[param] #For the first time-step, assume that delta_x_t := norm_grad gnorm = T.sqr(norm_grad).sum() cond = T.eq(step, 0) gnorm_sqr_o = cond * gnorm + (1 - cond) * gnorm_sqr gnorm_sqr_b = gnorm_sqr_o / (1 - fix_decay) norm_grad = norm_grad / (T.sqrt(gnorm_sqr_b) + eps) msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx mdx = cond * norm_grad + (1 - cond) * mean_dx new_prod_taus = ( prod_taus * (1 - 1 / taus_x_t) ) """ Compute the new updated values. """ # E[g_i^2]_t new_mean_squared_grad = ( mean_square_grad * (1 - 1 / taus_x_t) + T.sqr(norm_grad) / (taus_x_t) ) new_mean_squared_grad.name = "msg_" + param.name # E[g_i]_t new_mean_grad = ( mean_grad * (1 - 1 / taus_x_t) + norm_grad / taus_x_t ) new_mean_grad.name = "nmg_" + param.name mg = new_mean_grad / (1 - new_prod_taus) mgsq = new_mean_squared_grad / (1 - new_prod_taus) new_gnorm_sqr = ( gnorm_sqr_o * self.slow_decay + T.sqr(norm_grad).sum() * (1 - self.slow_decay) ) # Keep the rms for numerator and denominator of gamma. new_gamma_nume_sqr = ( gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr((norm_grad - old_grad) * (old_grad - mg)) / taus_x_t ) new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name new_gamma_deno_sqr = ( gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr((mg - norm_grad) * (old_grad - mg)) / taus_x_t ) new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name gamma = T.sqrt(gamma_nume_sqr) / (T.sqrt(gamma_deno_sqr + eps) + \ self.gamma_reg) gamma.name = "gamma_" + param.name if self.gamma_clip and self.gamma_clip > -1: gamma = T.minimum(gamma, self.gamma_clip) momentum_step = gamma * mg corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma) #For starting the variance reduction. if self.start_var_reduction > -1: cond = T.le(self.start_var_reduction, step) corrected_grad = cond * corrected_grad_cand + (1 - cond) * norm_grad else: corrected_grad = norm_grad if self.use_adagrad: g = corrected_grad # Accumulate gradient new_sum_squared_grad = ( sum_square_grad + T.sqr(g) ) rms_g_t = T.sqrt(new_sum_squared_grad) rms_g_t = T.maximum(rms_g_t, 1.0) #Use the gradients from the previous update #to compute the \nabla f(x_t) - \nabla f(x_{t-1}) cur_curvature = norm_grad - old_plain_grad #cur_curvature = theano.printing.Print("Curvature: ")(cur_curvature) cur_curvature_sqr = T.sqr(cur_curvature) new_curvature_ave = ( mean_curvature * (1 - 1 / taus_x_t) + (cur_curvature / taus_x_t) ) new_curvature_ave.name = "ncurve_ave_" + param.name #Average average curvature nc_ave = new_curvature_ave / (1 - new_prod_taus) new_curvature_sqr_ave = ( mean_curvature_sqr * (1 - 1 / taus_x_t) + (cur_curvature_sqr / taus_x_t) ) new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name #Unbiased average squared curvature nc_sq_ave = new_curvature_sqr_ave / (1 - new_prod_taus) epsilon = 1e-7 #lr_scalers.get(param, 1.) * learning_rate scaled_lr = sharedX(1.0) rms_dx_tm1 = T.sqrt(msdx + epsilon) rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon) #This is where the update step is being defined delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon)) delta_x_t.name = "delta_x_t_" + param.name # This part seems to be necessary for only RNNs # For feedforward networks this does not seem to be important. if self.delta_clip: logger.info("Clipping will be applied on the adaptive step size.") delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip) if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad is disabled.") delta_x_t = delta_x_t * corrected_grad else: logger.info("Clipping will not be applied on the adaptive step size.") if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad will not be used.") delta_x_t = delta_x_t * corrected_grad new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX(1 + eps, "stabilized") #To compute the E[\Delta^2]_t new_mean_square_dx = ( msdx * (1 - 1 / taus_x_t) + (T.sqr(delta_x_t) / taus_x_t) ) #To compute the E[\Delta]_t new_mean_dx = ( mdx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t)) ) #Perform the outlier detection: #This outlier detection is slightly different: new_taus_t = T.switch(T.or_(abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)), abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))), T.switch(new_taus_t > 2.5, sharedX(2.5), new_taus_t + sharedX(1.0) + eps), new_taus_t) #Apply the bound constraints on tau: new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t) new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t) new_cov_num_t = ( cov_num_t * (1 - 1 / taus_x_t) + (delta_x_t * cur_curvature) * (1 / taus_x_t) ) update_step = delta_x_t tot_norm_up += update_step.norm(2) # Apply updates updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[mean_dx] = new_mean_dx updates[gnorm_sqr] = new_gnorm_sqr updates[gamma_nume_sqr] = new_gamma_nume_sqr updates[gamma_deno_sqr] = new_gamma_deno_sqr updates[taus_x_t] = new_taus_t updates[cov_num_t] = new_cov_num_t updates[mean_grad] = new_mean_grad updates[old_plain_grad] = norm_grad updates[mean_curvature] = new_curvature_ave updates[mean_curvature_sqr] = new_curvature_sqr_ave if self.perform_update: updates[param] = param + update_step updates[step] = step + 1 updates[prod_taus] = new_prod_taus if self.use_adagrad: updates[sum_square_grad] = new_sum_squared_grad if self.use_corrected_grad: updates[old_grad] = corrected_grad f_update = theano.function([learning_rate], [tot_norm_up], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update