def construct_updates(self, grads): if not self.updates: self.updates = OrderedDict({}) ngrads = OrderedDict({}) mb_step = sharedX(0, name="mb_step") self.updates[mb_step] = mb_step + 1 cond = TT.eq((mb_step) % self.nbatches, 0) rate = 1.0 / self.nbatches for op, og in grads.iteritems(): for i, g in enumerate(self.gs): if op.name in g.name: break else: raise ValueError("Gradient for %s was not found." % op.name) if rate < 1.0: new_grad = (og + self.gs[i]) * as_floatX(rate) self.updates[self.gs[i]] = cond * new_grad + (1 - cond) * og * \ as_floatX(rate) ngrads[op] = new_grad else: ngrads[op] = og return ngrads
def __init_vals(self): self.gs = [theano.shared(as_floatX(k.get_value(borrow=True) * 0.0), \ name="grad_%s" % n) for n, k in \ self.params.__dict__['params'].iteritems()] self.gs_mon = [theano.shared(as_floatX(k.get_value(borrow=True) * 0.0), \ name="grad_%s_mon" % n) for n, k in \ self.params.__dict__['params'].iteritems()]
def __init__(self, n_hids=None, mem_size=None, mem_nel=None, address_size=None, mem_gater_activ=None, n_mid_key_size=None, scale_size=None, use_scale_layer=True, smoothed_diff_weights=False, use_local_att=False, mem_weight_decay=0.96, read_head=False, use_loc_based_addressing=True, shift_width=3, scale_bias_coef=1.0, use_adv_indexing=False, use_multiscale_shifts=True, use_geom_sig_dot=False, use_reinforce=False, weight_initializer=None, bias_initializer=None, name="nmt_addresser"): super(Addresser, self).__init__() self.n_hids = n_hids self.n_mid_key_size = n_mid_key_size self.mem_size = mem_size self.mem_nel = mem_nel self.use_reinforce = use_reinforce self.read_head = read_head self.scale_size = scale_size self.scale_bias_coef = scale_bias_coef self.address_size = address_size self.use_scale_layer = use_scale_layer self.use_adv_indexing = use_adv_indexing self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.name = name self.use_loc_based_addressing = use_loc_based_addressing self.use_multiscale_shifts = use_multiscale_shifts self.shift_width = shift_width self.smoothed_diff_weights = smoothed_diff_weights self.mem_weight_decay = mem_weight_decay self.use_local_att = use_local_att if self.use_local_att: self.time_idxs = const(as_floatX(np.arange(self.mem_nel))) self.time_idxs.name = "time_idxs" if self.use_adv_indexing: print "Using the advanced indexing." else: print "Not using the advanced indexing." if mem_gater_activ: self.mem_gater_activ = mem_gater_activ else: self.mem_gater_activ = Sigmoid if use_geom_sig_dot: self.mem_similarity = GeomEuclideanSigmoidDot() else: self.mem_similarity = MemorySimilarity() self.init_params()
def fprop(self, state_below, memory, w_t_before, w_t_pre_before=None, time_idxs=None): if time_idxs is None: logger.info("Time indices are empty!") time_idxs = self.time_idxs fork_outs = self.state_fork_layer.fprop(state_below) idx = 0 # First things first, content based addressing: if not self.use_local_att: beta_pre = fork_outs[self.names[0]] beta = TT.nnet.softplus(beta_pre).reshape((beta_pre.shape[0],)) if (state_below.ndim != beta.ndim and beta.ndim == 2 and state_below.ndim == 3): beta = beta.reshape((state_below.shape[0], state_below.shape[1])) elif (state_below.ndim != beta.ndim and beta.ndim == 1 and state_below.ndim == 2): beta = beta.reshape((state_below.shape[0],)) else: raise ValueError("Unknown shape for beta!") beta = TT.shape_padright(beta) idx = 1 key_pre = fork_outs[self.names[idx]] idx += 1 key_t = key_pre sim_vals = self.mem_similarity(key_t, memory) weights = sim_vals new_pre_weights = None if self.smoothed_diff_weights: dw_scaler = fork_outs[self.names[idx]] dw_scaler = TT.addbroadcast(dw_scaler, 1) weights = sim_vals - Sigmoid(dw_scaler) * w_t_pre_before new_pre_weights = self.mem_weight_decay * sim_vals + (1 - \ self.mem_weight_decay) * w_t_pre_before idx += 1 std = 5 """ if self.use_local_att: mean = as_floatX(self.mem_nel) * Sigmoid(weights*self.mean_pred.fprop(state_below)) exp_ws = -(time_idxs - mean)**2 / (2.0 * std) weights = exp_ws * weights """ if self.use_local_att: w_tc = softmax3(weights) if weights.ndim == 3 else TT.nnet.softmax(weights) else: if weights.ndim == 3 and beta.ndim == 2: beta = beta.dimshuffle('x', 0, 1) w_tc = softmax3(weights * beta) else: # Content based weights: w_tc = TT.nnet.softmax(weights * beta) if self.use_local_att: first_loc_layer = Tanh(self.state_below_local.fprop(state_below) +\ self.weights_below_local.fprop(weights)) mean = as_floatX(self.mem_nel) * Sigmoid(self.mean_pred.fprop(first_loc_layer)) mean = TT.addbroadcast(mean, 1) exp_ws = TT.exp(-((time_idxs - mean)**2) / (2.0 * std)) w_tc = exp_ws * w_tc w_tc = w_tc / w_tc.sum(axis=1, keepdims=True) if self.use_loc_based_addressing: # Location based addressing: g_t_pre = fork_outs[self.names[idx]] g_t = Sigmoid(g_t_pre).reshape((g_t_pre.shape[0],)) if (state_below.ndim != g_t.ndim and g_t.ndim == 2 and state_below.ndim == 3): g_t = g_t.reshape((state_below.shape[0], state_below.shape[1])) elif (state_below.ndim != g_t.ndim and g_t.ndim == 1 and state_below.ndim == 2): g_t = g_t.reshape((state_below.shape[0],)) else: raise ValueError("Unknown shape for g_t!") g_t = TT.shape_padright(g_t) w_tg = g_t * w_tc + (1 - g_t) * w_t_before shifts_pre = fork_outs[self.names[idx + 1]] if shifts_pre.ndim == 2: if self.use_multiscale_shifts: if self.use_scale_layer: scales = TT.exp(self.scale_layer.fprop(state_below)) scales = scales.dimshuffle(0, 'x', 1) else: scales = TT.exp(TT.arange(self.scale_size).dimshuffle('x', 'x', 0)) shifts_pre = shifts_pre.reshape((state_below.shape[0], -1, self.scale_size)) shifts_pre = (shifts_pre * scales).sum(-1) if self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1)) elif self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1)) else: shifts_pre = shifts_pre.reshape( (state_below.shape[0], self.mem_nel)) if state_below.ndim == 3: shifts_pre = shifts_pre.dimshuffle(0, 1, 'x') shifts_pre = shifts_pre - shifts_pre.max(1, keepdims=True).dimshuffle(0, 'x', 'x') else: shifts_pre = shifts_pre.dimshuffle(0, 1) shifts_pre = shifts_pre - shifts_pre.max(1, keepdims=True) shifts_pre = shifts_pre.dimshuffle(0, 1, 'x') elif shifts_pre.ndim == 1: if self.use_multiscale_shifts: if self.use_scale_layer: scales = TT.exp(self.scale_layer.fprop(state_below)) else: scales = TT.exp(TT.arange(self.scale_size)) shifts_pre = shifts_pre.reshape((-1, self.scale_size)) shifts_pre = (shifts_pre * scales).sum(-1) if self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1)) if self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, 1)) elif self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, 1)) else: shifts_pre = shifts_pre.reshape((self.mem_nel,)) if state_below.ndim == 2: shifts_pre = TT.shape_padright(shifts_pre) shifts_pre = shifts_pre - shifts_pre.max(0, keepdims=True) shifts = TT.exp(shifts_pre) if shifts.ndim == 2: shifts = shifts / shifts.sum(axis=0, keepdims=True) elif shifts.ndim == 3: shifts = shifts / shifts.sum(axis=1, keepdims=True) CC = CircularConvolveAdvIndexing if self.use_adv_indexing else\ CircularConvolve w_t_hat = CC()(weights=w_tg, shifts=shifts, mem_size=self.mem_nel, shift_width=self.shift_width) if self.use_reinforce: if w_t_hat.ndim == 2: w_t = TT.nnet.softmax(w_t_hat) elif w_t_hat.ndim == 3: w_t = softmax3(w_t_hat) else: gamma_pre = fork_outs[self.names[4]] assert w_t_hat.ndim == gamma_pre.ndim, ("The number of dimensions for " " w_t_hat and gamma_pre should " " be the same") if gamma_pre.ndim == 1: gamma_pre = gamma_pre else: gamma_pre = gamma_pre.reshape((gamma_pre.shape[0],)) gamma_pre = TT.shape_padright(gamma_pre) gamma = TT.nnet.softplus(gamma_pre) + const(1) w_t = (abs(w_t_hat + const(1e-16))**gamma) + const(1e-42) if (state_below.ndim != shifts_pre.ndim and w_t.ndim == 2 and state_below.ndim == 3): w_t = w_t.reshape((state_below.shape[0], state_below.shape[1])) w_t = w_t.dimshuffle(0, 1, 'x') elif (state_below.ndim != w_t.ndim and w_t.ndim == 1 and state_below.ndim == 2): w_t = w_t.reshape((state_below.shape[0],)) w_t = w_t.dimshuffle(0, 'x') if w_t.ndim == 2: w_t = w_t / (w_t.sum(axis=-1, keepdims=True) + const(1e-6)) elif w_t.ndim == 3: w_t = w_t / (w_t.sum(axis=-1, keepdims=True) + const(1e-6)) else: w_t = w_tc return [w_t], [new_pre_weights]
def init_params(self): if not self.use_local_att: names = ["fork_state_beta_t", "fork_state_key_t"] self.n_outs = [1, self.mem_size + self.address_size] else: names = ["fork_state_key_t"] self.n_outs = [self.mem_size + self.address_size] self.shift_size = self.mem_nel if self.use_multiscale_shifts: logger.info("Using the multiscale shifts.") if self.scale_size is None or self.scale_size < -1: self.scale_size = int(np.floor(np.log(self.mem_nel))) logger.info("Size of the scales is %d" % self.scale_size) self.shift_size = self.shift_width * self.scale_size binit_vals = [None, None] if self.smoothed_diff_weights: names.append("fork_state_diff_gate") self.n_outs += [1] binit_vals += [-0.16] if self.use_loc_based_addressing: names += [ "fork_state_gater_t", "fork_state_shift_hat_t" ] self.n_outs += [1, self.shift_size] binit_vals += [None, None] if not self.use_reinforce: names += [ "fork_state_sharpen_hat_t" ] self.n_outs += [1] binit_vals += [0.001] if self.use_scale_layer: self.scale_layer = AffineLayer(n_in=self.n_hids, n_out=self.scale_size, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_bias=True, name=self.pname("scale_layer")) pname = self.scale_layer.params.getparamname("bias") arng = as_floatX(np.arange(self.scale_size)) arng = arng / arng.sum() self.scale_layer.params[pname] = self.scale_bias_coef * arng self.children.extend([self.scale_layer]) if self.use_local_att: bott_size = self.n_hids logger.info("Using the local attention.") self.state_below_local = AffineLayer(n_in=self.n_hids, n_out=bott_size, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_bias=True, name=self.pname("state_below_loc_layer")) self.weights_below_local = AffineLayer(n_in=self.mem_nel, n_out=bott_size, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_bias=False, name=self.pname("weights_loc_layer")) self.mean_pred = AffineLayer(n_in=bott_size, n_out=1, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_bias=True, name=self.pname("mean_pred")) self.children.extend([self.state_below_local, self.weights_below_local, self.mean_pred]) names = map(lambda x: self.pname(x), names) self.names = names self.state_fork_layer = ForkLayer(n_in=self.n_hids, n_outs=self.n_outs, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, init_bias_vals = binit_vals, names=names) self.children.extend([self.state_fork_layer]) self.powerup_layer = None self.merge_params()
def get_updates(self, learning_rate, grads, lr_scalers=None): """ .. todo:: WRITEME Parameters ---------- learning_rate : float Learning rate coefficient. Learning rate is not being used but, pylearn2 requires a learning rate to be defined. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ updates = OrderedDict({}) eps = self.damping step = sharedX(0., name="step") if self.skip_nan_inf: #If norm of the gradients of a parameter is inf or nan don't update that parameter #That might be useful for RNNs. grads = OrderedDict({ p: T.switch(T.or_(T.isinf(grads[p]), T.isnan(grads[p])), 0, grads[p]) for p in grads.keys() }) #Block-normalize gradients: nparams = len(grads.keys()) #Apply the gradient clipping, this is only sometimes #necessary for RNNs and sometimes for very deep networks if self.grad_clip: assert self.grad_clip > 0. assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1." gnorm = sum([g.norm(2) for g in grads.values()]) notfinite = T.or_(T.isnan(gnorm), T.isinf(gnorm)) for p, g in grads.iteritems(): tmpg = T.switch(gnorm / nparams > self.grad_clip, g * self.grad_clip * nparams / gnorm, g) grads[p] = T.switch(notfinite, as_floatX(0.1) * p, tmpg) tot_norm_up = 0 tot_param_norm = 0 fix_decay = self.slow_decay**(step + 1) for param in grads.keys(): grads[param].name = "grad_%s" % param.name mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name) mean_corrected_grad = sharedX(param.get_value() * 0 + eps, name="mean_corrected_grad_%s" % param.name) gnorm_sqr = sharedX(0.0 + eps, name="gnorm_%s" % param.name) prod_taus = sharedX((np.ones_like(param.get_value()) - 2 * eps), name="prod_taus_x_t_" + param.name) slow_constant = 2.1 if self.use_adagrad: # sum_square_grad := \sum_i g_i^2 sum_square_grad = sharedX(param.get_value(borrow=True) * 0., name="sum_square_grad_%s" % param.name) """ Initialization of accumulators """ taus_x_t = sharedX( (np.ones_like(param.get_value()) + eps) * slow_constant, name="taus_x_t_" + param.name) self.taus_x_t = taus_x_t #Variance reduction parameters #Numerator of the gamma: gamma_nume_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_nume_sqr_" + param.name) #Denominator of the gamma: gamma_deno_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_deno_sqr_" + param.name) #For the covariance parameter := E[\gamma \alpha]_{t-1} cov_num_t = sharedX(np.zeros_like(param.get_value()) + eps, name="cov_num_t_" + param.name) # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(np.zeros_like(param.get_value()) + eps, name="msg_" + param.name) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name) if self.use_corrected_grad: old_grad = sharedX(param.get_value() * 0. + eps) #The uncorrected gradient of previous of the previous update: old_plain_grad = sharedX(param.get_value() * 0. + eps) mean_curvature = sharedX(param.get_value() * 0. + eps) mean_curvature_sqr = sharedX(param.get_value() * 0. + eps) # Initialize the E[\Delta]_{t-1} mean_dx = sharedX(param.get_value() * 0.) # Block-wise normalize the gradient: norm_grad = grads[param] #For the first time-step, assume that delta_x_t := norm_grad gnorm = T.sqr(norm_grad).sum() cond = T.eq(step, 0) gnorm_sqr_o = cond * gnorm + (1 - cond) * gnorm_sqr gnorm_sqr_b = gnorm_sqr_o / (1 - fix_decay) norm_grad = norm_grad / (T.sqrt(gnorm_sqr_b) + eps) msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx mdx = cond * norm_grad + (1 - cond) * mean_dx new_prod_taus = (prod_taus * (1 - 1 / taus_x_t)) """ Compute the new updated values. """ # E[g_i^2]_t new_mean_squared_grad = (mean_square_grad * (1 - 1 / taus_x_t) + T.sqr(norm_grad) / (taus_x_t)) new_mean_squared_grad.name = "msg_" + param.name # E[g_i]_t new_mean_grad = (mean_grad * (1 - 1 / taus_x_t) + norm_grad / taus_x_t) new_mean_grad.name = "nmg_" + param.name mg = new_mean_grad / (1 - new_prod_taus) mgsq = new_mean_squared_grad / (1 - new_prod_taus) new_gnorm_sqr = (gnorm_sqr_o * self.slow_decay + T.sqr(norm_grad).sum() * (1 - self.slow_decay)) # Keep the rms for numerator and denominator of gamma. new_gamma_nume_sqr = (gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr( (norm_grad - old_grad) * (old_grad - mg)) / taus_x_t) new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name new_gamma_deno_sqr = (gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr( (mg - norm_grad) * (old_grad - mg)) / taus_x_t) new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name gamma = T.sqrt(gamma_nume_sqr) / (T.sqrt(gamma_deno_sqr + eps) + \ self.gamma_reg) gamma.name = "gamma_" + param.name if self.gamma_clip and self.gamma_clip > -1: gamma = T.minimum(gamma, self.gamma_clip) momentum_step = gamma * mg corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma) #For starting the variance reduction. if self.start_var_reduction > -1: cond = T.le(self.start_var_reduction, step) corrected_grad = cond * corrected_grad_cand + ( 1 - cond) * norm_grad else: corrected_grad = norm_grad if self.use_adagrad: g = corrected_grad # Accumulate gradient new_sum_squared_grad = (sum_square_grad + T.sqr(g)) rms_g_t = T.sqrt(new_sum_squared_grad) rms_g_t = T.maximum(rms_g_t, 1.0) #Use the gradients from the previous update #to compute the \nabla f(x_t) - \nabla f(x_{t-1}) cur_curvature = norm_grad - old_plain_grad #cur_curvature = theano.printing.Print("Curvature: ")(cur_curvature) cur_curvature_sqr = T.sqr(cur_curvature) new_curvature_ave = (mean_curvature * (1 - 1 / taus_x_t) + (cur_curvature / taus_x_t)) new_curvature_ave.name = "ncurve_ave_" + param.name #Average average curvature nc_ave = new_curvature_ave / (1 - new_prod_taus) new_curvature_sqr_ave = (mean_curvature_sqr * (1 - 1 / taus_x_t) + (cur_curvature_sqr / taus_x_t)) new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name #Unbiased average squared curvature nc_sq_ave = new_curvature_sqr_ave / (1 - new_prod_taus) epsilon = 1e-7 #lr_scalers.get(param, 1.) * learning_rate scaled_lr = sharedX(1.0) rms_dx_tm1 = T.sqrt(msdx + epsilon) rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon) #This is where the update step is being defined delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon)) delta_x_t.name = "delta_x_t_" + param.name # This part seems to be necessary for only RNNs # For feedforward networks this does not seem to be important. if self.delta_clip: logger.info( "Clipping will be applied on the adaptive step size.") delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip) if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad is disabled.") delta_x_t = delta_x_t * corrected_grad else: logger.info( "Clipping will not be applied on the adaptive step size.") if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad will not be used.") delta_x_t = delta_x_t * corrected_grad new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX( 1 + eps, "stabilized") #To compute the E[\Delta^2]_t new_mean_square_dx = (msdx * (1 - 1 / taus_x_t) + (T.sqr(delta_x_t) / taus_x_t)) #To compute the E[\Delta]_t new_mean_dx = (mdx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t))) #Perform the outlier detection: #This outlier detection is slightly different: new_taus_t = T.switch( T.or_( abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)), abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))), T.switch(new_taus_t > 2.5, sharedX(2.5), new_taus_t + sharedX(1.0) + eps), new_taus_t) #Apply the bound constraints on tau: new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t) new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t) new_cov_num_t = (cov_num_t * (1 - 1 / taus_x_t) + (delta_x_t * cur_curvature) * (1 / taus_x_t)) update_step = delta_x_t tot_norm_up += update_step.norm(2) tot_param_norm += param.norm(2) # Apply updates updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[mean_dx] = new_mean_dx updates[gnorm_sqr] = new_gnorm_sqr updates[gamma_nume_sqr] = new_gamma_nume_sqr updates[gamma_deno_sqr] = new_gamma_deno_sqr updates[taus_x_t] = new_taus_t updates[cov_num_t] = new_cov_num_t updates[mean_grad] = new_mean_grad updates[old_plain_grad] = norm_grad updates[mean_curvature] = new_curvature_ave updates[mean_curvature_sqr] = new_curvature_sqr_ave if self.perform_update: updates[param] = param + update_step updates[step] = step + 1 updates[prod_taus] = new_prod_taus if self.use_adagrad: updates[sum_square_grad] = new_sum_squared_grad if self.use_corrected_grad: updates[old_grad] = corrected_grad return updates, tot_norm_up, tot_param_norm
def get_updates(self, learning_rate, grads, lr_scalers=None): """ .. todo:: WRITEME """ updates = OrderedDict() velocity = OrderedDict() normalized_velocities = OrderedDict() counter = sharedX(0, 'counter') tot_norm_up = 0 tot_param_norm = 0 if self.gradient_clipping is not None: grads_norm = sum( map(lambda X: T.sqr(X).sum(), [grads[param] for param in grads.keys()])) grads_norm = T.sqrt(grads_norm) scaling_den = T.maximum(self.gradient_clipping, grads_norm) scaling_num = self.gradient_clipping for param in grads.keys(): grads[param] = scaling_num * grads[param] / scaling_den for param in grads.keys(): avg_grad_sqr = sharedX(np.zeros_like(param.get_value())) velocity[param] = sharedX(np.zeros_like(param.get_value())) next_counter = counter + 1. fix_first_moment = 1. - self.momentum**next_counter fix_second_moment = 1. - self.averaging_coeff**next_counter if param.name is not None: avg_grad_sqr.name = 'avg_grad_sqr_' + param.name new_avg_grad_sqr = self.averaging_coeff*avg_grad_sqr \ + (1 - self.averaging_coeff)*T.sqr(grads[param]) rms_grad_t = T.sqrt(new_avg_grad_sqr) rms_grad_t = T.maximum(rms_grad_t, self.stabilizer) new_velocity = self.momentum * velocity[param] \ - (1 - self.momentum) * grads[param] normalized_velocity = (new_velocity * T.sqrt(fix_second_moment)) \ / (rms_grad_t * fix_first_moment) tot_param_norm += param.norm(2) tot_norm_up += learning_rate * normalized_velocity.norm(2) normalized_velocities[param] = normalized_velocity updates[avg_grad_sqr] = new_avg_grad_sqr updates[velocity[param]] = new_velocity update_param_norm_ratio = tot_norm_up / (tot_param_norm + 1e-7) new_lr = ifelse.ifelse( T.ge(update_param_norm_ratio, self.update_param_norm_ratio), as_floatX(learning_rate * self.update_param_norm_ratio) / update_param_norm_ratio, as_floatX(learning_rate)) new_lr = ifelse.ifelse(T.ge(counter, 6000), new_lr, as_floatX(learning_rate)) for param in grads.keys(): updates[param] = param + new_lr * normalized_velocities[param] updates[counter] = counter + 1 return updates, tot_norm_up, tot_param_norm
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ .. todo:: WRITEME Parameters ---------- learning_rate : float Learning rate coefficient. Learning rate is not being used but, pylearn2 requires a learning rate to be defined. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ updates = OrderedDict({}) eps = self.damping step = sharedX(0., name="step") if self.skip_nan_inf: #If norm of the gradients of a parameter is inf or nan don't update that parameter #That might be useful for RNNs. grads = OrderedDict({p: T.switch(T.or_(T.isinf(grads[p]), T.isnan(grads[p])), 0, grads[p]) for p in grads.keys()}) # Block-normalize gradients: nparams = len(grads.keys()) # Apply the gradient clipping, this is only sometimes # necessary for RNNs and sometimes for very deep networks if self.grad_clip: assert self.grad_clip > 0. assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1." gnorm = sum([g.norm(2) for g in grads.values()]) notfinite = T.or_(T.isnan(gnorm), T.isinf(gnorm)) for p, g in grads.iteritems(): tmpg = T.switch(gnorm / nparams > self.grad_clip, g * self.grad_clip * nparams / gnorm , g) grads[p] = T.switch(notfinite, as_floatX(0.1)*p, tmpg) tot_norm_up = 0 gshared = OrderedDict({p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems()}) gsup = [(gshared[p], g) for p, g in grads.iteritems()] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) fix_decay = self.slow_decay**(step + 1) for param in gshared.keys(): gshared[param].name = "grad_%s" % param.name mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name) gnorm_sqr = sharedX(0.0 + eps, name="gnorm_%s" % param.name) prod_taus = sharedX((np.ones_like(param.get_value()) - 2*eps), name="prod_taus_x_t_" + param.name) slow_constant = 2.1 if self.use_adagrad: # sum_square_grad := \sum_i g_i^2 sum_square_grad = sharedX(param.get_value(borrow=True) * 0., name="sum_square_grad_%s" % param.name) """ Initialization of accumulators """ taus_x_t = sharedX((np.ones_like(param.get_value()) + eps) * slow_constant, name="taus_x_t_" + param.name) self.taus_x_t = taus_x_t #Variance reduction parameters #Numerator of the gamma: gamma_nume_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_nume_sqr_" + param.name) #Denominator of the gamma: gamma_deno_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_deno_sqr_" + param.name) #For the covariance parameter := E[\gamma \alpha]_{t-1} cov_num_t = sharedX(np.zeros_like(param.get_value()) + eps, name="cov_num_t_" + param.name) # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(np.zeros_like(param.get_value()) + eps, name="msg_" + param.name) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name) if self.use_corrected_grad: old_grad = sharedX(param.get_value() * 0. + eps) #The uncorrected gradient of previous of the previous update: old_plain_grad = sharedX(param.get_value() * 0. + eps) mean_curvature = sharedX(param.get_value() * 0. + eps) mean_curvature_sqr = sharedX(param.get_value() * 0. + eps) # Initialize the E[\Delta]_{t-1} mean_dx = sharedX(param.get_value() * 0.) # Block-wise normalize the gradient: norm_grad = gshared[param] #For the first time-step, assume that delta_x_t := norm_grad gnorm = T.sqr(norm_grad).sum() cond = T.eq(step, 0) gnorm_sqr_o = cond * gnorm + (1 - cond) * gnorm_sqr gnorm_sqr_b = gnorm_sqr_o / (1 - fix_decay) norm_grad = norm_grad / (T.sqrt(gnorm_sqr_b) + eps) msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx mdx = cond * norm_grad + (1 - cond) * mean_dx new_prod_taus = ( prod_taus * (1 - 1 / taus_x_t) ) """ Compute the new updated values. """ # E[g_i^2]_t new_mean_squared_grad = ( mean_square_grad * (1 - 1 / taus_x_t) + T.sqr(norm_grad) / (taus_x_t) ) new_mean_squared_grad.name = "msg_" + param.name # E[g_i]_t new_mean_grad = ( mean_grad * (1 - 1 / taus_x_t) + norm_grad / taus_x_t ) new_mean_grad.name = "nmg_" + param.name mg = new_mean_grad / (1 - new_prod_taus) mgsq = new_mean_squared_grad / (1 - new_prod_taus) new_gnorm_sqr = ( gnorm_sqr_o * self.slow_decay + T.sqr(norm_grad).sum() * (1 - self.slow_decay) ) # Keep the rms for numerator and denominator of gamma. new_gamma_nume_sqr = ( gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr((norm_grad - old_grad) * (old_grad - mg)) / taus_x_t ) new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name new_gamma_deno_sqr = ( gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr((mg - norm_grad) * (old_grad - mg)) / taus_x_t ) new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name gamma = T.sqrt(gamma_nume_sqr) / (T.sqrt(gamma_deno_sqr + eps) + \ self.gamma_reg) gamma.name = "gamma_" + param.name if self.gamma_clip and self.gamma_clip > -1: gamma = T.minimum(gamma, self.gamma_clip) momentum_step = gamma * mg corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma) #For starting the variance reduction. if self.start_var_reduction > -1: cond = T.le(self.start_var_reduction, step) corrected_grad = cond * corrected_grad_cand + (1 - cond) * norm_grad else: corrected_grad = norm_grad if self.use_adagrad: g = corrected_grad # Accumulate gradient new_sum_squared_grad = ( sum_square_grad + T.sqr(g) ) rms_g_t = T.sqrt(new_sum_squared_grad) rms_g_t = T.maximum(rms_g_t, 1.0) #Use the gradients from the previous update #to compute the \nabla f(x_t) - \nabla f(x_{t-1}) cur_curvature = norm_grad - old_plain_grad #cur_curvature = theano.printing.Print("Curvature: ")(cur_curvature) cur_curvature_sqr = T.sqr(cur_curvature) new_curvature_ave = ( mean_curvature * (1 - 1 / taus_x_t) + (cur_curvature / taus_x_t) ) new_curvature_ave.name = "ncurve_ave_" + param.name #Average average curvature nc_ave = new_curvature_ave / (1 - new_prod_taus) new_curvature_sqr_ave = ( mean_curvature_sqr * (1 - 1 / taus_x_t) + (cur_curvature_sqr / taus_x_t) ) new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name #Unbiased average squared curvature nc_sq_ave = new_curvature_sqr_ave / (1 - new_prod_taus) epsilon = 1e-7 #lr_scalers.get(param, 1.) * learning_rate scaled_lr = sharedX(1.0) rms_dx_tm1 = T.sqrt(msdx + epsilon) rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon) #This is where the update step is being defined delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon)) delta_x_t.name = "delta_x_t_" + param.name # This part seems to be necessary for only RNNs # For feedforward networks this does not seem to be important. if self.delta_clip: logger.info("Clipping will be applied on the adaptive step size.") delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip) if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad is disabled.") delta_x_t = delta_x_t * corrected_grad else: logger.info("Clipping will not be applied on the adaptive step size.") if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad will not be used.") delta_x_t = delta_x_t * corrected_grad new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX(1 + eps, "stabilized") #To compute the E[\Delta^2]_t new_mean_square_dx = ( msdx * (1 - 1 / taus_x_t) + (T.sqr(delta_x_t) / taus_x_t) ) #To compute the E[\Delta]_t new_mean_dx = ( mdx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t)) ) #Perform the outlier detection: #This outlier detection is slightly different: new_taus_t = T.switch(T.or_(abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)), abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))), T.switch(new_taus_t > 2.5, sharedX(2.5), new_taus_t + sharedX(1.0) + eps), new_taus_t) #Apply the bound constraints on tau: new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t) new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t) new_cov_num_t = ( cov_num_t * (1 - 1 / taus_x_t) + (delta_x_t * cur_curvature) * (1 / taus_x_t) ) update_step = delta_x_t tot_norm_up += update_step.norm(2) # Apply updates updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[mean_dx] = new_mean_dx updates[gnorm_sqr] = new_gnorm_sqr updates[gamma_nume_sqr] = new_gamma_nume_sqr updates[gamma_deno_sqr] = new_gamma_deno_sqr updates[taus_x_t] = new_taus_t updates[cov_num_t] = new_cov_num_t updates[mean_grad] = new_mean_grad updates[old_plain_grad] = norm_grad updates[mean_curvature] = new_curvature_ave updates[mean_curvature_sqr] = new_curvature_sqr_ave if self.perform_update: updates[param] = param + update_step updates[step] = step + 1 updates[prod_taus] = new_prod_taus if self.use_adagrad: updates[sum_square_grad] = new_sum_squared_grad if self.use_corrected_grad: updates[old_grad] = corrected_grad f_update = theano.function([learning_rate], [tot_norm_up], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def fprop(self, inps=None, use_mask=True, use_cmask=True, use_noise=False, mdl_name=None): self.build_model(use_noise=use_noise, mdl_name=mdl_name) if not inps: inps = self.inps X = inps[0] if use_mask: mask = inps[2] qmask = inps[3] if use_cmask: cmask = inps[4] assert (3 + sum([use_mask, use_cmask ])) == len(inps), "inputs have illegal shape." m0 = as_floatX(TT.gt(X, 0)) if cmask is not None: m1 = mask * TT.eq(cmask, 0) else: raise ValueError("Mask for the answers should not be empty.") dropOp = None low_inp_shp = (X.shape[0], X.shape[1] * X.shape[2], -1) Xr = X.reshape(low_inp_shp) grulow_inps = self.grulow_layer.fprop(Xr, deterministic=not use_noise) linps = [low_reset_below, low_gater_below, low_state_below] inp_shp = (X.shape[1], X.shape[2], -1) h0 = self.low_gru_layer.fprop(inps=linps, mask=m0, batch_size=self.batch_size) h0 = m1.dimshuffle(0, 1, 'x') * (h0.reshape( (X.shape[0], X.shape[1], X.shape[2], -1))[-1]).reshape(inp_shp) if self.dropout: if dropOp is None: dropOp = Dropout(dropout_prob=self.dropout) h0 = dropOp(h0, deterministic=not use_noise) gruup_inps = self.gruup_layer.fprop(h0, deterministic=not use_noise) reset_below = gruup_inps.values()[0].reshape(inp_shp) gater_below = gruup_inps.values()[1].reshape(inp_shp) state_below = gruup_inps.values()[2].reshape(inp_shp) uinps = [reset_below, gater_below, state_below] h1, _ = self.gru_layer.fprop(inps=uinps, maskf=m1, maskq=qmask, batch_size=self.batch_size) if self.dropout: if dropOp is None: dropOp = Dropout(dropout_prob=self.dropout) h1 = dropOp(h1, deterministic=not use_noise) out_layer = self.out_layer.fprop(h1, deterministic=not use_noise) self.probs = Softmax(out_layer) return self.probs, h1
def fprop(self, inps=None, leak_rate=0.05, use_noise=False, mdl_name=None): self.build_model(use_noise=use_noise, mdl_name=mdl_name) self.ntm.evaluation_mode = use_noise if not inps: inps = self.inps # First two are X and targets # assert (2 + sum([use_mask, use_cmask])) + 1 >= len(inps), \ # "inputs have illegal shape." cmask = None mask = None if isinstance(inps, list): X = inps[0] y = inps[1] if self.use_mask: mask = inps[2] if self.use_cost_mask: cmask = inps[3] else: X = inps['X'] y = inps['y'] if self.use_mask: mask = inps['mask'] if self.use_cost_mask: cmask = inps['cmask'] if self.use_cost_mask: if cmask is not None: if self.use_bow_cost_mask: if mask.ndim == cmask.ndim: m = (mask * TT.eq(cmask, 0)).reshape( (cmask.shape[0] * cmask.shape[1], -1)) else: m = (mask.dimshuffle(0, 1, 'x') * TT.eq(cmask, 0))[:, :, 0].reshape( (cmask.shape[0] * cmask.shape[1], -1)) else: m = mask else: raise ValueError("Mask for the answers should not be empty.") if X.ndim == 2 and y.ndim == 1: # For sequential MNIST. if self.permute_order: X = X.dimshuffle(1, 0) idxs = self.rnd_indxs X = X[idxs] inp_shp = (X.shape[0], X.shape[1], -1) else: inp_shp = (X.shape[1], X.shape[2], -1) #import pdb;pdb.set_trace() self.ntm_in = None if self.use_bow_input and not self.use_gru_inp_rep and not self.use_simple_rnn_inp_rep: bow_out = self.bow_layer.fprop(X, amask=m, deterministic=not use_noise) bow_out = bow_out.reshape((X.shape[1], X.shape[2], -1)) self.ntm_in = bow_out elif self.use_gru_inp_rep: m0 = as_floatX(TT.gt(X, 0)) if self.use_mask and self.use_cost_mask: if cmask is not None: m1 = mask * TT.eq(cmask, 0) else: raise ValueError( "Mask for the answers should not be empty.") low_inp_shp = (X.shape[0], X.shape[1] * X.shape[2], -1) Xr = X.reshape(low_inp_shp) grufact_inps = self.gru_fact_layer_inps.fprop(Xr) low_reset_below = grufact_inps.values()[0].reshape(low_inp_shp) low_gater_below = grufact_inps.values()[1].reshape(low_inp_shp) low_state_below = grufact_inps.values()[2].reshape(low_inp_shp) linps = [low_reset_below, low_gater_below, low_state_below] m0_part = TT.cast( m0.sum(0).reshape( (X.shape[1], X.shape[2])).dimshuffle(0, 1, 'x'), 'float32') m0_part = TT.switch(TT.eq(m0_part, as_floatX(0)), as_floatX(1), m0_part) h0 = self.gru_fact_layer.fprop(inps=linps, mask=m0, batch_size=self.batch_size) self.ntm_in = m1.dimshuffle(0, 1, 'x') * ((m0.dimshuffle(0, 1, 2, 'x') * h0.reshape((X.shape[0], X.shape[1], X.shape[2], -1))).sum(0) \ / m0_part).reshape(inp_shp) elif self.use_simple_rnn_inp_rep: m0 = as_floatX(TT.gt(X, 0)) if cmask is not None: m1 = mask * TT.eq(cmask, 0) else: raise ValueError("Mask for the answers should not be empty.") low_inp_shp = (X.shape[0], X.shape[1] * X.shape[2], -1) Xr = X.reshape(low_inp_shp) rnnfact_inps = self.rnn_fact_layer_inps.fprop(Xr).reshape( low_inp_shp) m0 = m0.reshape(low_inp_shp) h0 = self.rnn_fact_layer.fprop(inps=rnnfact_inps, mask=m0, batch_size=self.batch_size) m0_part = TT.cast( m0.sum(0).reshape( (X.shape[1], X.shape[2])).dimshuffle(0, 1, 'x'), 'float32') m0_part = TT.switch(m0_part == 0, as_floatX(1), m0_part) self.ntm_in = m1.dimshuffle(0, 1, 'x') * (h0.reshape((X.shape[0], X.shape[1], X.shape[2], -1)).sum(0) / \ m0_part).reshape(inp_shp) else: X_proj = self.inp_proj_layer.fprop(X) if not self.learn_embeds: X_proj = block_gradient(X_proj) if self.use_batch_norm: X_proj = self.batch_norm_layer.fprop(X_proj, inference=not use_noise) self.ntm_in = X_proj context = None if self.use_context: if self.use_qmask: context = (self.qmask.dimshuffle(0, 1, 'x') * self.ntm_in).sum(0) else: m1_part = m1.sum(0).dimshuffle(0, 'x') context = self.ntm_in.sum(0) / m1_part self.ntm_outs = self.ntm.fprop(self.ntm_in, mask=mask, cmask=cmask, context=context, batch_size=self.batch_size, use_mask=self.use_mask, use_noise=not use_noise) h, m_read = self.ntm_outs[0], self.ntm_outs[2] if self.use_reinforce: self.w_samples, self.r_samples = self.ntm_outs[-2], self.ntm_outs[ -1] if self.smoothed_diff_weights: idx = -6 else: idx = -4 self.write_weights, self.read_weights = self.ntm_outs[idx], \ self.ntm_outs[idx+1] else: self.write_weights, self.read_weights = self.ntm_outs[ 3], self.ntm_outs[4] if self.anticorrelation: acorr = AntiCorrelationConstraint(level=self.anticorrelation) rw1 = self.read_weights[:, 0] rw2 = self.read_weights[:, 1] self.reg += acorr(rw1, rw2, mask=mask) if self.correlation_ws: logger.info("Applying the correlation constraint.") corr_cons = CorrelationConstraint(level=self.correlation_ws) self.reg += corr_cons(self.read_weights, self.write_weights, mask, self.qmask) if self.use_last_hidden_state: h = h.reshape(inp_shp) h = h[-1] if self.use_deepout: merged_out = self.merge_layer.fprop([h, m_read]) out_layer = Leaky_Rect(merged_out, leak_rate) if self.dropout: dropOp = Dropout(dropout_prob=self.dropout) out_layer = dropOp(out_layer, deterministic=not use_noise) out_layer = self.out_layer.fprop(out_layer, deterministic=not use_noise) else: if self.use_out_mem: if self.dropout: dropOp = Dropout(dropout_prob=self.dropout) m_read = dropOp(m_read, deterministic=not use_noise) mem_out = self.out_mem.fprop(m_read, deterministic=not use_noise) mem_scaler = self.out_scaler.fprop( h, deterministic=not use_noise).reshape( (mem_out.shape[0], )).dimshuffle(0, 'x') h_out = self.out_layer.fprop(h, deterministic=not use_noise) out_layer = h_out + mem_out * Sigmoid(mem_scaler) else: if self.dropout: dropOp = Dropout(dropout_prob=self.dropout) h = dropOp(h, deterministic=not use_noise) out_layer = self.out_layer.fprop(h, deterministic=not use_noise) if self.predict_bow_out and self.bow_out_layer: logger.info("Using the bow output prediction.") self.bow_pred_out = Sigmoid( self.bow_out_layer.fprop(h, deterministic=not use_noise)) if self.softmax: self.probs = Softmax(out_layer) else: self.probs = Sigmoid(out_layer) if self.ntm.updates: self.updates.update(self.ntm.updates) self.str_params(logger) self.h = h return self.probs, self.ntm_outs
def __call__(self, probs, samples, baseline, updates, cost = None, cost_mean=None, mask=None, seq_len=20, batch_size=140, deterministic=False, dimshuffle_probs=True): print("Using the input based baseline") if input is None: raise ValueError("input for the %s should" " not be empty." % __class__.__name__) if cost_mean is None: cost_mean = cost.mean() step = 0 key_step = get_key_byname_from_dict(updates, "step") if key_step: step = updates[key_step] else: step = sharedX(0., name="step") updates[step] = step + as_floatX(1) key_center = get_key_byname_from_dict(updates, "center") if key_center: center = updates[key_center] new_center = center else: if self.generative_pred: center = sharedX(np.zeros((self.maxlen,)) + 0.15 + self.eps, name="center") new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost.mean(-1) else: center = sharedX(0.15 + self.eps, name="center") assert cost_mean is not None, "Cost mean should not be empty!" if cost.ndim > 2 and cost.broadcastable[0] is False: new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost_mean else: new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost_mean updates[center] = new_center key_cvar = get_key_byname_from_dict(updates, "cost_var") if key_cvar: cost_var = updates[key_cvar] new_cost_var = cost_var else: if self.generative_pred: cost_var_tot = (cost_mean - new_center)**2 cost_var = sharedX(numpy.zeros((self.maxlen,)) + as_floatX(1.0), name="cost_var") else: if cost.ndim > 2 and cost.broadcastable[0] is False: cost_var_tot = (cost_mean - new_center)**2 else: cost_var_tot = (cost_mean - new_center)**2 cost_var = sharedX(1.0, name="cost_var") new_cost_var = as_floatX(self.decay) * cost_var + as_floatX(1 - self.decay) * \ cost_var_tot updates[cost_var] = new_cost_var lambda2_reg = self.lambda2_reg """ if not self.schedule_h_opts: start = self.schedule_h_opts["lambda2_reg_start"] nbatches = self.schedule_h_opts["end_nbatches"] end = self.lambda2_reg assert start > end lambda2_reg = TT.minimum(((start - end) * step / nbatches) + start, end) """ if dimshuffle_probs: probsd = probs.dimshuffle(0, 2, 1) else: probsd = probs if samples.ndim == 4: reward = cost.dimshuffle(0, 'x', 1, 'x') policy = -(TT.log(probsd + 1e-8) * samples).mean((2, 3)).sum() else: if cost.ndim == 2: if dimshuffle_probs: reward = cost.dimshuffle(0, 'x', 1) if self.generative_pred: new_center = new_center.dimshuffle(0, 'x', 'x') new_cost_var = new_cost_var.dimshuffle(0, 'x', 'x') baseline = baseline.dimshuffle(0, 2, 1) else: reward = cost.dimshuffle(0, 1, 'x') policy = -(TT.log(probsd + 1e-8) * samples).mean((1, 2)).sum() elif cost.ndim == 1: reward = cost.dimshuffle('x', 0, 'x') if dimshuffle_probs: baseline = baseline.dimshuffle(0, 2, 1) else: baseline = baseline.dimshuffle(1, 0, 2) cost_std = TT.maximum(TT.sqrt(new_cost_var + 1e-8), 1.0) centered_reward = (reward - baseline - new_center) / cost_std if cost.ndim == 2: centered_reward = TT.addbroadcast(centered_reward, 1) N = probs.shape[-1] gradp = self.lambda1_reg * (centered_reward) * \ (samples / (probsd + 1e-8)) + lambda2_reg * (TT.log(probsd + 1e-6) + as_floatX(1)) if dimshuffle_probs: gradp = gradp.dimshuffle(0, 2, 1) if mask is not None: if self.generative_pred: gradp = mask.dimshuffle(0, 1, 'x') * gradp / N else: gradp = mask.dimshuffle(0, 1, 'x') * gradp known_grads = {probs: gradp} return updates, known_grads, new_center, cost_std, policy, lambda2_reg
def __call__(self, probs, samples, updates, cost=None, mask=None, deterministic=False, child_probs=None, dimshuffle_probs=False, child_samples=None): if input is None: raise ValueError("input for the %s should " " not be empty." % __class__.__name__) key_baseline = get_key_byname_from_dict(updates, "baseline") step = 0 if key_baseline: rbaseline = updates[key_baseline] key_step = get_key_byname_from_dict(updates, "step") if key_step: step = updates[key_step] else: step = sharedX(0., name="step") else: if self.generative_pred: baseline = sharedX(np.zeros((self.maxlen,)) + 1.0 + self.eps, name="baseline") else: baseline = sharedX(0. + 1.0 + self.eps, name="new_baseline") key_step = get_key_byname_from_dict(updates, "step") fix_decay = self.decay**(step + as_floatX(1)) if key_step: step = updates[key_step] else: step = sharedX(0., name="step") updates[step] = step + as_floatX(1) if self.use_rms_baseline: if self.generative_pred: new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean(-1)**2 else: new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean()**2 updates[baseline] = new_baseline rbaseline = new_baseline / (1 - fix_decay) rbaseline = TT.sqrt(rbaseline) else: if self.generative_pred: new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean(-1) else: new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean() updates[baseline] = new_baseline rbaseline = new_baseline key_cvar = get_key_byname_from_dict(updates, "cost_var") if key_cvar: cost_var = updates[key_cvar] new_cost_var = cost_var else: if self.generative_pred: cost_var = sharedX(np.zeros((self.maxlen,)) + as_floatX(1.2), name="cost_var") cost_var_ave = (cost.mean(-1) - new_baseline)**2 else: cost_var = sharedX(as_floatX(1.2), name="cost_var") cost_var_ave = (cost.mean() - new_baseline)**2 new_cost_var = as_floatX(self.decay) * cost_var + as_floatX(1 - self.decay) * cost_var_ave updates[cost_var] = new_cost_var lambda2_reg = self.lambda2_reg """ if not self.schedule_h_opts: start = self.schedule_h_opts["lambda2_reg_start"] nbatches = self.schedule_h_opts["end_nbatches"] end = self.lambda2_reg assert start > end lambda2_reg = TT.minimum(((start - end) * step / nbatches) + start, end) """ if dimshuffle_probs: probsd = probs.dimshuffle(0, 2, 1) else: probsd = probs if probs.ndim == 3 and cost.ndim == 1: if dimshuffle_probs: reward = cost.dimshuffle('x', 'x', 0) if self.generative_pred: rbaseline = rbaseline.dimshuffle('x', 'x', 0) cost_std = new_cost_var.dimshuffle('x', 'x', 0) else: reward = cost.dimshuffle('x', 0, 'x') if self.generative_pred: rbaseline = rbaseline.dimshuffle('x', 0, 'x') cost_std = new_cost_var.dimshuffle('x', 0, 'x') elif probs.ndim == 3 and cost.ndim == 2: if dimshuffle_probs: reward = cost.dimshuffle(0, 'x', 1) if self.generative_pred: rbaseline = rbaseline.dimshuffle(0, 'x', 'x') new_cost_var = new_cost_var.dimshuffle(0, 'x', 'x') else: reward = cost.dimshuffle('x', 0, 1) if self.generative_pred: rbaseline = rbaseline.dimshuffle('x', 0, 'x') new_cost_var = new_cost_var.dimshuffle('x', 0, 'x') elif probs.ndim == 4 and self.cost.ndim == 1: reward = cost.dimshuffle('x', 'x', 0, 'x') elif probs.ndim == 4: reward = cost.dimshuffle(0, 'x', 1, 'x') centered_cost = reward - rbaseline N = probsd.shape[-1] if self.use_cost_std: cost_std = TT.maximum(TT.sqrt(new_cost_var + 1e-8), 1.0) else: cost_std = 1 if child_probs is not None and child_samples is not None: cprobs1 = child_samples / (child_probs + 1e-8) + samples / (probsd + 1e-8) else: cprobs1 = samples / (probsd + 1e-8) gradp = self.lambda1_reg * (centered_cost / cost_std) * \ (cprobs1) + (lambda2_reg) * (TT.log(probsd + 1e-8) + as_floatX(1)) if dimshuffle_probs: gradp = gradp.dimshuffle(0, 2, 1) if mask is not None: if dimshuffle_probs: gradp = mask.dimshuffle(0, 1, 'x') * gradp else: gradp = mask.dimshuffle(0, 1, 'x') * gradp / N known_grads = {probs: gradp} policy = -(TT.log(probsd + 1e-8) * samples).mean((1, 2)).sum() return updates, known_grads, rbaseline, cost_std, policy, lambda2_reg
def fprop(self, inps=None, use_mask=True, use_cmask=True, use_noise=False, mdl_name=None): self.build_model(use_noise=use_noise, mdl_name=mdl_name) if not inps: inps = self.inps X = inps[0] if use_mask: mask = inps[2] if use_cmask: cmask = inps[3] qmask = inps[4] assert (3 + sum([use_mask, use_cmask ])) == len(inps), "inputs have illegal shape." if cmask is not None: m = mask * TT.eq(cmask.reshape( (cmask.shape[0], cmask.shape[1])), 0) else: raise ValueError("Mask for the answers should not be empty.") bow_out = self.bow_layer.fprop(X, amask=m, qmask=qmask, deterministic=not use_noise) new_bow = TT.roll(bow_out, 1, axis=0) new_bow = TT.set_subtensor(new_bow[0], as_floatX(0)) bow_outs = self.bowup_layer.fprop(bow_out, deterministic=not use_noise) forget_below = bow_outs[self.cnames[0]].reshape( (X.shape[1], X.shape[2], -1)) input_below = bow_outs[self.cnames[1]].reshape( (X.shape[1], X.shape[2], -1)) output_below = bow_outs[self.cnames[2]].reshape( (X.shape[1], X.shape[2], -1)) cell_below = bow_outs[self.cnames[3]].reshape( (X.shape[1], X.shape[2], -1)) inps = [forget_below, input_below, output_below, cell_below] h, c = self.lstm_layer.fprop(inps=inps, mask=mask, batch_size=self.batch_size) if self.deepout: h_deepout = self.deepout_layer_ht.fprop(h) emb_deepout = self.deepout_layer_qbow.fprop(new_bow) z = Leaky_Rect(h_deepout + emb_deepout, 0.01) if self.dropout: dropOp = Dropout(dropout_prob=self.dropout) z = dropOp(z, deterministic=not use_noise) else: z = h if self.dropout: dropOp = Dropout(dropout_prob=self.dropout) z = dropOp(z, deterministic=not use_noise) out_layer = self.out_layer.fprop(z, deterministic=not use_noise) self.probs = Softmax(out_layer) return self.probs, h
def __call__(self, probs, samples, baseline, updates, cost = None, mask=None, seq_len=20, batch_size=140, deterministic=False): if input is None: raise ValueError("input for the %s should" " not be empty." % __class__.__name__) step = 0 key_step = get_key_byname_from_dict(updates, "step") if key_step: step = updates[key_step] else: step = sharedX(0., name="step") updates[step] = step + as_floatX(1) key_center = get_key_byname_from_dict(updates, "center") if key_center: center = updates[key_center] new_center = center else: center = sharedX(0.08 + self.eps, name="center") new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost.sum(0).mean() updates[center] = new_center key_cvar = get_key_byname_from_dict(updates, "cost_var") if key_cvar: cost_var = updates[key_cvar] new_cost_var = cost_var else: cost_var_tot = (cost.sum(0).mean() - new_center)**2 cost_var = sharedX(as_floatX(0.5), name="cost_var") new_cost_var = as_floatX(self.decay) * cost_var + as_floatX(1 - self.decay) * \ cost_var_tot updates[cost_var] = new_cost_var lambda2_reg = self.lambda2_reg if not self.schedule_h_opts: start = self.schedule_h_opts["lambda2_reg_start"] nbatches = self.schedule_h_opts["end_nbatches"] end = self.lambda2_reg assert start > end lambda2_reg = TT.minimum(((start - end) * step / nbatches) + start, end) action_probs = samples * probs if samples.ndim == 4: reward = cost.dimshuffle(0, 'x', 1, 'x') policy = (TT.log(probs + 1e-8) * samples).mean((2, 3)).sum() else: if cost.ndim == 2: reward = cost.dimshuffle(0, 1, 'x') elif cost.ndim == 1: reward = cost.dimshuffle('x', 0, 'x') baseline = baseline.dimshuffle(1, 0, 2) policy = (TT.log(probs + 1e-8) * samples).mean((1, 2)).sum() cost_std = TT.maximum(TT.sqrt(new_cost_var + 1e-8), 1e-6) centered_reward = (reward - baseline - new_center) / cost_std N = probs.shape[-1] gradp = self.lambda1_reg * (centered_reward) * \ (samples / (probs + 1e-8)) + lambda2_reg * (TT.log(probs + 1e-6) + as_floatX(1)) if mask is not None: gradp = mask.dimshuffle(0, 1, 'x') * gradp / N known_grads = {probs: gradp} return updates, known_grads, new_center, cost_std, policy, lambda2_reg