def get_monitoring_channels(self): warnings.warn("Layer.get_monitoring_channels is " + "deprecated. Use get_layer_monitoring_channels " + "instead. Layer.get_monitoring_channels " + "will be removed on or after september 24th 2014", stacklevel=2) W, = self.transformer.get_params() assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) row_norms_min = row_norms.min() row_norms_min.__doc__ = ("The smallest norm of any row of the " "weight matrix W. This is a measure of the " "least influence any visible unit has.") return OrderedDict([('row_norms_min', row_norms_min), ('row_norms_mean', row_norms.mean()), ('row_norms_max', row_norms.max()), ('col_norms_min', col_norms.min()), ('col_norms_mean', col_norms.mean()), ('col_norms_max', col_norms.max()), ])
def adam(lr, tparams, grads, inp, cost): gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inp, cost, updates=gsup) lr0 = 0.0002 b1 = 0.1 b2 = 0.001 e = 1e-8 updates = [] i = theano.shared(numpy.float32(0.)) i_t = i + 1. fix1 = 1. - b1**(i_t) fix2 = 1. - b2**(i_t) lr_t = lr0 * (tensor.sqrt(fix2) / fix1) for p, g in zip(tparams.values(), gshared): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) g_t = m_t / (tensor.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def sgd_updates_adadelta(params,cost,rho=0.95,epsilon=1e-6,norm_lim=9,word_vec_name='Words'): """ adadelta update rule, mostly from https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta) """ updates = OrderedDict({}) exp_sqr_grads = OrderedDict({}) exp_sqr_ups = OrderedDict({}) gparams = [] for param in params: empty = numpy.zeros_like(param.get_value()) exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name) gp = T.grad(cost, param) exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name) gparams.append(gp) for param, gp in zip(params, gparams): exp_sg = exp_sqr_grads[param] exp_su = exp_sqr_ups[param] up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp) updates[exp_sg] = up_exp_sg step = -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) stepped_param = param + step if (param.get_value(borrow=True).ndim == 2) and (param.name!='Words'): col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim)) scale = desired_norms / (1e-7 + col_norms) tmp=stepped_param * scale tmp=T.cast(tmp,'float32') #print param.type,tmp.type updates[param] = tmp else: updates[param] = stepped_param #print param.type,stepped_param.type return updates
def get_updates_adadelta(grads,params,decay=0.95): decay = constantX(decay) print 'build updates with adadelta' for param, grad in zip(params, grads): # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(numpy.zeros(param.get_value().shape, dtype=floatX)) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(numpy.zeros(param.get_value().shape, dtype=floatX)) if param.name is not None: mean_square_grad.name = 'mean_square_grad_' + param.name mean_square_dx.name = 'mean_square_dx_' + param.name # Accumulate gradient new_mean_squared_grad = \ decay * mean_square_grad +\ (1. - decay) * T.sqr(grad) # Compute update epsilon = constantX(1e-7) rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon) rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon) delta_x_t = - rms_dx_tm1 / rms_grad_t * grad # Accumulate updates new_mean_square_dx = \ decay * mean_square_dx + \ (1. - decay) * T.sqr(delta_x_t) # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[param] = param + delta_x_t
def buildUpdatesSimpleMomentum(self, batchTrainer, momentum, batchLearningRate, error): deltaParams = T.grad(error, batchTrainer.params) updates = [] parametersTuples = zip(batchTrainer.params, deltaParams, batchTrainer.oldUpdates, batchTrainer.oldMeanSquare, batchTrainer.hasNormConstraint) for param, delta, oldUpdate, oldMeanSquare, hasNormConstraint in parametersTuples: paramUpdate = momentum * oldUpdate if self.rmsprop: meanSquare = 0.9 * oldMeanSquare + 0.1 * delta ** 2 paramUpdate += - batchLearningRate * delta / T.sqrt(meanSquare + 1e-8) updates.append((oldMeanSquare, meanSquare)) else: paramUpdate += - batchLearningRate * delta newParam = param + paramUpdate if self.normConstraint is not None and hasNormConstraint: norms = SquaredElementWiseNorm(newParam) rescaled = norms > self.normConstraint factors = T.ones(norms.shape, dtype=theanoFloat) / T.sqrt(norms) * np.sqrt(self.normConstraint, dtype='float32') - 1.0 replaceNewParam = (factors * rescaled) * newParam replaceNewParam += newParam newParam = replaceNewParam # paramUpdate = newParam - param updates.append((param, newParam)) updates.append((oldUpdate, paramUpdate)) return updates
def generate_forward_diffusion_sample(self, X_noiseless): """ Corrupt a training image with t steps worth of Gaussian noise, and return the corrupted image, as well as the mean and covariance of the posterior q(x^{t-1}|x^t, x^0). """ X_noiseless = X_noiseless.reshape( (-1, self.n_colors, self.spatial_width, self.spatial_width)) n_images = X_noiseless.shape[0].astype('int16') rng = Random().theano_rng # choose a timestep in [1, self.trajectory_length-1]. # note the reverse process is fixed for the very # first timestep, so we skip it. # TODO for some reason random_integer is missing from the Blocks # theano random number generator. t = T.floor(rng.uniform(size=(1,1), low=1, high=self.trajectory_length, dtype=theano.config.floatX)) t_weights = self.get_t_weights(t) N = rng.normal(size=(n_images, self.n_colors, self.spatial_width, self.spatial_width), dtype=theano.config.floatX) # noise added this time step beta_forward = self.get_beta_forward(t) # decay in noise variance due to original signal this step alpha_forward = 1. - beta_forward # compute total decay in the fraction of the variance due to X_noiseless alpha_arr = 1. - self.beta_arr alpha_cum_forward_arr = T.extra_ops.cumprod(alpha_arr).reshape((self.trajectory_length,1)) alpha_cum_forward = T.dot(t_weights.T, alpha_cum_forward_arr) # total fraction of the variance due to noise being mixed in beta_cumulative = 1. - alpha_cum_forward # total fraction of the variance due to noise being mixed in one step ago beta_cumulative_prior_step = 1. - alpha_cum_forward/alpha_forward # generate the corrupted training data X_uniformnoise = X_noiseless + (rng.uniform(size=(n_images, self.n_colors, self.spatial_width, self.spatial_width), dtype=theano.config.floatX)-T.constant(0.5,dtype=theano.config.floatX))*T.constant(self.uniform_noise,dtype=theano.config.floatX) X_noisy = X_uniformnoise*T.sqrt(alpha_cum_forward) + N*T.sqrt(1. - alpha_cum_forward) # compute the mean and covariance of the posterior distribution mu1_scl = T.sqrt(alpha_cum_forward / alpha_forward) mu2_scl = 1. / T.sqrt(alpha_forward) cov1 = 1. - alpha_cum_forward/alpha_forward cov2 = beta_forward / alpha_forward lam = 1./cov1 + 1./cov2 mu = ( X_uniformnoise * mu1_scl / cov1 + X_noisy * mu2_scl / cov2 ) / lam sigma = T.sqrt(1./lam) sigma = sigma.reshape((1,1,1,1)) mu.name = 'mu q posterior' sigma.name = 'sigma q posterior' X_noisy.name = 'X_noisy' t.name = 't' return X_noisy, t, mu, sigma
def adadelta(lr,tparams,grads,x,mask,y,cost): zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.items()] running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.items()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.items()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] #梯度更新字典 param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') return f_grad_shared, f_update
def adadelta(lr, tparams, grads, inp, cost, extra_ups=[], extra_outs=[], exclude_params=set([])): '''Adadelta''' zipped_grads = [theano.shared(p.get_value() * np.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rup2'%k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function( inp, [cost]+extra_outs, updates=zgup+rg2up+extra_ups, profile=profile) updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tools.itemlist(tparams), updir) if p.name not in exclude_params] if not isinstance(lr, list): lr = [lr] f_update = theano.function(lr, [], updates=ru2up+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def get_mu_sigma(self, X_noisy, t): """ Generate mu and sigma for one step in the reverse trajectory, starting from a minibatch of images X_noisy, and at timestep t. """ Z = self.mlp.apply(X_noisy) mu_coeff, beta_coeff = self.temporal_readout(Z, t) # reverse variance is perturbation around forward variance beta_forward = self.get_beta_forward(t) # make impact of beta_coeff scaled appropriately with mu_coeff beta_coeff_scaled = beta_coeff / np.sqrt(self.trajectory_length).astype(theano.config.floatX) beta_reverse = T.nnet.sigmoid(beta_coeff_scaled + util.logit(beta_forward)) # # reverse mean is decay towards mu_coeff # mu = (X_noisy - mu_coeff)*T.sqrt(1. - beta_reverse) + mu_coeff # reverse mean is a perturbation around the mean under forward # process # # DEBUG -- use these lines to test objective is 0 for isotropic Gaussian model # beta_reverse = beta_forward # mu_coeff = mu_coeff*0 mu = X_noisy*T.sqrt(1. - beta_forward) + mu_coeff*T.sqrt(beta_forward) sigma = T.sqrt(beta_reverse) mu.name = 'mu p' sigma.name = 'sigma p' return mu, sigma
def dev_loss(self, dev_types, dev_lams, ss_ratio, y): su_mask = ss_ratio * T.neq(y, 0).reshape((y.shape[0], 1)) un_mask = T.eq(y, 0).reshape((y.shape[0], 1)) ss_mask = su_mask + un_mask var_fun = lambda x1, x2: T.sum(((x1 - x2) * ss_mask)**2.0) / T.sum(ss_mask) tanh_fun = lambda x1, x2: var_fun(T.tanh(x1), T.tanh(x2)) norm_fun = lambda x1, x2: var_fun( \ (x1 / T.sqrt(T.sum(x1**2.0,axis=1,keepdims=1) + 1e-6)), \ (x2 / T.sqrt(T.sum(x2**2.0,axis=1,keepdims=1) + 1e-6))) sigm_fun = lambda x1, x2: var_fun(T.nnet.sigmoid(x1), T.nnet.sigmoid(x2)) cent_fun = lambda xt, xo: T.sum(T.nnet.binary_crossentropy( \ T.nnet.sigmoid(xo), T.nnet.sigmoid(xt))) / xt.shape[0] L = 0.0 for i in xrange(self.layer_count): if (i < (self.layer_count - 1)): x1 = self.layers[i].output x2 = self.drop_nets[0][i].output else: x1 = self.layers[i].linear_output x2 = self.drop_nets[0][i].linear_output if (dev_types[i] == 1): L = L + (dev_lams[i] * norm_fun(x1, x2)) elif (dev_types[i] == 2): L = L + (dev_lams[i] * tanh_fun(x1, x2)) elif (dev_types[i] == 3): L = L + (dev_lams[i] * sigm_fun(x1, x2)) elif (dev_types[i] == 4): L = L + (dev_lams[i] * cent_fun(x1, x2)) else: L = L + (dev_lams[i] * var_fun(x1, x2)) return L
def batchnorm(X, rescale=None, reshift=None, u=None, s=None, e=1e-8): """ batchnorm with support for not using scale and shift parameters as well as inference values (u and s) and partial batchnorm (via a) will detect and use convolutional or fully connected version """ g = rescale b = reshift if X.ndim == 4: if u is not None and s is not None: # use normalization params given a priori b_u = u.dimshuffle('x', 0, 'x', 'x') b_s = s.dimshuffle('x', 0, 'x', 'x') else: # compute normalization params from input b_u = T.mean(X, axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') b_s = T.mean(T.sqr(X - b_u), axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') # batch normalize X = (X - b_u) / T.sqrt(b_s + e) if g is not None and b is not None: # apply rescale and reshift X = X*T.exp(0.2*g.dimshuffle('x', 0, 'x', 'x')) + b.dimshuffle('x', 0, 'x', 'x') elif X.ndim == 2: if u is None and s is None: # compute normalization params from input u = T.mean(X, axis=0) s = T.mean(T.sqr(X - u), axis=0) # batch normalize X = (X - u) / T.sqrt(s + e) if g is not None and b is not None: # apply rescale and reshift X = X*T.exp(0.2*g) + b else: raise NotImplementedError return X
def sample_v_given_hs(self, h_sample, s_sample, rng=None, size=None): """ Generates sample from p(v | h, s) """ v_mean = self.v_given_hs(h_sample, s_sample) rng = self.theano_rng if rng is None else rng size = size if size else self.batch_size if self.flags['truncate_v']: v_sample = truncated.truncated_normal( size=(size, self.n_v), avg = v_mean, std = T.sqrt(1./self.lambd_prec), lbound = -self.truncation_bound['v'], ubound = self.truncation_bound['v'], theano_rng = rng, dtype=floatX) else: v_sample = rng.normal( size=(size, self.n_v), avg = v_mean, std = T.sqrt(1./self.lambd_prec), dtype=floatX) return v_sample
def adadelta(lr, tparams, grads, inp, cost): zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rup2' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up, profile=profile) updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] f_update = theano.function([lr], [], updates=ru2up+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def updates(self, cost, params, learning_rate = 0.1, momentum= 0.95, rescale=5.): grads = T.grad(cost, params) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1e-4 updates = [] for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg ** 2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = self.memory_[n] update = momentum * memory - learning_rate * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * learning_rate * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def sample_s_given_ghv(self, g_sample, h_sample, v_sample, rng=None, size=None): """ Generates sample from p(s | g, h, v) """ s_mean = self.s_given_ghv(g_sample, h_sample, v_sample) rng = self.theano_rng if rng is None else rng size = size if size else self.batch_size if self.flags['truncate_s']: s_sample = truncated.truncated_normal( size=(size, self.n_s), avg = s_mean, std = T.sqrt(1./self.alpha_prec), lbound = self.truncation_bound['s'], ubound = self.truncation_bound['s'], theano_rng = rng, dtype=floatX) else: s_sample = rng.normal( size=(size, self.n_s), avg = s_mean, std = T.sqrt(1./self.alpha_prec), dtype=floatX) return s_sample
def sgd_updates_adadelta(params, cost, rho=0.95, epsilon=1e-6, norm_lim=9, word_vec_name='embedding'): updates = OrderedDict({}) exp_sqr_grads = OrderedDict({}) exp_sqr_ups = OrderedDict({}) gparams = [] for param in params: empty = np.zeros_like(param.get_value()) exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name) gp = T.grad(cost, param) exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name) gparams.append(gp) for param, gp in zip(params, gparams): exp_sg = exp_sqr_grads[param] exp_su = exp_sqr_ups[param] up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp) updates[exp_sg] = up_exp_sg step = -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) stepped_param = param + step if (param.get_value(borrow=True).ndim == 2) and (param.name!='embedding'): col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim)) scale = desired_norms / (1e-7 + col_norms) updates[param] = stepped_param * scale else: updates[param] = stepped_param return updates
def __init__(self, vocab_size, dim, lr=0.5): W = np.asarray(np.random.rand(vocab_size, dim), dtype=theano.config.floatX) / float(dim) W1 = np.asarray((np.random.rand(vocab_size, dim)), dtype=theano.config.floatX) / float(dim) self.W = theano.shared(W, name='W', borrow=True) self.W1 = theano.shared(W1, name='W1', borrow=True) gW = np.asarray(np.ones((vocab_size, dim)), dtype=theano.config.floatX) gW1 = np.asarray( np.ones((vocab_size, dim)), dtype=theano.config.floatX) self.gW = theano.shared(gW, name='gW', borrow=True) self.gW1 = theano.shared(gW1, name='gW1', borrow=True) X = T.vector() fX = T.vector() ind_W = T.ivector() ind_W1 = T.ivector() w = self.W[ind_W, :] w1 = self.W1[ind_W1, :] cost = T.sum(fX * ((T.sum(w * w1, axis=1) - X) ** 2)) grad = T.clip(T.grad(cost, [w, w1]), -5.0, 5.0) updates1 = [(self.gW, T.inc_subtensor(self.gW[ind_W, :], grad[0] ** 2))] updates2 = [(self.gW1, T.inc_subtensor(self.gW1[ind_W1, :], grad[1] ** 2))] updates3 = [(self.W, T.inc_subtensor(self.W[ind_W, :], - (lr / T.sqrt(self.gW[ind_W, :])) * grad[0]))] updates4 = [(self.W1, T.inc_subtensor(self.W1[ind_W1, :], - (lr / T.sqrt(self.gW1[ind_W1, :])) * grad[1]))] updates = updates1 + updates2 + updates3 + updates4 self.cost_fn = theano.function( inputs=[ind_W, ind_W1, X, fX], outputs=cost, updates=updates)
def get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=9, word_vec_name='W_emb'): """ adadelta update rule, mostly from https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta) """ print "Generating adadelta updates" updates = OrderedDict({}) exp_sqr_grads = OrderedDict({}) exp_sqr_ups = OrderedDict({}) gparams = [] for param in params: exp_sqr_grads[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name) gp = T.grad(cost, param) exp_sqr_ups[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name) gparams.append(gp) for param, gp in zip(params, gparams): exp_sg = exp_sqr_grads[param] exp_su = exp_sqr_ups[param] up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp) updates[exp_sg] = up_exp_sg step = -(T.sqrt(exp_su + eps) / T.sqrt(up_exp_sg + eps)) * gp updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) stepped_param = param + step # if (param.get_value(borrow=True).ndim == 2) and (param.name != word_vec_name): if max_norm and param.name != word_vec_name: col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(max_norm)) scale = desired_norms / (1e-7 + col_norms) updates[param] = stepped_param * scale else: updates[param] = stepped_param return updates
def _get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=9, word_vec_name='W_emb'): print "Generating adadelta updates (implementation from dnn)" # compute list of weights updates gparams = T.grad(cost, params) accugrads, accudeltas = [], [] for param in params: accugrads.append(build_shared_zeros(param.shape.eval(), 'accugrad')) accudeltas.append(build_shared_zeros(param.shape.eval(), 'accudelta')) # compute list of weights updates updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(accugrads, accudeltas, params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = rho * accugrad + (1 - rho) * gparam * gparam dx = - T.sqrt((accudelta + eps) / (agrad + eps)) * gparam updates[accudelta] = (rho * accudelta + (1 - rho) * dx * dx) if (max_norm > 0) and param.ndim == 2 and param.name != word_vec_name: W = param + dx col_norms = W.norm(2, axis=0) desired_norms = T.clip(col_norms, 0, T.sqrt(max_norm)) updates[param] = W * (desired_norms / (1e-7 + col_norms)) else: updates[param] = param + dx updates[accugrad] = agrad return updates
def adadelta(parameters, gradients, rho=0.95, eps=1e-6): """ adadelta : training algorithm """ # create variables to store intermediate updates gradients_sq = [theano.shared(numpy.zeros(p.get_value().shape, dtype=theano.config.floatX)) for p in parameters] deltas_sq = [theano.shared(numpy.zeros(p.get_value().shape, dtype=theano.config.floatX)) for p in parameters] # calculates the new "average" delta for the next iteration gradients_sq_new = [rho*g_sq + (1-rho)*(g**2) for g_sq,g in izip(gradients_sq, gradients)] # calculates the step in direction. The square root is an approximation to getting the RMS for the average value deltas = [(T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad for d_sq,g_sq,grad in izip(deltas_sq,gradients_sq_new,gradients)] # calculates the new "average" deltas for the next step. deltas_sq_new = [rho*d_sq + (1-rho)*(d**2) for d_sq,d in izip(deltas_sq,deltas)] # Prepare it as a list f gradient_sq_updates = zip(gradients_sq,gradients_sq_new) deltas_sq_updates = zip(deltas_sq,deltas_sq_new) parameters_updates = [(p,T.clip(p - d, -15,15)) for p,d in izip(parameters,deltas)] return gradient_sq_updates + deltas_sq_updates + parameters_updates
def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.), W=lasagne.init.Normal(0.05), train_g=False, init_stdv=1., nonlinearity=relu, **kwargs): super(WeightNormLayer, self).__init__(incoming, **kwargs) self.nonlinearity = nonlinearity self.init_stdv = init_stdv k = self.input_shape[1] if b is not None: self.b = self.add_param(b, (k,), name="b", regularizable=False) if g is not None: self.g = self.add_param(g, (k,), name="g", regularizable=False, trainable=train_g) if len(self.input_shape)==4: self.axes_to_sum = (0,2,3) self.dimshuffle_args = ['x',0,'x','x'] else: self.axes_to_sum = 0 self.dimshuffle_args = ['x',0] # scale weights in layer below incoming.W_param = incoming.W #incoming.W_param.set_value(W.sample(incoming.W_param.get_value().shape)) if incoming.W_param.ndim==4: if isinstance(incoming, Deconv2DLayer): W_axes_to_sum = (0,2,3) W_dimshuffle_args = ['x',0,'x','x'] else: W_axes_to_sum = (1,2,3) W_dimshuffle_args = [0,'x','x','x'] else: W_axes_to_sum = 0 W_dimshuffle_args = ['x',0] if g is not None: incoming.W = incoming.W_param * (self.g/T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum))).dimshuffle(*W_dimshuffle_args) else: incoming.W = incoming.W_param / T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum,keepdims=True))
def _get_model_updates(self): alpha = self.params['optimizer/learning_rate'] updates = dict() for name, param in self.network.params.items(): gradient = self.params[name + '_gradient'] ms_gradient = self.params[name + '_mean_sqr_gradient'] ms_velocity = self.params[name + '_mean_sqr_velocity'] # rms_velocity quantity lags behind rms_gradient by 1 time step, # due to the recurrence relationship for velocity. rms_gradient = tensor.sqrt(ms_gradient + self._epsilon) rms_velocity = tensor.sqrt(ms_velocity + self._epsilon) velocity = -gradient * rms_velocity / rms_gradient updates[name] = velocity self._normalize(updates) result = [] for name, param in self.network.params.items(): update = updates[name] ms_velocity = self.params[name + '_mean_sqr_velocity'] ms_velocity_new = self._gamma * ms_velocity + \ (1.0 - self._gamma) * tensor.sqr(update) param_new = param + alpha * update result.append((ms_velocity, ms_velocity_new)) result.append((param, param_new)) return result
def ADAMopt(self, tVars, loss, lr, momentum=0): i = T.iscalar('i'); lr = T.fscalar('lr'); grads = T.grad(loss, tVars) '''ADAM Code from https://github.com/danfischetti/deep-recurrent-attentive-writer/blob/master/DRAW/adam.py ''' self.m = [theano.shared(name = 'm', \ value = np.zeros(param.get_value().shape,dtype=theano.config.floatX)) for param in model.params] self.v = [theano.shared(name = 'v', \ value = np.zeros(param.get_value().shape,dtype=theano.config.floatX)) for param in model.params] self.t = theano.shared(name = 't',value = np.asarray(1).astype(theano.config.floatX)) updates = [(self.t,self.t+1)] for param, gparam,m,v in zip(model.params, gparams, self.m, self.v): b1_t = 1-(1-beta1)*(l**(self.t-1)) m_t = b1_t*gparam + (1-b1_t)*m updates.append((m,m_t)) v_t = beta2*(gparam**2)+(1-beta2)*v updates.append((v,v_t)) m_t_bias = m_t/(1-(1-beta1)**self.t) v_t_bias = v_t/(1-(1-beta2)**self.t) if param.get_value().ndim == 1: updates.append((param,param - 5*lr*m_t_bias/(T.sqrt(v_t_bias)+epsilon))) else: updates.append((param,param - lr*m_t_bias/(T.sqrt(v_t_bias)+epsilon))) return theano.function([], loss, updates=updates)
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) accumulators = [shared_zeros(p.get_value().shape) for p in params] delta_accumulators = [shared_zeros(p.get_value().shape) for p in params] self.updates = [] n_step = theano.shared(1.0) self.updates.append((n_step, n_step + 1)) for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): g_noise = self.rng.normal(p.shape, 0, T.sqrt(n_step ** - 0.55), dtype='float32') g_deviated = g + g_noise new_a = self.rho * a + (1 - self.rho) * g_deviated ** 2 # update accumulator self.updates.append((a, new_a)) # use the new accumulator and the *old* delta_accumulator update = g_deviated * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a + self.epsilon) new_p = p - self.lr * update self.updates.append((p, new_p)) # update delta_accumulator new_d_a = self.rho * d_a + (1 - self.rho) * update ** 2 self.updates.append((d_a, new_d_a)) return self.updates
def get_updates(self, params, loss, **kwargs): grads = self.get_gradients(loss, params, **kwargs) self.updates = [(self.iterations, self.iterations+1.)] t = self.iterations + 1 lr_t = self.lr * T.sqrt(1-self.beta_2**t)/(1-self.beta_1**t) # n_step = theano.shared(1.0) # self.updates.append((n_step, n_step + 1)) gradients = [] for p, g in zip(params, grads): m = theano.shared(p.get_value() * 0.) # zero init of moment v = theano.shared(p.get_value() * 0.) # zero init of velocity # g_noise = self.rng.normal(g.shape, 0, T.sqrt(0.5 * n_step ** - 0.55), dtype='float32') # g_deviated = g + g_noise g_deviated = g # for debug purposes gradients.append(g) m_t = (self.beta_1 * m) + (1 - self.beta_1) * g_deviated v_t = (self.beta_2 * v) + (1 - self.beta_2) * (g_deviated**2) p_t = p - lr_t * m_t / (T.sqrt(v_t) + self.epsilon) self.updates.append((m, m_t)) self.updates.append((v, v_t)) self.updates.append((p, p_t)) # apply constraints return self.updates, gradients
def AdadeltaUpdate(params,cost,stepSize=1.0,rho=0.95,epsilon=1e-6,norm_lim=9): updates=OrderedDict({}) exp_sqr_grads=OrderedDict({}) exp_sqr_update=OrderedDict({}) g_params=[] for param in params: empty=np.zeros_like(param.get_value()) exp_sqr_grads[param]=theano.shared(value=as_floatX(empty),name='exp_grad_%s'%param.name) exp_sqr_update[param]=theano.shared(value=as_floatX(empty),name='exp_grad_%s'%param.name) gp=T.grad(cost,param) g_params.append(gp) for param,gp in zip(params,g_params): exp_sg=exp_sqr_grads[param] exp_su=exp_sqr_update[param] update_exp_sg=rho*exp_sg+(1-rho)*T.sqr(gp)#???? updates[exp_sg]=update_exp_sg step=-(T.sqrt(exp_su+epsilon)/T.sqrt(update_exp_sg+epsilon))*gp stepped_param=param+step*stepSize update_exp_su=rho*exp_su+(1-rho)*T.sqr(step) updates[exp_su]=update_exp_su if param.get_value(borrow=True).ndim==2 and param.name!='wordVec': col_norms=T.sqrt(T.sum(T.sqr(stepped_param),axis=0)) desired_norms=T.clip(col_norms,0,T.sqrt(norm_lim))#??? scale=desired_norms/(1e-7+col_norms) updates[param]=stepped_param*scale else: updates[param]=stepped_param return updates
def forward(self,input_org,train=True,update_batch_stat=True,finetune=False): print "Layer/BatchNormalization" ldim,cdim,rdim = self._internal_shape(input_org) input = input_org.reshape((ldim,cdim,rdim)) if (train): mean = T.mean(input, axis=(0, 2), keepdims=True ) var = T.mean((input-mean)**2, axis=(0, 2), keepdims=True) if(update_batch_stat): finetune_N = theano.clone(self.finetune_N, share_inputs=False) if(finetune): finetune_N.default_update = finetune_N+1 ratio = T.cast(1-1.0/(finetune_N+1),theano.config.floatX) else: finetune_N.default_update = 0 ratio = self.moving_avg_ratio m = ldim*rdim scale = T.cast(m/(m-1.0),theano.config.floatX) est_mean = theano.clone(self.est_mean, share_inputs=False) est_var = theano.clone(self.est_var, share_inputs=False) est_mean.default_update = T.cast(ratio*self.est_mean + (1-ratio)*mean,theano.config.floatX) est_var.default_update = T.cast(ratio*self.est_var + (1-ratio)*scale*var,theano.config.floatX) mean += 0 * est_mean var += 0 * est_var output = self._pbc(self.gamma) * (input - self._pbc(mean)) \ / T.sqrt(1e-6+self._pbc(var)) + self._pbc(self.beta) else: output = self._pbc(self.gamma) * (input - self._pbc(self.est_mean)) \ / T.sqrt(1e-6+self._pbc(self.est_var)) + self._pbc(self.beta) return output.reshape(input_org.shape)
def applyConstraint(self, param): if param.ndim != 4 and param.ndim != 2: warnings.warn("Norm constraints are normally applied to matrices" +" or 4-dimensional tensors, but currently got " +"%d dimensions, please make sure this is the desired" +" parameter to apply norm constraints" % param.ndim) needFlip = False if param.ndim == 4: # a hack for conv layer filters prevShape = param.shape # conv layer filter shape is (nChannelOut, nChannelIn, r, c) param = param.flatten(2) # now it is (nout, nin), which is different from (nin, nout) # from fulling connected networks, so need to flip here needFlip = True if needFlip: col_norm = T.sqrt(T.sum(T.sqr(param), axis=1, keepdims=True)) else: col_norm = T.sqrt(T.sum(T.sqr(param), axis=0, keepdims=True)) param /= (col_norm+1e-7) param *= self.norm if needFlip: param = param.reshape(prevShape) return param
def make_functions(inputs,outputs,params,grads,lr): shapes = [ p.get_value().shape for p in params ] acc_grads = [ theano.shared(np.zeros(s,dtype=np.float32)) for s in shapes ] count = theano.shared(np.float32(0)) acc_update = [ (a,a+g) for a,g in zip(acc_grads,grads) ] + [ (count,count + 1.) ] # deltas = acc_grads deltas = [ ag / count for ag in acc_grads ] grads_norms = [ T.sqrt(T.sum(g**2)) for g in deltas ] deltas = [ T.switch(T.gt(n,1.),1.*g/n,g) for n,g in zip(grads_norms,deltas) ] # param_update = [ (p, p - lr * g) for p,g in zip(params,deltas) ] param_update = updates.adadelta(params,deltas,learning_rate=lr) # ,learning_rate=lr,rho=np.float32(0.95) clear_update = [ (a,np.zeros(s,dtype=np.float32)) for a,s in zip(acc_grads,shapes) ] + [ (count,0) ] acc = theano.function( inputs = inputs, outputs = [outputs,output_ans[ans_lbl]], updates = acc_update, on_unused_input='warn', # mode=theano.compile.MonitorMode(post_func=detect_nan) ) update = theano.function( inputs=[lr], updates = param_update + clear_update, outputs = [ T.sqrt(T.sum(T.sqr(w))) for w in deltas ], on_unused_input='warn', # mode=theano.compile.MonitorMode(post_func=detect_nan) ) return acc,update
def get_updates(self, cost, params): grads = self.get_gradients(cost, params) updates = [] if self.i is None: self.i = sharedasarray(0) updates.append((self.i, self.i+1)) t = self.i+1 lr_t = self.lr * T.sqrt(1-self.beta2**t) / (1-self.beta1**t) eps_hat = self.eps * T.sqrt(1-self.beta2**t) if self.ms is None: self.ms = [sharedzeros(p.get_value().shape) for p in params] if self.vs is None: self.vs = [sharedzeros(p.get_value().shape) for p in params] for p, g, m, v in zip(params, grads, self.ms, self.vs): m_t = (self.beta1*m) + (1.-self.beta1)*g v_t = (self.beta2*v) + (1.-self.beta2)*(g**2) p_t = p - lr_t*m_t/(T.sqrt(v_t)+eps_hat) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) return updates
def quantization(W,Wacc,method, Wb): if method == "FPN": Wb = W elif method == "LAB": L = (T.sqrt(Wacc) + 1e-8) Wb = hard_sigmoid(W) Wb = round3(Wb) Wb = T.cast(T.switch(Wb,1.,-1.), theano.config.floatX) alpha = (T.abs_(L*W).sum()/L.sum()).astype('float32') Wb = alpha*Wb elif method=="LATa": D = (T.sqrt(Wacc) + 1e-8) b = T.sgn(Wb) # compute the threshold, converge within 10 iterations alpha = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') b = T.switch(T.gt(W/alpha, 0.5), 1., T.switch(T.lt(W/alpha, -0.5), -1., 0.) ) def OneStep(alpha, b): # minimize alpha alpha_new = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') # minimize b b_new = T.switch(T.gt(W/alpha_new, 0.5), 1., T.switch(T.lt(W/alpha_new, -0.5), -1., 0.)) delta = T.abs_(alpha_new-alpha) condition = T.lt(delta, 1e-6) return [alpha_new, b_new], theano.scan_module.until(condition) [out1, out2], updates = theano.scan(fn=OneStep ,outputs_info=[alpha, b],n_steps=10) Wb = out1[-1]*out2[-1] elif method=="LATe": D = (T.sqrt(Wacc) + 1e-8) thres = findalpha(D, W) alpha = thres*2 Wt = T.switch(T.gt(W, thres), 1., T.switch(T.lt(W, -thres), -1., 0.) ) Wb = alpha*Wt elif method=="LAT2e": D = (T.sqrt(Wacc) + 1e-8) thres1, thres2 = findalpha2(D, W) alpha1 = thres1*2 Wt1 = T.switch(T.gt(W, thres1), 1., 0.) alpha2 = thres2*2 Wt2 = T.switch(T.lt(W, -thres2), -1., 0.) Wb = alpha1*Wt1 + alpha2*Wt2 elif method=="LAT2a": D = (T.sqrt(Wacc) + 1e-8) b1 = T.ge(Wb,0) alpha1 = (T.abs_(b1*D*W).sum()/T.abs_(b1*D).sum()).astype('float32') b1 = T.switch(T.gt(W/alpha1, 0.5), 1., 0.) # Wb1 = alpha1*mask1*Wb b2 = T.lt(Wb,0) alpha2 = (T.abs_(b2*D*W).sum()/T.abs_(b2*D).sum()).astype('float32') b2 = T.switch(T.lt(W/alpha2, -0.5), -1., 0.) def OneStep(alpha1, b1, alpha2, b2): alpha1_new = (T.abs_(b1*D*W).sum()/T.abs_(b1*D).sum()).astype('float32') b1_new = T.switch(T.gt(W/alpha1_new, 0.5), 1., 0.) alpha2_new = (T.abs_(b2*D*W).sum()/T.abs_(b2*D).sum()).astype('float32') b2_new = T.switch(T.lt(W/alpha2_new, -0.5), -1., 0.) delta1 = T.abs_(alpha1_new-alpha1) delta2 = T.abs_(alpha2_new-alpha2) condition = T.lt(delta1, 1e-6) and T.lt(delta2, 1e-6) return [alpha1_new, b1_new, alpha2_new, b2_new], theano.scan_module.until(condition) [out1, out2, out3, out4], updates = theano.scan(fn=OneStep ,outputs_info=[alpha1, b1, alpha2, b2],n_steps=10) Wb = out1[-1]*out2[-1] + out3[-1]*out4[-1] elif method=="LAQ_linear": D = (T.sqrt(Wacc) + 1e-8) b = T.sgn(Wb) alpha = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') # b = T.switch(T.gt(W/alpha, 0.5), 1., T.switch(T.lt(W/alpha, -0.5), -1., 0.) ) m = 3 # number of bits n = 2**(m-1)-1 b = round3(T.clip(W/alpha, -1., 1.)*n)/(n) def OneStep(alpha, b): # minimize alpha alpha_new = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') # minimize b # b_new = T.switch(T.gt(W/alpha, 0.5), 1., T.switch(T.lt(W/alpha, -0.5), -1., 0.)) b_new = round3(T.clip(W/alpha_new, -1., 1.)*n)/(n) delta = T.abs_(alpha_new-alpha) condition = T.lt(delta, 1e-6) return [alpha_new, b_new], theano.scan_module.until(condition) [out1, out2], updates = theano.scan(fn=OneStep ,outputs_info=[alpha, b],n_steps=10) Wb = out1[-1]*out2[-1] elif method=="LAQ_log": D = (T.sqrt(Wacc) + 1e-8) b = T.sgn(Wb) alpha = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') m = 3 # number of bits n = 2**(m-1)-1 tmp = T.clip(W/alpha, -1., 1.) # log2(1/2*(2^(-n)+2^(-(n+1)))) - (-n-(n+1))/2 = 0.0849625 b = T.switch( T.ge(tmp, pow(2, -n)), T.pow(2, round3(T.log2(tmp)-0.0849625)), T.switch( T.le(tmp, -pow(2,-n)), -T.pow(2, round3(T.log2(-tmp)-0.0849625)), 0.)) b = T.switch(T.ge(b, pow(2, - (n-1))), b, T.switch(T.le(b, -pow(2, -(n-1))), b, T.sgn(b)*pow(2,-(n-1)))) def OneStep(alpha, b): # minimize alpha alpha_new = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') # minimize b tmp_new = T.clip(W/alpha_new, -1., 1.) b_new = T.switch( T.ge(tmp_new, pow(2, -n)), T.pow(2, round3(T.log2(tmp_new)-0.0849625)), T.switch( T.le(tmp_new, -pow(2, -n)), -T.pow(2, round3(T.log2(-tmp_new)-0.0849625)), 0.)) b_new = T.switch(T.ge(b_new, pow(2, - (n-1))), b_new, T.switch(T.le(b_new, -pow(2, -(n-1))), b_new, T.sgn(b_new)*pow(2, -(n-1)))) delta = T.abs_(alpha_new-alpha) condition = T.lt(delta, 1e-6) return [alpha_new, b_new], theano.scan_module.until(condition) [out1, out2], updates = theano.scan(fn=OneStep ,outputs_info=[alpha, b],n_steps=10) Wb = out1[-1]*out2[-1] return Wb
def std_cdf(x): """ Calculates the standard normal cumulative distribution function. """ return 0.5 + 0.5 * erf(x / sqrt(2.))
def create_optimization_updates(cost, params, updates=None, max_norm=5.0, lr=0.01, eps=1e-6, rho=0.95, method="adadelta", gradients=None): """ Get the updates for a gradient descent optimizer using SGD, AdaDelta, or AdaGrad. Returns the shared variables for the gradient caches, and the updates dictionary for compilation by a theano function. Inputs ------ cost theano variable : what to minimize params list : list of theano variables with respect to which the gradient is taken. max_norm float : cap on excess gradients lr float : base learning rate for adagrad and SGD eps float : numerical stability value to not divide by zero sometimes rho float : adadelta hyperparameter. method str : 'adagrad', 'adadelta', or 'sgd'. Outputs: -------- updates OrderedDict : the updates to pass to a theano function gsums list : gradient caches for Adagrad and Adadelta xsums list : gradient caches for AdaDelta only lr theano shared : learning rate max_norm theano_shared : normalizing clipping value for excessive gradients (exploding). """ lr = theano.shared(np.float64(lr).astype(theano.config.floatX)) eps = np.float64(eps).astype(theano.config.floatX) rho = theano.shared(np.float64(rho).astype(theano.config.floatX)) if max_norm is not None and max_norm is not False: max_norm = theano.shared( np.float64(max_norm).astype(theano.config.floatX)) gsums = [ theano.shared(np.zeros_like(param.get_value(borrow=True))) if (method == 'adadelta' or method == 'adagrad') else None for param in params ] xsums = [ theano.shared(np.zeros_like(param.get_value( borrow=True))) if method == 'adadelta' else None for param in params ] gparams = T.grad(cost, params) if gradients is None else gradients if updates is None: updates = OrderedDict() for gparam, param, gsum, xsum in zip(gparams, params, gsums, xsums): # clip gradients if they get too big if max_norm is not None and max_norm is not False: grad_norm = gparam.norm(L=2) gparam = (T.minimum(max_norm, grad_norm) / (grad_norm + eps)) * gparam if method == 'adadelta': updates[gsum] = T.cast(rho * gsum + (1. - rho) * (gparam**2), theano.config.floatX) dparam = -T.sqrt((xsum + eps) / (updates[gsum] + eps)) * gparam updates[xsum] = T.cast(rho * xsum + (1. - rho) * (dparam**2), theano.config.floatX) updates[param] = T.cast(param + dparam, theano.config.floatX) elif method == 'adagrad': updates[gsum] = T.cast(gsum + (gparam**2), theano.config.floatX) updates[param] = T.cast( param - lr * (gparam / (T.sqrt(updates[gsum] + eps))), theano.config.floatX) else: updates[param] = param - gparam * lr if method == 'adadelta': lr = rho return updates, gsums, xsums, lr, max_norm
def test_net( classifier, num_classes, learning_rate, learning_rate_decay, squared_filter_length_limit, n_epochs, timeout, batch_size, x, mom_params, dropout, results_file_name, dataset, use_bias, random_seed, decay=True, momentum=True, L2=True, plot = False, return_classifier = False, augment_schedule = []): #[(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)] = dataset [(train_set_x, train_set_y),(valid_set_x, valid_set_y),(test_set_x, test_set_y)] = dataset # extract the params for momentum mom_start = mom_params["start"] mom_end = mom_params["end"] mom_epoch_interval = mom_params["interval"] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size #n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' index = T.lscalar() # index to a [mini]batch epoch = T.scalar() y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(random_seed) # Build the expresson for the cost function. if L2: lamb = 0.00000001 cost = classifier.negative_log_likelihood(y) dropout_cost = classifier.dropout_negative_log_likelihood(y) if use_bias: cost += lamb * sum([(classifier.params[i]**2).sum() for i in range(0,len(classifier.params),2)])/2*batch_size dropout_cost += lamb * sum([(classifier.params[i]**2).sum() for i in range(0,len(classifier.params),2)])/2*batch_size else: cost += lamb *sum([(param**2).sum() for param in classifier.params])/2*batch_size dropout_cost += lamb *sum([(param**2).sum() for param in classifier.params])/2*batch_size else: cost = classifier.negative_log_likelihood(y) dropout_cost = classifier.dropout_negative_log_likelihood(y) # Compile theano function for testing. test_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size]}, on_unused_input='ignore') softmax_predictions = theano.function(inputs=[index], outputs=classifier.p_y_given_x_(), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size]}, on_unused_input='ignore') test_labels = theano.function(inputs=[index], outputs=test_set_y[index * batch_size:(index + 1) * batch_size]) #theano.printing.pydotprint(test_model, outfile="test_file.png", # var_with_name_simple=True) # Compile theano function for validation. #validate_model = theano.function(inputs=[index], # outputs=classifier.errors(y), # givens={ # x: valid_set_x[index * batch_size:(index + 1) * batch_size], # y: valid_set_y[index * batch_size:(index + 1) * batch_size]}, # on_unused_input='ignore') #theano.printing.pydotprint(validate_model, outfile="validate_file.png", # var_with_name_simple=True) # Compute gradients of the model wrt parameters gparams = [] for param in classifier.params: # Use the right cost function here to train with or without dropout. gparam = T.grad(dropout_cost if dropout else cost, param) gparams.append(gparam) if momentum: print >> sys.stderr, ("Using momentum") # ... and allocate mmeory for momentum'd versions of the gradient gparams_mom = [] for param in classifier.params: gparam_mom = theano.shared(np.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX)) gparams_mom.append(gparam_mom) # Compute momentum for the current epoch mom = ifelse(epoch < mom_epoch_interval, mom_start*(1.0 - epoch/mom_epoch_interval) + mom_end*(epoch/mom_epoch_interval), mom_end) # Update the step direction using momentum updates = OrderedDict() for gparam_mom, gparam in zip(gparams_mom, gparams): # Misha Denil's original version #updates[gparam_mom] = mom * gparam_mom + (1. - mom) * gparam # change the update rule to match Hinton's dropout paper updates[gparam_mom] = mom * gparam_mom - (1. - mom) * learning_rate * gparam # ... and take a step along that direction for param, gparam_mom in zip(classifier.params, gparams_mom): # Misha Denil's original version #stepped_param = param - learning_rate * updates[gparam_mom] # since we have included learning_rate in gparam_mom, we don't need it # here stepped_param = param + updates[gparam_mom] # This is a silly hack to constrain the norms of the rows of the weight # matrices. This just checks if there are two dimensions to the # parameter and constrains it if so... maybe this is a bit silly but it # should work for now. if param.get_value(borrow=True).ndim == 2: #squared_norms = T.sum(stepped_param**2, axis=1).reshape((stepped_param.shape[0],1)) #scale = T.clip(T.sqrt(squared_filter_length_limit / squared_norms), 0., 1.) #updates[param] = stepped_param * scale # constrain the norms of the COLUMNs of the weight, according to # https://github.com/BVLC/caffe/issues/109 col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(squared_filter_length_limit)) scale = desired_norms / (1e-7 + col_norms) updates[param] = stepped_param * scale else: updates[param] = stepped_param else: if L2: print >> sys.stderr, ("Using gradient decent with L2 regularization") updates = [ (param_i, param_i - learning_rate * (grad_i + lamb*param_i/batch_size)) for param_i, grad_i in zip(classifier.params, gparams) ] else: print >> sys.stderr, ("Using gradient decent") updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(classifier.params, gparams) ] # Compile theano function for training. This returns the training cost and # updates the model parameters. output = dropout_cost if dropout else cost train_model = theano.function(inputs=[epoch,index], outputs=output, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size]}, on_unused_input='ignore') #theano.printing.pydotprint(train_model, outfile="train_file.png", # var_with_name_simple=True) # Theano function to decay the learning rate, this is separate from the # training function because we only want to do this once each epoch instead # of after each minibatch. decay_learning_rate = theano.function(inputs=[], outputs=learning_rate, updates={learning_rate: learning_rate * learning_rate_decay}) ############### # TRAIN MODEL # ############### print '... training' best_test_error = np.inf best_test_score = np.inf best_iter_test = 0 test_score = 0. epoch_counter = 0 start_time = time.clock() plot_training = [] plot_test = [] best_model = None # saves the best model to be returned by function. #layer0_weights = None train_set_x_backup = train_set_x.get_value() while epoch_counter < n_epochs: # Train this epoch # augment the images if len(augment_schedule) > 0: opp = augment_schedule[epoch_counter % len(augment_schedule)] train_set_x_augment = copy.deepcopy(train_set_x_backup) print 'augmenting epoch with operation:', opp for i in range(len(train_set_x_augment)): # augment even images on even epochs and odd images on odd epochs. if (epoch_counter % 2 == 0 and i % 2 == 0) or (epoch_counter % 2 == 1 and i % 2 == 1): img = train_set_x_augment[i] img = (img*256.0).astype(dtype=np.uint8) if img.shape[2] == 1: # if it's a one channel image img = np.reshape(img, (img.shape[0], img.shape[1])) img = augment(img, opp) img = img.astype(dtype=np.float32)/256.0 if len(img.shape) == 2: img = np.reshape(img, (img.shape[0], img.shape[1], 1)) train_set_x_augment[i] = img train_set_x.set_value(train_set_x_augment) epoch_counter = epoch_counter + 1 minibatch_avg_cost = 0 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost += train_model(epoch_counter, minibatch_index) plot_training.append(minibatch_avg_cost/n_train_batches) test_losses = [test_model(i) for i in xrange(n_test_batches)] this_test_error = np.mean(test_losses) plot_test.append(this_test_error) print "epoch {}, test error {}%, train error {}, learning_rate={}{}".format( epoch_counter, this_test_error*100.0, plot_training[-1], learning_rate.get_value(borrow=True), " **" if this_test_error < best_test_error else "" ) #print 'predictions', test_softmax_predictions if this_test_error < best_test_error: best_test_error = this_test_error best_iter_test = epoch_counter test_softmax_predictions = [softmax_predictions(i) for i in xrange(n_test_batches)] test_labels_ = [test_labels(i) for i in xrange(n_test_batches)] #best_model_ = [param.get_value() for param in classifier.params] #best_model = cPickle.dumps(best_model_, protocol=cPickle.HIGHEST_PROTOCOL) # doesn't work TODO TODO if return_classifier: best_model = cPickle.dumps(classifier.params, protocol=cPickle.HIGHEST_PROTOCOL) # TODO extract filter images. #layer0_weights = classifier.layer0.W.get_value() if (timeout is not None) and (epoch_counter - best_iter_test >= timeout): break if decay: new_learning_rate = decay_learning_rate() end_time = time.clock() print >> sys.stderr, (('Optimization complete. Best test score of %f %% ' 'obtained at epoch %i') % (best_test_error * 100., best_iter_test)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) preds, lbls, cmc, roc = get_cmc_roc_data(test_softmax_predictions, test_labels_) test_set = test_set_x.get_value() misses = [] hits = [] for sample in range(len(preds)): pred = preds[sample] pred_ = zip(pred, range(len(pred))) pred_.sort(reverse=True) prediction = pred_[0][1] lbl = lbls[sample] if prediction != lbl: misses.append(test_set[sample]) else: hits.append(test_set[sample]) #for idx, img in enumerate(d): # img_ = (img*256.0).astype(dtype=np.uint8) # cv2.imwrite('missed_img'+str(idx)+'.jpg', img_) if plot: plot_cmc(cmc) plot_roc(roc) plot_training_error(plot_training, epoch_counter) plot_testing_error(plot_test, epoch_counter) return (best_model, hits, misses, roc, cmc, preds, lbls, plot_training, plot_test)
def batch_norm(self, h, dim, use_shift=True, use_std=True, use_sample=0.0, force_sample=False, index=None, sample_mean=None, gamma=None, beta=None): x = h if h.ndim == 3: if index is None: index = self.index x = h.reshape((h.shape[0] * h.shape[1], h.shape[2]))[(index.flatten() > 0).nonzero()] elif h.ndim == 4: # index is sizes here assert index is not None x = h.reshape((h.shape[0] * h.shape[1] * h.shape[2], h.shape[3])) #x = x[(T.gt(x,numpy.float32(0))>0).nonzero()] mean = T.mean(x, axis=0) std = T.sqrt(T.mean((x - mean)**2, axis=0)) if sample_mean is None: sample_mean = self.add_param(theano.shared( numpy.zeros((dim, ), 'float32'), '%s_%s_mean' % (self.name, h.name)), custom_update=mean, custom_update_normalized=True) self.sample_mean = sample_mean sample_std = T.sqrt(T.mean((x - sample_mean)**2, axis=0)) if not self.train_flag and not force_sample: use_sample = 1.0 mean = T.constant(1. - use_sample, 'float32') * mean + T.constant( use_sample, 'float32') * sample_mean std = T.constant(1. - use_sample, 'float32') * std + T.constant( use_sample, 'float32') * sample_std if h.ndim == 3: mean = mean.dimshuffle('x', 'x', 0).repeat(h.shape[0], axis=0).repeat(h.shape[1], axis=1) std = std.dimshuffle('x', 'x', 0).repeat(h.shape[0], axis=0).repeat(h.shape[1], axis=1) elif h.ndim == 4: mean = mean.dimshuffle('x', 'x', 'x', 0).repeat( h.shape[0], axis=0).repeat(h.shape[1], axis=1).repeat(h.shape[2], axis=2) std = std.dimshuffle('x', 'x', 'x', 0).repeat( h.shape[0], axis=0).repeat(h.shape[1], axis=1).repeat(h.shape[2], axis=2) else: mean = mean.dimshuffle('x', 0).repeat(h.shape[0], axis=0) std = std.dimshuffle('x', 0).repeat(h.shape[0], axis=0) bn = (h - mean) / (std + numpy.float32(1e-10)) if use_std: if gamma is None: gamma = self.add_param( self.shared( numpy.zeros((dim, ), 'float32') + numpy.float32(0.1), "%s_%s_gamma" % (self.name, h.name))) self.gamma = gamma if h.ndim == 3: bn *= gamma.dimshuffle('x', 'x', 0).repeat(h.shape[0], axis=0).repeat(h.shape[1], axis=1) elif h.ndim == 4: bn *= gamma.dimshuffle('x', 'x', 'x', 0).repeat( h.shape[0], axis=0).repeat(h.shape[1], axis=1).repeat(h.shape[2], axis=2) else: bn *= gamma.dimshuffle('x', 0).repeat(h.shape[0], axis=0) if use_shift: if beta is None: beta = self.add_param( self.shared(numpy.zeros((dim, ), 'float32'), "%s_%s_beta" % (self.name, h.name))) self.beta = beta bn += beta return bn
def train( dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units encoder='gru', decoder='gru_cond', patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 regularization penalty alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=0.01, # learning rate n_words_src=100000, # source vocabulary size n_words=100000, # target vocabulary size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq datasets=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok' ], valid_datasets=[ '../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok' ], dictionaries=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl' ], use_dropout=False, reload_=False): # Model options model_options = locals().copy() # load dictionaries and invert them worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): with open(dd, 'rb') as f: worddicts[ii] = pkl.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk # reload options if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: models_options = pkl.load(f) print 'Loading data' train = TextIterator(datasets[0], datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=batch_size, maxlen=maxlen) valid = TextIterator(valid_datasets[0], valid_datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=valid_batch_size, maxlen=maxlen) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] print 'Buliding sampler' f_init, f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after all regularizers - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 for x, y in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen, n_words_src=n_words_src, n_words=n_words) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask, y, y_mask) # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(numpy.minimum(5, x.shape[1])): stochastic = True sample, score = gen_sample(tparams, f_init, f_next, x[:, jj][:, None], model_options, trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=False) print 'Source ', jj, ': ', for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: print worddicts_r[0][vv], else: print 'UNK', print print 'Truth ', jj, ' : ', for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print print 'Sample ', jj, ': ', if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] for vv in ss: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print 'Valid ', valid_err # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) return valid_err
def rmsprop(lr, tparams, grads, x, y, cost): """ A variant of SGD that scales the step size by running average of the recent step norms. Parameters ---------- lr : Theano SharedVariable Initial learning rate tpramas: Theano SharedVariable Model parameters grads: Theano variable Gradients of cost w.r.t to parameres x: Theano variable Model inputs y: Theano variable Targets cost: Theano variable Objective fucntion to minimize Notes ----- For more information, see [Hint2014]_. .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*, lecture 6a, http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf """ zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function([x, y], cost, updates=zgup + rgup + rg2up, name='rmsprop_f_grad_shared') updir = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.iteritems()] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / T.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update') return f_grad_shared, f_update
def adadelta(lr, tparams, grads, x, mask, y, cost): """ An adaptive learning rate optimizer Parameters ---------- lr : Theano SharedVariable Initial learning rate tpramas: Theano SharedVariable Model parameters grads: Theano variable Gradients of cost w.r.t to parameres x: Theano variable Model inputs mask: Theano variable Sequence mask y: Theano variable Targets cost: Theano variable Objective fucntion to minimize Notes ----- For more information, see [ADADELTA]_. .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning Rate Method*, arXiv:1212.5701. """ zipped_grads = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems() ] running_up2 = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.iteritems() ] running_grads2 = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') updir = [ -tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2) ] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud**2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') return f_grad_shared, f_update
def __init__(self, input, n_in, n_hidden, n_out, activation=T.tanh, output_type='real'): self.input = input self.activation = activation self.output_type = output_type self.batch_size = T.iscalar() # theta is a vector of all trainable parameters # it represents the value of W, W_in, W_out, h0, bh, by theta_shape = n_hidden ** 2 + n_in * n_hidden + n_hidden * n_out + \ n_hidden + n_hidden + n_out self.theta = theano.shared( value=np.zeros(theta_shape, dtype=theano.config.floatX)) # Parameters are reshaped views of theta param_idx = 0 # pointer to somewhere along parameter vector # recurrent weights as a shared variable self.W = self.theta[param_idx:(param_idx + n_hidden**2)].reshape( (n_hidden, n_hidden)) self.W.name = 'W' W_init = np.asarray(np.random.uniform(size=(n_hidden, n_hidden), low=-0.01, high=0.01), dtype=theano.config.floatX) param_idx += n_hidden**2 # input to hidden layer weights self.W_in = self.theta[param_idx:(param_idx + n_in * \ n_hidden)].reshape((n_in, n_hidden)) self.W_in.name = 'W_in' W_in_init = np.asarray(np.random.uniform(size=(n_in, n_hidden), low=-0.01, high=0.01), dtype=theano.config.floatX) param_idx += n_in * n_hidden # hidden to output layer weights self.W_out = self.theta[param_idx:(param_idx + n_hidden * \ n_out)].reshape((n_hidden, n_out)) self.W_out.name = 'W_out' W_out_init = np.asarray(np.random.uniform(size=(n_hidden, n_out), low=-0.01, high=0.01), dtype=theano.config.floatX) param_idx += n_hidden * n_out self.h0 = self.theta[param_idx:(param_idx + n_hidden)] self.h0.name = 'h0' h0_init = np.zeros((n_hidden, ), dtype=theano.config.floatX) param_idx += n_hidden self.bh = self.theta[param_idx:(param_idx + n_hidden)] self.bh.name = 'bh' bh_init = np.zeros((n_hidden, ), dtype=theano.config.floatX) param_idx += n_hidden self.by = self.theta[param_idx:(param_idx + n_out)] self.by.name = 'by' by_init = np.zeros((n_out, ), dtype=theano.config.floatX) param_idx += n_out assert (param_idx == theta_shape) # for convenience self.params = [ self.W, self.W_in, self.W_out, self.h0, self.bh, self.by ] # shortcut to norms (for monitoring) self.l2_norms = {} for param in self.params: self.l2_norms[param] = T.sqrt(T.sum(param**2)) # initialize parameters # DEBUG_MODE gives division by zero error when we leave parameters # as zeros self.theta.set_value( np.concatenate([ x.ravel() for x in (W_init, W_in_init, W_out_init, h0_init, bh_init, by_init) ])) self.theta_update = theano.shared( value=np.zeros(theta_shape, dtype=theano.config.floatX)) # recurrent function (using tanh activation function) and arbitrary output # activation function def step(x_t, h_tm1): h_t = self.activation(T.dot(x_t, self.W_in) + \ T.dot(h_tm1, self.W) + self.bh) y_t = T.dot(h_t, self.W_out) + self.by return h_t, y_t # the hidden state `h` for the entire sequence, and the output for the # entire sequence `y` (first dimension is always time) # Note the implementation of weight-sharing h0 across variable-size # batches using T.ones multiplying h0 # Alternatively, T.alloc approach is more robust [self.h, self.y_pred], _ = theano.scan(step, sequences=self.input, outputs_info=[ T.alloc(self.h0, self.input.shape[1], n_hidden), None ]) # outputs_info=[T.ones(shape=(self.input.shape[1], # self.h0.shape[0])) * self.h0, None]) # L1 norm ; one regularization option is to enforce L1 norm to # be small self.L1 = 0 self.L1 += abs(self.W.sum()) self.L1 += abs(self.W_in.sum()) self.L1 += abs(self.W_out.sum()) # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = 0 self.L2_sqr += (self.W**2).sum() self.L2_sqr += (self.W_in**2).sum() self.L2_sqr += (self.W_out**2).sum() if self.output_type == 'real': self.loss = lambda y: self.mse(y) elif self.output_type == 'binary': # push through sigmoid self.p_y_given_x = T.nnet.sigmoid(self.y_pred[-1]) # apply sigmoid self.y_out = T.round(self.p_y_given_x) # round to {0,1} self.loss = lambda y: self.nll_binary(y) elif self.output_type == 'softmax': # push through softmax, computing vector of class-membership # probabilities in symbolic form # # T.nnet.softmax will not operate on T.tensor3 types, only matrices # We take our n_steps x n_seq x n_classes output from the net # and reshape it into a (n_steps * n_seq) x n_classes matrix # apply softmax, then reshape back y_p = self.y_pred y_p_m = T.reshape(y_p, (y_p.shape[0] * y_p.shape[1], -1)) y_p_s = T.nnet.softmax(y_p_m) self.p_y_given_x = T.reshape(y_p_s, y_p.shape) # compute prediction as class whose probability is maximal self.y_out = T.argmax(self.p_y_given_x, axis=-1) self.loss = lambda y: self.nll_multiclass(y) else: raise NotImplementedError
def irl(structure, feature_matrix, n_actions, discount, transition_probability, trajectories, epochs, learning_rate, initialisation="normal", l1=0.1, l2=0.1): """ Find the reward function for the given trajectories. structure: Neural network structure tuple, e.g. (10, 3, 3) would be a 3-layer neural network with 10 inputs. feature_matrix: Matrix with the nth row representing the nth state. NumPy array with shape (N, D) where N is the number of states and D is the dimensionality of the state. n_actions: Number of actions A. int. discount: Discount factor of the MDP. float. transition_probability: NumPy array mapping (state_i, action, state_k) to the probability of transitioning from state_i to state_k under action. Shape (N, A, N). trajectories: 3D array of state/action pairs. States are ints, actions are ints. NumPy array with shape (T, L, 2) where T is the number of trajectories and L is the trajectory length. epochs: Number of gradient descent steps. int. learning_rate: Gradient descent learning rate. float. initialisation: What distribution to use. str in {normal, uniform}. Default normal. l1: L1 regularisation. Default 0.1. float. l2: L2 regularisation. Default 0.1. float. -> Reward vector with shape (N,). """ n_states, d_states = feature_matrix.shape transition_probability = th.shared(transition_probability, borrow=True) trajectories = th.shared(trajectories, borrow=True) # Initialise W matrices; b biases. n_layers = len(structure) - 1 weights = [] hist_w_grads = [] # For AdaGrad. biases = [] hist_b_grads = [] # For AdaGrad. for i in range(n_layers): # W shape = (structure[i + 1], structure[i]) if initialisation == "normal": matrix = th.shared(rn.normal(size=shape), name="W", borrow=True) else: matrix = th.shared(rn.uniform(size=shape), name="W", borrow=True) weights.append(matrix) hist_w_grads.append(th.shared(np.zeros(shape), name="hdW", borrow=True)) # b shape = (structure[i + 1], 1) if initialisation == "normal": matrix = th.shared(rn.normal(size=shape), name="b", borrow=True) else: matrix = th.shared(rn.uniform(size=shape), name="b", borrow=True) biases.append(matrix) hist_b_grads.append(th.shared(np.zeros(shape), name="hdb", borrow=True)) # Initialise α weight, β bias. if initialisation == "normal": α = th.shared(rn.normal(size=(1, structure[-1])), name="alpha", borrow=True) else: α = th.shared(rn.uniform(size=(1, structure[-1])), name="alpha", borrow=True) hist_α_grad = T.zeros(α.shape) # For AdaGrad. adagrad_epsilon = 1e-6 # AdaGrad numerical stability. #### Theano symbolic setup. #### # Symbolic input. s_feature_matrix = T.matrix("x") # Feature matrices. # All dimensions of the form (d_layer, n_states). φs = [s_feature_matrix.T] # Forward propagation. for W, b in zip(weights, biases): φ = T.nnet.sigmoid( th.compile.ops.Rebroadcast((0, False), (1, True))(b) + W.dot(φs[-1])) φs.append(φ) # φs[1] = φ1 etc. # Reward. r = α.dot(φs[-1]).reshape((n_states, )) # Engineering hack: z-score the reward. r = (r - r.mean()) / r.std() # Associated feature expectations. expected_svf = find_expected_svf(n_states, r, n_actions, discount, transition_probability, trajectories) svf = maxent.find_svf(n_states, trajectories.get_value()) # Derivatives (backward propagation). updates = [] α_grad = φs[-1].dot(svf - expected_svf).T hist_α_grad += α_grad**2 adj_α_grad = α_grad / (adagrad_epsilon + T.sqrt(hist_α_grad)) updates.append((α, α + adj_α_grad * learning_rate)) def grad_for_state(s, theta, svf_diff, r): """ Calculate the gradient with respect to theta for one state. """ regularisation = abs(theta).sum() * l1 + (theta**2).sum() * l2 return svf_diff[s] * T.grad(r[s], theta) - regularisation, {} for i, W in enumerate(weights): w_grads, _ = th.scan(fn=grad_for_state, sequences=[T.arange(n_states)], non_sequences=[W, svf - expected_svf, r]) w_grad = w_grads.sum(axis=0) hist_w_grads[i] += w_grad**2 adj_w_grad = w_grad / (adagrad_epsilon + T.sqrt(hist_w_grads[i])) updates.append((W, W + adj_w_grad * learning_rate)) for i, b in enumerate(biases): b_grads, _ = th.scan(fn=grad_for_state, sequences=[T.arange(n_states)], non_sequences=[b, svf - expected_svf, r]) b_grad = b_grads.sum(axis=0) hist_b_grads[i] += b_grad**2 adj_b_grad = b_grad / (adagrad_epsilon + T.sqrt(hist_b_grads[i])) updates.append((b, b + adj_b_grad * learning_rate)) train = th.function([s_feature_matrix], updates=updates, outputs=r) run = th.function([s_feature_matrix], outputs=r) for e in range(epochs): reward = train(feature_matrix) return reward.reshape((n_states, ))
def step(inp_h1_t, gat_h1_t, inp_h2_t, gat_h2_t, inp_h3_t, gat_h3_t, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, context_oh): attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) inp_h1_t += attinp_h1 gat_h1_t += attgat_h1 h1_t = self.rnn1.apply(inp_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) + self.epsilon k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t) a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) inp_h2_t += attinp_h2 gat_h2_t += attgat_h2 inp_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply(inp_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply(inp_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) return h1_t, h2_t, h3_t, k_t, w_t, phi_t, a_t_
def __init__(self, model, algo="sgd"): self.model = model self.algo = algo self.x = self.model.x self.y = T.ivector('y') self.outc = T.matrix('outc') # due to theano bugs self.x_d = shared_empty(2) self.y_d = shared_empty(1, dtype="int32") self.outc_d = shared_empty(2) # --- # -- target def -- self.loss = get_loss(self.model.a, self.outc) self.err = get_error(get_pred(self.model.a), self.y) self.grad_vec = T.concatenate([T.grad(self.loss, p).flatten() for p in self.model.params]) srng = RandomStreams(seed=234) self.grad = {p: T.grad(self.loss, p) for p in self.model.params} for p in self.grad: self.grad[p] += srng.normal(p.shape)*1e-4 self.updates = OrderedDict() if self.algo == 'gd': self.rate = 10. for p in self.model.params: self.updates[p] = p - self.rate * self.grad[p] elif self.algo == 'adagrad': self.rate = 5e-2 eps = 1e-6 for p in self.model.params: value = p.get_value(borrow=True) hist = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable) hist_n = hist + self.grad[p]**2 self.updates[hist] = hist_n self.updates[p] = p - self.rate * self.grad[p] / T.sqrt(hist + eps) elif self.algo == 'rmsprop': self.rate = 1e-2 eps = 1e-6 rho = 0.7 for p in self.model.params: value = p.get_value(borrow=True) hist = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable) hist_n = rho * hist + (1 - rho) * self.grad[p]**2 self.updates[hist] = hist_n self.updates[p] = p - self.rate * self.grad[p] / T.sqrt(hist + eps) elif self.algo == 'nag': self.rate = 10 mu = 0.2 for p in self.model.params: value = p.get_value(borrow=True) vel = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable) x = mu * vel + self.rate * self.grad[p] self.updates[vel] = x self.updates[p] = p - self.rate * self.grad[p] - mu * x elif self.algo == 'adam': self.rate = 4e-2 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 one = T.constant(1) t_prev = theano.shared(np.asarray(0, dtype=theano.config.floatX)) t = t_prev + 1 a_t = self.rate*T.sqrt(one-beta2**t)/(one-beta1**t) for p in self.model.params: value = p.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable) v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable) m_t = beta1*m_prev + (one-beta1)*self.grad[p] v_t = beta2*v_prev + (one-beta2)*self.grad[p]**2 step = a_t*m_t/(T.sqrt(v_t) + eps) self.updates[m_prev] = m_t self.updates[v_prev] = v_t self.updates[p] = p - step self.updates[t_prev] = t self.print_pls = [] self.print_pls += [T.mean(self.grad_vec**2)**0.5] self.train = theano.function( inputs=[], outputs=[self.loss, self.err] + self.print_pls, updates=self.updates, givens={ self.x: self.x_d, self.y: self.y_d, self.outc: self.outc_d, }, on_unused_input='warn', allow_input_downcast=True ) self.eva = theano.function( inputs=[], outputs=[self.loss], givens={ self.x: self.x_d, self.outc: self.outc_d }, on_unused_input='warn', allow_input_downcast=True )
def RMSprop(self, cost, params, full_params, sampled_params, sidxs, epsilon=1e-6): grads = [T.grad(cost=cost, wrt=param) for param in params] sgrads = [T.grad(cost=cost, wrt=sparam) for sparam in sampled_params] updates = OrderedDict() if self.grad_cap > 0: norm = T.cast( T.sqrt( T.sum([ T.sum([T.sum(g**2) for g in g_list]) for g_list in grads ]) + T.sum([T.sum(g**2) for g in sgrads])), theano.config.floatX) grads = [[ T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm, g) for g in g_list ] for g_list in grads] sgrads = [ T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm, g) for g in sgrads ] for p_list, g_list in zip(params, grads): for p, g in zip(p_list, g_list): if self.adapt: if self.adapt == 'adagrad': g = self.adagrad(p, g, updates) if self.adapt == 'rmsprop': g = self.rmsprop(p, g, updates) if self.adapt == 'adadelta': g = self.adadelta(p, g, updates) if self.adapt == 'adam': g = self.adam(p, g, updates) if self.momentum > 0: velocity = theano.shared(p.get_value(borrow=False) * 0., borrow=True) velocity2 = self.momentum * velocity - np.float32( self.learning_rate) * (g + self.lmbd * p) updates[velocity] = velocity2 updates[p] = p + velocity2 else: updates[p] = p * np.float32(1.0 - self.learning_rate * self.lmbd) - np.float32( self.learning_rate) * g for i in range(len(sgrads)): g = sgrads[i] fullP = full_params[i] sample_idx = sidxs[i] sparam = sampled_params[i] if self.adapt: if self.adapt == 'adagrad': g = self.adagrad(fullP, g, updates, sample_idx) if self.adapt == 'rmsprop': g = self.rmsprop(fullP, g, updates, sample_idx) if self.adapt == 'adadelta': g = self.adadelta(fullP, g, updates, sample_idx) if self.adapt == 'adam': g = self.adam(fullP, g, updates, sample_idx) if self.lmbd > 0: delta = np.float32( self.learning_rate) * (g + self.lmbd * sparam) else: delta = np.float32(self.learning_rate) * g if self.momentum > 0: velocity = theano.shared(fullP.get_value(borrow=False) * 0., borrow=True) vs = velocity[sample_idx] velocity2 = self.momentum * vs - delta updates[velocity] = T.set_subtensor(vs, velocity2) updates[fullP] = T.inc_subtensor(sparam, velocity2) else: updates[fullP] = T.inc_subtensor(sparam, -delta) return updates
def sample_step(inp_cell_h1_t, inp_gat_h1_t, inp_cell_h2_t, inp_gat_h2_t, inp_cell_h3_t, inp_gat_h3_t, x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1): cell_h1_t = inp_cell_h1_t cell_h2_t = inp_cell_h2_t cell_h3_t = inp_cell_h3_t gat_h1_t = inp_gat_h1_t gat_h2_t = inp_gat_h2_t gat_h3_t = inp_gat_h3_t attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) cell_h1_t += attinp_h1 gat_h1_t += attgat_h1 if self.weak_feedback: out_cell_h1_t, out_gat_h1_t = self.out_to_h1.apply(x_tm1) to_normalize = [out_cell_h1_t, out_gat_h1_t] out_cell_h1_t, out_gat_h1_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1_t += out_cell_h1_t gat_h1_t += out_gat_h1_t if self.full_feedback: out_cell_h2_t, out_gat_h2_t = self.out_to_h2.apply(x_tm1) out_cell_h3_t, out_gat_h3_t = self.out_to_h3.apply(x_tm1) to_normalize = [ out_cell_h2_t, out_gat_h2_t, out_cell_h3_t, out_gat_h3_t ] out_cell_h2_t, out_gat_h2_t, \ out_cell_h3_t, out_gat_h3_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h2_t += out_cell_h2_t cell_h3_t += out_cell_h3_t gat_h2_t += out_gat_h2_t gat_h3_t += out_gat_h3_t h1_t = self.rnn1.apply(cell_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) * self.sharpening_coeff + self.epsilon k_t = k_tm1 + self.attention_alignment * \ tensor.exp(k_t) / self.timing_coeff a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) cell_h2_t += attinp_h2 gat_h2_t += attgat_h2 cell_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply(cell_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply(cell_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) h1_out_t = self.h1_to_readout.apply(h1_t) h2_out_t = self.h2_to_readout.apply(h2_t) h3_out_t = self.h3_to_readout.apply(h3_t) to_normalize = [h1_out_t, h2_out_t, h3_out_t] h1_out_t, h2_out_t, h3_out_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] readout_t = h1_out_t + h2_out_t + h3_out_t readout_t += self.att_to_readout.apply(w_t) if self.use_speaker: readout_t += spk_readout output_t = self.readout_to_output.apply(readout_t) if self.which_cost == 'MSE': predicted_x_t = output_t if self.use_speaker: predicted_x_t += spk_output # Dummy value for coeff_t coeff_t = predicted_x_t elif self.which_cost == "GMM": mu_t, sigma_t, coeff_t = output_t if self.use_speaker: mu_t += spk_output[0] sigma_t += spk_output[1] coeff_t += spk_output[2] sigma_t = tensor.exp(sigma_t - self.sampling_bias) + \ self.epsilon coeff_t = tensor.nnet.softmax( coeff_t.reshape( (-1, self.k_gmm)) * (1. + self.sampling_bias)).reshape( coeff_t.shape) + self.epsilon predicted_x_t = sample_gmm(mu_t, sigma_t, coeff_t, self.theano_rng) return predicted_x_t, h1_t, h2_t, h3_t, \ k_t, w_t, coeff_t, phi_t, a_t_
def l2_normalize(x, axis): norm = T.sqrt(T.sum(T.square(x), axis=axis, keepdims=True)) return x / norm
def stdize(layer, input): m = T.mean(input, layer.axes_to_sum) input -= m.dimshuffle(*layer.dimshuffle_args) stdv = T.sqrt(T.mean(T.square(input), axis=layer.axes_to_sum)) input /= stdv.dimshuffle(*layer.dimshuffle_args) return -m / stdv, 1. / stdv, input
def L1sim(left, right): return -T.sum(T.sqrt(T.sqr(left - right)), axis=1)
def get_expr_rff_feature_map_component(x, omega, u): phi = T.cos(T.dot(x, omega) + u) * T.sqrt(2.) return phi
def __init__(self, We_initial, params): initial_We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) g1 = T.imatrix() g2 = T.imatrix() p1 = T.imatrix() p2 = T.imatrix() g1mask = T.matrix() g2mask = T.matrix() p1mask = T.matrix() p2mask = T.matrix() l_in = lasagne.layers.InputLayer((None, None)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) l_emb = lasagne.layers.EmbeddingLayer(l_in, input_size=We.get_value().shape[0], output_size=We.get_value().shape[1], W=We) if params.model == "wordaverage": l_out = lasagne_average_layer([l_emb, l_mask], tosum=False) elif params.model == "maxpool": l_out = lasagne_max_layer([l_emb, l_mask], params) elif params.model == "lstmavg": l_lstm = lasagne.layers.LSTMLayer(l_emb, params.dim, peepholes=True, learn_init=False, mask_input=l_mask) l_out = lasagne_average_layer([l_lstm, l_mask], tosum = False) elif params.model == "lstmmax": l_lstm = lasagne.layers.LSTMLayer(l_emb, params.dim, peepholes=True, learn_init=False, mask_input=l_mask) l_out = lasagne_max_layer([l_lstm, l_mask], params) elif params.model == "bilstmavg": l_lstm = lasagne.layers.LSTMLayer(l_emb, params.dim, peepholes=True, learn_init=False, mask_input=l_mask) l_lstmb = lasagne.layers.LSTMLayer(l_emb, params.dim, learn_init=False, mask_input=l_mask, backwards=True) l_cleanse = lasagne_cleanse_layer([l_lstm, l_mask], to_pool = False) l_cleanse_b = lasagne_cleanse_layer([l_lstmb, l_mask], to_pool = False) l_concat = lasagne.layers.ConcatLayer([l_cleanse, l_cleanse_b], axis = params.axis) l_out = lasagne_average_layer2([l_concat, l_mask]) elif params.model == "bilstmmax": l_lstm = lasagne.layers.LSTMLayer(l_emb, params.dim, peepholes=True, learn_init=False, mask_input=l_mask) l_lstmb = lasagne.layers.LSTMLayer(l_emb, params.dim, learn_init=False, mask_input=l_mask, backwards=True) l_cleanse = lasagne_cleanse_layer([l_lstm, l_mask], to_pool = True) l_cleanse_b = lasagne_cleanse_layer([l_lstmb, l_mask], to_pool = True) l_concat = lasagne.layers.ConcatLayer([l_cleanse, l_cleanse_b], axis=params.axis) l_out = lasagne_max_layer2([l_concat]) embg1 = lasagne.layers.get_output(l_out, {l_in: g1, l_mask: g1mask}, deterministic=False) embg2 = lasagne.layers.get_output(l_out, {l_in: g2, l_mask: g2mask}, deterministic=False) embp1 = lasagne.layers.get_output(l_out, {l_in: p1, l_mask: p1mask}, deterministic=False) embp2 = lasagne.layers.get_output(l_out, {l_in: p2, l_mask: p2mask}, deterministic=False) def fix(x): return x*(x > 0) + 1E-10*(x <= 0) #objective function g1g2 = (embg1 * embg2).sum(axis=1) g1g2norm = T.sqrt(fix(T.sum(embg1 ** 2, axis=1))) * T.sqrt(fix(T.sum(embg2 ** 2, axis=1))) g1g2 = g1g2 / g1g2norm p1g1 = (embp1 * embg1).sum(axis=1) p1g1norm = T.sqrt(fix(T.sum(embp1 ** 2, axis=1))) * T.sqrt(fix(T.sum(embg1 ** 2, axis=1))) p1g1 = p1g1 / p1g1norm p2g2 = (embp2 * embg2).sum(axis=1) p2g2norm = T.sqrt(fix(T.sum(embp2 ** 2, axis=1))) * T.sqrt(fix(T.sum(embg2 ** 2, axis=1))) p2g2 = p2g2 / p2g2norm costp1g1 = params.margin - g1g2 + p1g1 costp1g1 = costp1g1 * (costp1g1 > 0) costp2g2 = params.margin - g1g2 + p2g2 costp2g2 = costp2g2 * (costp2g2 > 0) cost = costp1g1 + costp2g2 network_params = lasagne.layers.get_all_params(l_out, trainable=True) network_params.pop(0) self.final_layer = l_out self.all_params = lasagne.layers.get_all_params(l_out, trainable=True) if params.LC: l2 = 0.5 * params.LC * sum(lasagne.regularization.l2(x) for x in network_params) else: l2 = 0 word_reg = 0.5 * params.LW * lasagne.regularization.l2(We - initial_We) cost = T.mean(cost) + l2 + word_reg self.feedforward_function = theano.function([g1, g1mask], embg1) prediction = g1g2 self.scoring_function = theano.function([g1, g2, g1mask, g2mask],prediction) grads = theano.gradient.grad(cost, self.all_params) updates = params.learner(grads, self.all_params, params.eta) self.train_function = theano.function([g1, g2, p1, p2, g1mask, g2mask, p1mask, p2mask], cost, updates=updates) cost = costp1g1 + costp2g2 cost = T.mean(cost) self.cost_function = theano.function([g1, g2, p1, p2, g1mask, g2mask, p1mask, p2mask], cost) print("Num Params:", lasagne.layers.count_params(self.final_layer))
def sqrt(x): x = T.clip(x, 0., np.inf) return T.sqrt(x)
size = np.asarray(x_shp[1] * inker_shape[0] * inker_shape[1], dtype=x.dtype) ssq, ssqshp = boxconv((x**2, x_shp), inker_shape, channels=True) xs = inker_shape[0] // 2 ys = inker_shape[1] // 2 # --local contrast normalization in regions that are not symmetric # about the pixel being normalized feels weird, but we're # allowing it here. xs_inc = (inker_shape[0] + 1) % 2 ys_inc = (inker_shape[1] + 1) % 2 if div_method == 'euclidean': if remove_mean: arr_sum, _shp = boxconv((x, x_shp), inker_shape, channels=True) arr_num = (x[:, :, xs - xs_inc:-xs, ys - ys_inc:-ys] - arr_sum / size) arr_div = EPSILON + tensor.sqrt( tensor.maximum(0, ssq - (arr_sum**2) / size)) else: arr_num = x[:, :, xs - xs_inc:-xs, ys - ys_inc:-ys] arr_div = EPSILON + tensor.sqrt(ssq) else: raise NotImplementedError('div_method', div_method) else: raise NotImplementedError('outker_shape != inker_shape', outker_shape, inker_shape) if (hasattr(stretch, '__iter__') and (stretch != 1).any()) or stretch != 1: arr_num = arr_num * stretch arr_div = arr_div * stretch # XXX: IS THIS 1.0 supposed to be (threshold + EPSILON) ?? arr_div = tensor.switch(arr_div < (threshold + EPSILON), 1.0, arr_div)
def compute_norms(array, norm_axes=None): """ Compute incoming weight vector norms. Parameters ---------- array : numpy array or Theano expression Weight or bias. norm_axes : sequence (list or tuple) The axes over which to compute the norm. This overrides the default norm axes defined for the number of dimensions in `array`. When this is not specified and `array` is a 2D array, this is set to `(0,)`. If `array` is a 3D, 4D or 5D array, it is set to a tuple listing all axes but axis 0. The former default is useful for working with dense layers, the latter is useful for 1D, 2D and 3D convolutional layers. Finally, in case `array` is a vector, `norm_axes` is set to an empty tuple, and this function will simply return the absolute value for each element. This is useful when the function is applied to all parameters of the network, including the bias, without distinction. (Optional) Returns ------- norms : 1D array or Theano vector (1D) 1D array or Theano vector of incoming weight/bias vector norms. Examples -------- >>> array = np.random.randn(100, 200) >>> norms = compute_norms(array) >>> norms.shape (200,) >>> norms = compute_norms(array, norm_axes=(1,)) >>> norms.shape (100,) """ # Check if supported type if not isinstance(array, theano.Variable) and \ not isinstance(array, np.ndarray): raise RuntimeError("Unsupported type {}. " "Only theano variables and numpy arrays " "are supported".format(type(array))) # Compute default axes to sum over ndim = array.ndim if norm_axes is not None: sum_over = tuple(norm_axes) elif ndim == 1: # For Biases that are in 1d (e.g. b of DenseLayer) sum_over = () elif ndim == 2: # DenseLayer sum_over = (0, ) elif ndim in [3, 4, 5]: # Conv{1,2,3}DLayer sum_over = tuple(range(1, ndim)) else: raise ValueError("Unsupported tensor dimensionality {}. " "Must specify `norm_axes`".format(array.ndim)) # Run numpy or Theano norm computation if isinstance(array, theano.Variable): # Apply theano version if it is a theano variable if len(sum_over) == 0: norms = T.abs_(array) # abs if we have nothing to sum over else: norms = T.sqrt(T.sum(array**2, axis=sum_over)) elif isinstance(array, np.ndarray): # Apply the numpy version if ndarray if len(sum_over) == 0: norms = abs(array) # abs if we have nothing to sum over else: norms = np.sqrt(np.sum(array**2, axis=sum_over)) return norms
def cdf(x, miu=0.0, variance=1.0): return 1.0 / 2 * (1.0 + T.erf((x - miu) / T.sqrt(2 * variance)))
def _setup_backprop_with(self, dec_init_state, annotations, y, d, y_in_x_inds, eta, l2_reg): def decoder_recurrence(y_t, d_t, cur_y_in_x_inds, h_prev, annotations, *params): h_for_write = self.specs[self.domain].decoder.get_h_for_write( h_prev) scores = self.specs[self.domain].get_attention_scores( h_for_write, annotations) alpha = self.specs[self.domain].get_alpha(scores) c_t = self.specs[self.domain].get_context(alpha, annotations) write_dist = self.specs[self.domain].f_write( h_for_write, c_t, scores) base_p_y_t = write_dist[y_t] if self.spec.attention_copying: copying_p_y_t = T.dot( write_dist[self.specs[self.domain].out_vocabulary.size( ):self.specs[self.domain].out_vocabulary.size() + cur_y_in_x_inds.shape[0]], cur_y_in_x_inds) p_y_t = base_p_y_t + copying_p_y_t else: p_y_t = base_p_y_t h_t = self.specs[self.domain].f_dec(y_t, d_t, c_t, h_prev) return (h_t, p_y_t) dec_results, _ = theano.scan(fn=decoder_recurrence, sequences=[y, d, y_in_x_inds], outputs_info=[dec_init_state, None], non_sequences=[annotations] + self.all_shared) p_y_seq = dec_results[1] log_p_y = T.sum(T.log(p_y_seq)) nll = -log_p_y # Add L2 regularization regularization = l2_reg / 2 * sum(T.sum(p**2) for p in self.params) objective = nll + regularization gradients = T.grad(objective, self.params) # Do the updates here updates = [] if self.specs[self.domain].step_rule in ('adagrad', 'rmsprop'): # Adagrad updates for p, g, c in zip(self.params, gradients, self.grad_cache): grad_norm = g.norm(2) clipped_grad = ifelse(grad_norm >= CLIP_THRESH, g * CLIP_THRESH / grad_norm, g) if self.spec.step_rule == 'adagrad': new_c = c + clipped_grad**2 else: # rmsprop decay_rate = 0.9 # Use fixed decay rate of 0.9 new_c = decay_rate * c + (1.0 - decay_rate) * clipped_grad**2 new_p = p - eta * clipped_grad / T.sqrt(new_c + 1e-4) has_non_finite = T.any(T.isnan(new_p) + T.isinf(new_p)) updates.append((p, ifelse(has_non_finite, p, new_p))) updates.append((c, ifelse(has_non_finite, c, new_c))) elif self.specs[self.domain].step_rule == 'nesterov': # Nesterov momentum for p, g, v in zip(self.params, gradients, self.grad_cache): grad_norm = g.norm(2) clipped_grad = ifelse(grad_norm >= CLIP_THRESH, g * CLIP_THRESH / grad_norm, g) new_v = NESTEROV_MU * v - eta * clipped_grad new_p = p - NESTEROV_MU * v + (1 + NESTEROV_MU) * new_v has_non_finite = (T.any(T.isnan(new_p) + T.isinf(new_p)) + T.any(T.isnan(new_v) + T.isinf(new_v))) updates.append((p, ifelse(has_non_finite, p, new_p))) updates.append((v, ifelse(has_non_finite, v, new_v))) else: # Simple SGD updates for p, g in zip(self.params, gradients): grad_norm = g.norm(2) clipped_grad = ifelse(grad_norm >= CLIP_THRESH, g * CLIP_THRESH / grad_norm, g) new_p = p - eta * clipped_grad has_non_finite = T.any(T.isnan(new_p) + T.isinf(new_p)) updates.append((p, ifelse(has_non_finite, p, new_p))) return nll, p_y_seq, objective, updates
def cdf(sample, mu=0, sigma=1, eps=1e-6): div = T.sqrt(2) * sigma erf_arg = (sample - mu) / div return .5 * (1 + T.erf(erf_arg))
def nonDeterminstic(self, x): x += self.theanoGenerator.normal(avg=0.0, std=(T.sqrt(T.nnet.sigmoid(x)) + 1e-8)) return x * (x > 0.0)
def _get_compiled_theano_functions(): # Planet masses: m1,m2 m1, m2 = T.dscalars(2) mstar = 1 mu1 = m1 * mstar / (mstar + m1) mu2 = m2 * mstar / (mstar + m2) eta1 = mstar + m1 eta2 = mstar + m2 beta1 = mu1 * T.sqrt(eta1 / mstar) / (mu1 + mu2) beta2 = mu2 * T.sqrt(eta2 / mstar) / (mu1 + mu2) j, k = T.lscalars('jk') s = (j - k) / k # Dynamical variables: dyvars = T.vector() s1, s2, psi, phi, Omega, I1, I2, Psi, Phi, Rtilde = [ dyvars[i] for i in range(10) ] l1 = phi - 0.5 * k * psi l2 = phi + 0.5 * k * psi gamma1 = s1 - (1 + s) * l2 + s * l1 gamma2 = s2 - (1 + s) * l2 + s * l1 Gamma1 = I1 Gamma2 = I2 L1 = Phi / 2 - Psi / k - s * (I1 + I2) L2 = Phi / 2 + Psi / k + (s + 1) * (I1 + I2) Cz = -1 * Rtilde R = L1 + L2 - Gamma1 - Gamma2 - Cz G1 = L1 - Gamma1 G2 = L2 - Gamma2 r2_by_r1 = (L2 - L1 - Gamma2 + Gamma1) / (L1 + L2 - Gamma1 - Gamma2 - R) rho1 = 0.5 * R * (1 + r2_by_r1) rho2 = 0.5 * R * (1 - r2_by_r1) a1 = (L1 / beta1)**2 e1 = T.sqrt(1 - (1 - (Gamma1 / L1))**2) a2 = (L2 / beta2)**2 e2 = T.sqrt(1 - (1 - (Gamma2 / L2))**2) cos_inc1 = 1 - rho1 / G1 cos_inc2 = 1 - rho2 / G2 inc1 = T.arccos(cos_inc1) inc2 = T.arccos(cos_inc2) l1_r = l1 - Omega l2_r = l2 - Omega Omega1_r = T.constant(np.pi / 2) - Omega Omega2_r = Omega1_r - T.constant(np.pi) pomega1 = -1 * gamma1 pomega2 = -1 * gamma2 pomega1_r = pomega1 - Omega pomega2_r = pomega2 - Omega omega1 = pomega1_r - Omega1_r omega2 = pomega2_r - Omega2_r Hkep = -0.5 * T.sqrt(eta1) * beta1 / a1 - 0.5 * T.sqrt(eta2) * beta2 / a2 ko = KeplerOp() M1 = l1_r - pomega1_r M2 = l2_r - pomega2_r sinf1, cosf1 = ko(M1, e1 + T.zeros_like(M1)) sinf2, cosf2 = ko(M2, e2 + T.zeros_like(M2)) # n1 = T.sqrt(eta1 / mstar) * a1**(-3 / 2) n2 = T.sqrt(eta2 / mstar) * a2**(-3 / 2) Hint_dir, Hint_ind, r1, r2, v1, v2 = calc_Hint_components_sinf_cosf( a1, a2, e1, e2, inc1, inc2, omega1, omega2, Omega1_r, Omega2_r, n1, n2, sinf1, cosf1, sinf2, cosf2) eps = m1 * m2 / (mu1 + mu2) / T.sqrt(mstar) Hpert = (Hint_dir + Hint_ind / mstar) Htot = Hkep + eps * Hpert ##################################################### # Set parameters for compiling functions with Theano ##################################################### # 'ins' will set the inputs of Theano functions compiled below # Note: 'extra_ins' will be passed as values of object attributes # of the 'ResonanceEquations' class 'defined below extra_ins = [m1, m2, j, k] givens = [] ins = [dyvars] + extra_ins orbels = [ a1, e1, inc1, l1_r, pomega1_r, Omega1_r, a2, e2, inc2, l2_r, pomega2_r, Omega2_r ] # Conservative flow gradHtot = T.grad(Htot, wrt=dyvars) hessHtot = theano.gradient.hessian(Htot, wrt=dyvars) Jtens = T.as_tensor(_get_Omega_matrix(5)) H_flow_vec = Jtens.dot(gradHtot) H_flow_jac = Jtens.dot(hessHtot) ########################## # Compile Theano functions ########################## orbels_fn = theano.function(inputs=ins, outputs=orbels, givens=givens, on_unused_input='ignore') rv1_fn = theano.function(inputs=ins, outputs=r1 + v1, givens=givens, on_unused_input='ignore') rv2_fn = theano.function(inputs=ins, outputs=r2 + v2, givens=givens, on_unused_input='ignore') Htot_fn = theano.function(inputs=ins, outputs=Htot, givens=givens, on_unused_input='ignore') Hpert_fn = theano.function(inputs=ins, outputs=Hpert, givens=givens, on_unused_input='ignore') Hpert_components_fn = theano.function(inputs=ins, outputs=[Hint_dir, Hint_ind], givens=givens, on_unused_input='ignore') H_flow_vec_fn = theano.function(inputs=ins, outputs=H_flow_vec, givens=givens, on_unused_input='ignore') H_flow_jac_fn = theano.function(inputs=ins, outputs=H_flow_jac, givens=givens, on_unused_input='ignore') return dict({ 'orbital_elements': orbels_fn, 'Hamiltonian': Htot_fn, 'Hpert': Hpert_fn, 'Hpert_components': Hpert_components_fn, 'Hamiltonian_flow': H_flow_vec_fn, 'Hamiltonian_flow_jacobian': H_flow_jac_fn, 'positions_and_velocities1': rv1_fn, 'positions_and_velocities2': rv2_fn })
def expectedValueRectified(mean, variance): std = T.sqrt(variance) return std / T.sqrt(2.0 * np.pi) * T.exp( -mean**2 / (2.0 * variance)) + mean * cdf(mean / std)
def calc_Hint_components_sinf_cosf(a1, a2, e1, e2, inc1, inc2, omega1, omega2, Omega1, Omega2, n1, n2, sinf1, cosf1, sinf2, cosf2): """ Compute the value of the disturbing function .. math:: \frac{1}{|r-r'|} - ??? v.v' from a set of input orbital elements for coplanar planets. Arguments --------- a1 : float inner semi-major axis a2 : float outer semi-major axis e1 : float inner eccentricity e2 : float outer eccentricity I1 : float inner inclination I2 : float outer inclination omega1 : float inner argument of periapse omega2 : float outer argument of periapse dOmega : float difference in long. of nodes, Omega2-Omega1 n1 : float inner mean motion n2 : float outer mean motion sinf1 : float sine of inner planet true anomaly cosf1 : float cosine of inner planet true anomaly sinf2 : float sine of outer planet true anomaly cosf2 : float cosine of outer planet true anomaly Returns ------- (direct,indirect) : tuple Returns a tuple containing the direct and indirect parts of the interaction Hamiltonian """ r1 = a1 * (1 - e1 * e1) / (1 + e1 * cosf1) _x1 = r1 * cosf1 _y1 = r1 * sinf1 _z1 = 0. x1, y1, z1 = EulerAnglesTransform(_x1, _y1, _z1, Omega1, inc1, omega1) vel1 = n1 * a1 / T.sqrt(1 - e1 * e1) _u1 = -1 * vel1 * sinf1 _v1 = vel1 * (e1 + cosf1) _w1 = 0. u1, v1, w1 = EulerAnglesTransform(_u1, _v1, _w1, Omega1, inc1, omega1) r2 = a2 * (1 - e2 * e2) / (1 + e2 * cosf2) _x2 = r2 * cosf2 _y2 = r2 * sinf2 _z2 = 0. x2, y2, z2 = EulerAnglesTransform(_x2, _y2, _z2, Omega2, inc2, omega2) vel2 = n2 * a2 / T.sqrt(2 - e2 * e2) _u2 = -1 * vel2 * sinf2 _v2 = vel2 * (e2 + cosf2) _w2 = 0. u2, v2, w2 = EulerAnglesTransform(_u2, _v2, _w2, Omega2, inc2, omega2) # direct term dx = x2 - x1 dy = y2 - y1 dz = z2 - z1 dr2 = dx * dx + dy * dy + dz * dz direct = -1 / T.sqrt(dr2) # indirect terms indirect = u1 * u2 + v1 * v2 + w1 * w2 return direct, indirect, [x1, y1, z1], [x2, y2, z2], [u1, v1, w1], [u2, v2, w2]
def step(input_n, gamma, time_step, cell_previous, hid_previous, *args): hidden = T.dot(hid_previous, W_hid_stacked) # batch normalization of hidden states if deterministic: mean = self.running_mean[time_step] inv_std = self.running_inv_std[time_step] else: mean = hidden.mean(0) inv_std = T.inv(T.sqrt(hidden.var(0) + self.epsilon)) self.running_mean_clone.default_update = \ T.set_subtensor(self.running_mean_clone.default_update[time_step], (1-self.alpha) * self.running_mean_clone.default_update[time_step] + self.alpha * mean) self.running_inv_std_clone.default_update = \ T.set_subtensor(self.running_inv_std_clone.default_update[time_step], (1-self.alpha) * self.running_inv_std_clone.default_update[time_step] + self.alpha * inv_std) mean += 0 * self.running_mean_clone[time_step] inv_std += 0 * self.running_inv_std_clone[time_step] gamma = gamma.dimshuffle('x', 0) mean = mean.dimshuffle('x', 0) inv_std = inv_std.dimshuffle('x', 0) # normalize normalized = (hidden - mean) * (gamma * inv_std) # Calculate gates pre-activations and slice gates = input_n + normalized # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip( gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous*self.W_cell_to_ingate forgetgate += cell_previous*self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input if self.peepholes: outgate += cell*self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) return [cell, hid]