def _elbo_t(logp, uw, inarray, n_mcsamples, random_seed): """Create Theano tensor of approximate ELBO by Monte Carlo sampling. """ l = (uw.size / 2).astype('int64') u = uw[:l] w = uw[l:] # Callable tensor logp_ = lambda input: theano.clone(logp, {inarray: input}, strict=False) # Naive Monte-Carlo r = MRG_RandomStreams(seed=random_seed) if n_mcsamples == 1: n = r.normal(size=inarray.tag.test_value.shape) q = n * exp(w) + u elbo = logp_(q) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi)) else: n = r.normal(size=(n_mcsamples, u.tag.test_value.shape[0])) qs = n * exp(w) + u logps, _ = theano.scan(fn=lambda q: logp_(q), outputs_info=None, sequences=[qs]) elbo = tt.mean(logps) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi)) return elbo
def compute_output(self, network, in_vw): deterministic = network.find_hyperparameter(["deterministic"]) sigma = network.find_hyperparameter(["sigma"], None) if sigma is None: p = network.find_hyperparameter(["dropout_probability", "probability", "p"], 0) if p == 0: sigma = 0 else: # derive gaussian dropout variance from bernoulli dropout # probability sigma = T.sqrt(p / (1 - p)) if deterministic or sigma == 0: network.copy_vw(name="default", previous_vw=in_vw, tags={"output"}) else: mask_shape = in_vw.shape if any(s is None for s in mask_shape): # NOTE: this uses symbolic shape - can be an issue with # theano.clone and random numbers # https://groups.google.com/forum/#!topic/theano-users/P7Mv7Fg0kUs warnings.warn("using symbolic shape for dropout mask, " "which can be an issue with theano.clone") mask_shape = in_vw.variable.shape # TODO save this state so that we can seed the rng srng = MRG_RandomStreams() mask = srng.normal(mask_shape, avg=1.0, std=sigma, dtype=fX) network.create_vw("default", variable=in_vw.variable * mask, shape=in_vw.shape, tags={"output"})
class GaussianProd(MaskedLayer): ''' Multiply by Gaussian noise. Similar to dropout but with gaussians instead of binomials. The way they have this at Keras is not the way we need for Variational AutoEncoders. ''' def __init__(self, avg=0., std=1., **kwargs): super(GaussianProd, self).__init__(**kwargs) self.std = std self.avg = avg self.srng = RandomStreams(seed=np.random.randint(10e6)) def get_output(self, train=False): X = self.get_input(train) X *= self.srng.normal(size=X.shape, avg=self.avg, std=self.std, dtype=floatX) return X def get_config(self): return {"name": self.__class__.__name__, "avg": self.avg, "std": self.std}
def compare_speed(): # To run this speed comparison # cd <directory of this file> # THEANO_FLAGS=device=gpu \ # python -c 'import test_rng_curand; test_rng_curand.compare_speed()' mrg = MRG_RandomStreams() crn = CURAND_RandomStreams(234) N = 1000 * 100 dest = theano.shared(numpy.zeros(N, dtype=theano.config.floatX)) mrg_u = theano.function([], [], updates={dest: mrg.uniform((N,))}, profile='mrg uniform') crn_u = theano.function([], [], updates={dest: crn.uniform((N,))}, profile='crn uniform') mrg_n = theano.function([], [], updates={dest: mrg.normal((N,))}, profile='mrg normal') crn_n = theano.function([], [], updates={dest: crn.normal((N,))}, profile='crn normal') for f in mrg_u, crn_u, mrg_n, crn_n: # don't time the first call, it has some startup cost print('DEBUGPRINT') print('----------') theano.printing.debugprint(f) for i in range(100): for f in mrg_u, crn_u, mrg_n, crn_n: # don't time the first call, it has some startup cost f.fn.time_thunks = (i > 0) f()
def compute_output(self, network, mu_vw, sigma_vw): deterministic = network.find_hyperparameter(["deterministic"], False) if deterministic: res = mu_vw.variable else: # TODO look at shape of both mu and sigma shape = mu_vw.shape if any(s is None for s in shape): # NOTE: this uses symbolic shape - can be an issue with # theano.clone and random numbers # https://groups.google.com/forum/#!topic/theano-users/P7Mv7Fg0kUs warnings.warn("using symbolic shape for random number shape, " "which can be an issue with theano.clone") shape = mu_vw.variable.shape # TODO save this state so that we can seed the rng srng = MRG_RandomStreams() res = srng.normal(shape, avg=mu_vw.variable, std=sigma_vw.variable, dtype=fX) network.create_vw( "default", variable=theano.gradient.disconnected_grad(res), shape=mu_vw.shape, tags={"output"}, )
class GaussianDropout(MaskedLayer): ''' Multiplicative Gaussian Noise Reference: Dropout: A Simple Way to Prevent Neural Networks from Overfitting Srivastava, Hinton, et al. 2014 http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf ''' def __init__(self, p, **kwargs): super(GaussianDropout, self).__init__(**kwargs) self.p = p self.srng = RandomStreams(seed=np.random.randint(10e6)) def get_output(self, train): X = self.get_input(train) if train: # self.p refers to drop probability rather than retain probability (as in paper) to match Dropout layer syntax X *= self.srng.normal(size=X.shape, avg=1.0, std=T.sqrt(self.p / (1.0 - self.p)), dtype=theano.config.floatX) return X def get_config(self): config = {"name": self.__class__.__name__, "p": self.p} base_config = super(GaussianDropout, self).get_config() return dict(list(base_config.items()) + list(config.items()))
def apply_noise(computation_graph, variables, level, seed=None): """Add Gaussian noise to certain variable of a computation graph. Parameters ---------- computation_graph : instance of :class:`ComputationGraph` The computation graph. variables : :class:`~tensor.TensorVariable` Variables to add noise to. level : float Noise level. seed : int, optional The seed with which :class:`~theano.sandbox.rng_mrg.MRG_RandomStreams` is initialized, is set to 1 by default. """ if not seed: seed = config.default_seed rng = MRG_RandomStreams(seed) replace = {} for variable in variables: replace[variable] = (variable + rng.normal(variable.shape, std=level)) return computation_graph.replace(replace)
class SimpleSampleLayer(lasagne.layers.MergeLayer): """ Simple sampling layer drawing a single Monte Carlo sample to approximate E_q [log( p(x,z) / q(z|x) )]. This is the approach described in [KINGMA]_. Parameters ---------- mu, log_var : class:`Layer` instances Parameterizing the mean and log(variance) of the distribution to sample from as described in [KINGMA]_. The code assumes that these have the same number of dimensions References ---------- .. [KINGMA] Kingma, Diederik P., and Max Welling. "Auto-encoding variational bayes." arXiv preprint arXiv:1312.6114 (2013). """ def __init__(self, mu, log_var, **kwargs): super(SimpleSampleLayer, self).__init__([mu, log_var], **kwargs) self._srng = RandomStreams( lasagne.random.get_rng().randint(1, 2147462579)) def get_output_shape_for(self, input_shapes): return input_shapes[0] def get_output_for(self, input, **kwargs): mu, log_var = input eps = self._srng.normal(mu.shape) z = mu + T.exp(0.5 * log_var) * eps return z
class AdditiveDiagonalMND: def __init__(self, init_beta, nvis): """ A conditional distribution that adds gaussian noise with diagonal precision matrix beta to another variable that it conditions on """ self.__dict__.update(locals()) del self.self self.beta = sharedX(np.ones((nvis,))*init_beta) assert self.beta.ndim == 1 self.s_rng = RandomStreams(17) def random_design_matrix(self, X): """ X: a theano variable containing a design matrix of observations of the random vector to condition on.""" Z = self.s_rng.normal(size=X.shape, avg=X, std=1./T.sqrt(self.beta), dtype=config.floatX) return Z def is_symmetric(self): """ A property of conditional distributions P(Y|X) Return true if P(y|x) = P(x|y) for all x,y """ return True
class GaussianNoise(Layer): """ This layer is used to construct the embedding of the encoder by taking the last state of the recurrent model """ def __init__(self, rng, std = 0.1, ndim=0, avg =0, shape_fn=None): """ """ assert rng is not None, "random number generator should not be empty!" super(GaussianNoise, self).__init__(0, 0, rng) self.std = scale self.avg = self.avg self.ndim = ndim self.shape_fn = shape_fn if self.shape_fn: # Name is not important as it is not a parameter of the model self.noise_term = theano.shared(numpy.zeros((2,)*ndim, dtype=theano.config.floatX), name='ndata') self.noise_params += [self.noise_term] self.noise_params_shape_fn += [shape_fn] self.trng = RandomStreams(rng.randint(1e5)) def fprop(self, x): self.out = x if self.scale: if self.shape_fn: self.out += self.noise_term else: self.out += self.trng.normal(self.out.shape, std=self.std, avg = self.avg, dtype=self.out.dtype) return self.out
class NoiseInputLayer(Layer): def __init__(self, shape, input_var=None, name=None, **kwargs): self.shape = shape self._srng = RandomStreams(get_rng().randint(1, 2147462579)) if any(d is not None and d <= 0 for d in self.shape): raise ValueError(( "Cannot create InputLayer with a non-positive shape " "dimension. shape=%r, self.name=%r") % ( self.shape, name)) ndim = len(shape) if input_var is None: # create the right TensorType for the given number of dimensions input_var_type = T.TensorType(theano.config.floatX, [False] * ndim) var_name = ("%s.input" % name) if name is not None else "input" input_var = input_var_type(var_name) else: # ensure the given variable has the correct dimensionality if input_var.ndim != ndim: raise ValueError("shape has %d dimensions, but variable has " "%d" % (ndim, input_var.ndim)) self.input_var = self._srng.normal(self.shape, avg = 0., std = 0.1) self.name = name self.params = OrderedDict() @Layer.output_shape.getter def output_shape(self): return self.shape
class GaussianBandit(Environment): """ An n-armed bandit whose rewards are drawn from a different Gaussian distribution for each arm. The mean and standard deviation of the reward for each arm is drawn at initialization time from N(0, <corresponding std arg>). (For the standard deviation we use the absolute value of the Gaussian sample) """ def __init__(self, num_arms, mean_std = 1.0, std_std = 1.0): self.rng = np.random.RandomState([2013, 11, 12]) self.means = sharedX(self.rng.randn(num_arms) * mean_std) self.stds = sharedX(np.abs(self.rng.randn(num_arms) * std_std)) self.theano_rng = MRG_RandomStreams(self.rng.randint(2 ** 16)) def get_action_func(self): """ Returns a theano function that takes an action and returns a reward. """ action = T.iscalar() reward_mean = self.means[action] reward_std = self.stds[action] reward = self.theano_rng.normal(avg=reward_mean, std=reward_std, dtype=config.floatX, size=reward_mean.shape) rval = function([action], reward) return rval
def prediction(self, h, bias): srng = RandomStreams(seed=42) prop, mean_x, mean_y, std_x, std_y, rho, bernoulli = \ self.compute_parameters(h, bias) mode = T.argmax(srng.multinomial(pvals=prop, dtype=prop.dtype), axis=1) v = T.arange(0, mean_x.shape[0]) m_x = mean_x[v, mode] m_y = mean_y[v, mode] s_x = std_x[v, mode] s_y = std_y[v, mode] r = rho[v, mode] # cov = r * (s_x * s_y) normal = srng.normal((h.shape[0], 2)) x = normal[:, 0] y = normal[:, 1] # x_n = T.shape_padright(s_x * x + cov * y + m_x) # y_n = T.shape_padright(s_y * y + cov * x + m_y) x_n = T.shape_padright(m_x + s_x * x) y_n = T.shape_padright(m_y + s_y * (x * r + y * T.sqrt(1.-r**2))) uniform = srng.uniform((h.shape[0],)) pin = T.shape_padright(T.cast(bernoulli > uniform, floatX)) return T.concatenate([x_n, y_n, pin], axis=1)
def compute_output(self, network, in_vw): axis = network.find_hyperparameter(["axis"]) deterministic = network.find_hyperparameter(["deterministic"], False) # calculate output shape output_shape = list(in_vw.shape) output_shape.pop(axis) if deterministic: out_var = in_vw.variable.mean(axis=axis) else: # TODO save this state so that we can seed the rng srng = MRG_RandomStreams() if in_vw.shape[axis] is None: # NOTE: this uses symbolic shape - can be an issue with # theano.clone and random numbers # https://groups.google.com/forum/#!topic/theano-users/P7Mv7Fg0kUs warnings.warn("using symbolic shape for random variable size " "which can be an issue with theano.clone") idx = T.argmax(srng.normal([in_vw.symbolic_shape()[axis]])) slices = tuple([slice(None) for _ in range(axis)] + [idx]) out_var = in_vw.variable[slices] network.create_vw( "default", variable=out_var, shape=tuple(output_shape), tags={"output"}, )
class MND(object): """A Multivariate Normal Distribution""" def __init__(self, sigma, mu, seed=42): """ .. todo:: WRITEME properly Parameters ----------- sigma: a numpy ndarray of shape (n,n) mu: a numpy ndarray of shape (n,) seed: the seed for the theano random number generator used to sample from this distribution""" self.sigma = sigma self.mu = mu if not (len(mu.shape) == 1): raise Exception('mu has shape ' + str(mu.shape) + ' (it should be a vector)') self.sigma_inv = solve(self.sigma, N.identity(mu.shape[0]), sym_pos=True) self.L = cholesky(self.sigma) self.s_rng = RandomStreams(seed) #Compute logZ #log Z = log 1/( (2pi)^(-k/2) |sigma|^-1/2 ) # = log 1 - log (2pi^)(-k/2) |sigma|^-1/2 # = 0 - log (2pi)^(-k/2) - log |sigma|^-1/2 # = (k/2) * log(2pi) + (1/2) * log |sigma| k = float(self.mu.shape[0]) self.logZ = 0.5 * (k * N.log(2. * N.pi) + N.log(det(sigma))) def free_energy(self, X): """ .. todo:: WRITEME """ #design matrix format return .5 * T.sum(T.dot(X - self.mu, T.dot(self.sigma_inv, T.transpose(X - self.mu)))) def log_prob(self, X): """ .. todo:: WRITEME """ return - self.free_energy(X) - self.logZ def random_design_matrix(self, m): """ .. todo:: WRITEME """ Z = self.s_rng.normal(size=(m, self.mu.shape[0]), avg=0., std=1., dtype=config.floatX) return self.mu + T.dot(Z, self.L.T)
def gaussian_noise(input_shape): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams _srng=RandomStreams() mask=_srng.normal(input_shape,avg=0.0,std=1.0,dtype=theano.config.floatX);#variable mask=mask.eval()#get value from variable noise=mask.reshape((-1,1,28,28)) return noise
def construct_graph_ref(self, args, x, length, popstats=None): p = self.allocate_parameters(args) if args.baseline: def bn(x, gammas, betas): return x + betas else: def bn(x, gammas, betas): mean, var = x.mean(axis=0, keepdims=True), x.var(axis=0, keepdims=True) # if only mean.tag.batchstat, var.tag.batchstat = True, True #var = T.maximum(var, args.epsilon) var = var + args.epsilon return (x - mean) / T.sqrt(var) * gammas + betas def stepfn(x, dummy_h, dummy_c, h, c): # a_mean, b_mean, c_mean, # a_var, b_var, c_var): a_mean, b_mean, c_mean = 0, 0, 0 a_var, b_var, c_var = 0, 0, 0 atilde = T.dot(h, p.Wa) btilde = x a_normal = bn(atilde, p.a_gammas, p.ab_betas) b_normal = bn(btilde, p.b_gammas, 0) ab = a_normal + b_normal g, f, i, o = [fn(ab[:, j * args.num_hidden:(j + 1) * args.num_hidden]) for j, fn in enumerate([self.activation] + 3 * [T.nnet.sigmoid])] c = dummy_c + f * c + i * g c_normal = bn(c, p.c_gammas, p.c_betas) h = dummy_h + o * self.activation(c_normal) return h, c, atilde, btilde, c_normal xtilde = T.dot(x, p.Wx) if args.noise: # prime h with white noise Trng = MRG_RandomStreams() h_prime = Trng.normal((xtilde.shape[1], args.num_hidden), std=args.noise) elif args.summarize: # prime h with mean of example h_prime = x.mean(axis=[0, 2])[:, None] else: h_prime = 0 dummy_states = dict(h=T.zeros((xtilde.shape[0], xtilde.shape[1], args.num_hidden)), c=T.zeros((xtilde.shape[0], xtilde.shape[1], args.num_hidden))) [h, c, atilde, btilde, htilde], _ = theano.scan( stepfn, sequences=[xtilde, dummy_states["h"], dummy_states["c"]], outputs_info=[T.repeat(p.h0[None, :], xtilde.shape[1], axis=0) + h_prime, T.repeat(p.c0[None, :], xtilde.shape[1], axis=0), None, None, None]) return dict(h=h, c=c, atilde=atilde, btilde=btilde, htilde=htilde), [], dummy_states, popstats
def test_normal0(): steps = 50 std = 2. if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or config.mode == 'Mode' and config.linker in ['py']): sample_size = (25, 30) default_rtol = .02 else: sample_size = (999, 50) default_rtol = .01 sample_size_odd = (sample_size[0], sample_size[1] - 1) x = tensor.matrix() for size, const_size, var_input, input, avg, rtol, std_tol in [ (sample_size, sample_size, [], [], -5., default_rtol, default_rtol), (x.shape, sample_size, [x], [np.zeros(sample_size, dtype=config.floatX)], -5., default_rtol, default_rtol), # test odd value (x.shape, sample_size_odd, [x], [np.zeros(sample_size_odd, dtype=config.floatX)], -5., default_rtol, default_rtol), (sample_size, sample_size, [], [], np.arange(np.prod(sample_size), dtype='float32').reshape(sample_size), 10. * std / np.sqrt(steps), default_rtol), # test empty size (scalar) ((), (), [], [], -5., default_rtol, 0.02), # test with few samples at the same time ((1,), (1,), [], [], -5., default_rtol, 0.02), ((3,), (3,), [], [], -5., default_rtol, 0.02), ]: R = MRG_RandomStreams(234) # Note: we specify `nstreams` to avoid a warning. n = R.normal(size=size, avg=avg, std=std, nstreams=rng_mrg.guess_n_streams(size, warn=False)) f = theano.function(var_input, n) f(*input) # Increase the number of steps if size implies only a few samples if np.prod(const_size) < 10: steps_ = steps * 50 else: steps_ = steps basictest(f, steps_, const_size, target_avg=avg, target_std=std, prefix='mrg ', allow_01=True, inputs=input, mean_rtol=rtol, std_tol=std_tol) sys.stdout.flush() RR = theano.tensor.shared_randomstreams.RandomStreams(234) nn = RR.normal(size=size, avg=avg, std=std) ff = theano.function(var_input, nn) basictest(ff, steps_, const_size, target_avg=avg, target_std=std, prefix='numpy ', allow_01=True, inputs=input, mean_rtol=rtol)
def sample_vp(vparams, draws=1000, model=None, random_seed=20090425, hide_transformed=True): """Draw samples from variational posterior. Parameters ---------- vparams : dict or pymc3.variational.ADVIFit Estimated variational parameters of the model. draws : int Number of random samples. model : pymc3.Model Probabilistic model. random_seed : int Seed of random number generator. hide_transformed : bool If False, transformed variables are also sampled. Default is True. Returns ------- trace : pymc3.backends.base.MultiTrace Samples drawn from the variational posterior. """ model = modelcontext(model) if isinstance(vparams, ADVIFit): vparams = { 'means': vparams.means, 'stds': vparams.stds } # Make dict for replacements of random variables r = MRG_RandomStreams(seed=random_seed) updates = {} for var in model.free_RVs: u = theano.shared(vparams['means'][str(var)]).ravel() w = theano.shared(vparams['stds'][str(var)]).ravel() n = r.normal(size=u.tag.test_value.shape) updates.update({var: (n * w + u).reshape(var.tag.test_value.shape)}) vars = model.free_RVs # Replace some nodes of the graph with variational distributions samples = theano.clone(vars, updates) f = theano.function([], samples) # Random variables which will be sampled vars_sampled = [v for v in model.unobserved_RVs if not str(v).endswith('_')] \ if hide_transformed else \ [v for v in model.unobserved_RVs] varnames = [str(var) for var in model.unobserved_RVs] trace = NDArray(model=model, vars=vars_sampled) trace.setup(draws=draws, chain=0) for i in range(draws): # 'point' is like {'var1': np.array(0.1), 'var2': np.array(0.2), ...} point = {varname: value for varname, value in zip(varnames, f())} trace.record(point) return MultiTrace([trace])
class NormalApproximation(object): def __init__(self, mu=0, std=np.exp(-3),seed=None): """ Approximation that samples network weights from factorized normal distribution. :param mu: prior mean for gaussian weights :param std: prior std for gaussian weights :param seed: random seed """ self.prior_mu = mu self.prior_std = std self.srng = RandomStreams(seed or get_rng().randint(1, 2147462579)) def log_normal(self,x, mean, std, eps=0.0): """computes log-proba of normal distribution""" std += eps return - 0.5 * np.log(2 * np.pi) - T.log(T.abs_(std)) - (x - mean) ** 2 / (2 * std ** 2) def log_prior(self, weights): """ Logarithm of prior probabilities for weights: log P(weights) aka log P(theta) """ return self.log_normal(weights, self.prior_mu, self.prior_std) def log_posterior_approx(self,weights, mean, rho): """ Logarithm of ELBO on posterior probabilities: log q(weights|learned mu and rho) aka log q(theta|x) """ std = T.log1p(T.exp(rho)) #rho to std return self.log_normal(weights, mean, std) def __call__(self, layer, spec, shape, name=None, **tags): # case when user uses default init specs assert tags.get('variational',False) == True, "Please declare param as variational to avoid confusion" if not isinstance(spec, dict): initial_rho = np.log(np.expm1(self.prior_std)) #std to rho assert np.isfinite(initial_rho),"too small std to initialize correctly. Please pass explicit"\ " initializer (dict with {'mu':mu_init, 'rho':rho_init})." spec = {'mu': spec,'rho':init.Constant(initial_rho)} mu_spec,rho_spec = spec['mu'],spec['rho'] rho = layer.add_param(rho_spec, shape,name=(name or 'unk')+'.rho', **tags) mean = layer.add_param(mu_spec, shape,name=(name or 'unk')+'.mu', **tags) #Reparameterization trick e = self.srng.normal(shape, std=1) W = mean + T.log1p(T.exp(rho)) * e #KL divergence KL(q,p) = E_(w~q(w|x)) [log q(w|x) - log P(w)] aka variational cost q_p = T.sum(self.log_posterior_approx(W, mean, rho) - self.log_prior(W)) #accumulate variational cost layer._bbwrap_var_cost += q_p return W
def fprop(self, state_below, add_noise=True): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) self.x = state_below # linear part if isinstance(self.x, S.SparseVariable): z = S.dot(self.x,self.W[0]) + self.b[0] else: z = T.dot(self.x,self.W[0]) + self.b[0] self.z = self.activate(z, self.expert_activation) # first layer non-linear part if isinstance(self.x, S.SparseVariable): h = S.dot(self.x,self.W[1]) + self.b[1] else: h = T.dot(self.x,self.W[1]) + self.b[1] # activate hidden units of non-linear part self.h = self.activate(h, self.hidden_activation) noise = 0. if add_noise: rng = MRG_RandomStreams(self.mlp.rng.randint(2**15)) noise = rng.normal(size = self.z.shape, std=self.noise_stdev , dtype=self.z.type.dtype) # second layer non-linear part self.a = T.dot(self.h,self.W[2]) + self.b[2] + noise # activate non-linear part self.m_mean = self.activate(self.a, self.gater_activation) # how many are over 0: self.effective_sparsity = T.cast(T.gt(self.m_mean, 0), theano.config.floatX).mean() # mix output of linear part with output of non-linear part self.p = self.m_mean * self.z if self.layer_name is not None: self.z.name = self.layer_name + '_z' self.h.name = self.layer_name + '_h' self.a.name = self.layer_name + '_a' self.m_mean.name = self.layer_name + '_m_mean' self.p.name = self.layer_name + '_p' return self.p
class SampleLayer(lasagne.layers.MergeLayer): """ Samplelayer supporting importance sampling as described in [BURDA]_ and multiple monte carlo samples for the approximation of E_q [log( p(x,z) / q(z|x) )] Parameters ---------- mu, log_var : class:`Layer` instances Parameterizing the mean and log(variance) of the distribution to sample from as described in [BURDA]. The code assumes that these have the same number of dimensions eq_samples: Int or T.scalar Number of Monte Carlo samples used to estimate the expectation over q(z|x) in eq. (8) in [BURDA] iw_samples: Int or T.scalar Number of importance samples in the sum over k in eq. (8) in [BURDA] References ---------- .. [BURDA] Burda, Yuri, Roger Grosse, and Ruslan Salakhutdinov. "Importance Weighted Autoencoders." arXiv preprint arXiv:1509.00519 (2015). """ def __init__(self, mu, log_var, eq_samples=1, iw_samples=1, **kwargs): super(SampleLayer, self).__init__([mu, log_var], **kwargs) self.eq_samples = eq_samples self.iw_samples = iw_samples self._srng = RandomStreams( lasagne.random.get_rng().randint(1, 2147462579)) def get_output_shape_for(self, input_shapes): batch_size, num_latent = input_shapes[0] if isinstance(batch_size, int) and \ isinstance(self.iw_samples, int) and \ isinstance(self.eq_samples, int): out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent) else: out_dim = (None, num_latent) return out_dim def get_output_for(self, input, **kwargs): mu, log_var = input batch_size, num_latent = mu.shape eps = self._srng.normal( [batch_size, self.eq_samples, self.iw_samples, num_latent], dtype=theano.config.floatX) z = mu.dimshuffle(0,'x','x',1) + \ T.exp(0.5 * log_var.dimshuffle(0,'x','x',1)) * eps return z.reshape((-1,num_latent))
class NormalRandom(object): """Implements normal random sampling in Tensorflow""" def __init__(self): self._rng = RandomStreams(seed=self.seed or 123456) def _sample(self, shape, dtype): return self._rng.normal( size=shape, avg=self.mean, std=self.std, dtype=dtype)
def get_samples_and_objectives(self, model, data): space, sources = self.get_data_specs(model) space.validate(data) assert isinstance(model, AdversaryPair) g = model.generator d = model.discriminator # Note: this assumes data is design matrix X = data m = data.shape[space.get_batch_axis()] y1 = T.alloc(1, m, 1) y0 = T.alloc(0, m, 1) # NOTE: if this changes to optionally use dropout, change the inference # code below to use a non-dropped-out version. S, z, other_layers = g.sample_and_noise(m, default_input_include_prob=self.generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale, all_g_layers=(self.infer_layer is not None)) if self.noise_both != 0.: rng = MRG_RandomStreams(2014 / 6 + 2) S = S + rng.normal(size=S.shape, dtype=S.dtype) * self.noise_both X = X + rng.normal(size=X.shape, dtype=S.dtype) * self.noise_both y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob, self.discriminator_input_include_probs, self.discriminator_default_input_scale, self.discriminator_input_scales) y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob, self.discriminator_input_include_probs, self.discriminator_default_input_scale, self.discriminator_input_scales) d_obj = 0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0)) if self.no_drop_in_d_for_g: y_hat0_no_drop = d.dropout_fprop(S) g_obj = d.layers[-1].cost(y1, y_hat0_no_drop) else: g_obj = d.layers[-1].cost(y1, y_hat0) if self.blend_obj: g_obj = (self.zurich_coeff * g_obj - self.minimax_coeff * d_obj) / (self.zurich_coeff + self.minimax_coeff) if model.inferer is not None: # Change this if we ever switch to using dropout in the # construction of S. S_nograd = block_gradient(S) # Redundant as long as we have custom get_gradients pred = model.inferer.dropout_fprop(S_nograd, self.inference_default_input_include_prob, self.inference_input_include_probs, self.inference_default_input_scale, self.inference_input_scales) if self.infer_layer is None: target = z else: target = other_layers[self.infer_layer] i_obj = model.inferer.layers[-1].cost(target, pred) else: i_obj = 0 return S, d_obj, g_obj, i_obj
def apply(self, x): # lazy hack h0 = self.parameters[0] c0 = self.parameters[1] Wa = self.parameters[2] Wx = self.parameters[3] if self.baseline: ab_betas = self.parameters[4] h_betas = self.parameters[5] a_gammas = None b_gammas = None h_gammas = None else: a_gammas = self.parameters[4] b_gammas = self.parameters[5] h_gammas = self.parameters[6] ab_betas = self.parameters[7] h_betas = self.parameters[8] xtilde = tensor.dot(x, Wx) if self.noise: # prime h with white noise Trng = MRG_RandomStreams() h_prime = Trng.normal((xtilde.shape[1], self.state_dim), std=args.noise) #elif args.summarize: # # prime h with summary of example # Winit = theano.shared(orthogonal((nclasses, self.state_dim)), name="Winit") # parameters.append(Winit) # h_prime = tensor.dot(x, Winit).mean(axis=0) else: h_prime = 0 dummy_states = dict(h=tensor.zeros((xtilde.shape[0], xtilde.shape[1], self.state_dim)), c=tensor.zeros((xtilde.shape[0], xtilde.shape[1], self.state_dim))) def stepfn(xtilde, dummy_h, dummy_c, h, c): atilde = tensor.dot(h, Wa) btilde = xtilde a = self.bn(atilde, a_gammas, ab_betas) b = self.bn(btilde, b_gammas, 0) ab = a + b g, f, i, o = [fn(ab[:, j * self.state_dim:(j + 1) * self.state_dim]) for j, fn in enumerate([self.children[0].apply] + 3 * [tensor.nnet.sigmoid])] c = dummy_c + f * c + i * g htilde = c h = dummy_h + o * self.children[0].apply(self.bn(htilde, h_gammas, h_betas)) return h, c, atilde, btilde, htilde [h, c, atilde, btilde, htilde], _ = theano.scan( stepfn, sequences=[xtilde, dummy_states["h"], dummy_states["c"]], outputs_info=[tensor.repeat(h0[None, :], xtilde.shape[1], axis=0) + h_prime, tensor.repeat(c0[None, :], xtilde.shape[1], axis=0), None, None, None]) #return dict(h=h, c=c, atilde=atilde, btilde=btilde, htilde=htilde), dummy_states, parameters return h
def apply_noise(computation_graph, variables, level, seed=None): if not seed: seed = config.default_seed rng = MRG_RandomStreams(seed) replace = {} for variable in variables: replace[variable] = (variable + level*rng.normal(variable.shape)) return computation_graph.replace(replace)
class ESGD(RmsProp): r'''Equilibrated SGD computes a diagonal preconditioner for gradient descent. The ESGD method uses the same general strategy as SGD, in the sense that all gradient-based methods make small parameter adjustments using local derivative information. The difference here is that as gradients are computed during each parameter update, an exponential moving average of diagonal preconditioner values is maintained as well. At each update, the EWMA is used to compute the root-mean-square (RMS) diagonal preconditioner value that's been seen in the recent past. The actual gradient is normalized by this preconditioner before being applied to update the parameters. .. math:: \begin{eqnarray*} r &\sim& \mathcal{N}(0, 1) \\ Hr &=& \frac{\partial^2 \mathcal{L}}{\partial^2\theta}r \\ D_{t+1} &=& \gamma D_t + (1 - \gamma) (Hr)^2 \\ v_{t+1} &=& \mu v_t - \frac{\alpha}{\sqrt{D_{t+1} + \epsilon}} \frac{\partial\mathcal{L}}{\partial\theta} \\ \theta_{t+1} &=& \theta_t + v_{t+1} \end{eqnarray*} Like :class:`Rprop` and the :class:`ADADELTA`--:class:`RmsProp` family, this learning method effectively maintains a sort of parameter-specific momentum value. The primary difference between this method and :class:`RmsProp` is that ESGD treats the normalizing fraction explicitly as a preconditioner for the diaonal of the Hessian, and estimates this diagonal by drawing a vector of standard normal values at every training step. The primary difference between this implementation and the algorithm described in the paper (see below) is the use of an EWMA to decay the diagonal values over time, while in the paper the diagonal is divided by the training iteration. In this implementation, :math:`\epsilon` is set to 1e-4. The weight parameter :math:`\gamma` for the EWMA window is computed from the ``rms_halflife`` keyword argument, such that the actual EWMA weight varies inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln 2}{h}}`. The implementation here is modeled after Dauphin, de Vries, Chung & Bengio (2014), "RMSProp and equilibrated adaptive learning rates for non-convex optimization," http://arxiv.org/pdf/1502.04390.pdf. ''' def __init__(self, *args, **kwargs): self.rng = RandomStreams() super(ESGD, self).__init__(*args, **kwargs) def learning_updates(self): eps = 1e-4 # more or less from the paper for param, grad in zip(self.params, self.clipped_gradients()): D_tm1 = self.shared_like(param, 'D_ewma') vel_tm1 = self.shared_like(param, 'vel') Hv = TT.Rop(grad, param, self.rng.normal(param.shape)) D_t = self.ewma * D_tm1 + (1 - self.ewma) * Hv * Hv vel_t = self.momentum * vel_tm1 - grad * self.learning_rate / TT.sqrt(D_t + eps) yield D_tm1, D_t yield param, param + vel_t
def SGLD(loss, params, learning_rate, log_prior, N): """Apply the SGLD MCMC sampler""" g_lik = get_or_compute_grads(-N*loss, params) g_prior = get_or_compute_grads(log_prior, params) smrg = MRG_RandomStreams() updates = OrderedDict() for param, gl, gp in zip(params, g_lik, g_prior): eta = T.sqrt(learning_rate)*smrg.normal(size=param.shape) delta = 0.5*learning_rate*(gl + gp) + eta updates[param] = param + delta return updates
def Santa(tparams, cost, inps, lr, eidx, nframes, max_epoch, rho=0.95, anne_rate=0.5, e=1e-8, clip_norm=5): """ The implementation of Santa algorithm. tparams: theano shared variables, params that we need to optimize cost: cost function, the cross-entropy loss in our case inps: input theano variables lr: learning rate, in our case, we choose it to be 1.*1e-3, or 2.*1e-4 eidx: the current epochs we are running, used to decide when to change from exploration to refinement nframes: how many time-steps we have in the training dataset. max_epoch: the maximum of epochs we run rho, anne_rate, e, clip_norm: hyper-parameters we used in all the algorithms. """ trng = RandomStreams(123) grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, clip_norm): grads = [g*clip_norm/norm for g in grads] gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] i = theano.shared(numpy_floatX(0.)) i_t = i + 1. for p, g in zip(tparams.values(), gshared): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) alpha = theano.shared(np.ones(p.get_value().shape)*.5) alpha_t = alpha + (m**2 - lr/(i_t ** anne_rate)) * tensor.lt(eidx, 0.15*max_epoch) v_t = rho * v + (1.-rho) * (g ** 2) pcder = tensor.sqrt(tensor.sqrt(v_t)+e) eps = trng.normal(p.get_value().shape, avg = 0.0, std = 1.0, dtype=theano.config.floatX) m_t = -lr*g/pcder + (1. - alpha_t) * m + (tensor.sqrt(2*lr*v_t/(i_t ** anne_rate)/nframes) *eps) * tensor.lt(eidx, 0.15*max_epoch) p_t = p + (m_t/ pcder) updates.append((alpha, alpha_t)) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) f_update = theano.function([lr,eidx,nframes,max_epoch], [], updates=updates) return f_grad_shared, f_update
def _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l, n_mcsamples, random_seed): """Return expression of approximate ELBO based on Monte Carlo sampling. """ if random_seed is None: r = MRG_RandomStreams(gen_random_state()) else: r = MRG_RandomStreams(seed=random_seed) if uw_l is not None: l_g = (uw_g.size / 2).astype('int64') u_g = uw_g[:l_g] w_g = uw_g[l_g:] l_l = (uw_l.size / 2).astype('int64') u_l = uw_l[:l_l] w_l = uw_l[l_l:] def logp_(z_g, z_l): return theano.clone(logp, {inarray_g: z_g, inarray_l: z_l}, strict=False) if n_mcsamples == 1: n_g = r.normal(size=inarray_g.tag.test_value.shape) z_g = n_g * tt.exp(w_g) + u_g n_l = r.normal(size=inarray_l.tag.test_value.shape) z_l = n_l * tt.exp(w_l) + u_l elbo = logp_(z_g, z_l) + \ tt.sum(w_g) + 0.5 * l_g * (1 + np.log(2.0 * np.pi)) + \ tt.sum(w_l) + 0.5 * l_l * (1 + np.log(2.0 * np.pi)) else: ns_g = r.normal(size=inarray_g.tag.test_value.shape) zs_g = ns_g * tt.exp(w_g) + u_g ns_l = r.normal(size=inarray_l.tag.test_value.shape) zs_l = ns_l * tt.exp(w_l) + u_l logps, _ = theano.scan(fn=lambda z_g, z_l: logp_(z_g, z_l), outputs_info=None, sequences=zip(zs_g, zs_l)) elbo = tt.mean(logps) + \ tt.sum(w_g) + 0.5 * l_g * (1 + np.log(2.0 * np.pi)) + \ tt.sum(w_l) + 0.5 * l_l * (1 + np.log(2.0 * np.pi)) else: l_g = (uw_g.size / 2).astype('int64') u_g = uw_g[:l_g] w_g = uw_g[l_g:] def logp_(z_g): return theano.clone(logp, {inarray_g: z_g}, strict=False) if n_mcsamples == 1: n_g = r.normal(size=inarray_g.tag.test_value.shape) z_g = n_g * tt.exp(w_g) + u_g elbo = logp_(z_g) + \ tt.sum(w_g) + 0.5 * l_g * (1 + np.log(2.0 * np.pi)) else: n_g = r.normal(size=(n_mcsamples, u_g.tag.test_value.shape[0])) zs_g = n_g * tt.exp(w_g) + u_g logps, _ = theano.scan(fn=lambda q: logp_(q), outputs_info=None, sequences=[zs_g]) elbo = tt.mean(logps) + \ tt.sum(w_g) + 0.5 * l_g * (1 + np.log(2.0 * np.pi)) return elbo
class LadderAE(): def __init__(self, p): self.p = p self.init_weights_transpose = False self.default_lr = p.lr self.shareds = OrderedDict() self.rstream = RandomStreams(seed=p.seed) self.rng = np.random.RandomState(seed=p.seed) n_layers = len(p.encoder_layers) assert n_layers > 1, "Need to define encoder layers" assert n_layers == len(p.denoising_cost_x), ( "Number of denoising costs does not match with %d layers: %s" % (n_layers, str(p.denoising_cost_x))) def one_to_all(x): """ (5.,) -> 5 -> (5., 5., 5.) ('relu',) -> 'relu' -> ('relu', 'relu', 'relu') """ if type(x) is tuple and len(x) == 1: x = x[0] if type(x) is float: x = (np.float32(x), ) * n_layers if type(x) is str: x = (x, ) * n_layers return x p.decoder_spec = one_to_all(p.decoder_spec) p.f_local_noise_std = one_to_all(p.f_local_noise_std) acts = one_to_all(p.get('act', 'relu')) assert n_layers == len(p.decoder_spec), "f and g need to match" assert (n_layers == len(acts)), ( "Not enough activations given. Requires %d. Got: %s" % (n_layers, str(acts))) acts = acts[:-1] + ('softmax', ) def parse_layer(spec): """ 'fc:5' -> ('fc', 5) '5' -> ('fc', 5) 5 -> ('fc', 5) 'convv:3:2:2' -> ('convv', [3,2,2]) """ if type(spec) is not str: return "fc", spec spec = spec.split(':') l_type = spec.pop(0) if len(spec) >= 2 else "fc" spec = list(map(int, spec)) spec = spec[0] if len(spec) == 1 else spec return l_type, spec enc = list(map(parse_layer, p.encoder_layers)) self.layers = list(enumerate(zip(enc, p.decoder_spec, acts))) def weight(self, init, name, cast_float32=True, for_conv=False): weight = self.shared(init, name, cast_float32, role=WEIGHT) if for_conv: return weight.dimshuffle('x', 0, 'x', 'x') return weight def bias(self, init, name, cast_float32=True, for_conv=False): b = self.shared(init, name, cast_float32, role=BIAS) if for_conv: return b.dimshuffle('x', 0, 'x', 'x') return b def shared(self, init, name, cast_float32=True, role=PARAMETER, **kwargs): p = self.shareds.get(name) if p is None: p = shared_param(init, name, cast_float32, role, **kwargs) self.shareds[name] = p return p def counter(self): name = 'counter' p = self.shareds.get(name) update = [] if p is None: p_max_val = np.float32(10) p = self.shared(np.float32(1), name, role=BNPARAM) p_max = self.shared(p_max_val, name + '_max', role=BNPARAM) update = [(p, T.clip(p + np.float32(1), np.float32(0), p_max)), (p_max, p_max_val)] return (p, update) def noise_like(self, x): noise = self.rstream.normal(size=x.shape, avg=0.0, std=1.0) return T.cast(noise, dtype=floatX) def rand_init(self, in_dim, out_dim): """ Random initialization for fully connected layers """ W = self.rng.randn(in_dim, out_dim) / np.sqrt(in_dim) return W def rand_init_conv(self, dim): """ Random initialization for convolution filters """ fan_in = np.prod(dtype=floatX, a=dim[1:]) bound = np.sqrt(3. / max(1.0, (fan_in))) W = np.asarray(self.rng.uniform(low=-bound, high=bound, size=dim), dtype=floatX) return W def new_activation_dict(self): return AttributeDict({'z': {}, 'h': {}, 's': {}, 'm': {}}) def annotate_update(self, update, tag_to): a = Annotation() for (var, up) in update: a.updates[var] = up add_annotation(tag_to, a) def apply(self, input_labeled, target_labeled, input_unlabeled): self.layer_counter = 0 input_dim = self.p.encoder_layers[0] # Store the dimension tuples in the same order as layers. layers = self.layers self.layer_dims = {0: input_dim} self.lr = self.shared(self.default_lr, 'learning_rate', role=None) self.costs = costs = AttributeDict() self.costs.denois = AttributeDict() self.act = AttributeDict() self.error = AttributeDict() top = len(layers) - 1 N = input_labeled.shape[0] self.join = lambda l, u: T.concatenate([l, u], axis=0) self.labeled = lambda x: x[:N] if x is not None else x self.unlabeled = lambda x: x[N:] if x is not None else x self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x)) input_concat = self.join(input_labeled, input_unlabeled) def encoder(input_, path_name, input_noise_std=0, noise_std=[]): h = input_ logger.info(' 0: noise %g' % input_noise_std) if input_noise_std > 0.: h = h + self.noise_like(h) * input_noise_std d = AttributeDict() d.unlabeled = self.new_activation_dict() d.labeled = self.new_activation_dict() d.labeled.z[0] = self.labeled(h) d.unlabeled.z[0] = self.unlabeled(h) prev_dim = input_dim for i, (spec, _, act_f) in layers[1:]: d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h) noise = noise_std[i] if i < len(noise_std) else 0. curr_dim, z, m, s, h = self.f(h, prev_dim, spec, i, act_f, path_name=path_name, noise_std=noise) assert self.layer_dims.get(i) in (None, curr_dim) self.layer_dims[i] = curr_dim d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z) d.unlabeled.s[i] = s d.unlabeled.m[i] = m prev_dim = curr_dim d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h) return d # Clean, supervised logger.info('Encoder: clean, labeled') clean = self.act.clean = encoder(input_concat, 'clean') # Corrupted, supervised logger.info('Encoder: corr, labeled') corr = self.act.corr = encoder(input_concat, 'corr', input_noise_std=self.p.super_noise_std, noise_std=self.p.f_local_noise_std) est = self.act.est = self.new_activation_dict() # Decoder path in opposite order logger.info('Decoder: z_corr -> z_est') for i, ((_, spec), l_type, act_f) in layers[::-1]: z_corr = corr.unlabeled.z[i] z_clean = clean.unlabeled.z[i] z_clean_s = clean.unlabeled.s.get(i) z_clean_m = clean.unlabeled.m.get(i) fspec = layers[i + 1][1][0] if len(layers) > i + 1 else (None, None) if i == top: ver = corr.unlabeled.h[i] ver_dim = self.layer_dims[i] top_g = True else: ver = est.z.get(i + 1) ver_dim = self.layer_dims.get(i + 1) top_g = False z_est = self.g(z_lat=z_corr, z_ver=ver, in_dims=ver_dim, out_dims=self.layer_dims[i], l_type=l_type, num=i, fspec=fspec, top_g=top_g) if z_est is not None: # Denoising cost if z_clean_s and self.p.zestbn == 'bugfix': z_est_norm = (z_est - z_clean_m ) / T.sqrt(z_clean_s + np.float32(1e-10)) elif z_clean_s is None or self.p.zestbn == 'no': z_est_norm = z_est else: assert False, 'Not supported path' se = SquaredError('denois' + str(i)) costs.denois[i] = se.apply(z_est_norm.flatten(2), z_clean.flatten(2)) \ / np.prod(self.layer_dims[i], dtype=floatX) costs.denois[i].name = 'denois' + str(i) denois_print = 'denois %.2f' % self.p.denoising_cost_x[i] else: denois_print = '' # Store references for later use est.h[i] = self.apply_act(z_est, act_f) est.z[i] = z_est est.s[i] = None est.m[i] = None logger.info(' g%d: %10s, %s, dim %s -> %s' % (i, l_type, denois_print, self.layer_dims.get(i + 1), self.layer_dims.get(i))) # Costs y = target_labeled.flatten() costs.class_clean = CategoricalCrossEntropy().apply( y, clean.labeled.h[top]) costs.class_clean.name = 'cost_class_clean' costs.class_corr = CategoricalCrossEntropy().apply( y, corr.labeled.h[top]) costs.class_corr.name = 'cost_class_corr' # This will be used for training costs.total = costs.class_corr * 1.0 for i in range(top + 1): if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0: costs.total += costs.denois[i] * self.p.denoising_cost_x[i] costs.total.name = 'cost_total' # Classification error mr = MisclassificationRate() self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.) self.error.clean.name = 'error_rate_clean' def apply_act(self, input, act_name): if input is None: return input act = { 'relu': lambda x: T.maximum(0, x), 'leakyrelu': lambda x: T.switch(x > 0., x, 0.1 * x), 'linear': lambda x: x, 'softplus': lambda x: T.log(1. + T.exp(x)), 'sigmoid': lambda x: T.nnet.sigmoid(x), 'softmax': lambda x: T.nnet.softmax(x), }.get(act_name) assert act, 'unknown act %s' % act_name if act_name == 'softmax': input = input.flatten(2) return act(input) def annotate_bn(self, var, id, var_type, mb_size, size, norm_ax): var_shape = np.array((1, ) + size) out_dim = np.prod(var_shape) / np.prod(var_shape[list(norm_ax)]) # Flatten the var - shared variable updating is not trivial otherwise, # as theano seems to believe a row vector is a matrix and will complain # about the updates orig_shape = var.shape var = var.flatten() # Here we add the name and role, the variables will later be identified # by these values var.name = id + '_%s_clean' % var_type add_role(var, BNPARAM) shared_var = self.shared(np.zeros(out_dim), name='shared_%s' % var.name, role=None) # Update running average estimates. When the counter is reset to 1, it # will clear its memory cntr, c_up = self.counter() one = np.float32(1) run_avg = lambda new, old: one / cntr * new + (one - one / cntr) * old if var_type == 'mean': new_value = run_avg(var, shared_var) elif var_type == 'var': mb_size = T.cast(mb_size, 'float32') new_value = run_avg(mb_size / (mb_size - one) * var, shared_var) else: raise NotImplemented('Unknown batch norm var %s' % var_type) # Add the counter update to the annotated update if it is the first # instance of a counter self.annotate_update([(shared_var, new_value)] + c_up, var) return var.reshape(orig_shape) def f(self, h, in_dim, spec, num, act_f, path_name, noise_std=0): assert path_name in ['clean', 'corr'] # Generates identifiers used for referencing shared variables. # E.g. clean and corrupted encoders will end up using the same # variable name and hence sharing parameters gen_id = lambda s: '_'.join(['f', str(num), s]) layer_type, _ = spec # Pooling if layer_type in ['maxpool', 'globalmeanpool']: z, output_size = self.f_pool(h, spec, in_dim) norm_ax = (0, -2, -1) # after pooling, no activation func for now unless its softmax act_f = "linear" if act_f != "softmax" else act_f # Convolution elif layer_type in ['convv', 'convf']: z, output_size = self.f_conv(h, spec, in_dim, gen_id('W')) norm_ax = (0, -2, -1) # Fully connected elif layer_type == "fc": h = h.flatten(2) if h.ndim > 2 else h _, dim = spec W = self.weight(self.rand_init(np.prod(in_dim), dim), gen_id('W')) z, output_size = T.dot(h, W), (dim, ) norm_ax = (0, ) else: raise ValueError("Unknown layer spec: %s" % layer_type) m = s = None is_normalizing = True if is_normalizing: keep_dims = True z_l = self.labeled(z) z_u = self.unlabeled(z) m = z_u.mean(norm_ax, keepdims=keep_dims) s = z_u.var(norm_ax, keepdims=keep_dims) m_l = z_l.mean(norm_ax, keepdims=keep_dims) s_l = z_l.var(norm_ax, keepdims=keep_dims) if path_name == 'clean': # Batch normalization estimates the mean and variance of # validation and test sets based on the training set # statistics. The following annotates the computation of # running average to the graph. m_l = self.annotate_bn(m_l, gen_id('bn'), 'mean', z_l.shape[0], output_size, norm_ax) s_l = self.annotate_bn(s_l, gen_id('bn'), 'var', z_l.shape[0], output_size, norm_ax) z = self.join((z_l - m_l) / T.sqrt(s_l + np.float32(1e-10)), (z_u - m) / T.sqrt(s + np.float32(1e-10))) if noise_std > 0: z += self.noise_like(z) * noise_std # z for lateral connection z_lat = z b_init, c_init = 0.0, 1.0 b_c_size = output_size[0] # Add bias if act_f != 'linear': z += self.bias(b_init * np.ones(b_c_size), gen_id('b'), for_conv=len(output_size) > 1) if is_normalizing: # Add free parameter (gamma in original Batch Normalization paper) # if needed by the activation. For instance ReLU does't need one # and we only add it to softmax if hyperparameter top_c is set. if (act_f not in ['relu', 'leakyrelu', 'linear', 'softmax'] or (act_f == 'softmax' and self.p.top_c is True)): c = self.weight(c_init * np.ones(b_c_size), gen_id('c'), for_conv=len(output_size) > 1) z *= c h = self.apply_act(z, act_f) logger.info(' f%d: %s, %s,%s noise %.2f, params %s, dim %s -> %s' % (num, layer_type, act_f, ' BN,' if is_normalizing else '', noise_std, spec[1], in_dim, output_size)) return output_size, z_lat, m, s, h def f_pool(self, x, spec, in_dim): layer_type, dims = spec num_filters = in_dim[0] if "globalmeanpool" == layer_type: y, output_size = global_meanpool_2d(x, num_filters) # scale the variance to match normal conv layers with xavier init y = y * np.float32(in_dim[-1]) * np.float32(np.sqrt(3)) else: assert dims[0] != 1 or dims[1] != 1 y, output_size = maxpool_2d(x, in_dim, poolsize=(dims[1], dims[1]), poolstride=(dims[0], dims[0])) return y, output_size def f_conv(self, x, spec, in_dim, weight_name): layer_type, dims = spec num_filters = dims[0] filter_size = (dims[1], dims[1]) stride = (dims[2], dims[2]) bm = 'full' if 'convf' in layer_type else 'valid' num_channels = in_dim[0] W = self.weight( self.rand_init_conv((num_filters, num_channels) + filter_size), weight_name) if stride != (1, 1): f = GpuCorrMM(subsample=stride, border_mode=bm, pad=(0, 0)) y = f(gpu_contiguous(x), gpu_contiguous(W)) else: assert self.p.batch_size == self.p.valid_batch_size y = conv2d(x, W, image_shape=(2 * self.p.batch_size, ) + in_dim, filter_shape=((num_filters, num_channels) + filter_size), border_mode=bm) output_size = ( (num_filters, ) + ConvOp.getOutputShape(in_dim[1:], filter_size, stride, bm)) return y, output_size def g(self, z_lat, z_ver, in_dims, out_dims, l_type, num, fspec, top_g): f_layer_type, dims = fspec is_conv = f_layer_type is not None and ('conv' in f_layer_type or 'pool' in f_layer_type) gen_id = lambda s: '_'.join(['g', str(num), s]) in_dim = np.prod(dtype=floatX, a=in_dims) out_dim = np.prod(dtype=floatX, a=out_dims) num_filters = out_dims[0] if is_conv else out_dim if l_type[-1] in ['0']: g_type, u_type = l_type[:-1], l_type[-1] else: g_type, u_type = l_type, None # Mapping from layer above: u if u_type in ['0'] or z_ver is None: if z_ver is None and u_type not in ['0']: logger.warn('Decoder %d:%s without vertical input' % (num, g_type)) u = None else: if top_g: u = z_ver elif is_conv: u = self.g_deconv(z_ver, in_dims, out_dims, gen_id('W'), fspec) else: W = self.weight(self.rand_init(in_dim, out_dim), gen_id('W')) u = T.dot(z_ver, W) # Batch-normalize u if u is not None: norm_ax = (0, ) if u.ndim <= 2 else (0, -2, -1) keep_dims = True u -= u.mean(norm_ax, keepdims=keep_dims) u /= T.sqrt(u.var(norm_ax, keepdims=keep_dims) + np.float32(1e-10)) # Define the g function if not is_conv: z_lat = z_lat.flatten(2) bi = lambda inits, name: self.bias( inits * np.ones(num_filters), gen_id(name), for_conv=is_conv) wi = lambda inits, name: self.weight( inits * np.ones(num_filters), gen_id(name), for_conv=is_conv) if g_type == '': z_est = None elif g_type == 'i': z_est = z_lat elif g_type in ['sig']: sigval = bi(0., 'c1') + wi(1., 'c2') * z_lat if u is not None: sigval += wi(0., 'c3') * u + wi(0., 'c4') * z_lat * u sigval = T.nnet.sigmoid(sigval) z_est = bi(0., 'a1') + wi(1., 'a2') * z_lat + wi(1., 'b1') * sigval if u is not None: z_est += wi(0., 'a3') * u + wi(0., 'a4') * z_lat * u elif g_type in ['lin']: a1 = wi(1.0, 'a1') b = bi(0.0, 'b') z_est = a1 * z_lat + b elif g_type in ['relu']: assert u is not None b = bi(0., 'b') x = u + b z_est = self.apply_act(x, 'relu') elif g_type in ['sigmoid']: assert u is not None b = bi(0., 'b') c = wi(1., 'c') z_est = self.apply_act((u + b) * c, 'sigmoid') elif g_type in ['comparison_g2']: # sig without the uz cross term sigval = bi(0., 'c1') + wi(1., 'c2') * z_lat if u is not None: sigval += wi(0., 'c3') * u sigval = T.nnet.sigmoid(sigval) z_est = bi(0., 'a1') + wi(1., 'a2') * z_lat + wi(1., 'b1') * sigval if u is not None: z_est += wi(0., 'a3') * u elif g_type in ['comparison_g3']: # sig without the sigmoid nonlinearity z_est = bi(0., 'a1') + wi(1., 'a2') * z_lat if u is not None: z_est += wi(0., 'a3') * u + wi(0., 'a4') * z_lat * u elif g_type in ['comparison_g4']: # No mixing between z_lat and u before final sum, otherwise similar # to sig def nonlin(inp, in_name='input', add_bias=True): w1 = wi(1., 'w1_%s' % in_name) b1 = bi(0., 'b1') w2 = wi(1., 'w2_%s' % in_name) b2 = bi(0., 'b2') if add_bias else 0 w3 = wi(0., 'w3_%s' % in_name) return w2 * T.nnet.sigmoid(b1 + w1 * inp) + w3 * inp + b2 z_est = nonlin(z_lat, 'lat') if u is None else \ nonlin(z_lat, 'lat') + nonlin(u, 'ver', False) elif g_type in ['comparison_g5', 'gauss']: # Gaussian assumption on z: (z - mu) * v + mu if u is None: b1 = bi(0., 'b1') w1 = wi(1., 'w1') z_est = w1 * z_lat + b1 else: a1 = bi(0., 'a1') a2 = wi(1., 'a2') a3 = bi(0., 'a3') a4 = bi(0., 'a4') a5 = bi(0., 'a5') a6 = bi(0., 'a6') a7 = wi(1., 'a7') a8 = bi(0., 'a8') a9 = bi(0., 'a9') a10 = bi(0., 'a10') mu = a1 * T.nnet.sigmoid(a2 * u + a3) + a4 * u + a5 v = a6 * T.nnet.sigmoid(a7 * u + a8) + a9 * u + a10 z_est = (z_lat - mu) * v + mu else: raise NotImplementedError("unknown g type: %s" % str(g_type)) # Reshape the output if z is for conv but u from fc layer if (z_est is not None and type(out_dims) == tuple and len(out_dims) > 1.0 and z_est.ndim < 4): z_est = z_est.reshape((z_est.shape[0], ) + out_dims) return z_est def g_deconv(self, z_ver, in_dims, out_dims, weight_name, fspec): """ Inverse operation for each type of f used in convnets """ f_type, f_dims = fspec assert z_ver is not None num_channels = in_dims[0] if in_dims is not None else None num_filters, width, height = out_dims[:3] if f_type in ['globalmeanpool']: u = T.addbroadcast(z_ver, 2, 3) assert in_dims[1] == 1 and in_dims[2] == 1, \ "global pooling needs in_dims (1,1): %s" % str(in_dims) elif f_type in ['maxpool']: sh, str, size = z_ver.shape, f_dims[0], f_dims[1] assert str == size, "depooling requires stride == size" u = T.zeros((sh[0], sh[1], sh[2] * str, sh[3] * str), dtype=z_ver.dtype) for x in range(str): for y in range(str): u = T.set_subtensor(u[:, :, x::str, y::str], z_ver) u = u[:, :, :width, :height] elif f_type in ['convv', 'convf']: filter_size, str = (f_dims[1], f_dims[1]), f_dims[2] W_shape = (num_filters, num_channels) + filter_size W = self.weight(self.rand_init_conv(W_shape), weight_name) if str > 1: # upsample if strided version sh = z_ver.shape u = T.zeros((sh[0], sh[1], sh[2] * str, sh[3] * str), dtype=z_ver.dtype) u = T.set_subtensor(u[:, :, ::str, ::str], z_ver) else: u = z_ver # no strides, only deconv u = conv2d(u, W, filter_shape=W_shape, border_mode='valid' if 'convf' in f_type else 'full') u = u[:, :, :width, :height] else: raise NotImplementedError('Layer %s has no convolutional decoder' % f_type) return u
def random_normal(shape, mean=0.0, std=1.0, dtype=_FLOATX, seed=None): if seed is None: seed = np.random.randint(10e6) rng = RandomStreams(seed=seed) return rng.normal(size=shape, avg=mean, std=std, dtype=dtype)
else: params = lasagne.layers.get_all_params([l_dec_x_mu, l_dec_x_log_var], trainable=True) for p in params: print p, p.get_value().shape params_count = lasagne.layers.count_params([l_dec_x_mu, l_dec_x_log_var], trainable=True) print 'Number of parameters:', params_count # random generation for visualization from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng_ran = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579)) srng_ran_share = theano.tensor.shared_randomstreams.RandomStreams(1234) sym_nimages = T.iscalar('nimages') ran_z = srng_ran.normal((sym_nimages, latent_size)) if dataset in ['sample', 'fixed', 'caltech', 'ocr_letter', 'omniglot']: random_x_mean = lasagne.layers.get_output(l_dec_x_mu, {l_z: ran_z}, deterministic=True) random_x = srng_ran_share.binomial(n=1, p=random_x_mean, dtype=theano.config.floatX) else: random_x_mean, random_x_log_var = lasagne.layers.get_output( [l_dec_x_mu, l_dec_x_log_var], {l_z: ran_z}, deterministic=True) random_x = srng_ran_share.normal(size=(sym_nimages, num_features), avg=random_x_mean, std=T.exp(0.5 * random_x_log_var)) generate_model = theano.function(inputs=[sym_nimages], outputs=[random_x_mean, random_x])
def _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l, n_mcsamples, random_seed): """Return expression of approximate ELBO based on Monte Carlo sampling. """ if random_seed is None: r = MRG_RandomStreams(gen_random_state()) else: r = MRG_RandomStreams(seed=random_seed) normal_const = floatX(1 + np.log(2.0 * np.pi)) elbo = 0 # Sampling local variational parameters if uw_l is not None: l_l = (uw_l.size / 2).astype('int64') l_l_ = (uw_l.size / 2).astype(floatX_str) u_l = uw_l[:l_l] w_l = uw_l[l_l:] ns_l = r.normal(size=(n_mcsamples, inarray_l.tag.test_value.shape[0])) zs_l = ns_l * tt.exp(w_l) + u_l elbo += tt.sum(w_l) + 0.5 * l_l_ * normal_const else: zs_l = None # Sampling global variational parameters if uw_g is not None: l_g = (uw_g.size / 2).astype('int64') l_g_ = (uw_g.size / 2).astype(floatX_str) u_g = uw_g[:l_g] w_g = uw_g[l_g:] ns_g = r.normal(size=(n_mcsamples, inarray_g.tag.test_value.shape[0])) zs_g = ns_g * tt.exp(w_g) + u_g elbo += tt.sum(w_g) + 0.5 * l_g_ * normal_const else: zs_g = None if (zs_l is not None) and (zs_g is not None): def logp_(z_g, z_l): return theano.clone(logp, OrderedDict({ inarray_g: z_g, inarray_l: z_l }), strict=False) sequences = [zs_g, zs_l] elif zs_l is not None: def logp_(z_l): return theano.clone(logp, OrderedDict({inarray_l: z_l}), strict=False) sequences = [zs_l] else: def logp_(z_g): return theano.clone(logp, OrderedDict({inarray_g: z_g}), strict=False) sequences = [zs_g] logps, _ = theano.scan(fn=logp_, outputs_info=None, sequences=sequences) elbo += tt.mean(logps) return elbo
class GaussianDropoutLayer(lasagne.layers.Layer): ''' Puts a gaussian prior on the weights of the previous layer ''' def __init__(self, incoming, p=lasagne.init.Constant(-10), log_alpha=None, mask=None, n_samples=None, shared_axes=(), **kwargs): super(GaussianDropoutLayer, self).__init__( incoming, **kwargs) self._srng = RandomStreams(get_rng().randint(1, 2147462579)) self.shared_axes = tuple(shared_axes) if log_alpha is None: if isinstance(p, Number): p = np.atleast_1d(p) if callable(p): p_shape = self.input_shape[1:] else: p_shape = p.shape p = lasagne.utils.create_param(p, p_shape, name='p') p = p.get_value() log_alpha = np.log(p/(1-p)) # add log_alpha as trainable parameter if isinstance(log_alpha, Number): log_alpha = np.atleast_1d(log_alpha) if callable(log_alpha): log_alpha_shape = self.input_shape[1:] elif isinstance(log_alpha, tt.sharedvar.SharedVariable): log_alpha_shape = log_alpha.get_value().shape else: log_alpha_shape = log_alpha.shape self.log_alpha = self.add_param( log_alpha, log_alpha_shape, name='log_alpha', regularizable=False) # init mask to shape compatible with log_alpha mask_shape = [2] + list(self.input_shape[1:]) # the mask should be drawn from a normal (1, alpha) distribution sq_alpha = np.exp(0.5*self.log_alpha.get_value()) mask = sq_alpha*np.random.normal(1, 1, mask_shape).astype(floatX) self.mask = self.add_param( mask, mask_shape, name='mask', trainable=False, regularzable=False) self.mask_updates = None def get_output_for(self, input, deterministic=False, fixed_dropout_masks=False, **kwargs): if deterministic: return input else: # use nonsymbolic shape for dropout mask if possible mask_shape = self.input_shape if any(s is None for s in mask_shape): mask_shape = input.shape # apply dropout, respecting shared axes if self.shared_axes: shared_axes = tuple(a if a >= 0 else a + input.ndim for a in self.shared_axes) mask_shape = tuple(1 if a in shared_axes else s for a, s in enumerate(mask_shape)) mask = self._srng.normal( mask_shape, avg=0, std=1, dtype=input.dtype) if self.shared_axes: bcast = tuple(bool(s == 1) for s in mask_shape) mask = tt.patternbroadcast(mask, bcast) if self.mask is not None and fixed_dropout_masks: # the user may update the shared mask value however they want, # but here we provide an update expression. note that if the # batch size changes, the update will only have an effect at # the next call causing a shape mis-match in the elementwise # product. To avoid this, the user should update the masks # before performing a forward pass on this layer. self.mask_updates = mask # make sure that we use the local shared variable as the mask mask = self.mask sq_alpha = tt.exp(0.5*self.log_alpha) return input * (1 + sq_alpha * mask)
latents) embedded = lib.ops.embedding.Embedding('Embedding', 256, CONV_DIM, images) embedded = embedded.dimshuffle(0, 1, 4, 2, 3) embedded = embedded.reshape( (embedded.shape[0], embedded.shape[1] * embedded.shape[2], embedded.shape[3], embedded.shape[4])) mu_and_logsig1 = E1(embedded) mu1, logsig1 = split(mu_and_logsig1) if VANILLA: latents1 = mu1 else: eps = T.cast(theano_srng.normal(mu1.shape), theano.config.floatX) latents1 = mu1 + (eps * T.exp(logsig1)) outputs1 = D1(latents1) reconst_cost = T.nnet.categorical_crossentropy( T.nnet.softmax( outputs1.reshape( (-1, 256, N_CHANNELS, HEIGHT, WIDTH)).dimshuffle(0, 2, 3, 4, 1).reshape( (-1, 256))), images.flatten()).mean() # Layer 2
def __init__(self, rng, input, n_in, n_out, num_MC, num_FF, Domain_number=None, number="1", Domain_consideration=True): #inputも100*N*Dで入ってくるようにする. #DATA=input #N=DATA.shape[1] #n_in_D=DATA.shape[2] srng = RandomStreams(seed=234) #Define hyperparameters lhyp_values = np.zeros(n_in + 1, dtype=theano.config.floatX) self.lhyp = theano.shared(value=lhyp_values, name='lhyp' + number, borrow=True) self.sf2, self.l = T.exp(self.lhyp[0]), T.exp(self.lhyp[1:1 + n_in]) if Domain_consideration: ls_value = np.zeros(Domain_number, dtype=theano.config.floatX) + np.log( 0.1, dtype=theano.config.floatX) else: ls_value = np.zeros(1, dtype=theano.config.floatX) + np.log( 0.1, dtype=theano.config.floatX) self.ls = theano.shared(value=ls_value, name='ls' + number, borrow=True) self.beta = T.exp(self.ls) #Define prior omega #prior_mean_Omega.append(tf.zeros([self.d_in[i],1])) log_prior_var_Omega = T.tile(1 / (self.l)**0.5, (num_FF, 1)).T #Define posterior omega #get samples from omega sample_value = np.random.randn(1, n_in, num_FF) sample_Omega_epsilon_0 = theano.shared(value=sample_value, name='sample_Omega' + number) #sample_Omega_epsilon_0 = srng.normal((1,n_in,num_FF)) Omega_sample = sample_Omega_epsilon_0 * log_prior_var_Omega[None, :, :] Omega_samples = T.tile(Omega_sample, (num_MC, 1, 1)) #Define prior W prior_mean_W = T.zeros(2 * num_FF) log_prior_var_W = T.ones(2 * num_FF) #Define posterior W mean_mu_value = np.random.randn(2 * num_FF, n_out) * 1e-2 self.mean_mu = theano.shared(value=mean_mu_value, name='mean_mu' + number, borrow=True) log_var_value = np.zeros((2 * num_FF, n_out)) self.log_var_W = theano.shared(value=log_var_value, name='q_W' + number, borrow=True) #get samples from W sample_Omega_epsilon = srng.normal((num_MC, 2 * num_FF, n_out)) W_samples = sample_Omega_epsilon * (T.exp( self.log_var_W)**0.5)[None, :, :] + self.mean_mu[None, :, :] # calculate lyaer N_MC*N*D_out F_next, updates = theano.scan( fn=lambda a, b, c: self.passage(a, b, c, num_FF), sequences=[input, Omega_samples, W_samples]) #output self.output = F_next #KL-divergence #Omega #W self.KL_W = self.DKL_gaussian(self.mean_mu, self.log_var_W, prior_mean_W, log_prior_var_W) #parameter_setting self.all_params = [self.lhyp, self.ls, self.mean_mu, self.log_var_W] self.hyp_params = [self.lhyp, self.ls] self.variational_params = [self.mean_mu, self.log_var_W]
def sample_vp(vparams, draws=1000, model=None, local_RVs=None, random_seed=None, hide_transformed=True, progressbar=True): """Draw samples from variational posterior. Parameters ---------- vparams : dict or pymc3.variational.ADVIFit Estimated variational parameters of the model. draws : int Number of random samples. model : pymc3.Model Probabilistic model. random_seed : int or None Seed of random number generator. None to use current seed. hide_transformed : bool If False, transformed variables are also sampled. Default is True. Returns ------- trace : pymc3.backends.base.MultiTrace Samples drawn from the variational posterior. """ import warnings warnings.warn( 'Old ADVI interface and sample_vp is deprecated and will ' 'be removed in future, use pm.fit and pm.sample_approx instead', DeprecationWarning, stacklevel=2) model = pm.modelcontext(model) if isinstance(vparams, ADVIFit): vparams = {'means': vparams.means, 'stds': vparams.stds} ds = model.deterministics def get_transformed(v): return v if v not in ds else v.transformed def rvs(x): return [get_transformed(v) for v in x] if x is not None else [] global_RVs = list(set(model.free_RVs) - set(rvs(local_RVs))) # Make dict for replacements of random variables if random_seed is None: r = MRG_RandomStreams(gen_random_state()) else: r = MRG_RandomStreams(seed=random_seed) updates = {} for v in global_RVs: u = theano.shared(vparams['means'][str(v)]).ravel() w = theano.shared(vparams['stds'][str(v)]).ravel() n = r.normal(size=u.tag.test_value.shape) updates.update({v: (n * w + u).reshape(v.tag.test_value.shape)}) if local_RVs is not None: for v_, (uw, _) in local_RVs.items(): v = get_transformed(v_) u = uw[0].ravel() w = uw[1].ravel() n = r.normal(size=u.tag.test_value.shape) updates.update( {v: (n * tt.exp(w) + u).reshape(v.tag.test_value.shape)}) # Replace some nodes of the graph with variational distributions vars = model.free_RVs samples = theano.clone(vars, updates) f = theano.function([], samples) # Random variables which will be sampled vars_sampled = pm.util.get_default_varnames( model.unobserved_RVs, include_transformed=not hide_transformed) varnames = [str(var) for var in model.unobserved_RVs] trace = pm.sampling.NDArray(model=model, vars=vars_sampled) trace.setup(draws=draws, chain=0) range_ = trange(draws) if progressbar else range(draws) for _ in range_: # 'point' is like {'var1': np.array(0.1), 'var2': np.array(0.2), ...} point = {varname: value for varname, value in zip(varnames, f())} trace.record(point) return MultiTrace([trace])
class VAELayer(Layer): def __init__(self, incoming, encoder, decoder, x_distribution='bernoulli', pz_distribution='gaussian', qz_distribution='gaussian', latent_size=50, W=init.Normal(0.01), b=init.Normal(0.01), **kwargs): super(VAELayer, self).__init__(incoming, **kwargs) num_batch, n_features = self.input_shape self.num_batch = num_batch self.n_features = n_features self.x_distribution = x_distribution self.pz_distribution = pz_distribution self.qz_distribution = qz_distribution self.encoder = encoder self.decoder = decoder self._srng = RandomStreams() if self.x_distribution not in ['gaussian', 'bernoulli']: raise NotImplementedError if self.pz_distribution not in ['gaussian', 'gaussianmarg']: raise NotImplementedError if self.qz_distribution not in ['gaussian', 'gaussianmarg']: raise NotImplementedError self.params_encoder = lasagne.layers.get_all_params(encoder) self.params_decoder = lasagne.layers.get_all_params(decoder) for p in self.params_encoder: p.name = "VAELayer encoder :" + p.name for p in self.params_decoder: p.name = "VAELayer decoder :" + p.name self.num_hid_enc = encoder.output_shape[1] self.num_hid_dec = decoder.output_shape[1] self.latent_size = latent_size self.W_enc_to_z_mu = self.add_param(W, (self.num_hid_enc, latent_size)) self.b_enc_to_z_mu = self.add_param(b, (latent_size, )) self.W_enc_to_z_logsigma = self.add_param( W, (self.num_hid_enc, self.latent_size)) self.b_enc_to_z_logsigma = self.add_param(b, (latent_size, )) self.W_dec_to_x_mu = self.add_param( W, (self.num_hid_dec, self.n_features)) self.b_dec_to_x_mu = self.add_param(b, (self.n_features, )) self.W_params = [ self.W_enc_to_z_mu, self.W_enc_to_z_logsigma, self.W_dec_to_x_mu ] + self.params_encoder + self.params_decoder self.bias_params = [ self.b_enc_to_z_mu, self.b_enc_to_z_logsigma, self.b_dec_to_x_mu ] params_tmp = [] if self.x_distribution == 'gaussian': self.W_dec_to_x_logsigma = self.add_param( W, (self.num_hid_dec, self.n_features)) self.b_dec_to_x_logsigma = self.add_param(b, (self.n_features, )) self.W_params += [self.W_dec_to_x_logsigma] self.bias_params += [self.b_dec_to_x_logsigma] self.W_dec_to_x_logsigma.name = "VAE: W_dec_to_x_logsigma" self.b_dec_to_x_logsigma.name = "VAE: b_dec_to_x_logsigma" params_tmp = [self.W_dec_to_x_logsigma, self.b_dec_to_x_logsigma] self.params = self.params_encoder + [self.W_enc_to_z_mu, self.b_enc_to_z_mu, self.W_enc_to_z_logsigma, self.b_enc_to_z_logsigma] + self.params_decoder + \ [self.W_dec_to_x_mu, self.b_dec_to_x_mu] + params_tmp self.W_enc_to_z_mu.name = "VAELayer: W_enc_to_z_mu" self.W_enc_to_z_logsigma.name = "VAELayer: W_enc_to_z_logsigma" self.W_dec_to_x_mu.name = "VAELayer: W_dec_to_x_mu" self.b_enc_to_z_mu.name = "VAELayer: b_enc_to_z_mu" self.b_enc_to_z_logsigma.name = "VAELayer: b_enc_to_z_logsigma" self.b_dec_to_x_mu.name = "VAELayer: b_dec_to_x_mu" def get_params(self): return self.params def get_output_shape_for(self, input_shape): dec_out_shp = self.decoder.get_output_shape_for( (self.num_batch, self.num_hid_dec)) if self.x_distribution == 'bernoulli': return dec_out_shp elif self.x_distribution == 'gaussian': return [dec_out_shp, dec_out_shp] def _encoder_output(self, x, *args, **kwargs): return lasagne.layers.get_output(self.encoder, x, **kwargs) def decoder_output(self, z, *args, **kwargs): h_decoder = lasagne.layers.get_output(self.decoder, z, **kwargs) if self.x_distribution == 'gaussian': mu_decoder = T.dot(h_decoder, self.W_dec_to_x_mu) + self.b_dec_to_x_mu log_sigma_decoder = T.dot( h_decoder, self.W_dec_to_x_logsigma) + self.b_dec_to_x_logsigma decoder_out = mu_decoder, log_sigma_decoder elif self.x_distribution == 'bernoulli': # TODO: Finish writing the output of the decoder for a bernoulli distributed x. decoder_out = T.nnet.sigmoid( T.dot(h_decoder, self.W_dec_to_x_mu) + self.b_dec_to_x_mu) else: raise NotImplementedError return decoder_out def get_z_mu_sigma(self, x, *args, **kwargs): h_encoder = self._encoder_output(x, *args, **kwargs) mu_encoder = T.dot(h_encoder, self.W_enc_to_z_mu) + self.b_enc_to_z_mu log_sigma_encoder = (T.dot(h_encoder, self.W_enc_to_z_logsigma) + self.b_enc_to_z_logsigma) eps = self._srng.normal(log_sigma_encoder.shape) # TODO: Calculate the sampled z. z = mu_encoder + T.exp(0.5 * log_sigma_encoder) * eps return z, mu_encoder, log_sigma_encoder def get_log_distributions(self, x, *args, **kwargs): # sample z from q(z|x). h_encoder = self._encoder_output(x, *args, **kwargs) mu_encoder = T.dot(h_encoder, self.W_enc_to_z_mu) + self.b_enc_to_z_mu log_sigma_encoder = (T.dot(h_encoder, self.W_enc_to_z_logsigma) + self.b_enc_to_z_logsigma) eps = self._srng.normal(log_sigma_encoder.shape) z = mu_encoder + T.exp(0.5 * log_sigma_encoder) * eps # forward pass z through decoder to generate p(x|z). decoder_out = self.decoder_output(z, *args, **kwargs) if self.x_distribution == 'bernoulli': x_mu = decoder_out log_px_given_z = -T.nnet.binary_crossentropy(x_mu, x) elif self.x_distribution == 'gaussian': x_mu, x_logsigma = decoder_out log_px_given_z = normal2(x, x_mu, x_logsigma) # sample prior distribution p(z). if self.pz_distribution == 'gaussian': log_pz = standard_normal(z) elif self.pz_distribution == 'gaussianmarg': log_pz = -0.5 * (T.log(2 * np.pi) + (T.sqr(mu_encoder) + T.exp(log_sigma_encoder))) # variational approximation distribution q(z|x) if self.qz_distribution == 'gaussian': log_qz_given_x = normal2(z, mu_encoder, log_sigma_encoder) elif self.qz_distribution == 'gaussianmarg': log_qz_given_x = -0.5 * (T.log(2 * np.pi) + 1 + log_sigma_encoder) # sum over dim 1 to get shape (,batch_size) log_px_given_z = log_px_given_z.sum( axis=1, dtype=theano.config.floatX) # sum over x log_pz = log_pz.sum(axis=1, dtype=theano.config.floatX) # sum over latent vars log_qz_given_x = log_qz_given_x.sum( axis=1, dtype=theano.config.floatX) # sum over latent vars return log_pz, log_qz_given_x, log_px_given_z def draw_sample(self, z=None, *args, **kwargs): if z is None: # draw random z z = self._srng.normal((self.num_batch, self.latent_size)) return self.decoder_output(z, *args, **kwargs)
class SampleLayer(lasagne.layers.MergeLayer): """ Sampling layer supporting importance sampling as described in [BURDA]_ and multiple Monte Carlo samples for the approximation of E_q [log( p(x,z) / q(z|x) )]. Parameters ---------- mu : class:`Layer` instance Parameterizing the mean of the distribution to sample from as described in [BURDA]_. log_var : class:`Layer` instance By default assumed to parametrize log(sigma^2) of the distribution to sample from as described in [BURDA]_ which is transformed to sigma using the nonlinearity function as described below. Effectively this means that the nonlinearity function controls what log_var parametrizes. A few common examples: -nonlinearity = lambda x: T.exp(0.5*x) => log_var = log(sigma^2)[default] -nonlinearity = lambda x: T.sqrt(x) => log_var = sigma^2 -nonlinearity = lambda x: x => log_var = sigma eq_samples : int or T.scalar Number of Monte Carlo samples used to estimate the expectation over q(z|x) in eq. (8) in [BURDA]_. iw_samples : int or T.scalar Number of importance samples in the sum over k in eq. (8) in [BURDA]_. nonlinearity : callable or None The nonlinearity that is applied to the log_var input layer to transform it into a standard deviation. By default we assume that log_var = log(sigma^2) and hence the corresponding nonlinearity is f(x) = T.exp(0.5*x) such that T.exp(0.5*log(sigma^2)) = sigma seed : int seed to random stream Methods ---------- seed : Helper function to change the random seed after init is called References ---------- .. [BURDA] Burda, Yuri, Roger Grosse, and Ruslan Salakhutdinov. "Importance Weighted Autoencoders." arXiv preprint arXiv:1509.00519 (2015). """ def __init__(self, mean, log_var, eq_samples=1, iw_samples=1, nonlinearity=lambda x: T.exp(0.5 * x), seed=lasagne.random.get_rng().randint(1, 2147462579), **kwargs): super(SampleLayer, self).__init__([mean, log_var], **kwargs) self.eq_samples = eq_samples self.iw_samples = iw_samples self.nonlinearity = nonlinearity self._srng = RandomStreams(seed) def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): self._srng.seed(seed) def get_output_shape_for(self, input_shapes): batch_size, num_latent = input_shapes[0] if isinstance(batch_size, int) and \ isinstance(self.iw_samples, int) and \ isinstance(self.eq_samples, int): out_dim = (batch_size * self.eq_samples * self.iw_samples, num_latent) else: out_dim = (None, num_latent) return out_dim def get_output_for(self, input, **kwargs): mu, log_var = input batch_size, num_latent = mu.shape eps = self._srng.normal( [batch_size, self.eq_samples, self.iw_samples, num_latent], dtype=theano.config.floatX) z = mu.dimshuffle(0,'x','x',1) + \ self.nonlinearity( log_var.dimshuffle(0,'x','x',1)) * eps return z.reshape((-1, num_latent))
) l1_mu_and_log_sigma = lib.ops.mlp.MLP( 'L1Encoder', input_dim=FRAME_SIZE*EMBED_DIM, hidden_dim=L1_DIM, output_dim=2*L1_LATENT, n_layers=N_LAYERS, inputs=embedded.reshape((-1, FRAME_SIZE*EMBED_DIM)) ) l1_mu, l1_log_sigma = l1_mu_and_log_sigma[:,::2], l1_mu_and_log_sigma[:,1::2] if VANILLA: l1_latents = l1_mu else: eps = T.cast(theano_srng.normal(l1_mu.shape), theano.config.floatX) l1_latents = l1_mu + (eps * T.exp(l1_log_sigma)) def L1Decoder(latents): outputs = lib.ops.mlp.MLP( 'L1Decoder', input_dim=L1_LATENT, hidden_dim=L1_DIM, output_dim=FRAME_SIZE*EMBED_DIM, n_layers=N_LAYERS, inputs=latents ) outputs = outputs.reshape((-1, FRAME_SIZE, EMBED_DIM)) outputs = lib.ops.linear.Linear('L1DecoderOutput', input_dim=EMBED_DIM, output_dim=Q_LEVELS,
class PosteriorGP(object): def __init__(self, inducing_pts, x_test, kernel, symbolic_kernel, init_params=None, random_seed=101): self.rng = RandomStreams(seed=random_seed) self.len_test = len(x_test) self.cov_vec = CovVec(inducing_pts, kernel, symbolic_kernel) self.post_mean = PosteriorMean(inducing_pts, kernel, symbolic_kernel) # input symbolic variables self.t_idx_train = T.imatrix() self.t_w_train = T.matrix() self.t_idx_test = T.imatrix() self.t_w_test = T.matrix() self.t_y_train = T.vector() if init_params is None: #init_params = [np.log(np.array([2., 10.])), np.log(0.3)] init_params = [np.array([-0.7, 5]), -2.5] log_gp_params, log_indep_noise = init_params self.log_gp_params = theano.shared(log_gp_params) self.log_indep_noise = theano.shared(log_indep_noise) self.gp_params = T.exp(self.log_gp_params) self.indep_noise = T.exp(self.log_indep_noise) # collection of symbolic variables derived from data self.data_variables = [ self.t_idx_train, self.t_w_train, self.t_idx_test, self.t_w_test, self.t_y_train ] # GP hyperparameters and noise parameter self.params = [self.log_gp_params, self.log_indep_noise] def set_params(self, params): log_gp_params, log_indep_noise = params self.log_gp_params.set_value(log_gp_params) self.log_indep_noise.set_value(log_indep_noise) def mean(self): mu = self.post_mean(self.t_idx_train, self.t_w_train, self.t_idx_test, self.t_w_test, self.gp_params, self.indep_noise, self.t_y_train) return mu def cov_rand_proj(self, n_sample=10, n_lanczos_basis=10): cov_vec = self.cov_vec if n_sample == 1: cov_vec.use_single_sample() def linear_op(zs): return cov_vec(self.t_idx_train, self.t_w_train, self.t_idx_test, self.t_w_test, self.gp_params, self.indep_noise, zs) eps = self.rng.normal(size=(n_sample, self.len_test)) cov_zs = lanczos(linear_op, eps, n_lanczos_basis, n_sample) return cov_zs def cov_proj(self, eps, n_sample=10, n_lanczos_basis=10): cov_vec = self.cov_vec def linear_op(zs): return cov_vec(self.t_idx_train, self.t_w_train, self.t_idx_test, self.t_w_test, self.gp_params, self.indep_noise, zs) cov_zs = lanczos(linear_op, eps, n_lanczos_basis, n_sample) return cov_zs
def construct_graph_popstats(self, args, x, drops_state, drops_cell, length, popstats=None): p = self.allocate_parameters(args) def stepfn(x, drops_state, drops_cell, dummy_h, dummy_c, pop_means_a, pop_means_b, pop_means_c, pop_vars_a, pop_vars_b, pop_vars_c, h, c): atilde = T.dot(h, p.Wa) btilde = x if args.baseline: a_normal, a_mean, a_var = bn(atilde, 1.0, p.ab_betas, pop_means_a, pop_vars_a, args) b_normal, b_mean, b_var = bn(btilde, 1.0, 0, pop_means_b, pop_vars_b, args) else: a_normal, a_mean, a_var = bn(atilde, p.a_gammas, p.ab_betas, pop_means_a, pop_vars_a, args) b_normal, b_mean, b_var = bn(btilde, p.b_gammas, 0, pop_means_b, pop_vars_b, args) ab = a_normal + b_normal g, f, i, o = [ fn(ab[:, j * args.num_hidden:(j + 1) * args.num_hidden]) for j, fn in enumerate([self.activation] + 3 * [T.nnet.sigmoid]) ] if args.elephant: c_n = dummy_c + f * c + drops_cell * (i * g) else: c_n = dummy_c + f * c + i * g if args.zoneout: c_n_z = c_n * drops_cell + (1 - drops_cell) * c else: c_n_z = c_n if args.baseline: c_normal, c_mean, c_var = bn(c_n, 1.0, p.c_betas, pop_means_c, pop_vars_c, args) else: c_normal, c_mean, c_var = bn(c_n, p.c_gammas, p.c_betas, pop_means_c, pop_vars_c, args) h_n = dummy_h + o * self.activation(c_normal) ## Zoneout if args.zoneout: h = h_n * drops_state + (1 - drops_state) * h c = c_n_z else: h = h_n c = c_n return (h, c, atilde, btilde, c, a_mean, b_mean, c_mean, a_var, b_var, c_var) xtilde = T.dot(x, p.Wx) if args.noise: # prime h with white noise Trng = MRG_RandomStreams() h_prime = Trng.normal((xtilde.shape[1], args.num_hidden), std=args.noise) elif args.summarize: # prime h with mean of example h_prime = x.mean(axis=[0, 2])[:, None] else: h_prime = 0 dummy_states = dict(h=T.zeros( (xtilde.shape[0], xtilde.shape[1], args.num_hidden)), c=T.zeros((xtilde.shape[0], xtilde.shape[1], args.num_hidden))) if popstats is None: popstats = OrderedDict() for key, size in zip( "abc", [4 * args.num_hidden, 4 * args.num_hidden, args.num_hidden]): for stat, init in zip("mean var".split(), [0, 1]): name = "%s_%s" % (key, stat) popstats[name] = theano.shared(init + np.zeros( ( length, size, ), dtype=theano.config.floatX), name=name) popstats_seq = [ popstats['a_mean'], popstats['b_mean'], popstats['c_mean'], popstats['a_var'], popstats['b_var'], popstats['c_var'] ] [ h, c, atilde, btilde, htilde, batch_mean_a, batch_mean_b, batch_mean_c, batch_var_a, batch_var_b, batch_var_c ], _ = theano.scan( stepfn, sequences=[ xtilde, drops_cell, drops_state, dummy_states["h"], dummy_states["c"] ] + popstats_seq, outputs_info=[ T.repeat(p.h0[None, :], xtilde.shape[1], axis=0) + h_prime, T.repeat(p.c0[None, :], xtilde.shape[1], axis=0), None, None, None, None, None, None, None, None, None ]) batchstats = OrderedDict() batchstats['a_mean'] = batch_mean_a batchstats['b_mean'] = batch_mean_b batchstats['c_mean'] = batch_mean_c batchstats['a_var'] = batch_var_a batchstats['b_var'] = batch_var_b batchstats['c_var'] = batch_var_c updates = OrderedDict() if not args.use_population_statistics: alpha = 1e-2 for key in "abc": for stat, init in zip("mean var".split(), [0, 1]): name = "%s_%s" % (key, stat) print name popstats[name].tag.estimand = batchstats[name] updates[popstats[name]] = (alpha * batchstats[name] + (1 - alpha) * popstats[name]) return dict(h=h, c=c), updates, dummy_states, popstats
def test_grad_strided(): rng = np.random.RandomState([2012, 10, 9]) batch_size = 5 rows = 9 cols = 9 channels = 3 filter_rows = 3 filter_cols = filter_rows num_filters = 16 stride = 3 images = shared(rng.uniform( -1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = shared(rng.uniform( -1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs(stride=stride)(gpu_images, gpu_filters) output = host_from_gpu(output) images_bc01 = images.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) filters_bc01 = filters_bc01[:, :, ::-1, ::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid', subsample=(stride, stride)) output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) checker = function([], [output, output_conv2d]) output_numpy, output_conv2d_numpy = checker() if output_numpy.shape != output_conv2d_numpy.shape: raise AssertionError( "theano and cuda convnet follow different conventions for this input size, so we can't test cuda convnet by matching it against theano for these inputs" ) # Proper random projection, like verify_grad does. theano_rng = MRG_RandomStreams(2013 * 5 * 4) cost_weights = theano_rng.normal(size=output_conv2d.shape, dtype=output_conv2d.dtype) cost = (cost_weights * output).sum() # XXX: use verify_grad images_grad, filters_grad = grad(cost, [images, filters]) reference_cost = (cost_weights * output_conv2d).sum() images_conv2d_grad, filters_conv2d_grad = grad(reference_cost, [images, filters]) f = function( [], [images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad]) images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad = f() warnings.warn( """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""" ) # XXX: Refactor if np.abs(images_grad - images_conv2d_grad).max() > 1.15e-5: print "=== IMAGES GRADIENT ===" assert type(images_grad) == type(images_conv2d_grad) assert images_grad.dtype == images_conv2d_grad.dtype if images_grad.shape != images_conv2d_grad.shape: print 'cuda-convnet shape: ', images_grad.shape print 'theano shape: ', images_conv2d_grad.shape assert False err = np.abs(images_grad - images_conv2d_grad) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (images_grad.min(), images_grad.max()) print 'theano value range: ', (images_conv2d_grad.min(), images_conv2d_grad.max()) assert False if np.abs(filters_grad - filters_conv2d_grad).max() > 1e-5: print "=== FILTERS GRADIENT ===" assert type(filters_grad) == type(filters_conv2d_grad) assert filters_grad.dtype == filters_conv2d_grad.dtype if filters_grad.shape != filters_conv2d_grad.shape: print 'cuda-convnet shape: ', filters_grad.shape print 'theano shape: ', filters_conv2d_grad.shape assert False err = np.abs(filters_grad - filters_conv2d_grad) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (filters_grad.min(), filters_grad.max()) print 'theano value range: ', (filters_conv2d_grad.min(), filters_conv2d_grad.max()) assert False
def __init__(self, N_tot, D_in, D_out, M, Domain_number, Ydim, Hiddenlayerdim1, Hiddenlayerdim2, num_MC): ######################################## # set type self.Xlabel = T.matrix('Xlabel') self.X = T.matrix('X') self.Y = T.matrix('Y') self.Weight = T.matrix('Weight') Ydim = self.Y.shape[1] N = self.X.shape[0] self.Ntot = N_tot ############################################# #BCなXの設定 後でこれもレイヤー化する MCsample 分を生成することにします。 self.hiddenLayer_x = HiddenLayer(rng=rng, input=self.X, n_in=D_in, n_out=Hiddenlayerdim1, activation=T.nnet.relu, number='_x') self.hiddenLayer_hidden = HiddenLayer(rng=rng, input=self.hiddenLayer_x.output, n_in=Hiddenlayerdim1, n_out=Hiddenlayerdim2, activation=T.nnet.relu, number='_h') self.hiddenLayer_m = HiddenLayer(rng=rng, input=self.hiddenLayer_hidden.output, n_in=Hiddenlayerdim2, n_out=D_out, activation=T.nnet.relu, number='_m') self.hiddenLayer_S = HiddenLayer(rng=rng, input=self.hiddenLayer_hidden.output, n_in=Hiddenlayerdim2, n_out=D_out, activation=T.nnet.relu, number='_S') self.loc_params = [] self.loc_params.extend(self.hiddenLayer_x.params) self.loc_params.extend(self.hiddenLayer_hidden.params) self.loc_params.extend(self.hiddenLayer_m.params) self.loc_params.extend(self.hiddenLayer_S.params) self.local_params = {} for i in self.loc_params: self.local_params[str(i)] = i #when we use the back constrained model.... srng = RandomStreams(seed=234) sample_latent_epsilon = srng.normal((num_MC, N, D_out)) latent_samples = sample_latent_epsilon * ( T.exp(self.hiddenLayer_S.output)** 0.5)[None, :, :] + self.hiddenLayer_m.output[None, :, :] #普通のsupervised な場合 MCサンプル分コピーしときます。 #self.Data_input=T.tile(self.X,(num_MC,1,1)) self.Data_input = latent_samples ########################################## ####X側の推論 #self.Gaussian_layer_X=KernelLayer(self.Data_input, D_in=D_out, D_out=D_in,num_MC=num_MC,inducing_number=M,Domain_number=None,Domain_consideration=False,number='_X') self.Gaussian_layer_X = KernelLayer(self.Data_input, D_in=D_out, D_out=D_in, num_MC=num_MC, inducing_number=M, Domain_number=Domain_number, Domain_consideration=True, number='_X') self.params = self.Gaussian_layer_X.params self.Z_params_list = self.Gaussian_layer_X.Z_params_list self.global_param_list = self.Gaussian_layer_X.global_params_list self.hyp_list = self.Gaussian_layer_X.hyp_params_list self.hidden_layer = self.Gaussian_layer_X.output ############################################################################################## ###Y側の計算 #self.Gaussian_layer_Y=KernelLayer(self.hidden_layer,D_in=D_out,D_out=Ydim,num_MC=num_MC,inducing_number=M,Domain_number=None,Domain_consideration=False,number='_Y') #self.params.extend(self.Gaussian_layer_Y.params) #self.Z_params_list.extend(self.Gaussian_layer_Y.Z_params_list) #self.global_param_list.extend(self.Gaussian_layer_Y.global_params_list) #self.hyp_list.extend(self.Gaussian_layer_Y.hyp_params_list) ########################################### ###目的関数 #self.LL = self.Gaussian_layer_X.liklihood_nodomain(self.X)*N_tot/(N) self.LL = self.Gaussian_layer_X.likelihood_domain( self.X, self.Xlabel) * N_tot / (N) self.KL_U = self.Gaussian_layer_X.KL_U #self.KL_UY=self.Gaussian_layer_Y.KL_U #y=self.Gaussian_layer_Y.softmax_class() #self.LLY = -T.mean(T.nnet.categorical_crossentropy(y, self.Y))*N #self.LLY=T.sum(T.log(T.maximum(T.sum(self.Y * y, 1), 1e-16))) #self.error = self.Gaussian_layer_Y.error_classification(self.Y) self.KL_latent_dim = self.KLD_X( self.hiddenLayer_m.output, T.exp( self.hiddenLayer_S.output)) * N_tot / (N) #pred = T.mean(self.Gaussian_layer_X.output,0) #self.error = (T.mean((self.Y - pred)**2,0))**0.5 ########################################### #domain checker MMD と クラス分類 #self.MMD=self.Gaussian_layer_Y.MMD_class_penalty(self.Y,self.Xlabel) ########################################## #パラメータの格納 self.hyp_params = {} for i in self.hyp_list: self.hyp_params[str(i)] = i self.Z_params = {} for i in self.Z_params_list: self.Z_params[str(i)] = i self.global_params = {} for i in self.global_param_list: self.global_params[str(i)] = i self.params.extend(self.loc_params) self.wrt = {} for i in self.params: self.wrt[str(i)] = i
class SLmodel(): #This is a test of my idea to adapt the proposal distribution by #maximizing the entropy of the weights def __init__(self, nx, ns, nh, npcl, xvar=1.0): #for this model I assume one linear generative model and a #combination of nh linear dynamical models #generative matrix init_W = np.asarray(np.random.randn(nx, ns) / 10.0, dtype='float32') #init_W=np.asarray(np.eye(2),dtype='float32') #always normalize the columns of W to be unit length init_W = init_W / np.sqrt(np.sum(init_W**2, axis=0)) #observed variable means init_c = np.asarray(np.zeros(nx), dtype='float32') #dynamical matrices #init_M=np.asarray(np.random.randn(ns,ns*nh)/2.0,dtype='float32') init_M = np.asarray((np.tile(np.eye(ns), (1, nh))), dtype='float32') #state-variable variances #(covariance matrix of state variable noise assumed to be diagonal) init_b = np.asarray(np.ones(ns) * 10.0, dtype='float32') #Switching parameter matrix init_A = np.asarray(np.zeros((ns, nh)), dtype='float32') #priors for switching variable init_ph = np.asarray(np.zeros(nh), dtype='float32') #parameters for proposal distribution init_D = np.asarray(np.eye(ns), dtype='float32') init_E = np.asarray(np.random.randn(nx, ns) / 100.0, dtype='float32') init_k = np.asarray(np.zeros(ns), dtype='float32') init_sig = np.asarray(np.ones(ns), dtype='float32') init_s_now = np.asarray(np.zeros((npcl, ns)), dtype='float32') init_h_now = np.asarray(np.zeros((npcl, nh)), dtype='float32') init_h_now[:, 0] = 1.0 init_weights_now = np.asarray(np.ones(npcl) / float(npcl), dtype='float32') init_s_past = np.asarray(np.zeros((npcl, ns)), dtype='float32') init_h_past = np.asarray(np.zeros((npcl, nh)), dtype='float32') init_h_past[:, 0] = 1.0 init_weights_past = np.asarray(np.ones(npcl) / float(npcl), dtype='float32') self.W = theano.shared(init_W) self.c = theano.shared(init_c) self.M = theano.shared(init_M) self.b = theano.shared(init_b) self.A = theano.shared(init_A) self.ph = theano.shared(init_ph) self.D = theano.shared(init_D) self.E = theano.shared(init_E) self.k = theano.shared(init_k) self.sig = theano.shared(init_sig) #this is to help vectorize operations self.sum_mat = T.as_tensor_variable( np.asarray((np.tile(np.eye(ns), nh)).T, dtype='float32')) self.s_now = theano.shared(init_s_now) self.h_now = theano.shared(init_h_now) self.weights_now = theano.shared(init_weights_now) self.s_past = theano.shared(init_s_past) self.h_past = theano.shared(init_h_past) self.weights_past = theano.shared(init_weights_past) self.xvar = np.asarray(xvar, dtype='float32') self.nx = nx #dimensionality of observed variables self.ns = ns #dimensionality of latent variables self.nh = nh #number of (linear) dynamical modes self.npcl = npcl #numer of particles in particle filter self.theano_rng = RandomStreams() self.params = [self.W, self.M, self.b, self.A, self.c, self.ph] self.rel_lrates = np.asarray([0.1, 1.0, 0.01, 10.0, 0.1, 1.0], dtype='float32') self.meta_params = [self.D, self.E, self.k, self.sig] self.meta_rel_lrates = [1.0, 1.0, 1.0, 1.0] def sample_proposal_s(self, s, h, xp): s_pred = self.get_prediction(s, h) n = self.theano_rng.normal(size=T.shape(s)) prop_mean = T.dot(s_pred, self.D) + T.reshape(T.dot(xp, self.E), (1, self.ns)) + self.k s_prop = prop_mean + n * T.reshape(T.exp(self.sig / 2.0), (1, self.ns)) #I compute the term inside the exponent for the pdf of the proposal distrib prop_term = -T.sum(n**2) / 2.0 return T.cast(s_prop, 'float32'), T.cast(s_pred, 'float32'), T.cast( prop_term, 'float32'), prop_mean def calc_h_probs(self, s): #this function takes an np by ns matrix of s samples #and returns an nh by np set of h probabilities exp_terms = T.dot(s, self.A) + T.reshape(self.ph, (1, self.nh)) #re-centering for numerical stability exp_terms_recentered = exp_terms - T.max(exp_terms, axis=1) #exponentiation and normalization rel_probs = T.exp(exp_terms) probs = rel_probs.T / T.sum(rel_probs, axis=1) return probs.T def proposal_loss(self, s_pred, s_samps, xp, weights): #estimates the KL divergence between the proposal distribution #and the true posterior (minus one term, which we assume does not #depend on the proposal distribution). #prop means should be symblolic variables since we need to #compute the derivatives of D and E through this function prop_means = T.dot(s_pred, self.D) + T.reshape(T.dot( xp, self.E), (1, self.ns)) + self.k #np by ns diffs = (prop_means - s_samps) scl_diffs = diffs * T.reshape(T.exp(-self.sig), (1, self.ns)) energies = 0.5 * T.sum(diffs * scl_diffs, axis=1) tot = T.sum(energies * weights) + 0.5 * T.sum(self.sig) return tot def forward_filter_step(self, xp): #need to sample from the proposal distribution first s_samps, s_pred, prop_terms, prop_means = self.sample_proposal_s( self.s_now, self.h_now, xp) updates = {} #now that we have samples from the proposal distribution, we need to reweight them h_probs = self.calc_h_probs(s_samps) h_samps = self.theano_rng.multinomial(pvals=h_probs) recons = T.dot(self.W, s_samps.T) + T.reshape(self.c, (self.nx, 1)) x_terms = -T.sum( (recons - T.reshape(xp, (self.nx, 1)))**2, axis=0) / (2.0 * self.xvar**2) s_terms = -T.sum(((s_samps - s_pred) * self.b)**2, axis=1) / 2.0 energies = x_terms + s_terms - prop_terms #to avoid exponentiating large or very small numbers, I #"re-center" the reweighting factors by adding a constant, #as this has no impact on the resulting new weights energies_recentered = energies - T.max(energies) alpha = T.exp(energies_recentered) #these are the reweighting factors new_weights_unnorm = self.weights_now * alpha normalizer = T.sum(new_weights_unnorm) new_weights = new_weights_unnorm / normalizer #need to normalize new weights #gradient updates for the proposal distribution parameters lrate = 1e-2 loss = self.proposal_loss(s_pred, s_samps, xp, new_weights) gparams = T.grad(loss, self.meta_params, consider_constant=[s_pred, s_samps, xp, new_weights]) # constructs the update dictionary for gparam, param, rel_lr in zip(gparams, self.meta_params, self.meta_rel_lrates): updates[param] = T.cast(param - gparam * lrate * rel_lr, 'float32') updates[self.h_past] = T.cast(self.h_now, 'float32') updates[self.s_past] = T.cast(self.s_now, 'float32') updates[self.h_now] = T.cast(h_samps, 'float32') updates[self.s_now] = T.cast(s_samps, 'float32') updates[self.weights_past] = T.cast(self.weights_now, 'float32') updates[self.weights_now] = T.cast(new_weights, 'float32') #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates #return normalizer, energies_recentered, updates #return h_samps, updates return updates def get_prediction(self, s, h): s_dot_M = T.dot(s, self.M) #this is np by nh*ns s_pred = T.dot(s_dot_M * T.extra_ops.repeat(h, self.ns, axis=1), self.sum_mat) #should be np by ns return T.cast(s_pred, 'float32') def sample_joint(self, sp): t2_samp = self.theano_rng.multinomial( pvals=T.reshape(self.weights_now, (1, self.npcl))).T s2_samp = T.cast( T.sum(self.s_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32') h2_samp = T.cast( T.sum(self.h_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32') diffs = self.b * (s2_samp - sp) sqr_term = T.sum(diffs**2, axis=1) alpha = T.exp(-sqr_term) probs_unnorm = self.weights_past * alpha probs = probs_unnorm / T.sum(probs_unnorm) t1_samp = self.theano_rng.multinomial( pvals=T.reshape(probs, (1, self.npcl))).T s1_samp = T.cast( T.sum(self.s_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32') h1_samp = T.cast( T.sum(self.h_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32') return [s1_samp, h1_samp, s2_samp, h2_samp] #def sample_posterior(self, n_samps): #sp, updates = theano.scan(fn=self.get_prediction, #outputs_info=[None], #sequences=[self.s_past, self.h_past], #n_steps=self.npcl) ##sp should be np by ns #[s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint, #outputs_info=[None, None, None, None], #non_sequences=[sp], #n_steps=n_samps) #return [s1_samps, h1_samps, s2_samps, h2_samps] def h_energy_step(self, s, h): #helper function for self.calc_mean_h_energy exp_A_i = T.reshape( T.sum(self.exp_A * T.reshape(h, (self.nh, 1)), axis=0), (self.ns, 1)) mu_i = T.reshape(T.sum(self.mu * T.reshape(h, (self.nh, 1)), axis=0), (self.ns, 1)) ln_Z_h_i = T.sum(self.ln_Z_h * T.reshape(h, (self.nh, 1))) ph_i = T.sum(self.ph * T.reshape(h, (self.nh, 1))) diff = T.reshape(T.reshape(s, (self.ns, 1)) - mu_i, (self.ns, 1)) diff_dot_exp_A_i = diff * exp_A_i gterm = -0.5 * T.sum(T.sum(diff_dot_exp_A_i * diff)) energy = gterm + ln_Z_h_i + ph_i return energy def calc_mean_h_energy(self, s, h): #you give this function a set of samples of s and h, #it gives you the average energy of those samples exp_terms = T.dot(s, self.A) + T.reshape(self.ph, (1, self.nh)) #np by nh energies = T.sum(h * exp_terms, axis=1) + T.log( T.sum(T.exp(exp_terms), axis=1)) #should be np by 1 energy = T.mean(energies) return energy def update_params(self, x1, x2, n_samps, lrate): #this function samples from the joint posterior and performs # a step of gradient ascent on the log-likelihood sp = self.get_prediction(self.s_past, self.h_past) #sp should be np by ns [s1_samps, h1_samps, s2_samps, h2_samps ], updates = theano.scan(fn=self.sample_joint, outputs_info=[None, None, None, None], non_sequences=[sp], n_steps=n_samps) x1_recons = T.dot(self.W, s1_samps.T) + T.reshape(self.c, (self.nx, 1)) x2_recons = T.dot(self.W, s2_samps.T) + T.reshape(self.c, (self.nx, 1)) s_pred = self.get_prediction(s1_samps, h1_samps) hterm1 = self.calc_mean_h_energy(s1_samps, h1_samps) #hterm2=self.calc_mean_h_energy(s2_samps, h2_samps) sterm = -T.mean(T.sum((self.b * (s2_samps - s_pred))**2, axis=1)) / 2.0 #xterm1=-T.mean(T.sum((x1_recons-T.reshape(x1,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2)) xterm2 = -T.mean( T.sum((x2_recons - T.reshape(x2, (self.nx, 1)))**2, axis=0) / (2.0 * self.xvar**2)) #energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm -T.sum(T.sum(self.A**2)) energy = hterm1 + xterm2 + sterm gparams = T.grad( energy, self.params, consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps]) # constructs the update dictionary for gparam, param, rel_lr in zip(gparams, self.params, self.rel_lrates): #gnat=T.dot(param, T.dot(param.T,param)) updates[param] = T.cast(param + gparam * lrate * rel_lr, 'float32') #make sure W has unit-length columns #new_W=updates[self.W] #updates[self.W]=T.cast(new_W/T.sqrt(T.sum(new_W**2,axis=0)),'float32') #MIGHT NEED TO NORMALIZE A return energy, updates def get_ESS(self): return 1.0 / T.sum(self.weights_now**2) def resample_step(self): idx = self.theano_rng.multinomial( pvals=T.reshape(self.weights_now, (1, self.npcl))).T s_samp = T.sum(self.s_now * T.addbroadcast(idx, 1), axis=0) h_samp = T.sum(self.h_now * T.addbroadcast(idx, 1), axis=0) return T.cast(s_samp, 'float32'), T.cast(h_samp, 'float32') def resample(self): [s_samps, h_samps], updates = theano.scan(fn=self.resample_step, outputs_info=[None, None], n_steps=self.npcl) updates[self.s_now] = T.cast(s_samps, 'float32') updates[self.h_now] = T.cast(h_samps, 'float32') updates[self.weights_now] = T.cast( T.ones_like(self.weights_now) / T.cast(self.npcl, 'float32'), 'float32') #dtype paranoia return updates def simulate_step(self, s): s = T.reshape(s, (1, self.ns)) #get h probabilities h_probs = self.calc_h_probs(s) #h_samp=self.theano_rng.multinomial(pvals=T.reshape(h_probs,(self.nh,1))) h_samp = self.theano_rng.multinomial(pvals=h_probs) sp = self.get_prediction(s, h_samp) xp = T.dot(self.W, sp.T) + T.reshape(self.c, (self.nx, 1)) return T.cast(sp, 'float32'), T.cast(xp, 'float32'), h_samp def simulate_forward(self, n_steps): s0 = T.sum(self.s_now * T.reshape(self.weights_now, (self.npcl, 1)), axis=0) s0 = T.reshape(s0, (1, self.ns)) [sp, xp, hs], updates = theano.scan(fn=self.simulate_step, outputs_info=[s0, None, None], n_steps=n_steps) return sp, xp, hs, updates
class RNNCluster(rnn.RNNBase): """RNNCluster combines sampling-based RNN with item clustering. Parameters ---------- n_clusters: int Number of clusters loss: "Blackout", "CCE", "BPR" or "BPRelu" Determines the loss function, among: - BPR, as used in "Session-based Recommendations with Recurrent Neural Networks", Hidasi, B. et al., 2016 - TOP1, defined in "Session-based Recommendations with Recurrent Neural Networks", Hidasi, B. et al., 2016 - Blackout, discriminative loss function defined in "BlackOut: Speeding up Recurrent Neural Network Language Models With Very Large Vocabularies", Ji, S. et al., 2015 (equation 6) - BPRelu, approximation of BPR based on relu/hinge non-linearities - CCE, categorical cross-entropy computed on the set of samples cluster_type: "mix", "softmax" or "sigmoid" Determines whether items can belong to multiple clusters. - mix, items belong to at least one cluster, possibly many. - softmax, items belong to one and only one cluster. - sigmoid, items belong to zero, one or multiple clusters. sampling: int Number of samples. cluster_sampling: int If cluster_sampling > 0, the recommendation loss and the clustering loss use different samples. In that case, cluster_sampling is the number of samples used by the clustering loss. sampling_bias: float Items are sampled with a probability proportional to their frequency to the power of the sampling_bias. predict_with_clusters: bool Set to false during testing if you want to ignore the clustering. cluster_selection_noise: float If cluster_selection_noise > 0, a random gaussian noise (whose std is cluster_selection_noise) is added to the cluster selection output during training. Can help to explore a large number of clusters. init_scale: float Initial scale of the softmax and sigmoid functions used in the cluster selection process. scale_growing_rate: float After each training epoch, the scale of the softmax and sigmoid functions is multiplied by the scale_growing_rate. max_scale: float Maximum allowed scale. See classes SequenceNoise, RecurrentLayers, SelectTargets and update manager for options common to the other RNN methods. """ def __init__(self, n_clusters=10, loss="Blackout", cluster_type='mix', sampling=100, cluster_sampling=-1, sampling_bias=0., predict_with_clusters=True, cluster_selection_noise=0., init_scale=1., scale_growing_rate=1., max_scale=50, **kwargs): super(RNNCluster, self).__init__(**kwargs) self.n_clusters = n_clusters self.init_scale = np.cast[theano.config.floatX](init_scale) self.effective_scale = np.cast[theano.config.floatX](init_scale) self.scale_growing_rate = np.cast[theano.config.floatX]( scale_growing_rate) self.max_scale = np.cast[theano.config.floatX](max_scale) self.cluster_type = cluster_type self.sampling_bias = sampling_bias self.loss = loss self.cluster_selection_noise = cluster_selection_noise self.predict_with_clusters = predict_with_clusters if self.loss == "Blackout": self._loss = self._blackout_loss elif self.loss == 'lin': self._loss = self._lin_loss elif self.loss == 'BPRelu': self._loss = self._BPRelu_loss elif self.loss == 'BPR': self._loss = self._BPR_loss elif self.loss == 'TOP1': self._loss = self._TOP1_loss elif self.loss == 'CCE': self._loss = self._cce_loss else: raise ValueError('Unknown cluster loss') self.n_samples = int(sampling) self.n_cluster_samples = int(cluster_sampling) self._srng = MRG_RandomStreams(lasagne.random.get_rng().randint( 1, 2147462579)) self.name = "RNN Cluster with categorical cross entropy" self.metrics = { 'recall': { 'direction': 1 }, 'cluster_recall': { 'direction': 1 }, 'sps': { 'direction': 1 }, 'cluster_sps': { 'direction': 1 }, 'ignored_items': { 'direction': -1 }, 'assr': { 'direction': 1 }, 'cluster_use': { 'direction': 1 }, 'cluster_use_std': { 'direction': -1 }, 'cluster_size': { 'direction': 1 } } def _get_model_filename(self, epochs): '''Return the name of the file to save the current model ''' filename = "rnn_clusters" + str(self.n_clusters) + "_sc" + str( self.init_scale) if self.scale_growing_rate != 1.: filename += "-" + str(self.scale_growing_rate) + "-" + str( self.max_scale) filename += "_" if self.sampling_bias > 0.: filename += "p" + str(self.sampling_bias) filename += "s" + str(self.n_samples) if self.n_cluster_samples > 0: filename += "_" if self.sampling_bias > 0.: filename += "p" + str(self.sampling_bias) filename += "cs" + str(self.n_cluster_samples) if self.cluster_type == 'softmax': filename += "_softmax" elif self.cluster_type == 'mix': filename += "_mix" if self.cluster_selection_noise > 0.: filename += '_n' + str(self.cluster_selection_noise) filename += "_c" + self.loss return filename + "_" + self._common_filename(epochs) def _blackout_loss(self, predictions, n_targets): targets = np.arange(n_targets) predictions = T.nnet.softmax(predictions) pos = T.nnet.categorical_crossentropy(predictions, targets) neg = T.log(1 - predictions) return pos - neg[:, targets.shape[0]:].sum(axis=-1) def _cce_loss(self, predictions, n_targets): targets = np.arange(n_targets) predictions = T.nnet.softmax(predictions) pos = T.nnet.categorical_crossentropy(predictions, targets) return pos def _lin_loss(self, predictions, n_targets): neg = predictions[:, n_targets:].sum(axis=-1) pos = T.diag(predictions) return neg - pos def _BPR_loss(self, predictions, n_targets): diff = (predictions - T.diag(predictions).dimshuffle([0, 'x']))[:, n_targets:] return -(T.log(T.nnet.sigmoid(-diff))).mean(axis=-1) def _BPRelu_loss(self, predictions, n_targets): diff = (predictions - T.diag(predictions).dimshuffle([0, 'x']))[:, n_targets:] return lasagne.nonlinearities.leaky_rectify(diff + 0.5).mean(axis=-1) def _TOP1_loss(self, predictions, n_targets): diff = (predictions - T.diag(predictions).dimshuffle([0, 'x']))[:, n_targets:] reg = T.sqr(predictions[:, n_targets:]) return (T.nnet.sigmoid(diff) + T.nnet.sigmoid(reg)).mean(axis=-1) def _create_ini_clusters(self): c = 0.1 * np.random.randn(self.n_items, self.n_clusters) # c = -2 * np.random.random((self.n_items, self.n_clusters)) - 1 # for i, j in enumerate(np.random.choice(self.n_clusters, self.n_items)): # c[i,j] *= -1 # print(np.round(c[:5, :], 2)) return c.astype(theano.config.floatX) def _prepare_networks(self, n_items): ''' Prepares the building blocks of the RNN, but does not compile them: self.l_in : input layer self.l_mask : mask of the input layer self.target : target of the network self.l_out : output of the network self.cost : cost function ''' self.n_items = n_items # The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length, self._input_size())) # The input is completed by a mask to inform the LSTM of the length of the sequence self.l_mask = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length)) # recurrent layer if not self.use_movies_features: l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=self.n_items + self._n_optional_features(), only_return_final=True) else: l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=None, only_return_final=True) # Theano tensor for the targets self.target = T.ivector('target_output') self.exclude = T.fmatrix('excluded_items') self.samples = T.ivector('samples') self.cluster_samples = T.ivector('cluster_samples') self.user_representation_layer = l_recurrent # The sliced output is then passed through linear layer to obtain the right output size self.l_out = BlackoutLayer(l_recurrent, num_units=self.n_items, num_outputs=self.n_samples, nonlinearity=None, W=lasagne.init.GlorotUniform()) # lasagne.layers.get_output produces a variable for the output of the net network_output = lasagne.layers.get_output(self.l_out, targets=self.target, samples=self.samples) # loss function self.cost = self._loss(network_output, self.batch_size).mean() # Cluster learning self.T_scale = theano.shared(self.effective_scale) scaled_softmax = lambda x: lasagne.nonlinearities.softmax(x * self. T_scale) self.cluster_selection_layer = lasagne.layers.DenseLayer( l_recurrent, b=None, num_units=self.n_clusters, nonlinearity=None) cluster_selection = lasagne.layers.get_output( self.cluster_selection_layer) if self.cluster_selection_noise > 0.: cluster_selection = cluster_selection + self._srng.normal( cluster_selection.shape, avg=0.0, std=self.cluster_selection_noise) cluster_selection = scaled_softmax(cluster_selection) self.cluster_repartition = theano.shared(self._create_ini_clusters()) if self.cluster_type == 'softmax': target_and_samples_clusters = scaled_softmax( self.cluster_repartition[ T.concatenate([self.target, self.cluster_samples]), :]) elif self.cluster_type == 'mix': target_and_samples_clusters = scaled_softmax(self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) + \ T.nnet.sigmoid(self.T_scale*self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) else: target_and_samples_clusters = T.nnet.sigmoid( self.T_scale * self.cluster_repartition[ T.concatenate([self.target, self.cluster_samples]), :]) cluster_score = cluster_selection.dot(target_and_samples_clusters.T) self.cost_clusters = self._loss(cluster_score, self.batch_size).mean() def _compile_train_function(self): ''' Compile self.train. self.train recieves a sequence and a target for every steps of the sequence, compute error on every steps, update parameter and return global cost (i.e. the error). ''' print("Compiling train...") # Compute AdaGrad updates for training all_params = lasagne.layers.get_all_params(self.l_out, trainable=True) updates = self.updater(self.cost, all_params) params_clusters = self.cluster_selection_layer.get_params( trainable=True) params_clusters.append(self.cluster_repartition) updates.update(self.updater(self.cost_clusters, params_clusters)) # Compile network self.train_function = theano.function([ self.l_in.input_var, self.l_mask.input_var, self.target, self.samples, self.cluster_samples, self.exclude ], self.cost, updates=updates, allow_input_downcast=True, name="Train_function", on_unused_input='ignore') print("Compilation done.") def _get_hard_clusters(self): if self.cluster_type == 'softmax': return lasagne.nonlinearities.softmax(100. * self.cluster_repartition) elif self.cluster_type == 'mix': # Clipping is used to avoid the sum of sigmoid and softmax to produce a cluster indicator of 2 return (lasagne.nonlinearities.softmax( 100. * self.cluster_repartition) + T.nnet.sigmoid(100. * self.cluster_repartition)).clip( 0, 1) else: return T.nnet.sigmoid(100. * self.cluster_repartition) def _compile_predict_function(self): ''' Compile self.predict, the deterministic rnn that output the prediction at the end of the sequence ''' print("Compiling predict...") if self.predict_with_clusters: cluster_selection = lasagne.layers.get_output( self.cluster_selection_layer, deterministic=True)[0, :].argmax() user_representation = lasagne.layers.get_output( self.user_representation_layer, deterministic=True) theano_predict_function = theano.function( [self.l_in.input_var, self.l_mask.input_var], [user_representation, cluster_selection], allow_input_downcast=True, name="Predict_function", on_unused_input='ignore') def cluster_predict_function(sequence, mask, k, exclude): u, c = theano_predict_function(sequence, mask) scores = u[0].dot( self.clusters_embeddings[c]) + self.clusters_bias[c] cluster_index_exclude = [] for i in exclude: if i in self.clusters_reverse_index[c]: cluster_index_exclude.append( self.clusters_reverse_index[c][i]) scores[cluster_index_exclude] = -np.inf # find top k according to output effective_k = min(k, len(self.clusters[c])) return list(self.clusters[c][np.argpartition( -scores, range(effective_k))[:effective_k]]), len(self.clusters[c]) self.predict_function = cluster_predict_function else: items_score = lasagne.nonlinearities.softmax( lasagne.layers.get_output(self.l_out, deterministic=True)) user_representation = lasagne.layers.get_output( self.user_representation_layer, deterministic=True) theano_predict_function = theano.function( [self.l_in.input_var, self.l_mask.input_var], user_representation, allow_input_downcast=True, name="Predict_function", on_unused_input='ignore') def no_cluster_predict_function(sequence, mask, k, exclude): u = theano_predict_function(sequence, mask) scores = u[0].dot(self.l_out.W.get_value( borrow=True)) + self.l_out.b.get_value(borrow=True) scores[exclude] = -np.inf # find top k according to output return list(np.argpartition(-scores, range(k))[:k]), self.n_items self.predict_function = no_cluster_predict_function print("Compilation done.") def _compile_test_function(self): ''' Compile self.test_function, the deterministic rnn that output the precision@10 ''' print("Compiling test...") items_score1 = lasagne.nonlinearities.softmax( lasagne.layers.get_output(self.l_out, deterministic=True)) cluster_selection = lasagne.layers.get_output( self.cluster_selection_layer, deterministic=True)[0, :].argmax() items_clusters = self._get_hard_clusters() used_items = items_clusters[:, cluster_selection] items_score2 = items_score1 * used_items if self.interactions_are_unique: items_score1 *= (1 - self.exclude) items_score2 *= (1 - self.exclude) theano_test_function = theano.function([ self.l_in.input_var, self.l_mask.input_var, self.target, self.samples, self.cluster_samples, self.exclude ], [items_score1, items_score2, cluster_selection, used_items.sum()], allow_input_downcast=True, name="Test_function", on_unused_input='ignore') def precision_test_function(theano_inputs): k = 10 scores1, scores2, c_select, n_used_items = theano_test_function( *theano_inputs) ids1 = np.argpartition(-scores1, range(k), axis=-1)[0, :k] ids2 = np.argpartition(-scores2, range(k), axis=-1)[0, :k] return ids1, ids2, c_select, n_used_items self.test_function = precision_test_function print("Compilation done.") def _popularity_sample(self): if not hasattr(self, '_cumsum'): self._cumsum = np.cumsum( np.power(self.dataset.item_popularity, self.sampling_bias)) return bisect(self._cumsum, random.uniform(0, self._cumsum[-1])) def _prepare_input(self, sequences): ''' Sequences is a list of [user_id, input_sequence, targets] ''' batch_size = len(sequences) # Shape return variables X = np.zeros((batch_size, self.max_length, self._input_size()), dtype=self._input_type) # input of the RNN mask = np.zeros( (batch_size, self.max_length) ) # mask of the input (to deal with sequences of different length) Y = np.zeros((batch_size, ), dtype='int32') # output target exclude = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX) for i, sequence in enumerate(sequences): user_id, in_seq, target = sequence seq_features = np.array( map(lambda x: self._get_features(x, user_id), in_seq)) X[i, :len(in_seq), :] = seq_features # Copy sequences into X mask[i, :len(in_seq)] = 1 Y[i] = target[0][0] # id of the first and only target exclude[i, [j[0] for j in in_seq]] = 1 if self.sampling_bias > 0.: samples = np.array( [self._popularity_sample() for i in range(self.n_samples)], dtype='int32') if self.n_cluster_samples > 0: cluster_samples = np.array([ self._popularity_sample() for i in range(self.n_cluster_samples) ], dtype='int32') else: cluster_samples = samples else: samples = np.random.choice(self.n_items, self.n_samples).astype('int32') if self.n_cluster_samples > 0: cluster_samples = np.random.choice( self.n_items, self.n_cluster_samples).astype('int32') else: cluster_samples = samples # scale if not hasattr(self, '_last_epoch'): self._last_epoch = self.dataset.training_set.epochs else: if self.dataset.training_set.epochs > self._last_epoch + 1 and self.scale_growing_rate != 1.: self.effective_scale *= self.scale_growing_rate**int( self.dataset.training_set.epochs - self._last_epoch) self._last_epoch += int(self.dataset.training_set.epochs - self._last_epoch) print("New scale: ", self.effective_scale) self.T_scale.set_value(self.effective_scale) return (X, mask.astype(theano.config.floatX), Y, samples, cluster_samples, exclude) def _compute_validation_metrics(self, metrics): clusters = np.zeros(self.n_clusters, dtype="int") used_items = [] ev = evaluation.Evaluator(self.dataset, k=10) ev_clusters = evaluation.Evaluator(self.dataset, k=10) for batch, goal in self._gen_mini_batch( self.dataset.validation_set(epochs=1), test=True): pred1, pred2, cl, i = self.test_function(batch) ev.add_instance(goal, pred1) ev_clusters.add_instance(goal, pred2) clusters[cl] += 1 used_items.append(i) if self.cluster_type == 'softmax': ignored_items = 0 cluster_size = np.histogram( self.cluster_repartition.get_value(borrow=True).argmax(axis=1), bins=range(self.n_clusters + 1))[0].tolist() elif self.cluster_type == 'mix': ignored_items = 0 sig_clusters = self.cluster_repartition.get_value(borrow=True) > 0. softmax_clusters = self.cluster_repartition.get_value( borrow=True).argmax(axis=1) for i in range(self.n_items): sig_clusters[i, softmax_clusters[i]] = True cluster_size = sig_clusters.sum(axis=0) else: ignored_items = (self.cluster_repartition.get_value( borrow=True).max(axis=1) < 0.).sum() cluster_size = (self.cluster_repartition.get_value(borrow=True) > 0.).sum(axis=0) metrics['recall'].append(ev.average_recall()) metrics['cluster_recall'].append(ev_clusters.average_recall()) metrics['sps'].append(ev.sps()) metrics['cluster_sps'].append(ev_clusters.sps()) metrics['assr'].append(self.n_items / np.mean(used_items)) metrics['ignored_items'].append(ignored_items) metrics['cluster_use'].append(clusters) metrics['cluster_use_std'].append(np.std(clusters)) metrics['cluster_size'].append(cluster_size) return metrics def _print_progress(self, iterations, epochs, start_time, train_costs, metrics, validation_metrics): '''Print learning progress in terminal ''' print(self.name, iterations, "batchs, ", epochs, " epochs in", time() - start_time, "s") print("Last train cost : ", train_costs[-1]) for m in self.metrics.keys(): print(m, ': ', metrics[m][-1]) if m in validation_metrics: print( 'Best ', m, ': ', max(np.array(metrics[m]) * self.metrics[m]['direction']) * self.metrics[m]['direction']) print('-----------------') # Print on stderr for easier recording of progress print(iterations, epochs, time() - start_time, train_costs[-1], metrics['sps'][-1], metrics['cluster_sps'][-1], metrics['recall'][-1], metrics['cluster_recall'][-1], metrics['assr'][-1], metrics['ignored_items'][-1], metrics['cluster_use_std'][-1], file=sys.stderr) def prepare_tests(self): '''Take the soft clustering and make actual clusters. ''' cluster_membership = self.cluster_repartition.get_value(borrow=True) item_embeddings = self.l_out.W.get_value(borrow=True) item_bias = self.l_out.b.get_value(borrow=True) self.clusters = [[] for i in range(self.n_clusters)] for i in range(cluster_membership.shape[0]): no_cluster = True best_cluster = 0 best_val = cluster_membership[i, 0] for j in range(self.n_clusters): if cluster_membership[i, j] > 0: self.clusters[j].append(i) no_cluster = False elif cluster_membership[i, j] > best_val: best_val = cluster_membership[i, j] best_cluster = j if no_cluster: self.clusters[best_cluster].append(i) self.clusters = [np.array(c) for c in self.clusters] self.clusters_reverse_index = [] for c in self.clusters: self.clusters_reverse_index.append( {c[j]: j for j in range(len(c))}) self.clusters_embeddings = [ item_embeddings[:, c] for c in self.clusters ] self.clusters_bias = [item_bias[c] for c in self.clusters] def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None): ''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids) ''' if exclude is None: exclude = [] # Compile network if needed if not hasattr(self, 'predict_function'): self._compile_predict_function() # Prepare RNN input max_length_seq = sequence[-min(self.max_length, len(sequence)):] X = np.zeros((1, self.max_length, self._input_size()), dtype=self._input_type) # input of the RNN X[0, :len(max_length_seq), :] = np.array( map(lambda x: self._get_features(x, user_id), max_length_seq)) mask = np.zeros( (1, self.max_length) ) # mask of the input (to deal with sequences of different length) mask[0, :len(max_length_seq)] = 1 # Run RNN if self.interactions_are_unique: should_exclude = [i[0] for i in sequence] else: should_exclude = [] should_exclude.extend(exclude) return self.predict_function(X, mask.astype(theano.config.floatX), k, should_exclude) def save(self, filename): '''Save the parameters of a network into a file ''' print('Save model in ' + filename) if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) param = lasagne.layers.get_all_param_values(self.l_out) param.append(self.cluster_repartition.get_value(borrow=True)) param.append([ p.get_value(borrow=True) for p in self.cluster_selection_layer.get_params() ]) f = file(filename, 'wb') cPickle.dump(param, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close() def load(self, filename): '''Load parameters values form a file ''' f = file(filename, 'rb') param = cPickle.load(f) f.close() lasagne.layers.set_all_param_values( self.l_out, [i.astype(theano.config.floatX) for i in param[:-2]]) self.cluster_repartition.set_value(param[-2]) for p, v in zip(self.cluster_selection_layer.get_params(), param[-1]): p.set_value(v) self.prepare_tests()
class Conv2DVarDropOutARD(ConvLayer): def __init__(self, incoming, num_filters, filter_size, stride=(1, 1), pad=0, untie_biases=False, Wconv=GlorotUniform(), b=Constant(0.), nonlinearity=nonlinearities.rectify, flip_filters=False, convolution=T.nnet.conv2d, ard_init=-10, **kwargs): super(Conv2DVarDropOutARD, self).__init__(incoming, num_filters, filter_size, stride, pad, untie_biases, Wconv, b, nonlinearity, flip_filters) self.convolution = convolution self.reg = True self.shape = self.get_W_shape() self.log_sigma2 = self.add_param(Constant(ard_init), self.shape, name="ls2") self._srng = RandomStreams(get_rng().randint(1, 2147462579)) @staticmethod def clip(mtx, to=8): mtx = T.switch(T.le(mtx, -to), -to, mtx) mtx = T.switch(T.ge(mtx, to), to, mtx) return mtx def convolve(self, input, deterministic=False, train_clip=False, thresh=3, **kwargs): log_alpha = self.clip(self.log_sigma2 - T.log(self.W**2 + 1e-8)) conv_mode = 'conv' if self.flip_filters else 'cross' border_mode = self.pad clip_mask = T.ge(log_alpha, thresh) if border_mode == 'same': border_mode = tuple(s // 2 for s in self.filter_size) if deterministic: conved = dnn.dnn_conv(img=input, kerns=T.switch(T.ge(log_alpha, thresh), 0, self.W), subsample=self.stride, border_mode=border_mode, conv_mode=conv_mode) else: W = self.W if train_clip: W = T.switch(clip_mask, 0, W) conved_mu = dnn.dnn_conv(img=input, kerns=W, subsample=self.stride, border_mode=border_mode, conv_mode=conv_mode) conved_si = T.sqrt(1e-8 + dnn.dnn_conv(img=input * input, kerns=T.exp(log_alpha) * W * W, subsample=self.stride, border_mode=border_mode, conv_mode=conv_mode)) conved = conved_mu + conved_si * self._srng.normal( conved_mu.shape, avg=0, std=1) return conved def eval_reg(self, **kwargs): k1, k2, k3 = 0.63576, 1.8732, 1.48695 C = -k1 log_alpha = self.clip(self.log_sigma2 - T.log(self.W**2)) mdkl = k1 * T.nnet.sigmoid(k2 + k3 * log_alpha) - 0.5 * T.log1p( T.exp(-log_alpha)) + C return -T.sum(mdkl) def get_ard(self, thresh=3, **kwargs): log_alpha = self.log_sigma2.get_value() - 2 * np.log( np.abs(self.W.get_value())) return '%.4f' % (np.sum(log_alpha > thresh) * 1.0 / log_alpha.size) def get_reg(self): log_alpha = self.log_sigma2.get_value() - 2 * np.log( np.abs(self.W.get_value())) return '%.1f, %.1f' % (log_alpha.min(), log_alpha.max())
class NeuralNetSvi: """Implements a feedforward neural network trained using stochastic variational inference. Supports various types of layers and loss functions.""" def __init__(self, n_inputs): """Constructs a net with a given number of inputs and no layers.""" assert isposint( n_inputs), 'Number of inputs must be a positive integer.' self.n_inputs = n_inputs self.n_outputs = n_inputs self.n_units = [n_inputs] self.n_layers = 0 self.n_params = 0 self.mWs = [] self.mbs = [] self.sWs = [] self.sbs = [] self.uas = [] self.mas = [] self.zas = [] self.hs = [tt.matrix('x')] self.mps = self.mWs + self.mbs self.sps = self.sWs + self.sbs self.parms = self.mps + self.sps self.input = self.hs[0] self.output = self.hs[-1] self.srng = RandomStreams() self.eval_f = None self.eval_f_rand = None def addLayer(self, n_units, type): """Adds a new layer to the network, :param n_units: number of units in the layer :param type: a string specification of the activation function """ # check number of units assert isposint(n_units), 'Number of units must be a positive integer.' # choose activation function if type == 'logistic': if dtype == 'float32': clipvalue = 15.0 else: clipvalue = 19.0 actfun = lambda t: tt.nnet.sigmoid( tt.clip(t, -clipvalue, clipvalue)) elif type == 'tanh': if dtype == 'float32': clipvalue = 9.0 else: clipvalue = 19.0 actfun = lambda t: tt.tanh(tt.clip(t, -clipvalue, clipvalue)) elif type == 'linear': actfun = lambda t: t elif type == 'relu': actfun = tt.nnet.relu elif type == 'softplus': actfun = tt.nnet.softplus elif type == 'softmax': actfun = tt.nnet.softmax else: raise ValueError(type + ' is not a supported activation function type.') n_prev_units = self.n_outputs self.n_outputs = n_units self.n_units.append(n_units) self.n_layers += 1 self.n_params += 2 * (n_prev_units + 1) * n_units mW = theano.shared((rng.randn(n_prev_units, n_units) / np.sqrt(n_prev_units + 1)).astype(dtype), name='mW' + str(self.n_layers)) mb = theano.shared(np.zeros(n_units, dtype=dtype), name='mb' + str(self.n_layers)) sW = theano.shared(-5.0 * np.ones([n_prev_units, n_units], dtype=dtype), name='sW' + str(self.n_layers)) sb = theano.shared(-5.0 * np.ones(n_units, dtype=dtype), name='sb' + str(self.n_layers)) ua = self.srng.normal((self.hs[-1].shape[0], n_units), dtype=dtype) ma = tt.dot(self.hs[-1], mW) + mb sa = tt.dot(self.hs[-1]**2, tt.exp(2 * sW)) + tt.exp(2 * sb) za = tt.sqrt(sa) * ua + ma h = actfun(za) h.name = 'h' + str(self.n_layers) self.mWs.append(mW) self.mbs.append(mb) self.sWs.append(sW) self.sbs.append(sb) self.uas.append(ua) self.mas.append(ma) self.zas.append(za) self.hs.append(h) self.mps = self.mWs + self.mbs self.sps = self.sWs + self.sbs self.parms = self.mps + self.sps self.output = self.hs[-1] self.eval_f = None self.eval_f_rand = None def removeLayer(self): """Removes a layer from the network.""" assert self.n_layers > 0, 'There is no layer to remove.' n_params_to_rem = 2 * self.n_outputs * (self.n_units[-2] + 1) self.n_outputs = self.n_units[-2] self.n_units.pop() self.n_layers -= 1 self.n_params -= n_params_to_rem self.mWs.pop() self.mbs.pop() self.sWs.pop() self.sbs.pop() self.uas.pop() self.mas.pop() self.zas.pop() self.hs.pop() self.mps = self.mWs + self.mbs self.sps = self.sWs + self.sbs self.parms = self.mps + self.sps self.output = self.hs[-1] self.eval_f = None self.eval_f_rand = None def eval(self, x, rand=False): """Evaluate net at locations in x.""" if rand: # compile theano computation graph, if haven't already done so if self.eval_f_rand == None: n_data = tt.iscalar('n_data') uas = [ tt.tile(self.srng.normal((n_units, ), dtype=dtype), [n_data, 1]) for n_units in self.n_units[1:] ] self.eval_f_rand = theano.function(inputs=[self.hs[0], n_data], outputs=self.hs[-1], givens=zip(self.uas, uas)) return self.eval_f_rand(x.astype(dtype), x.shape[0]) else: # compile theano computation graph, if haven't already done so if self.eval_f == None: self.eval_f = theano.function(inputs=[self.hs[0]], outputs=self.hs[-1], givens=zip(self.zas, self.mas)) return self.eval_f(x.astype(dtype)) def printInfo(self): """Prints some useful info about the net.""" print 'Number of inputs =', self.n_inputs print 'Number of outputs =', self.n_outputs print 'Number of units =', self.n_units print 'Number of layers =', self.n_layers print 'Number of params =', self.n_params print 'Data type =', dtype def visualize_weights(self, layer, imsize, layout): """ Displays the weights of a specified layer as images. :param layer: the layer whose weights to display :param imsize: the image size :param layout: number of rows and columns for each page :return: none """ helper.disp_imdata(self.mWs[layer].get_value().T, imsize, layout) plt.show(block=False) def visualize_activations(self, x, layers=None): """ Visualizes the activations of specified layers caused by a given data minibatch. :param x: a minibatch of data :param layers: list of layers to visualize activations of; defaults to the whole net except the input layer :return: none """ if layers is None: layers = xrange(self.n_layers) forwprop = theano.function(inputs=[self.hs[0]], outputs=self.hs[1:]) hs = forwprop(x.astype(dtype)) for l in layers: fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.imshow(hs[l], cmap='gray', interpolation='none') ax.set_title('Layer ' + str(l)) ax.set_xlabel('layer units') ax.set_ylabel('data points') plt.show(block=False) def param_hist(self, layers=None): """ Displays a histogram of weights and biases for specified layers. :param layers: list of layers to show histograms for; defaults to the whole net :return: none """ if layers is None: layers = xrange(self.n_layers) for l in layers: fig, axs = plt.subplots(2, 2) nbins = int(np.sqrt(self.mWs[l].get_value().size)) axs[0, 0].hist(self.mWs[l].get_value().flatten(), nbins, normed=True) axs[0, 0].set_title('weight means, layer ' + str(l)) axs[1, 0].hist(self.sWs[l].get_value().flatten(), nbins, normed=True) axs[1, 0].set_title('weight log stds, layer ' + str(l)) nbins = int(np.sqrt(self.mbs[l].get_value().size)) axs[0, 1].hist(self.mbs[l].get_value(), nbins, normed=True) axs[0, 1].set_title('bias means, layer ' + str(l)) axs[1, 1].hist(self.sbs[l].get_value(), nbins, normed=True) axs[1, 1].set_title('bias log stds, layer ' + str(l)) plt.show(block=False)
def test_undefined_grad(): srng = MRG_RandomStreams(seed=1234) # checking uniform distribution low = tensor.scalar() out = srng.uniform((), low=low) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, low) high = tensor.scalar() out = srng.uniform((), low=0, high=high) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, high) out = srng.uniform((), low=low, high=high) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, (low, high)) # checking binomial distribution prob = tensor.scalar() out = srng.binomial((), p=prob) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, prob) # checking multinomial distribution prob1 = tensor.scalar() prob2 = tensor.scalar() p = [theano.tensor.as_tensor_variable([prob1, 0.5, 0.25])] out = srng.multinomial(size=None, pvals=p, n=4)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, theano.tensor.sum(out), prob1) p = [theano.tensor.as_tensor_variable([prob1, prob2])] out = srng.multinomial(size=None, pvals=p, n=4)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, theano.tensor.sum(out), (prob1, prob2)) # checking choice p = [theano.tensor.as_tensor_variable([prob1, prob2, 0.1, 0.2])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0], (prob1, prob2)) p = [theano.tensor.as_tensor_variable([prob1, prob2])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0], (prob1, prob2)) p = [theano.tensor.as_tensor_variable([prob1, 0.2, 0.3])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0], prob1) # checking normal distribution avg = tensor.scalar() out = srng.normal((), avg=avg) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, avg) std = tensor.scalar() out = srng.normal((), avg=0, std=std) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, std) out = srng.normal((), avg=avg, std=std) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, (avg, std))
class LadderAE(): def __init__(self): self.input_dim = 784 # self.denoising_cost_x = (500.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) # self.denoising_cost_x = (4000.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) self.denoising_cost_x = (1000, 10, 0.1, 0.1, 0.1, 0.1, 0.1) self.noise_std = (0.3,) * 7 # self.noise_std = (0.55, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) self.default_lr = 0.002 self.shareds = OrderedDict() self.rstream = RandomStreams(seed=1) self.rng = np.random.RandomState(seed=1) self.layers = [(0, (('fc', 784), 'relu')), (1, (('fc', 1000), 'relu')), (2, (('fc', 500), 'relu')), (3, (('fc', 250), 'relu')), (4, (('fc', 250), 'relu')), (5, (('fc', 250), 'relu')), (6, (('fc', 10), 'softmax'))] def counter(self): name = 'counter' p = self.shareds.get(name) update = [] if p is None: p_max_val = np.float32(10) p = self.shared(np.float32(1), name, role=BNPARAM) p_max = self.shared(p_max_val, name + '_max', role=BNPARAM) update = [(p, T.clip(p + np.float32(1), np.float32(0), p_max)), (p_max, p_max_val)] return (p, update) def annotate_bn(self, var, id, var_type, mb_size, size): var_shape = np.array((1, size)) out_dim = np.prod(var_shape) / np.prod(var_shape[0]) # Flatten the var - shared variable updating is not trivial otherwise, # as theano seems to believe a row vector is a matrix and will complain # about the updates orig_shape = var.shape var = var.flatten() # Here we add the name and role, the variables will later be identified # by these values var.name = id + '_%s_clean' % var_type add_role(var, BNPARAM) shared_var = self.shared(np.zeros(out_dim), name='shared_%s' % var.name, role=None) # Update running average estimates. When the counter is reset to 1, it # will clear its memory cntr, c_up = self.counter() one = np.float32(1) run_avg = lambda new, old: one / cntr * new + (one - one / cntr) * old if var_type == 'mean': new_value = run_avg(var, shared_var) elif var_type == 'var': mb_size = T.cast(mb_size, 'float32') new_value = run_avg(mb_size / (mb_size - one) * var, shared_var) else: raise NotImplemented('Unknown batch norm var %s' % var_type) def annotate_update(update, tag_to): a = Annotation() for (var, up) in update: a.updates[var] = up add_annotation(tag_to, a) # Add the counter update to the annotated update if it is the first # instance of a counter annotate_update([(shared_var, new_value)] + c_up, var) return var.reshape(orig_shape) def shared(self, init, name, cast_float32=True, role=PARAMETER, **kwargs): p = self.shareds.get(name) if p is None: p = shared_param(init, name, cast_float32, role, **kwargs) self.shareds[name] = p return p def new_activation_dict(self): return AttributeDict({'z': {}, 'h': {}, 's': {}, 'm': {}}) def encoder(self, input_, noise_std): z = input_ d = self.new_activation_dict() z = z + (self.rstream.normal(size=z.shape).astype(floatX) * noise_std[0]) d.z[0] = z h = z d.h[0] = h prev_dim = self.input_dim for i, (spec, act_f) in self.layers[1:]: layer_type, dim = spec noise = noise_std[i] if i < len(noise_std) else 0. z, m, s, h = self.f(h, prev_dim, layer_type, dim, i, act_f, noise) self.layer_dims[i] = dim d.z[i] = z d.s[i] = s d.m[i] = m d.h[i] = h prev_dim = dim return d def decoder(self, clean, corr, batch_size): get_unlabeled = lambda x: x[batch_size:] if x is not None else x est = self.new_activation_dict() costs = AttributeDict() costs.denois = AttributeDict() for i, ((_, spec), act_f) in self.layers[::-1]: z_corr = get_unlabeled(corr.z[i]) z_clean = get_unlabeled(clean.z[i]) z_clean_s = get_unlabeled(clean.s.get(i)) z_clean_m = get_unlabeled(clean.m.get(i)) # It's the last layer if i == len(self.layers) - 1: fspec = (None, None) ver = get_unlabeled(corr.h[i]) ver_dim = self.layer_dims[i] top_g = True else: fspec = self.layers[i + 1][1][0] ver = est.z.get(i + 1) ver_dim = self.layer_dims.get(i + 1) top_g = False z_est = self.g(z_lat=z_corr, z_ver=ver, in_dims=ver_dim, out_dims=self.layer_dims[i], num=i, fspec=fspec, top_g=top_g) # For semi-supervised version if z_clean_s: z_est_norm = (z_est - z_clean_m) / z_clean_s else: z_est_norm = z_est z_est_norm = z_est se = SquaredError('denois' + str(i)) costs.denois[i] = se.apply(z_est_norm.flatten(2), z_clean.flatten(2)) \ / np.prod(self.layer_dims[i], dtype=floatX) costs.denois[i].name = 'denois' + str(i) # Store references for later use est.z[i] = z_est est.h[i] = apply_act(z_est, act_f) est.s[i] = None est.m[i] = None return est, costs def apply(self, input_lb, input_un, target): batch_size = input_lb.shape[0] get_labeled = lambda x: x[:batch_size] if x is not None else x input = T.concatenate([input_lb, input_un], axis=0) self.layer_dims = {0: self.input_dim} self.lr = self.shared(self.default_lr, 'learning_rate', role=None) top = len(self.layers) - 1 clean = self.encoder(input, noise_std=[0]) corr = self.encoder(input, noise_std=self.noise_std) ests, costs = self.decoder(clean, corr, batch_size) # Costs y = target.flatten() costs.class_clean = CategoricalCrossEntropy().apply( y, get_labeled(clean.h[top])) costs.class_clean.name = 'CE_clean' costs.class_corr = CategoricalCrossEntropy().apply( y, get_labeled(corr.h[top])) costs.class_corr.name = 'CE_corr' costs.total = costs.class_corr * 1.0 for i in range(len(self.layers)): costs.total += costs.denois[i] * self.denoising_cost_x[i] costs.total.name = 'Total_cost' self.costs = costs # Classification error mr = MisclassificationRate() self.error = mr.apply(y, get_labeled(clean.h[top])) * np.float32(100.) self.error.name = 'Error_rate' def rand_init(self, in_dim, out_dim): return self.rng.randn(in_dim, out_dim) / np.sqrt(in_dim) def apply_layer(self, layer_type, input_, in_dim, out_dim, layer_name): # Since we pass this path twice (clean and corr encoder), we # want to make sure that parameters of both layers are shared. layer = self.shareds.get(layer_name) if layer is None: if layer_type == 'fc': linear = Linear(use_bias=False, name=layer_name, input_dim=in_dim, output_dim=out_dim, seed=1) linear.weights_init = Glorot(self.rng, in_dim, out_dim) linear.initialize() layer = linear self.shareds[layer_name] = layer return layer.apply(input_) def f(self, h, in_dim, layer_type, dim, num, act_f, noise_std): layer_name = 'f_' + str(num) + '_' z = self.apply_layer(layer_type, h, in_dim, dim, layer_name) m = s = None m = z.mean(0, keepdims=True) s = z.var(0, keepdims=True) # if noise_std == 0: # m = self.annotate_bn(m, layer_name + 'bn', 'mean', # z.shape[0], dim) # s = self.annotate_bn(s, layer_name + 'bn', 'var', # z.shape[0], dim) z = (z - m) / T.sqrt(s + np.float32(1e-10)) z_lat = z + self.rstream.normal(size=z.shape).astype( floatX) * noise_std z = z_lat # Add bias if act_f != 'linear': z += self.shared(0.0 * np.ones(dim), layer_name + 'b', role=BIAS) # Add Gamma parameter if necessary. (Not needed for all act_f) if (act_f in ['sigmoid', 'tanh', 'softmax']): c = self.shared(1.0 * np.ones(dim), layer_name + 'c', role=WEIGHT) z *= c h = apply_act(z, act_f) return z_lat, m, s, h def g(self, z_lat, z_ver, in_dims, out_dims, num, fspec, top_g): f_layer_type, dims = fspec layer_name = 'g_' + str(num) + '_' in_dim = np.prod(dtype=floatX, a=in_dims) out_dim = np.prod(dtype=floatX, a=out_dims) if top_g: u = z_ver else: u = self.apply_layer(f_layer_type, z_ver, in_dim, out_dim, layer_name) u -= u.mean(0, keepdims=True) u /= T.sqrt(u.var(0, keepdims=True) + np.float32(1e-10)) z_lat = z_lat.flatten(2) bi = lambda inits, name: self.shared(inits * np.ones(out_dim), layer_name + name, role=BIAS) wi = lambda inits, name: self.shared(inits * np.ones(out_dim), layer_name + name, role=WEIGHT) type_ = 'wierd' if type_ == 'wierd': sigval = (bi(0., 'c1') + wi(1., 'c2') * z_lat + wi(0., 'c3') * u + wi(0., 'c4') * z_lat * u) sigval = T.nnet.sigmoid(sigval) z_est = (bi(0., 'a1') + wi(1., 'a2') * z_lat + wi(0., 'a3') * u + wi(0., 'a4') * z_lat * u + wi(1., 'b1') * sigval) elif type_ == 'simple': # if num != 6: # z_lat = z_lat * 0.0 # wu = wi(1., 'a3') * u # else: wu = wi(0., 'a3') * u wz = wi(1., 'a2') * z_lat wzu = wi(0., 'a4') * z_lat * u z_est = (bi(0., 'a1') + wz + wu + wzu) elif type_ == 'yoshua': wz = wi(1., 'a2') * z_lat wu = wi(0., 'a3') * u b = wi(1., 'b1') batch_size = u[:, 0:1].shape srng = T.shared_randomstreams.RandomStreams( self.rng.randint(999999)) mask = srng.binomial(n=1, p=0.5, size=batch_size) mask = T.addbroadcast(mask, 1) z_est = (mask * wz + (1 - mask) * wu) + b if (type(out_dims) == tuple and len(out_dims) > 1.0 and z_est.ndim < 4): z_est = z_est.reshape((z_est.shape[0],) + out_dims) return z_est
def test_normal0(): steps = 50 std = 2. if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or config.mode == 'Mode' and config.linker in ['py']): sample_size = (25, 30) default_rtol = .02 else: sample_size = (999, 50) default_rtol = .01 sample_size_odd = (sample_size[0], sample_size[1] - 1) x = tensor.matrix() for size, const_size, var_input, input, avg, rtol, std_tol in [ (sample_size, sample_size, [], [], -5., default_rtol, default_rtol), (x.shape, sample_size, [x], [np.zeros(sample_size, dtype=config.floatX)], -5., default_rtol, default_rtol), # test odd value (x.shape, sample_size_odd, [x], [np.zeros(sample_size_odd, dtype=config.floatX)], -5., default_rtol, default_rtol), (sample_size, sample_size, [], [], np.arange(np.prod(sample_size), dtype='float32').reshape(sample_size), 10. * std / np.sqrt(steps), default_rtol), # test empty size (scalar) ((), (), [], [], -5., default_rtol, 0.02), # test with few samples at the same time ((1, ), (1, ), [], [], -5., default_rtol, 0.02), ((3, ), (3, ), [], [], -5., default_rtol, 0.02), ]: R = MRG_RandomStreams(234) # Note: we specify `nstreams` to avoid a warning. n = R.normal(size=size, avg=avg, std=std, nstreams=rng_mrg.guess_n_streams(size, warn=False)) f = theano.function(var_input, n) f(*input) # Increase the number of steps if size implies only a few samples if np.prod(const_size) < 10: steps_ = steps * 50 else: steps_ = steps basictest(f, steps_, const_size, target_avg=avg, target_std=std, prefix='mrg ', allow_01=True, inputs=input, mean_rtol=rtol, std_tol=std_tol) sys.stdout.flush() RR = theano.tensor.shared_randomstreams.RandomStreams(234) nn = RR.normal(size=size, avg=avg, std=std) ff = theano.function(var_input, nn) basictest(ff, steps_, const_size, target_avg=avg, target_std=std, prefix='numpy ', allow_01=True, inputs=input, mean_rtol=rtol)
class vdrvc(object): def __init__(self): self._srng = RandomStreams(42) self.theta = None self.log_alpha = None def score(self, X, t): return acc(np.argmax(X.dot(self.theta.T), axis=1), t) def predict(self, X): return np.argmax(X.dot(self.theta.T), axis=1) def fit(self, X, t, num_classes, batch_size, max_iter=1000, display_each=100, lr=1e-2, beta=0.95): N, d = X.shape def create_theano_loss(d): X, t = T.dmatrix('X'), T.dvector('t') log_sigma2 = theano.shared(np.ones((num_classes, d))) theta = theano.shared(np.random.randn(num_classes, d)) # Change parametrization log_alpha = log_sigma2 - T.log(theta**2) la, alpha = log_alpha, T.exp(log_alpha) # -KL(q || prior) mD_KL = -(0.5 * T.log1p(T.exp(-la)) - (0.03 + 1.0 / (1.0 + T.exp(-(1.5 * (la + 1.3)))) * 0.64)).sum() # NLL through Local Reparametrization mu, si = T.dot(X, theta.T), T.sqrt( T.dot(X * X, (alpha * theta * theta).T)) activation = mu + self._srng.normal(mu.shape, avg=0, std=1) * si predictions = T.nnet.softmax(activation) ell = -T.sum( categorical_crossentropy(predictions, one_hot(t, num_classes))) # Objective Negative SGVLB nlb = -(N / batch_size * ell + mD_KL) # Optimization Method and Function Compiling opt = lasagne.updates.adam(nlb, [log_sigma2, theta], learning_rate=lr, beta1=beta) lbf = function([X, t], nlb, updates=opt) return lbf, theta, log_sigma2 lbf, theta, log_sigma2 = create_theano_loss(d) # Main loop for i in range(max_iter): if batch_size != N: idx = np.random.choice(X.shape[0], batch_size) loss = lbf(X[idx], t[idx]) else: loss = lbf(X, t) if display_each and i % display_each == 0: self.theta = theta.get_value() self.log_alpha = log_sigma2.get_value() - 2 * np.log( np.abs(self.theta)) acc_, ard_ = acc( self.predict(X), t), np.sum(self.log_alpha > 5) * 1.0 / self.log_alpha.size print('iter = %.4f' % i, 'vlb = %.4f' % loss, 'acc = %.4f' % acc_, 'ard = %.4f' % ard_) return self
def get_samples_and_objectives(self, model, data): space, sources = self.get_data_specs(model) space.validate(data) assert isinstance(model, AdversaryPair) g = model.generator d = model.discriminator # Note: this assumes data is design matrix X = data m = data.shape[space.get_batch_axis()] y1 = T.alloc(1, m, 1) y0 = T.alloc(0, m, 1) # NOTE: if this changes to optionally use dropout, change the inference # code below to use a non-dropped-out version. S, z, other_layers = g.sample_and_noise( m, default_input_include_prob=self. generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale, all_g_layers=(self.infer_layer is not None)) if self.noise_both != 0.: rng = MRG_RandomStreams(2014 / 6 + 2) S = S + rng.normal(size=S.shape, dtype=S.dtype) * self.noise_both X = X + rng.normal(size=X.shape, dtype=S.dtype) * self.noise_both y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob, self.discriminator_input_include_probs, self.discriminator_default_input_scale, self.discriminator_input_scales) y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob, self.discriminator_input_include_probs, self.discriminator_default_input_scale, self.discriminator_input_scales) # d_obj = 0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0)) pos_mask = y_hat1 < .5 + self.d_eps neg_mask = y_hat0 > .5 - self.d_eps pos_cost_matrix = d.layers[-1].cost_matrix(y1, y_hat1) neg_cost_matrix = d.layers[-1].cost_matrix(y0, y_hat0) pos_cost = (pos_mask * pos_cost_matrix).mean() neg_cost = (neg_mask * neg_cost_matrix).mean() d_obj = 0.5 * (pos_cost + neg_cost) if self.no_drop_in_d_for_g: y_hat0_no_drop = d.dropout_fprop(S) g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0_no_drop) else: g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0) assert g_cost_mat.ndim == 2 assert y_hat0.ndim == 2 mask = y_hat0 < 0.5 + self.g_eps masked_cost = g_cost_mat * mask g_obj = masked_cost.mean() if model.inferer is not None: # Change this if we ever switch to using dropout in the # construction of S. S_nograd = block_gradient( S) # Redundant as long as we have custom get_gradients pred = model.inferer.dropout_fprop( S_nograd, self.inference_default_input_include_prob, self.inference_input_include_probs, self.inference_default_input_scale, self.inference_input_scales) if self.infer_layer is None: target = z else: target = other_layers[self.infer_layer] i_obj = model.inferer.layers[-1].cost(target, pred) else: i_obj = 0 return S, d_obj, g_obj, i_obj
class QueuelessVariationalQueueManager(QueueManager): """ A variational-autoencoder-based manager which does not use the queue, with a configurable loss """ def __init__(self, feature_size, period=None, variational_loss_scale=1): """ Initialize the manager. Parameters: feature_size: The width of a feature period: Period for queue activations variational_loss_scale: Factor by which to scale variational loss """ self._feature_size = feature_size self._period = period self._srng = MRG_RandomStreams(np.random.randint(0, 1024)) self._variational_loss_scale = np.array(variational_loss_scale, np.float32) @property def activation_width(self): return self.feature_size * 2 @property def feature_size(self): return self._feature_size def helper_sample(self, input_activations): n_batch, n_time, _ = input_activations.shape means = input_activations[:, :, :self.feature_size] stdevs = abs(input_activations[:, :, self.feature_size:]) + constants.EPSILON wiggle = self._srng.normal(means.shape) vects = means + (stdevs * wiggle) strengths = T.zeros((n_batch, n_time)) if self._period is None: strengths = T.set_subtensor(strengths[:, -1], 1) else: strengths = T.set_subtensor( strengths[:, self._period - 1::self._period], 1) return strengths, vects, means, stdevs, {} def get_strengths_and_vects(self, input_activations): strengths, vects, means, stdevs, _ = self.helper_sample( input_activations) return strengths, vects def process(self, input_activations, extra_info=False): strengths, vects, means, stdevs, sample_info = self.helper_sample( input_activations) means_sq = means**2 variance = stdevs**2 loss_parts = 1 + T.log(variance) - means_sq - variance if self._period is None: loss_parts = loss_parts[:, -1] else: loss_parts = loss_parts[:, self._period - 1::self._period] variational_loss = -0.5 * T.sum( loss_parts) * self._variational_loss_scale info = {"variational_loss": variational_loss} info.update(sample_info) if extra_info: return variational_loss, strengths, vects, info else: return variational_loss, strengths, vects
class Generator(Model): def __init__(self, mlp, noise="gaussian", monitor_ll=False, ll_n_samples=100, ll_sigma=0.2): Model.__init__(self) self.__dict__.update(locals()) del self.self self.theano_rng = MRG_RandomStreams(2014 * 5 + 27) def get_input_space(self): return self.mlp.get_input_space() def dropout_fprop(self, sample_data, default_input_include_prob=1., default_input_scale=1., all_g_layers=False): if all_g_layers: rval = self.mlp.dropout_fprop( sample_data, default_input_include_prob=default_input_include_prob, default_input_scale=default_input_scale, return_all=all_g_layers) other_layers, rval = rval[:-1], rval[-1] else: rval = self.mlp.dropout_fprop( sample_data, default_input_include_prob=default_input_include_prob, default_input_scale=default_input_scale) other_layers = None return rval, other_layers def sample_and_noise(self, num_samples, default_input_include_prob=1., default_input_scale=1., all_g_layers=False): n = self.mlp.get_input_space().get_total_dimension() noise = self.get_noise((num_samples, n)) formatted_noise = VectorSpace(n).format_as(noise, self.mlp.get_input_space()) rval, other_layers = self.dropout_fprop( formatted_noise, default_input_include_prob=default_input_include_prob, default_input_scale=default_input_scale, all_g_layers=all_g_layers) return rval, formatted_noise, other_layers def sample(self, num_samples, default_input_include_prob=1., default_input_scale=1.): sample, _, _ = self.sample_and_noise(num_samples, default_input_include_prob, default_input_scale) return sample def get_monitoring_channels(self, data): if data is None: m = 100 else: m = data.shape[0] n = self.mlp.get_input_space().get_total_dimension() noise = self.get_noise((m, n)) rval = OrderedDict() try: rval.update(self.mlp.get_monitoring_channels((noise, None))) except Exception: warnings.warn( "something went wrong with generator.mlp's monitoring channels" ) if self.monitor_ll: rval['ll'] = T.cast( self.ll(data, self.ll_n_samples, self.ll_sigma), theano.config.floatX).mean() rval['nll'] = -rval['ll'] return rval def get_noise(self, size): # Allow just requesting batch size if isinstance(size, int): size = (size, self.get_input_space().get_total_dimension()) if not hasattr(self, 'noise'): self.noise = "gaussian" if self.noise == "uniform": return self.theano_rng.uniform(low=-np.sqrt(3), high=np.sqrt(3), size=size, dtype='float32') elif self.noise == "gaussian": return self.theano_rng.normal(size=size, dtype='float32') elif self.noise == "spherical": noise = self.theano_rng.normal(size=size, dtype='float32') noise = noise / T.maximum(1e-7, T.sqrt( T.sqr(noise).sum(axis=1))).dimshuffle(0, 'x') return noise else: raise NotImplementedError(self.noise) def get_params(self): return self.mlp.get_params() def get_output_space(self): return self.mlp.get_output_space() def ll(self, data, n_samples, sigma): samples = self.sample(n_samples) output_space = self.mlp.get_output_space() if 'Conv2D' in str(output_space): samples = output_space.convert(samples, output_space.axes, ('b', 0, 1, 'c')) samples = samples.flatten(2) data = output_space.convert(data, output_space.axes, ('b', 0, 1, 'c')) data = data.flatten(2) parzen = theano_parzen(data, samples, sigma) return parzen def _modify_updates(self, updates): self.mlp.modify_updates(updates) def get_lr_scalers(self): return self.mlp.get_lr_scalers() def __setstate__(self, state): self.__dict__.update(state) if 'monitor_ll' not in state: self.monitor_ll = False
class ESGD(RMSProp): r'''Equilibrated SGD computes a diagonal Hessian preconditioner. Notes ----- The ESGD method uses the same general strategy as all first-order stochastic gradient methods, in the sense that these methods make small parameter adjustments iteratively using local derivative information. The difference here is that as gradients are computed during each parameter update, an exponentially-weighted moving average (EWMA) of estimates of the diagonal of the Hessian (the matrix of second derivatives) is maintained as well. At each update, the EWMA is used to compute the root-mean-square (RMS) diagonal value that's been seen in the recent past. The actual gradient is scaled by the inverse of this diagonal preconditioner before being applied to update the parameters. Intuitively, this causes the algorithm to "reshape" the loss function in parameter space, such that directions of steep gradient (i.e., large diagonal values) and directions of shallow gradient (i.e., small diagonal values) are scaled to be approximately the same slope. The diagonal estimates are computed using a nice trick: A vector :math:`r \sim \mathcal{N}(0, 1)` consisting of standard normal values is sampled randomly at each update step, and the value of :math:`Hr` is computed symbolically. These vector values tend to approximate the diagonal of the Hessian. Because :math:`Hr` is itself a vector, the full Hessian :math:`H` does not need to be computed or stored. .. math:: \begin{eqnarray*} r &\sim& \mathcal{N}(0, 1) \\ Hr &=& \frac{\partial^2 \mathcal{L}}{\partial^2 p}r \\ D_{t+1} &=& \gamma D_t + (1 - \gamma) (Hr)^2 \\ p_{t+1} &=& p_t + - \frac{\alpha}{\sqrt{D_{t+1} + \epsilon}} \frac{\partial\mathcal{L}}{\partial p} \end{eqnarray*} Like :class:`Rprop` and the :class:`ADADELTA`--:class:`RMSProp` family, this learning method effectively maintains a sort of parameter-specific learning rate for each parameter in the loss. In this implementation, :math:`\epsilon` regularizes the RMS values; it is is specified using the ``rms_regularizer`` parameter. The weight parameter :math:`\gamma` for the EWMA is computed from the ``rms_halflife`` keyword argument, such that the actual EWMA weight varies inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln 2}{h}}`. The primary difference between this implementation and the algorithm described in the paper (see below) is the use of an EWMA to decay the diagonal values over time, while in the paper the diagonal is divided by the training iteration. The EWMA halflife should be set to something reasonably large to ensure that this method emulates the method described in the original paper. References ---------- .. [Daup14] Y. Dauphin, H. de Vries, J. Chung & Y. Bengio. (2014) "RMSProp and equilibrated adaptive learning rates for non-convex optimization." http://arxiv.org/abs/1502.04390 ''' def __init__(self, *args, **kwargs): self.rng = RandomStreams() super(ESGD, self).__init__(*args, **kwargs) def _get_updates_for(self, param, grad): D_tm1 = shared_like(param, 'D_ewma') Hv = TT.Rop(grad, param, self.rng.normal(param.shape)) D_t = self.ewma * D_tm1 + (1 - self.ewma) * Hv * Hv den = TT.sqrt(D_t) + self.epsilon yield D_tm1, D_t yield param, param - grad * self.learning_rate / den
def test_match_grad_valid_conv(): # Tests that weightActs is the gradient of FilterActs # with respect to the weights. for partial_sum in [0, 1, 4]: rng = np.random.RandomState([2012, 10, 9]) batch_size = 3 rows = 7 cols = 9 channels = 8 filter_rows = 4 filter_cols = filter_rows num_filters = 16 images = shared(rng.uniform( -1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = shared(rng.uniform(-1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs(partial_sum=partial_sum)(gpu_images, gpu_filters) output = host_from_gpu(output) images_bc01 = images.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) filters_bc01 = filters_bc01[:, :, ::-1, ::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid') output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) theano_rng = MRG_RandomStreams(2013 + 1 + 31) coeffs = theano_rng.normal(avg=0., std=1., size=output_conv2d.shape, dtype='float32') cost_conv2d = (coeffs * output_conv2d).sum() weights_grad_conv2d = T.grad(cost_conv2d, filters) cost = (coeffs * output).sum() hid_acts_grad = T.grad(cost, output) weights_grad = WeightActs(partial_sum=partial_sum)( gpu_images, gpu_from_host(hid_acts_grad), as_tensor_variable( (4, 4)))[0] weights_grad = host_from_gpu(weights_grad) f = function( [], [output, output_conv2d, weights_grad, weights_grad_conv2d]) output, output_conv2d, weights_grad, weights_grad_conv2d = f() if np.abs(output - output_conv2d).max() > 8e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print 'cuda-convnet shape: ', output.shape print 'theano shape: ', output_conv2d.shape assert False err = np.abs(output - output_conv2d) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (output.min(), output.max()) print 'theano value range: ', (output_conv2d.min(), output_conv2d.max()) assert False warnings.warn( """test_match_grad_valid_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""" ) if np.abs(weights_grad - weights_grad_conv2d).max() > 8.6e-6: if type(weights_grad) != type(weights_grad_conv2d): raise AssertionError("weights_grad is of type " + str(weights_grad)) assert weights_grad.dtype == weights_grad_conv2d.dtype if weights_grad.shape != weights_grad_conv2d.shape: print 'cuda-convnet shape: ', weights_grad.shape print 'theano shape: ', weights_grad_conv2d.shape assert False err = np.abs(weights_grad - weights_grad_conv2d) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (weights_grad.min(), weights_grad.max()) print 'theano value range: ', (weights_grad_conv2d.min(), weights_grad_conv2d.max()) assert False
def test_normal0(): steps = 50 std = 2. if mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE']: sample_size = (25, 30) default_rtol = .02 else: sample_size = (999, 50) default_rtol = .01 sample_size_odd = (sample_size[0], sample_size[1] - 1) x = tensor.matrix() for size, const_size, var_input, input, avg, rtol in [ (sample_size, sample_size, [], [], -5., default_rtol), (x.shape, sample_size, [x], [numpy.zeros(sample_size, dtype=config.floatX)], -5., default_rtol), #test odd value (sample_size_odd, sample_size_odd, [], [], -5., default_rtol), #test odd value (x.shape, sample_size_odd, [x], [numpy.zeros(sample_size_odd, dtype=config.floatX)], -5., default_rtol), (sample_size, sample_size, [], [], numpy.arange(numpy.prod(sample_size), dtype='float32').reshape(sample_size), 10. * std / numpy.sqrt(steps)), ]: #print '' #print 'ON CPU:' R = MRG_RandomStreams(234, use_cuda=False) # Note: we specify `nstreams` to avoid a warning. n = R.normal(size=size, avg=avg, std=std, nstreams=rng_mrg.guess_n_streams(size, warn=False)) f = theano.function(var_input, n, mode=mode) #theano.printing.debugprint(f) out = f(*input) #print 'random?[:10]\n', out[0, 0:10] basictest(f, steps, const_size, target_avg=avg, target_std=std, prefix='mrg ', allow_01=True, inputs=input, mean_rtol=rtol) sys.stdout.flush() if mode != 'FAST_COMPILE' and cuda_available: #print '' #print 'ON GPU:' R = MRG_RandomStreams(234, use_cuda=True) n = R.normal(size=size, avg=avg, std=std, dtype='float32', nstreams=rng_mrg.guess_n_streams(size, warn=False)) #well, it's really that this test w GPU doesn't make sense otw assert n.dtype == 'float32' f = theano.function( var_input, theano.Out(theano.sandbox.cuda.basic_ops.gpu_from_host(n), borrow=True), mode=mode_with_gpu) #theano.printing.debugprint(f) sys.stdout.flush() gpu_out = numpy.asarray(f(*input)) #print 'random?[:10]\n', gpu_out[0, 0:10] #print '----' sys.stdout.flush() basictest(f, steps, const_size, target_avg=avg, target_std=std, prefix='gpu mrg ', allow_01=True, inputs=input, mean_rtol=rtol) # Need to allow some rounding error as their is float # computation that are done on the gpu vs cpu assert numpy.allclose(out, gpu_out, rtol=5e-6, atol=5e-6) #print '' #print 'ON CPU w NUMPY:' RR = theano.tensor.shared_randomstreams.RandomStreams(234) nn = RR.normal(size=size, avg=avg, std=std) ff = theano.function(var_input, nn) basictest(ff, steps, const_size, target_avg=avg, target_std=std, prefix='numpy ', allow_01=True, inputs=input, mean_rtol=rtol)
class SGHMCSampler(object): def __init__(self, rng=None, precondition=False, ignore_burn_in=False): if rng: self._srng = rng else: self._srng = RandomStreams(np.random.randint(1, 2147462579)) self.precondition = precondition self.prepared = False self.ignore_burn_in = ignore_burn_in self.steps_burn_in = 0 self.requires_burn_in = self.precondition self.optim_params = [] self.initial_values = [] def _store_initial_values(self, *params): self.optim_params = [] self.initial_values = [] for param in params: self.optim_params.append(param) self.initial_values.append(param.get_value()) def prepare_updates(self, cost, params, epsilon, mdecay=0.05, inputs=[], scale_grad=1., A=None, **kwargs): self.updates = [] self.burn_in_updates = [] grads = T.grad(cost, params) self.params = params self.cost = cost self.count = sharedX(0) self.epsilon = sharedX(np.float32(epsilon)) self.mdecay = sharedX(np.float32(mdecay)) self.inputs = inputs self.scale_grad = theano.shared(np.float32(scale_grad)) if A is not None: # calculate mdecay based on A #raise NotImplementedError("TODO") eps_scaled = epsilon / np.sqrt(scale_grad) new_mdecay = A * eps_scaled self.mdecay.set_value(np.float32(new_mdecay)) print("You specified A of {} -> changing mdecay to {}".format( A, mdecay)) for theta, grad in zip(params, grads): xi = sharedX(theta.get_value() * 0. + 1, broadcastable=theta.broadcastable) g = sharedX(theta.get_value() * 0. + 1, broadcastable=theta.broadcastable) g2 = sharedX(theta.get_value() * 0. + 1, broadcastable=theta.broadcastable) p = sharedX(theta.get_value() * 0., broadcastable=theta.broadcastable) r_t = 1. / (xi + 1.) self._store_initial_values(xi, g, g2, p) if self.precondition: g_t = (1. - r_t) * g + r_t * grad g2_t = (1. - r_t) * g2 + r_t * grad**2 xi_t = 1. + xi * (1. - g * g / (g2 + 1e-16)) Minv = 1. / (T.sqrt(g2 + 1e-16) + 1e-16) self.burn_in_updates.append((g, g_t)) self.burn_in_updates.append((g2, g2_t)) self.burn_in_updates.append((xi, xi_t)) noise = 0. else: Minv = 1. noise = 0. self.epsilon_scaled = self.epsilon / T.sqrt(self.scale_grad) noise_scale = 2. * self.epsilon_scaled**2 * self.mdecay * Minv - 2. * self.epsilon_scaled**3 * T.square( Minv) * noise sigma = T.sqrt(T.maximum(noise_scale, 1e-16)) sample_t = self._srng.normal(size=theta.shape) * sigma p_t = p - self.epsilon**2 * Minv * grad - self.mdecay * p + sample_t theta_t = theta + p_t self.updates.append((theta, theta_t)) self.updates.append((p, p_t)) self.prepared = True if self.ignore_burn_in: self.updates += self.burn_in_updates return self.updates else: return self.updates, self.burn_in_updates def step(self, *inp): if not self.prepared: raise RuntimeError( "You called step() without a prior call to prepare_updates()") if not hasattr(self, "step_fun"): print("... compiling theano function") self.step_fun = theano.function(self.inputs, self.cost, updates=self.updates) if not self.ignore_burn_in and self.steps_burn_in < 1 and self.requires_burn_in: raise RuntimeError( "Your sampler requires a burn_in please run step_burn_in() for a few steps" ) nll = self.step_fun(*inp) return self.params, nll def step_burn_in(self, *inp): if not self.prepared: raise RuntimeError( "You called step_burn_in() without a prior call to prepare_updates()" ) if not hasattr(self, "step_fun_burn_in"): print("... compiling theano function") if self.ignore_burn_in: self.step_fun_burn_in = theano.function(self.inputs, self.cost, updates=self.updates) else: self.step_fun_burn_in = theano.function(self.inputs, self.cost, updates=self.updates + self.burn_in_updates) nll = self.step_fun_burn_in(*inp) self.steps_burn_in += 1 return self.params, nll def reset(self, n_samples, epsilon, reset_opt_params=False, **kwargs): if self.prepared: self.epsilon.set_value(np.float32(epsilon)) self.scale_grad.set_value(np.float32(n_samples)) if hasattr(self, "mdecay"): if "mdecay" in kwargs: self.mdecay.set_value(np.float32(kwargs["mdecay"])) elif "A" in kwargs: eps_scaled = self.epsilon.get_value() / np.sqrt(n_samples) new_mdecay = A * eps_scaled self.mdecay.set_value(np.float32(new_mdecay)) if reset_opt_params: for param, value in zip(self.optim_params, self.initial_values): param.set_value(value) else: raise RuntimeError("reset called before prepare")