def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: # use stored mean and std mean = self.mean std = self.std else: # use this batch's mean and std mean = input.mean(self.axes, keepdims=True) std = input.std(self.axes, keepdims=True) # and update the stored mean and std: # we create (memory-aliased) clones of the stored mean and std running_mean = theano.clone(self.mean, share_inputs=False) running_std = theano.clone(self.std, share_inputs=False) # set a default update for them running_mean.default_update = (1 - self.alpha) * running_mean + self.alpha * mean running_std.default_update = (1 - self.alpha) * running_std + self.alpha * std # and include them in the graph so their default updates will be # applied (although the expressions will be optimized away later) mean += 0 * running_mean std += 0 * running_std std += self.epsilon mean = T.addbroadcast(mean, *self.axes) std = T.addbroadcast(std, *self.axes) beta = T.addbroadcast(self.beta, *self.axes) gamma = T.addbroadcast(self.gamma, *self.axes) normalized = (input - mean) * (gamma / std) + beta return self.nonlinearity(normalized)
def get_output_for(self, input, deterministic=False, batch_norm_use_averages=None, batch_norm_update_averages=None, **kwargs): self.count = self.count + 1 self.alpha = 5.0 / (10 + self.count) # self.alpha = 1.0 / (self.count^2) input_mean = input.mean(self.axes) input_inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) # Decide whether to use the stored averages or mini-batch statistics if batch_norm_use_averages is None: batch_norm_use_averages = deterministic use_averages = batch_norm_use_averages if use_averages: mean = self.mean inv_std = self.inv_std else: mean = input_mean inv_std = input_inv_std # Decide whether to update the stored averages if batch_norm_update_averages is None: batch_norm_update_averages = not deterministic update_averages = batch_norm_update_averages if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_inv_std = theano.clone(self.inv_std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_inv_std.default_update = ((1 - self.alpha) * running_inv_std + self.alpha * input_inv_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean inv_std += 0 * running_inv_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(input.ndim - len(self.axes))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) inv_std = inv_std.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma * inv_std) + beta return normalized
def _init_exprs(self): # Here we need to replace the input with a corrupted version. If we do # so naively by calling clone on the loss, the targets (which are # identical to the inputs in thesense of identity in programming) the # targets will be replaced as well. Instead, we just want to thave the # inputs replaced. Thus we first clone the output of the model and # replace the input with the corrupted input. This will not change the # targets. Afterwards, we put that corruption into the loss as well. super(DenoisingAutoEncoder, self)._init_exprs() if self.noise_type == 'gauss': corrupted_inpt = corrupt.gaussian_perturb( self.exprs['inpt'], self.c_noise) elif self.noise_type == 'mask': corrupted_inpt = corrupt.mask( self.exprs['inpt'], self.c_noise) output_from_corrupt = theano.clone( self.exprs['output'], {self.exprs['inpt']: corrupted_inpt} ) score = self.exprs['loss'] loss = theano.clone( self.exprs['loss'], {self.exprs['output']: output_from_corrupt}) self.exprs.update(get_named_variables(locals(), overwrite=True))
def apply_replacements(self, node, deterministic=False, include=None, exclude=None, more_replacements=None): """ Replace variables in graph with variational approximation. By default, replaces all variables Parameters ---------- node : Theano Variables (or Theano expressions) node or nodes for replacements deterministic : bool whether to use zeros as initial distribution if True - zero initial point will produce constant latent variables include : list latent variables to be replaced exclude : list latent variables to be excluded for replacements more_replacements : dict add custom replacements to graph, e.g. change input source Returns ------- node(s) with replacements """ replacements = self.construct_replacements( include, exclude, more_replacements ) node = theano.clone(node, replacements, strict=False) posterior = self.random(no_rand=deterministic) return theano.clone(node, {self.input: posterior}, strict=False)
def __init__(self, freq, activation, input, target_idx, task_loss, surrogate_loss, hyperparameter, learning_rate, batch_generator, n_batches, factor=1.5, n_updates=10): Extension.__init__(self, 'adapt_zloss', freq) self.batch_generator = batch_generator self.n_batches = n_batches self.learning_rate = learning_rate self.hyperparameter = hyperparameter self.factor = factor self.n_updates = n_updates # grad = theano.grad(surrogate_loss, activation) # new_activation = activation - learning_rate * grad self.fun_activation = theano.function([input], activation) activation_bis = tensor.matrix() surr_loss_bis = theano.clone(surrogate_loss, replace={activation: activation_bis}) grad = theano.grad(surr_loss_bis, activation_bis) new_activation = activation_bis - 100*learning_rate * grad task_loss_bis = theano.clone(task_loss, replace={activation: new_activation}) self.fun_update_task_loss = theano.function( [activation_bis, target_idx], [task_loss_bis, new_activation])
def _make_loss_functions(self, mode=None): """Return pair (f_loss, f_d_loss) of functions. - f_loss returns the current loss, - f_d_loss returns the gradient of that loss wrt parameters, """ rng = T.shared_randomstreams.RandomStreams() # Drop out inpts. inpt = self.exprs['inpt'] inpt_dropped_out = corrupt.mask(inpt, self.p_dropout_inpt, rng) givens = {inpt: inpt_dropped_out} loss = theano.clone(self.exprs['loss'], givens) n_layers = len(self.n_hiddens) for i in range(n_layers - 1): # Drop out hidden. hidden = self.exprs['hidden_%i' % i] hidden_dropped_out = corrupt.mask(hidden, self.p_dropout_hidden, rng) givens = {hidden: hidden_dropped_out} loss = theano.clone(loss, givens) d_loss = T.grad(loss, self.parameters.flat) f_loss = self.function(['inpt', 'target'], loss, explicit_pars=True, mode=mode) f_d_loss = self.function(['inpt', 'target'], d_loss, explicit_pars=True, mode=mode) return f_loss, f_d_loss
def filter_and_prob(inpt, transition, emission, visible_noise_mean, visible_noise_cov, hidden_noise_mean, hidden_noise_cov, initial_hidden, initial_hidden_cov): step = forward_step( transition, emission, visible_noise_mean, visible_noise_cov, hidden_noise_mean, hidden_noise_cov) hidden_mean_0 = T.zeros_like(hidden_noise_mean).dimshuffle('x', 0) hidden_cov_0 = T.zeros_like(hidden_noise_cov).dimshuffle('x', 0, 1) f0, F0, ll0 = step(inpt[0], hidden_mean_0, hidden_cov_0) replace = {hidden_noise_mean: initial_hidden, hidden_noise_cov: initial_hidden_cov} f0 = theano.clone(f0, replace) F0 = theano.clone(F0, replace) ll0 = theano.clone(ll0, replace) (f, F, ll), _ = theano.scan( step, sequences=inpt[1:], outputs_info=[f0, F0, None]) ll = ll.sum(axis=0) f = T.concatenate([T.shape_padleft(f0), f]) F = T.concatenate([T.shape_padleft(F0), F]) ll += ll0 return f, F, ll
def forward(self,input_org,train=True,update_batch_stat=True,finetune=False): print "Layer/BatchNormalization" ldim,cdim,rdim = self._internal_shape(input_org) input = input_org.reshape((ldim,cdim,rdim)) if (train): mean = T.mean(input, axis=(0, 2), keepdims=True ) var = T.mean((input-mean)**2, axis=(0, 2), keepdims=True) if(update_batch_stat): finetune_N = theano.clone(self.finetune_N, share_inputs=False) if(finetune): finetune_N.default_update = finetune_N+1 ratio = T.cast(1-1.0/(finetune_N+1),theano.config.floatX) else: finetune_N.default_update = 0 ratio = self.moving_avg_ratio m = ldim*rdim scale = T.cast(m/(m-1.0),theano.config.floatX) est_mean = theano.clone(self.est_mean, share_inputs=False) est_var = theano.clone(self.est_var, share_inputs=False) est_mean.default_update = T.cast(ratio*self.est_mean + (1-ratio)*mean,theano.config.floatX) est_var.default_update = T.cast(ratio*self.est_var + (1-ratio)*scale*var,theano.config.floatX) mean += 0 * est_mean var += 0 * est_var output = self._pbc(self.gamma) * (input - self._pbc(mean)) \ / T.sqrt(1e-6+self._pbc(var)) + self._pbc(self.beta) else: output = self._pbc(self.gamma) * (input - self._pbc(self.est_mean)) \ / T.sqrt(1e-6+self._pbc(self.est_var)) + self._pbc(self.beta) return output.reshape(input_org.shape)
def __call__(self, z): if z.ndim > 1: a = theano.scan( lambda z_: theano.clone(self.op.apply(self.tf), {self.op.input: z_}, strict=False), sequences=z, n_steps=z.shape[0])[0].mean() else: a = theano.clone(self.op.apply(self.tf), {self.op.input: z}, strict=False) return tt.abs_(a)
def safe_clone(cost, replace): params = replace.keys() nw_vals = replace.values() dummy_params = [x.type() for x in params] dummy_cost = theano.clone(cost, replace=dict(zip(params, dummy_params))) return theano.clone(dummy_cost, replace=dict(zip(dummy_params, nw_vals)))
def step(input, mask, cumsum_grad_att, extra_grad_h, h, h_pre, update, grad_h, C, *prev_grad_params): """ A single timestep of the backward pass. Parameters ---------- input: (batch_size, n_in) mask: (batch_size,) cumsum_grad_att: (batch_size, n_hidden) h: (batch_size, n_hidden) h_pre: (batch_size, n_hidden) update: (batch_size, n_hidden) grad_h: (batch_size, n_hidden) C: (batch_size, n_hidden, n_hidden) *prev_grad_params Returns ------- grad_input: (batch_size, n_in) grad_h_pre: (batch_size, n_hidden) C_pre: (batch_size, n_hidden, n_hidden) gradients with respect to the params (both of the recurrent and the update rule) """ C_pre = self.attention_update_rule.restore_previous_matrix(C, update) att_grads = theano.clone( output=[u_grad_h] + u_grad_params, replace={u_h: h, u_mask: mask, u_C_pre: C_pre, u_grad_att: cumsum_grad_att, u_query: h}) grad_h_att = att_grads[0] grad_params_att = att_grads[1:] grad_h_att *= 1000 / T.sum(seq_mask, axis=0)[:, None] grad_h_att = T.switch(mask[:, None], grad_h_att, .0) rec_grads = theano.clone( output=[back_grad_input, back_grad_h_pre] + back_grad_params, replace={back_input: input, back_mask: mask, back_h_pre: h_pre, back_grad_h: extra_grad_h + grad_h + grad_h_att}) grad_input = rec_grads[0] grad_h_pre = rec_grads[1] grad_params_rec = rec_grads[2:] grad_params = grad_params_att + grad_params_rec scan_outputs = [grad_input, grad_h_pre, C_pre] for prev_grad, grad in zip(prev_grad_params, grad_params): scan_outputs.append(prev_grad + grad) return tuple(scan_outputs)
def _elbo_t_new(logp, uw_g, uw_l, inarray_g, inarray_l, n_mcsamples, random_seed): """Return expression of approximate ELBO based on Monte Carlo sampling. """ r = MRG_RandomStreams(seed=random_seed) if uw_l is not None: l_g = (uw_g.size/2).astype('int64') u_g = uw_g[:l_g] w_g = uw_g[l_g:] l_l = (uw_l.size/2).astype('int64') u_l = uw_l[:l_l] w_l = uw_l[l_l:] logp_ = lambda z_g, z_l: theano.clone( logp, {inarray_g: z_g, inarray_l: z_l}, strict=False ) if n_mcsamples == 1: n_g = r.normal(size=inarray_g.tag.test_value.shape) z_g = n_g * tt.exp(w_g) + u_g n_l = r.normal(size=inarray_l.tag.test_value.shape) z_l = n_l * tt.exp(w_l) + u_l elbo = logp_(z_g, z_l) + \ tt.sum(w_g) + 0.5 * l_g * (1 + np.log(2.0 * np.pi)) + \ tt.sum(w_l) + 0.5 * l_l * (1 + np.log(2.0 * np.pi)) else: ns_g = r.normal(size=inarray_g.tag.test_value.shape) zs_g = ns_g * tt.exp(w_g) + u_g ns_l = r.normal(size=inarray_l.tag.test_value.shape) zs_l = ns_l * tt.exp(w_l) + u_l logps, _ = theano.scan(fn=lambda z_g, z_l: logp_(z_g, z_l), outputs_info=None, sequences=zip(zs_g, zs_l)) elbo = tt.mean(logps) + \ tt.sum(w_g) + 0.5 * l_g * (1 + np.log(2.0 * np.pi)) + \ tt.sum(w_l) + 0.5 * l_l * (1 + np.log(2.0 * np.pi)) else: l_g = (uw_g.size/2).astype('int64') u_g = uw_g[:l_g] w_g = uw_g[l_g:] logp_ = lambda z_g: theano.clone(logp, {inarray_g: z_g}, strict=False) if n_mcsamples == 1: n_g = r.normal(size=inarray_g.tag.test_value.shape) z_g = n_g * tt.exp(w_g) + u_g elbo = logp_(z_g) + \ tt.sum(w_g) + 0.5 * l_g * (1 + np.log(2.0 * np.pi)) else: n_g = r.normal(size=(n_mcsamples, u_g.tag.test_value.shape[0])) zs_g = n_g * tt.exp(w_g) + u_g logps, _ = theano.scan(fn=lambda q: logp_(q), outputs_info=None, sequences=[zs_g]) elbo = tt.mean(logps) + \ tt.sum(w_g) + 0.5 * l_g * (1 + np.log(2.0 * np.pi)) return elbo
def get_output_for(self, input, deterministic=False, **kwargs): input_mean = input.mean(self.axes) input_var = input.var(self.axes) # Decide whether to use the stored averages or mini-batch statistics use_averages = kwargs.get('batch_norm_use_averages', deterministic) if use_averages: mean = self.mean var = self.var else: mean = input_mean var = input_var # Decide whether to update the stored averages update_averages = kwargs.get('batch_norm_update_averages', not deterministic) if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_var = theano.clone(self.var, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_var.default_update = ((1 - self.alpha) * running_var + self.alpha * input_var) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean var += 0 * running_var # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(self.beta.ndim)) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = self.beta.dimshuffle(pattern) gamma = self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) std = T.sqrt(var + self.epsilon) std = std.dimshuffle(pattern) # normalize # normalized = (input - mean) * (gamma / std) + beta normalized = T.nnet.batch_normalization(input, gamma=gamma, beta=beta, mean=mean, std=std, mode=self.mode) return self.nonlinearity(normalized)
def get_output_for(self, input, deterministic=False, collect=False, **kwargs): if collect: # use this batch's mean and var if self.stat_indices is None: mean = input.mean(self.axes, keepdims=True) var = input.var(self.axes, keepdims=True) else: mean = input[self.stat_indices].mean(self.axes, keepdims=True) var = input[self.stat_indices].var(self.axes, keepdims=True) # and update the stored mean and var: # we create (memory-aliased) clones of the stored mean and var running_mean = theano.clone(self.mean, share_inputs=False) running_var = theano.clone(self.var, share_inputs=False) # set a default update for them if self.alpha is not 'single_pass': running_mean.default_update = ( (1 - self.alpha) * running_mean + self.alpha * mean) running_var.default_update = ( (1 - self.alpha) * running_var + self.alpha * var) else: print "Collecting using single pass..." # this is ugly figure out what can be safely removed... running_mean.default_update = (0 * running_mean + 1.0 * mean) running_var.default_update = (0 * running_var + 1.0 * var) # and include them in the graph so their default updates will be # applied (although the expressions will be optimized away later) mean += 0 * running_mean var += 0 * running_var elif deterministic: # use stored mean and var mean = self.mean var = self.var else: # use this batch's mean and var mean = input.mean(self.axes, keepdims=True) var = input.var(self.axes, keepdims=True) mean = T.addbroadcast(mean, *self.axes) var = T.addbroadcast(var, *self.axes) normalized = (input - mean) / T.sqrt(var + self.epsilon) if self.return_stats: return [normalized, mean, var] else: return normalized
def _apply(self, x): import theano input_shape = K.shape(x) is_training = K.is_training(x) ndim = K.ndim(x) self.config(input_shape=input_shape) # ====== training mode ====== # input_mean = K.mean(x, self.axes) input_inv_std = K.inv(K.sqrt(K.var(x, self.axes) + self.epsilon)) # Decide whether to use the stored averages or mini-batch statistics if not is_training: mean = self.mean inv_std = self.inv_std else: # update the stored averages mean = input_mean inv_std = input_inv_std # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_inv_std = theano.clone(self.inv_std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_inv_std.default_update = ((1 - self.alpha) * running_inv_std + self.alpha * input_inv_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean inv_std += 0 * running_inv_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(ndim - len(self.axes))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else K.dimshuffle(self.beta, pattern) gamma = 1 if self.gamma is None else K.dimshuffle(self.gamma, pattern) mean = K.dimshuffle(mean, pattern) inv_std = K.dimshuffle(inv_std, pattern) # normalize normalized = (x - mean) * (gamma * inv_std) + beta # set shape for output K.add_shape(normalized, input_shape) return self.activation(normalized)
def set_size_and_deterministic(self, node, s, d): initial_local = self._initial_part_matrix('local', s, d) initial_global = self._initial_part_matrix('global', s, d) # optimizations if isinstance(s, int) and (s == 1) or s is None: node = theano.clone(node, { self.logp: self.single_symbolic_logp }) out = theano.clone(node, { self.symbolic_initial_local_matrix: initial_local, self.symbolic_initial_global_matrix: initial_global, }) try_to_set_test_value(node, out, None) return out
def clone(**new_inputs): new_obj = utils.copy(self) # Reorder inputs assert len(new_obj.inputs) == len(new_inputs.items()) pairs=[(x, new_inputs[x.name]) for x in inputs] new_obj.inputs = new_inputs.values() new_obj.out = theano.clone(new_obj.out, replace=pairs) if hasattr(new_obj, 'cost'): new_obj.cost = theano.clone(new_obj.cost, replace=pairs) if hasattr(new_obj, 'grads'): new_obj.grads = theano.clone(new_obj.grads, replace=pairs) if hasattr(new_obj, 'sample'): new_obj.sample = theano.clone(new_obj.sample, replace=pairs) return new_obj
def clone(self, **new_inputs): new_obj = utils.copy(self) # Reorder inputs assert len(new_obj.inputs) == len(new_inputs.items()) # TODO: error with inputs arg here. corrected missing self argument, this method must not be used pairs = [(x, new_inputs[x.name]) for x in inputs] new_obj.inputs = new_inputs.values() new_obj.out = theano.clone(new_obj.out, replace=pairs) if hasattr(new_obj, 'cost'): new_obj.cost = theano.clone(new_obj.cost, replace=pairs) if hasattr(new_obj, 'grads'): new_obj.grads = theano.clone(new_obj.grads, replace=pairs) if hasattr(new_obj, 'sample'): new_obj.sample = theano.clone(new_obj.sample, replace=pairs) return new_obj
def single_symbolic_logp(self): logp = self.to_flat_input(self.model.logpt) loc = self.symbolic_random_local_matrix[0] glob = self.symbolic_random_global_matrix[0] iloc = self.local_input iglob = self.global_input return theano.clone(logp, {iloc: loc, iglob: glob})
def symbolic_log_q_W_local(self): mu, rho = self.__local_mu_rho mu = self.scale_grad(mu) rho = self.scale_grad(rho) z = self.symbolic_random_local_matrix logp = log_normal(z, mu, rho=rho) if self.local_size == 0: scaling = tt.constant(1, mu.dtype) else: scaling = [] for var in self.local_vars: scaling.append(tt.repeat(var.scaling, var.dsize)) scaling = tt.concatenate(scaling) # we need only dimensions here # from incoming unobserved # to get rid of input_view # I replace it with the first row # of total_random matrix # that always exists scaling = self.to_flat_input(scaling) scaling = theano.clone( scaling, { self.local_input: self.symbolic_random_local_matrix[0], self.global_input: self.symbolic_random_global_matrix[0] }) logp *= scaling logp = logp.sum(1) return logp # shape (s,)
def logp(self, z): factors = ([tt.sum(var.logpt) for var in self.model.basic_RVs] + [tt.sum(var) for var in self.model.potentials]) p = self.approx.to_flat_input(tt.add(*factors)) p = theano.clone(p, {self.input: z}) return p
def test_cloning_available(self): gop = generator(integers()) res = gop**2 shared = theano.shared(floatX(10)) res1 = theano.clone(res, {gop: shared}) f = theano.function([], res1) assert f() == np.float32(100)
def sample_node(self, node, size=100, more_replacements=None): """ Samples given node or nodes over shared posterior Parameters ---------- node : Theano Variables (or Theano expressions) size : scalar number of samples more_replacements : dict add custom replacements to graph, e.g. change input source Returns ------- sampled node(s) with replacements """ if more_replacements is not None: # pragma: no cover node = theano.clone(node, more_replacements, strict=False) posterior = self.random(size) node = self.to_flat_input(node) def sample(z): return theano.clone(node, {self.input: z}, strict=False) nodes, _ = theano.scan(sample, posterior, n_steps=size) return nodes
def fuse(building_blocks, fuse_dim=4, input_variables=None, entry_expression=None, output_expressions=-1, input_dtype='float32'): num_blocks = len(building_blocks) if isinstance(output_expressions, numbers.Number): output_expressions = [output_expressions] # account for indices -1, -2 etc output_expressions = [oe % num_blocks for oe in output_expressions] if fuse_dim == 4: fuse_block = T.tensor4 else: fuse_block = T.matrix if input_variables is None and entry_expression is None: input_variables = fuse_block(dtype=input_dtype) entry_expression = input_variables current_expression = entry_expression outputs = [] for i, block in enumerate(building_blocks): if not hasattr(block, "expression_"): block._build_expression() current_expression = theano.clone( block.expression_, replace={block.input_: current_expression}, strict=False) if i in output_expressions: outputs.append(current_expression) return outputs, input_variables
def gradIminibatch_srng(self, x, srng, num_samples, model_type='iwae'): # rep_x = T.extra_ops.repeat(x, num_samples, axis=0) rep_x = t_repeat(x, num_samples, axis=0) # works marginally faster than theano's T.extra_ops.repeat q_samples = self.q_samplesIx_srng(rep_x, srng) log_ws = self.log_weightsIq_samples(q_samples) log_ws_matrix = log_ws.reshape((x.shape[0], num_samples)) log_ws_minus_max = log_ws_matrix - T.max(log_ws_matrix, axis=1, keepdims=True) ws = T.exp(log_ws_minus_max) ws_normalized = ws / T.sum(ws, axis=1, keepdims=True) ws_normalized_vector = T.reshape(ws_normalized, log_ws.shape) dummy_vec = T.vector(dtype=theano.config.floatX) if model_type in ['vae', 'VAE']: print "Training a VAE" return collections.OrderedDict([( param, T.grad(T.sum(log_ws)/T.cast(num_samples, log_ws.dtype), param) ) for param in self.params]) else: print "Training an IWAE" return collections.OrderedDict([( param, theano.clone( T.grad(T.dot(log_ws, dummy_vec), param), replace={dummy_vec: ws_normalized_vector}) ) for param in self.params])
def __call__(self, z, **kwargs): if 'more_tf_params' in kwargs: m = -1 else: m = 1 if z.ndim > 1: a = theano.scan( lambda z_: theano.clone( self.op.apply(self.tf), {self.op.input: z_}, strict=False), sequences=z, n_steps=z.shape[0])[0].mean() else: a = theano.clone( self.op.apply(self.tf), {self.op.input: z}, strict=False) return m * self.op.T(a)
def test_cloning_available(self): gop = pm.Minibatch(np.arange(100), 1) res = gop**2 shared = theano.shared(np.array([10])) res1 = theano.clone(res, {gop: shared}) f = theano.function([], res1) assert f() == np.array([100])
def test_gt_grad(): """A user test that failed. Something about it made Elemwise.grad return something that was too complicated for get_scalar_constant_value to recognize as being 0, so gradient.grad reported that it was not a valid gradient of an integer. """ floatX = config.floatX T = theano.tensor input_ = T.vector(dtype=floatX) random_values = numpy.random.RandomState(1234).uniform(low=-1, high=1, size=(2, 2)) W_values = numpy.asarray(random_values, dtype=floatX) W = theano.shared(value=W_values, name='weights') correct_score = T.dot(input_, W) wrong_input = T.vector(dtype=floatX) wrong_score = theano.clone(correct_score, {input_: wrong_input}) # Hinge loss scores = T.ones_like(correct_score) - correct_score + wrong_score cost = (scores * (scores > 0)).sum() T.grad(cost, input_)
def score_function(self, sc_n_mc=None, more_replacements=None, fn_kwargs=None): # pragma: no cover R"""Compiles scoring function that operates which takes no inputs and returns Loss Parameters ---------- sc_n_mc : `int` number of scoring MC samples more_replacements: Apply custom replacements before compiling a function fn_kwargs: `dict` arbitrary kwargs passed to theano.function Returns ------- theano.function """ if fn_kwargs is None: fn_kwargs = {} if not self.op.RETURNS_LOSS: raise NotImplementedError('%s does not have loss' % self.op) if more_replacements is None: more_replacements = {} loss = theano.clone(self(sc_n_mc), more_replacements, strict=False) return theano.function([], loss, **fn_kwargs)
def logp_(z_g, z_l): return theano.clone(logp, OrderedDict({ inarray_g: z_g, inarray_l: z_l }), strict=False)
def _elbo_t(logp, uw, inarray, n_mcsamples, random_seed): """Create Theano tensor of approximate ELBO by Monte Carlo sampling. """ l = (uw.size / 2).astype('int64') u = uw[:l] w = uw[l:] # Callable tensor logp_ = lambda input: theano.clone(logp, {inarray: input}, strict=False) # Naive Monte-Carlo if random_seed is None: r = MRG_RandomStreams(gen_random_state()) else: r = MRG_RandomStreams(seed=random_seed) if n_mcsamples == 1: n = r.normal(size=inarray.tag.test_value.shape) q = n * tt.exp(w) + u elbo = logp_(q) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi)) else: n = r.normal(size=(n_mcsamples, u.tag.test_value.shape[0])) qs = n * tt.exp(w) + u logps, _ = theano.scan(fn=lambda q: logp_(q), outputs_info=None, sequences=[qs]) elbo = tt.mean(logps) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi)) return elbo
def __init__(self, rng, P_input, L2_input, **kwargs): #symbol declaration, initialization and definition x_1_tm1, x_t = (\ sparse.csr_matrix("x_1_tm1", dtype=theano.config.floatX),\ sparse.csr_matrix("x_t",dtype=theano.config.floatX)\ )\ if P_input is None else P_input[:2] #elements of history shape = kwargs.get("shape") if shape is not None: dict_size = shape[0] if len(shape) <= 1: del shape["shape"] else: shape["shape"] = shape["shape"][1:] else: dict_size = (16,1,32,32) D_1_tm1 = theano.shared(rng.normal(size=dict_size).astype(theano.config.floatX)) Dx_1_tm1 = sparse.dot(x_1_tm1, D_1_tm1)#array access=dot operation super(SequenceCNN, self).__init__(rng=rng, inputsymbol=Dx_1_tm1, **kwargs)#attaches new elements into the fgraph self.L2_output_1_tm1 = self.L2_output #elements of current time D_t = theano.shared(rng.normal(size=dict_size).astype(theano.config.floatX)) Dx_t = sparse.dot(x_t, D_t)#array access=dot operation self.L2_output_t = theano.clone(self.L2_output_1_tm1, replace={Dx_1_tm1:Dx_t}) #element prepartion for model building self.P_input = (x_1_tm1,x_t) self.params += [D_1_tm1, D_t] self.L2_output = self.L2_output_1_tm1*self.L2_output_t
def check_mat_rop_lop(self, y, out_shape): vx = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX) yv = tensor.Rop(y, self.mx, self.mv) rop_f = function([self.mx, self.mv], yv) sy, _ = theano.scan( lambda i,y,x,v: (tensor.grad(y[i],x)*v).sum(), sequences = tensor.arange(y.shape[0]), non_sequences = [y,self.mx,self.mv]) scan_f = function([self.mx,self.mv], sy) v1 = rop_f(vx,vv) v2 = scan_f(vx,vv) assert numpy.allclose(v1,v2), ('ROP mismatch: %s %s' % (v1, v2)) self.check_nondiff_rop( theano.clone(y, replace={self.mx:break_op(self.mx)})) vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.mx, self.v) lop_f = function([self.mx, self.v], yv) sy = tensor.grad((self.v*y).sum(), self.mx) scan_f = function([self.mx, self.v], sy) v1 = lop_f(vx,vv) v2 = scan_f(vx,vv) assert numpy.allclose(v1,v2), ('LOP mismatch: %s %s' % (v1, v2))
def _elbo_t(logp, uw, inarray, n_mcsamples, random_seed): """Create Theano tensor of approximate ELBO by Monte Carlo sampling. """ l = (uw.size / 2).astype('int64') u = uw[:l] w = uw[l:] # Callable tensor logp_ = lambda input: theano.clone(logp, {inarray: input}, strict=False) # Naive Monte-Carlo r = MRG_RandomStreams(seed=random_seed) if n_mcsamples == 1: n = r.normal(size=inarray.tag.test_value.shape) q = n * exp(w) + u elbo = logp_(q) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi)) else: n = r.normal(size=(n_mcsamples, u.tag.test_value.shape[0])) qs = n * exp(w) + u logps, _ = theano.scan(fn=lambda q: logp_(q), outputs_info=None, sequences=[qs]) elbo = tt.mean(logps) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi)) return elbo
def get_output_for(self, input, deterministic=False, **kwargs): """ Binary dense layer dot product computation """ if(self.xnor): # binarize the input bin_input, beta = binarize_fc_input(input) # compute weight scaling factor. self.Wb, alpha = binarize_fc_weights(self.W) if not deterministic: old_alpha = theano.clone(self.xalpha, share_inputs=False) old_alpha.default_update = alpha alpha += 0*old_alpha else: alpha = self.xalpha #W_full_precision = self.Wb * alpha.dimshuffle('x', 0) Wr = self.W self.W = self.Wb fc_out = super(DenseLayer, self).get_output_for(bin_input, **kwargs) # scale the output by alpha and beta # FIXME: Actually we are scaling after adding bias here. Need to scale first and then add bias. # The super class method automatically adds bias. Somehow need to overcome this.. # may subtract the bias, scale by alpha and beta ans then add bias ? fc_out = fc_out * beta.dimshuffle(0, 'x') fc_out = fc_out * alpha.dimshuffle('x', 0) #self.W = W_full_precision self.W = Wr else: fc_out = super(DenseLayer, self).get_output_for(input, **kwargs) return fc_out
def testBackward(): #x_s = theano.shared(np.random.normal(0.0, 0.1, size=(1,patchSize*patchSize)).astype('float32')) x_s = theano.shared(np.zeros((1, patchSize*patchSize), dtype='float32')) y_s = theano.shared(np.ones((1,), dtype='int32')) c = classifier.validation_cost(y) + 0.01*T.sum(abs(x)) loss = theano.clone(c, {x:x_s, y:y_s}) upd = lasagne.updates.rmsprop(loss, [x_s], learning_rate=0.01) func = theano.function(inputs=[], outputs = loss, updates = upd) for i in range(10000): res = func() print("Loss: {0}".format(res)) #if i%100 == 0: # img = x_s.get_value(borrow=False) # img = img.reshape((patchSize, patchSize)) # plt.imshow(img, cmap='gray') # plt.show() img = x_s.get_value(borrow=False) img = img.reshape((patchSize, patchSize)) plt.imshow(img, cmap='gray') plt.show()
def corrupt(exprs, name, typ, pars): f_corrupt = lookup(typ, _corrupt) if 'true_loss' not in exprs: exprs['true_loss'] = exprs['loss'] uncorrupted = exprs[name] corrupted = f_corrupt(uncorrupted, **pars) exprs['loss'] = theano.clone(exprs['loss'], {uncorrupted: corrupted})
def test_gt_grad(): """A user test that failed. Something about it made Elemwise.grad return something that was too complicated for get_scalar_constant_value to recognize as being 0, so gradient.grad reported that it was not a valid gradient of an integer. """ floatX = config.floatX T = theano.tensor input_ = T.vector(dtype=floatX) random_values = numpy.random.RandomState(1234).uniform( low=-1, high=1, size=(2, 2)) W_values = numpy.asarray(random_values, dtype=floatX) W = theano.shared(value=W_values, name='weights') correct_score = T.dot(input_, W) wrong_input = T.vector(dtype=floatX) wrong_score = theano.clone(correct_score, {input_: wrong_input}) # Hinge loss scores = T.ones_like(correct_score) - correct_score + wrong_score cost = (scores * (scores > 0)).sum() T.grad(cost, input_)
def test_cloning_available(self): gop = generator(integers()) res = gop ** 2 shared = theano.shared(floatX(10)) res1 = theano.clone(res, {gop: shared}) f = theano.function([], res1) assert f() == np.float32(100)
def cpu_to_gpu_graph(inputs, outputs): """ Converts a cpu-only subgraph into a gpu-only subgraph >>> x, y = theano.tensor.matrix('x'), theano.tensor.matrix('y') >>> z = theano.tensor.dot(x, y) >>> gpu_inputs, gpu_outputs = cpu_to_gpu_graph((x,y), (z,)) >>> f = theano.function(gpu_inputs, gpu_outputs) >>> theano.printing.debugprint(f) GpuDot22 [@A] '' 0 |gpu_x [@B] |gpu_y [@C] """ math_opt = theano.compile.optdb.query('-inplace', '+fast_run', '-gpu') gpu_opt = cuda.opt.gpu_optimizer.query('+gpu', '-inplace', '-async') gpu_comm = cuda.opt.gpu_cut_copies.query('+gpu') gpu_inputs, cpu_inputs = zip(*map(cpu_to_gpu_var, inputs)) outputs2 = theano.clone(outputs, replace=dict(zip(inputs, cpu_inputs))) gpu_outputs = map(theano.sandbox.cuda.basic_ops.gpu_from_host, outputs2) fgraph = theano.FunctionGraph(gpu_inputs, gpu_outputs) math_opt.optimize(fgraph) gpu_opt.optimize(fgraph) gpu_comm.optimize(fgraph) fgraph.disown() for go, co in zip(gpu_outputs, outputs): go.name = gpu_name(co.name) return tuple(gpu_inputs), tuple(gpu_outputs)
def prior_dlogp(vars, model, flat_view): """Returns the gradient of the prior on the parameters as a vector of size D x 1""" terms = tt.concatenate( [theano.grad(var.logpt, var).flatten() for var in vars], axis=0) dlogp = theano.clone(terms, flat_view.replacements, strict=False) return dlogp
def test_cloning_available(self): gop = pm.Minibatch(np.arange(100), 1) res = gop ** 2 shared = theano.shared(np.array([10])) res1 = theano.clone(res, {gop: shared}) f = theano.function([], res1) assert f() == np.array([100])
def get_output(self, input, **kwargs): input_mean = input.mean(self.axes) input_invstd = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) # Decide whether to use the stored averages or mini-batch statistics use_averages = self.deterministic if use_averages: mean = self.mean invstd = self.invstd else: mean = input_mean invstd = input_invstd # Decide whether to update the stored averages update_averages = self.update_averages and not use_averages if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_invstd = theano.clone(self.invstd, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_invstd.default_update = ( (1 - self.alpha) * running_invstd + self.alpha * input_invstd) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean invstd += 0 * running_invstd # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(list(range(input.ndim - len(self.axes)))) pattern = [ 'x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim) ] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) invstd = invstd.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma * invstd) + beta return self.activation(normalized)
def __init__(self, components): """Constructor. Parameters ---------- * `components` [list of `DistributionMixin`]: The components to join together. """ super(Join, self).__init__() self.components = components for i, component in enumerate(components): # Add component parameters, constants and observeds if isinstance(component, TheanoDistribution): for p_i in component.parameters_: self.parameters_.add(p_i) for c_i in component.constants_: self.constants_.add(c_i) for o_i in component.observeds_: self.observeds_.add(o_i) # Derive and overide pdf and nll analytically if possible if all([hasattr(c, "pdf_") for c in self.components]): # pdf c0 = self.components[0] self.pdf_ = theano.clone(c0.pdf_, {c0.X: self.X[:, 0:c0.ndim]}) start = c0.ndim for c in self.components[1:]: self.pdf_ *= theano.clone( c.pdf_, {c.X: self.X[:, start:start+c.ndim]}) start += c.ndim self._make(self.pdf_, "pdf") if all([hasattr(c, "nll_") for c in self.components]): # nll c0 = self.components[0] self.nll_ = theano.clone(c0.nll_, {c0.X: self.X[:, 0:c0.ndim]}) start = c0.ndim for c in self.components[1:]: self.nll_ += theano.clone( c.nll_, {c.X: self.X[:, start:start+c.ndim]}) start += c.ndim self._make(self.nll_, "nll")
def gradIminibatch_srng(self, x, srng, num_samples, model_type='iwae', backward_pass='******'): # rep_x = T.extra_ops.repeat(x, num_samples, axis=0) rep_x = t_repeat( x, num_samples, axis=0) # works marginally faster than theano's T.extra_ops.repeat q_samples = self.q_samplesIx_srng(rep_x, srng) log_ws = self.log_weightsIq_samples(q_samples) log_ws_matrix = log_ws.reshape((x.shape[0], num_samples)) # for alpha divergence (take 0 <= alpha <= 1) # see the math to show why we can directly set alpha = 1, # with reparameterization trick if backward_pass == 'full': log_ws_matrix *= (1.0 - self.alpha) log_ws_minus_max = log_ws_matrix - T.max( log_ws_matrix, axis=1, keepdims=True) ws = T.exp(log_ws_minus_max) ws_normalized = ws / T.sum(ws, axis=1, keepdims=True) ws_normalized_vector = T.reshape(ws_normalized, log_ws.shape) dummy_vec = T.vector(dtype=theano.config.floatX) else: # just take the particle that has the largest (unnormalised) weight # NOTE: might pick different particles for different datapoint! log_ws_max = log_ws_matrix.max(axis=1) if backward_pass == 'max': print "Training an AAE with largest particle" return collections.OrderedDict([ (param, T.grad(T.sum(log_ws_max) / T.cast(1, log_ws.dtype), param)) for param in self.params ]) elif model_type in ['vae', 'VAE']: print "Training a VAE" return collections.OrderedDict([ (param, T.grad( T.sum(log_ws) / T.cast(num_samples, log_ws.dtype), param)) for param in self.params ]) else: print "Training an AAE with alpha = %.2f, k = %d" % (self.alpha, num_samples) return collections.OrderedDict([ (param, theano.clone(T.grad(T.dot(log_ws, dummy_vec), param), replace={dummy_vec: ws_normalized_vector})) for param in self.params ])
def inner_replacer(graph): new_graph = replacer(graph) other_inputs = [] constants = [] for input_ in gof.graph.inputs([new_graph]): if isinstance(input_, gof.Variable): if isinstance(input_, gof.Constant): constants.append(input_) else: other_inputs.append(input_) # foreign inputs are fgraph inputs and shared variables that we need # to access through inner inputs foreign_inputs = list(set(other_inputs) - set(outer_to_inner.values())) # skip further processing if there is nothing to do if not constants and not foreign_inputs: return new_graph replacements = [] # constants just need to be replaced by copies that the inner # `fg` can take ownership of for input_ in constants: new_input = input_.clone() new_input.name = f"{new_input.name}_copied" replacements.append((input_, new_input)) for outer_input in foreign_inputs: if getattr(outer_input, "update", False): # when theano.scan() constructs a scan node, it detects # shared variables with updates and returns these updates # to the user. we need to do the same thing for every new # use of such a variable that is introduced. it's hard to # do that at this point. # shared variables with updates inside the inner graph of # OpFromGraph are not supported at all, so we don't support # introducing those either. raise NotImplementedError( f"Replacement introduces shared variable {outer_input} " "which has an update associated with it into " f"the inner graph of {containing_op}. This is not currently " "supported.") # if this foreign input is not already available # as an inner input, connect it through a new # inner input if outer_input not in outer_to_inner.keys(): inner_input = utils.safe_new(outer_input, tag="_copy") outer_to_inner[outer_input] = inner_input extra_inner_inputs.append(inner_input) extra_outer_inputs.append(outer_input) replacements.extend(outer_to_inner.items()) (new_graph, ) = theano.clone([new_graph], share_inputs=True, replace=replacements) return new_graph
def normalizing_constant(self): t = self.to_flat_input( tt.max([v.scaling for v in self.model.basic_RVs])) t = theano.clone(t, {self.input: tt.zeros(self.total_size)}) # if not scale_cost_to_minibatch: t=1 t = tt.switch(self.scale_cost_to_minibatch, t, tt.constant(1, dtype=t.dtype)) return t
def set_size_and_deterministic(self, node, s, d): """ Replaces self.symbolic_n_samples and self._deterministic_flag with non symbolic input. Used whenever user specifies `sample size` and `deterministic` option """ initial_local = self._initial_part_matrix('local', s, d) initial_global = self._initial_part_matrix('global', s, d) # optimizations if isinstance(s, int) and (s == 1) or s is None: node = theano.clone(node, {self.logp: self.single_symbolic_logp}) return theano.clone( node, { self.symbolic_initial_local_matrix: initial_local, self.symbolic_initial_global_matrix: initial_global, })
def symbsample_X(self, Y=None, X=None): """ TODO: Write docstring """ if Y is None: Y = self.Y if X is None: X = self.lat_ev_model.get_X() Xgen = self.lat_ev_model.get_X() Nsamps, Tbins = Y.shape[0], Y.shape[1] TheChol = theano.clone(self.TheChol, replace={self.Y : Y, Xgen : X}) postX = theano.clone(self.postX, replace={self.Y : Y, Xgen : X}) normSamps = srng.normal([Nsamps, Tbins, self.xDim]) noise, _ = theano.scan(lambda tc1, tc2, ns : blk_chol_inv(tc1, tc2, ns, lower=False, transpose=True), sequences=(TheChol[0], TheChol[1], normSamps)) return postX + noise
def forward_pass(self, z0): ret = theano.clone(self.forward, {self.root.z0: z0}) try: ret.tag.test_value = np.random.normal( size=z0.tag.test_value.shape).astype(self.z0.dtype) except AttributeError: ret.tag.test_value = self.root.z0.tag.test_value return ret
def make_L_fn(self, loss, params): grads = theano.grad(loss, params) params_next = [x - 1. / self.L * g for x, g in zip(params, grads)] loss_next = theano.clone(loss, replace=zip(params, params_next)) sq_sum = sum((g**2).sum() for g in grads) return theano.function([self.input_var, self.target_var], [loss_next, sq_sum])
def logp_norm(self, z): t = self.approx.normalizing_constant factors = ([tt.sum(var.logpt) / t for var in self.model.basic_RVs] + [tt.sum(var) / t for var in self.model.potentials]) logpt = tt.add(*factors) p = self.approx.to_flat_input(logpt) p = theano.clone(p, {self.input: z}) return p
def __call__(self, input): """ Replaces the single input of symbolic variable to be the passed argument. Parameters ---------- input : TensorVariable """ oldinput, = inputvars(self.tensor) return theano.clone(self.tensor, {oldinput: input}, strict=False)
def check_rop_lop(self, y, out_shape): """ As check_mat_rop_lop, except the input is self.x which is a vector. The output is still a vector. """ # TEST ROP vx = np.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = np.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) yv = tensor.Rop(y, self.x, self.v) rop_f = function([self.x, self.v], yv, on_unused_input="ignore") J, _ = theano.scan( lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x], ) sy = tensor.dot(J, self.v) scan_f = function([self.x, self.v], sy, on_unused_input="ignore") v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert np.allclose(v1, v2), "ROP mismatch: %s %s" % (v1, v2) known_fail = False try: tensor.Rop(theano.clone(y, replace={self.x: break_op(self.x)}), self.x, self.v) except ValueError: known_fail = True # TEST LOP vx = np.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = np.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.x, self.v) lop_f = function([self.x, self.v], yv, on_unused_input="ignore") J, _ = theano.scan( lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x], ) sy = tensor.dot(self.v, J) scan_f = function([self.x, self.v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert np.allclose(v1, v2), "LOP mismatch: %s %s" % (v1, v2) if known_fail: pytest.skip("Rop does not handle non-differentiable inputs " "correctly. Bug exposed by fixing Add.grad method.")
def test_gen_cloning_with_shape_change(self, datagen): gen = generator(datagen) gen_r = tt_rng().normal(size=gen.shape).T X = gen.dot(gen_r) res, _ = theano.scan(lambda x: x.sum(), X, n_steps=X.shape[0]) assert res.eval().shape == (50,) shared = theano.shared(datagen.data.astype(gen.dtype)) res2 = theano.clone(res, {gen: shared**2}) assert res2.eval().shape == (1000,)
def compute_Entropy(self, Y=None, X=None): if Y is None: Y = self.Y if X is None: X = self.X Xgen = self.lat_ev_model.get_X() Nsamps, Tbins = Y.shape[0], Y.shape[1] LnDeterminant = theano.clone(self.LnDeterminant, replace={self.Y : Y, Xgen : X}) Entropy = 0.5*LnDeterminant + 0.5*Nsamps*Tbins*(1 + np.log(2*np.pi))*self.xDim # Yuanjun has xDim here so I put it but I don't think this is right. return Entropy
def logp_norm(self): sized_symbolic_logp = self.approx.sized_symbolic_logp if self.use_histogram: sized_symbolic_logp = theano.clone( sized_symbolic_logp, dict( zip(self.approx.symbolic_randoms, self.approx.collect('histogram')))) return sized_symbolic_logp / self.approx.symbolic_normalizing_constant
def _batch_normalization(input_variable, name, mode_switch, alpha=0.5, strict=True): """Based on batch normalization by Jan Schluter for Lasagne""" raise ValueError("NYI") G_name = name + '_G' B_name = name + '_B' list_of_names = [G_name, B_name] if not names_in_graph(list_of_names, graph): input_dim = calc_expected_dims(graph, input_variable)[-1] np_G = np_ones((input_dim, )) np_B = np_zeros((input_dim, )) add_arrays_to_graph([np_G, np_B], list_of_names, graph, strict=strict) else: if strict: raise AttributeError( "Name %s already found in graph with strict mode!" % name) G, B = fetch_from_graph(list_of_names, graph) eps = 1E-20 batch_mean = input_variable.mean(axis=0, keepdims=True) batch_std = input_variable.std(axis=0, keepdims=True) running_mean_shape = calc_expected_dims(graph, batch_mean) running_std_shape = calc_expected_dims(graph, batch_std) running_mean = theano.clone(batch_mean, share_inputs=True) running_std = theano.clone(batch_std, share_inputs=True) running_mean, running_std = add_random_to_graph( [running_mean, running_std], [running_mean_shape, running_std_shape], [name + '_running_mean', name + '_running_std'], graph) running_mean.default_update = ((1 - alpha) * running_mean + alpha * batch_mean) running_std.default_update = ((1 - alpha) * running_std + alpha * batch_std) running_mean = tensor.addbroadcast(running_mean, 0) running_std = tensor.addbroadcast(running_std, 0) batch_mean += 0 * running_mean batch_std += 0 * running_std # include running_{mean, std} in computation graph for updates... fixed = (input_variable - running_mean) / (running_std + eps) batch = (input_variable - batch_mean) / (batch_std + eps) normed = (1 - mode_switch) * batch + mode_switch * fixed out = G * normed + B return out
def check_rop_lop(self, y, out_shape): """ As check_mat_rop_lop, except the input is self.x which is a vector. The output is still a vector. """ # TEST ROP vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) yv = tensor.Rop(y, self.x, self.v) rop_f = function([self.x, self.v], yv, on_unused_input='ignore') J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x]) sy = tensor.dot(J, self.v) scan_f = function([self.x, self.v], sy, on_unused_input='ignore') v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2)) known_fail = False try: self.check_nondiff_rop( theano.clone(y, replace={self.x: break_op(self.x)})) except AssertionError: known_fail = True # TEST LOP vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.x, self.v) lop_f = function([self.x, self.v], yv, on_unused_input='ignore') J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x]) sy = tensor.dot(self.v, J) scan_f = function([self.x, self.v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2)) if known_fail: raise KnownFailureTest( "Rop doesn't handle non-differentiable " "inputs correctly. Bug exposed by fixing Add.grad" " method.")
def get_output_for(self, input, deterministic=False, **kwargs): beta = self.beta; if not deterministic: self_beta = theano.clone(self.beta, share_inputs=False); input_beta = ttt.percentile(input, self.perc); self_beta.default_update = ((1 - self.alpha) * self_beta + self.alpha * input_beta); beta += 0 * self_beta; # thresholding return theano.tensor.nnet.sigmoid(self.tight*(input-beta+self.bias));