def loss(self, n_samples, regularization_strength, mix, mu, sigma): log_sum_loss = -tensor.sum(tensor.log( tensor.sum(mix * tensor.inv(np.sqrt(2 * np.pi) * sigma) * tensor.exp(tensor.neg(tensor.sqr(mu - self.target_vector)) * tensor.inv(2 * tensor.sqr(sigma))), axis=0) )) # reg_loss = tensor.sum(tensor.sqr(self.layers.values()[0].W)) # for layer in self.layers.values()[1:]: # reg_loss += tensor.sum(tensor.sqr(layer.W)) # regularization = 1/n_samples * regularization_strength/2 * reg_loss return log_sum_loss #+ regularization
def get_output_for(self, input, deterministic=False, batch_norm_use_averages=None, batch_norm_update_averages=None, **kwargs): self.count = self.count + 1 self.alpha = 5.0 / (10 + self.count) # self.alpha = 1.0 / (self.count^2) input_mean = input.mean(self.axes) input_inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) # Decide whether to use the stored averages or mini-batch statistics if batch_norm_use_averages is None: batch_norm_use_averages = deterministic use_averages = batch_norm_use_averages if use_averages: mean = self.mean inv_std = self.inv_std else: mean = input_mean inv_std = input_inv_std # Decide whether to update the stored averages if batch_norm_update_averages is None: batch_norm_update_averages = not deterministic update_averages = batch_norm_update_averages if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_inv_std = theano.clone(self.inv_std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_inv_std.default_update = ((1 - self.alpha) * running_inv_std + self.alpha * input_inv_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean inv_std += 0 * running_inv_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(input.ndim - len(self.axes))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) inv_std = inv_std.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma * inv_std) + beta return normalized
def normal_log_likelihood_per_component(x, mu, sigma, mixing): return ( MINUS_HALF_LOG_2PI - T.log(sigma) - 0.5 * T.inv(sigma**2) * (x - mu)**2 + T.log(mixing) )
def __init(): dataset = T.matrix("dataset", dtype=config.globalFloatType()) trans_dataset = T.transpose(dataset) dot_mul = T.dot(dataset, trans_dataset) l2 = T.sqrt(T.sum(T.square(dataset), axis=1)) # p =printing.Print("l2") # l2 = p(l2) l2_inv2 = T.inv(l2).dimshuffle(['x', 0]) # p =printing.Print("l2_inv2") # l2_inv2 = p(l2_inv2) l2_inv1 = T.transpose(l2_inv2) # p =printing.Print("l2_inv1") # l2_inv1 = p(l2_inv1) l2_inv = T.dot(l2_inv1, l2_inv2) # p =printing.Print("l2_inv") # l2_inv = p(l2_inv) affinty = (T.mul(dot_mul, l2_inv) + 1) / 2 globals()['__affinty_fun'] = theano.function( [dataset], [affinty], allow_input_downcast=True )
def set_generator_update_function(generator_rnn_model, generator_mean_model, generator_std_model, generator_optimizer, grad_clipping): # input data (time length * num_samples * input_dims) source_data = tensor.tensor3(name='source_data', dtype=floatX) target_data = tensor.tensor3(name='target_data', dtype=floatX) # set generator input data list generator_input_data_list = [source_data,] # get generator hidden data hidden_data = generator_rnn_model[0].forward(generator_input_data_list, is_training=True)[0] # get generator output data output_mean_data = get_tensor_output(input=hidden_data, layers=generator_mean_model, is_training=True) output_std_data = get_tensor_output(input=hidden_data, layers=generator_std_model, is_training=True) generator_cost = -0.5*tensor.inv(2.0*tensor.sqr(output_std_data))*tensor.sqr(output_mean_data-target_data) generator_cost += -0.5*tensor.log(2.0*tensor.sqr(output_std_data)*numpy.pi) # set generator update generator_updates_cost = generator_cost.mean() generator_updates_dict = get_model_updates(layers=generator_rnn_model+generator_mean_model+generator_std_model, cost=generator_updates_cost, optimizer=generator_optimizer, use_grad_clip=grad_clipping) gradient_dict = get_model_gradients(generator_rnn_model+generator_mean_model+generator_std_model, generator_updates_cost) gradient_norm = 0. for grad in gradient_dict: gradient_norm += tensor.sum(grad**2) gradient_norm = tensor.sqrt(gradient_norm) # set generator update inputs generator_updates_inputs = [source_data, target_data,] # set generator update outputs generator_updates_outputs = [generator_cost, gradient_norm] # set generator update function generator_updates_function = theano.function(inputs=generator_updates_inputs, outputs=generator_updates_outputs, updates=generator_updates_dict, on_unused_input='ignore') return generator_updates_function
def energy_function(feature_data, is_train=True): # feature-wise std feature_std_inv = T.inv(T.nnet.softplus(feature_std)+1e-10) # energy hidden-feature e = softplus(T.dot(feature_data*feature_std_inv, linear_w0)+linear_b0) e = T.sum(-e, axis=1) # energy feature prior e += 0.5*T.sum(T.sqr(feature_std_inv)*T.sqr(feature_data-feature_mean), axis=1) return e
def set_generator_update_function( generator_rnn_model, generator_mean_model, generator_std_model, generator_optimizer, grad_clipping ): # input data (time length * num_samples * input_dims) source_data = tensor.tensor3(name="source_data", dtype=floatX) target_data = tensor.tensor3(name="target_data", dtype=floatX) # set generator input data list generator_input_data_list = [source_data] # get generator hidden data hidden_data = generator_rnn_model[0].forward(generator_input_data_list, is_training=True)[0] hidden_data = hidden_data.dimshuffle(0, 2, 1, 3).flatten(3) # get generator output data output_mean_data = get_tensor_output(input=hidden_data, layers=generator_mean_model, is_training=True) # output_std_data = get_tensor_output(input=hidden_data, # layers=generator_std_model, # is_training=True) output_std_data = 0.22 # get generator cost (time_length x num_samples x hidden_size) generator_cost = 0.5 * tensor.inv(2.0 * tensor.sqr(output_std_data)) * tensor.sqr(output_mean_data - target_data) generator_cost += tensor.log(output_std_data) + 0.5 * tensor.log(2.0 * numpy.pi) generator_cost = tensor.sum(generator_cost, axis=2) # set generator update generator_updates_cost = generator_cost.mean() generator_updates_dict = get_model_updates( layers=generator_rnn_model + generator_mean_model, cost=generator_updates_cost, optimizer=generator_optimizer, use_grad_clip=grad_clipping, ) gradient_dict = get_model_gradients(generator_rnn_model + generator_mean_model, generator_updates_cost) gradient_norm = 0.0 for grad in gradient_dict: gradient_norm += tensor.sum(grad ** 2) gradient_norm = tensor.sqrt(gradient_norm) # set generator update inputs generator_updates_inputs = [source_data, target_data] # set generator update outputs generator_updates_outputs = [generator_cost, gradient_norm] # set generator update function generator_updates_function = theano.function( inputs=generator_updates_inputs, outputs=generator_updates_outputs, updates=generator_updates_dict, on_unused_input="ignore", ) return generator_updates_function
def logsum_loss(self, n_samples, l1_regularization_strength, l2_regularization_strength): log_sum_loss = -tensor.sum(tensor.log( tensor.sum(self.mix * tensor.inv(np.sqrt(2 * np.pi) * self.sigma) * tensor.exp(tensor.neg(tensor.sqr(self.mu - self.target_vector)) * tensor.inv(2 * tensor.sqr(self.sigma))), axis=0) )) l1_reg_loss = tensor.sum(np.abs(self.layers.values()[0].W)) for layer in self.layers.values()[1:]: l1_reg_loss += tensor.sum(np.abs(layer.W)) l2_reg_loss = tensor.sum(tensor.sqr(self.layers.values()[0].W)) for layer in self.layers.values()[1:]: l2_reg_loss += tensor.sum(tensor.sqr(layer.W)) l1_regularization = 1/n_samples * l1_regularization_strength/2 * l1_reg_loss l2_regularization = 1/n_samples * l2_regularization_strength/2 * l2_reg_loss return log_sum_loss + l1_regularization + l2_regularization
def __spectral_matrix(self, covariance): egvalues, egmatrix = T.nlinalg.eig(covariance) egmatrix_inv = T.nlinalg.matrix_inverse(egmatrix) diag_sqr_inv = T.nlinalg.alloc_diag( T.inv( T.sqrt( T.switch(T.eq(egvalues,0), 0.001, egvalues) ) ) ) return egmatrix.dot(diag_sqr_inv).dot(egmatrix_inv)
def standardize(layer, offset, scale, shared_axes): """ Convenience function for standardizing inputs by applying a fixed offset and scale. This is usually useful when you want the input to your network to, say, have zero mean and unit standard deviation over the feature dimensions. This layer allows you to include the appropriate statistics to achieve this normalization as part of your network, and applies them to its input. The statistics are supplied as the `offset` and `scale` parameters, which are applied to the input by subtracting `offset` and dividing by `scale`, sharing dimensions as specified by the `shared_axes` argument. Parameters ---------- layer : a :class:`Layer` instance or a tuple The layer feeding into this layer, or the expected input shape. offset : Theano shared variable, expression, or numpy array The offset to apply (via subtraction) to the axis/axes being standardized. scale : Theano shared variable, expression or numpy array The scale to apply (via division) to the axis/axes being standardized. shared_axes : 'auto', int or tuple of int The axis or axes to share the offset and scale over. If ``'auto'`` (the default), share over all axes except for the second: this will share scales over the minibatch dimension for dense layers, and additionally over all spatial dimensions for convolutional layers. Examples -------- Assuming your training data exists in a 2D numpy ndarray called ``training_data``, you can use this function to scale input features to the [0, 1] range based on the training set statistics like so: >>> import lasagne >>> import numpy as np >>> training_data = np.random.standard_normal((100, 20)) >>> input_shape = (None, training_data.shape[1]) >>> l_in = lasagne.layers.InputLayer(input_shape) >>> offset = training_data.min(axis=0) >>> scale = training_data.max(axis=0) - training_data.min(axis=0) >>> l_std = standardize(l_in, offset, scale, shared_axes=0) Alternatively, to z-score your inputs based on training set statistics, you could set ``offset = training_data.mean(axis=0)`` and ``scale = training_data.std(axis=0)`` instead. """ # Subtract the offset layer = BiasLayer(layer, -offset, shared_axes) # Do not optimize the offset parameter layer.params[layer.b].remove('trainable') # Divide by the scale layer = ScaleLayer(layer, T.inv(scale), shared_axes) # Do not optimize the scales parameter layer.params[layer.scales].remove('trainable') return layer
def test_dnn_batchnorm_train(): if not dnn.dnn_available(test_ctx_name): raise SkipTest(dnn.dnn_available.msg) if dnn.version(raises=False) < 5000: raise SkipTest("batch normalization requires cudnn v5+") utt.seed_rng() for mode in ('per-activation', 'spatial'): for vartype in (T.ftensor4, T.ftensor3, T.fmatrix, T.fvector): x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias')) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used # forward pass out, x_mean, x_invstd = dnn.dnn_batch_normalization_train( x, scale, bias, mode, eps) # reference forward pass if mode == 'per-activation': axes = (0,) elif mode == 'spatial': axes = (0,) + tuple(range(2, ndim)) x_mean2 = x.mean(axis=axes, keepdims=True) x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps)) scale2 = T.addbroadcast(scale, *axes) bias2 = T.addbroadcast(bias, *axes) out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 # backward pass dy = vartype('dy') grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd, out2, x_mean2, x_invstd2] + grads + grads2, mode=mode_with_gpu) # run for data_shape in ((10, 20, 30, 40), (4, 3, 1, 1), (1, 1, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes else s for d, s in enumerate(data_shape)) X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32') Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32') Scale = numpy.random.randn(*param_shape).astype('float32') Bias = numpy.random.randn(*param_shape).astype('float32') outputs = f(X, Scale, Bias, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[0 + 3]) # out utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd # compare gradients utt.assert_allclose(outputs[6], outputs[6 + 3]) # dx utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=3e-3) # dscale utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias
def get_symbolic_thermal_hmm_params(log_prior_c: types.TheanoVector, log_trans_tcc: types.TheanoTensor3, log_emission_tc: types.TheanoMatrix, temperature: tt.scalar): inv_temperature = tt.inv(temperature) thermal_log_prior_c = inv_temperature * log_prior_c thermal_log_prior_c -= pm.math.logsumexp(thermal_log_prior_c) thermal_log_trans_tcc = inv_temperature * log_trans_tcc thermal_log_trans_tcc -= pm.math.logsumexp(thermal_log_trans_tcc, axis=-1) thermal_log_emission_tc = inv_temperature * log_emission_tc return thermal_log_prior_c, thermal_log_trans_tcc, thermal_log_emission_tc
def predict(self, X1, y1, X2): cov_train = self.compute_cov_s(X1,self.N) cov_test = self.compute_cov_s(X2,self.M) cov_te_tr = self.compute_cov(X1,X2,self.N,self.M) cov_tr_te = cov_te_tr.T arg0 = T.inv(cov_train+self.noise**2 *T.identity_like(cov_train)) #arg0 = T.inv(cov_train) arg1 = T.dot(cov_te_tr, arg0) mu = T.dot(arg1,y1) sigma = cov_test - T.dot(arg1, cov_tr_te) return mu,T.diag(sigma)
def logp(X): ''' logp de la probabilidad de muchas gaussianas ''' # print(X.shape.eval(), mu.shape.eval()) err = T.reshape(X, (-1,2)) - T.reshape(mu, (-1,2)) # shaped as (n*m,2) S = T.inv(cov) # np.linalg.inv(cov) E = (T.reshape(err, (-1, 2, 1)) * S * T.reshape(err, (-1, 1, 2)) ).sum() return - E / 2
def _whiten_input(self, n): X = T.matrix('X', dtype=theano.config.floatX) cov = T.dot(X.T, X) / (n - 1) eigenvalues, eigenvectors = T.nlinalg.eig(cov) V = eigenvectors D = eigenvalues D_prime = T.nlinalg.alloc_diag(T.inv(T.sqrt(D + self.e_zca))) M = T.dot(V, T.dot(D_prime, V.T)) # now the input has been rotated: each column is a sample return theano.function(inputs=[X], outputs=T.dot(M, X.T))
def addFullBNLayerTrain(x,gamma,beta, mean=None, var=None): fsize = gamma.get_value().shape[0] ep = 1e-5 momentum = 0.9 if mean is None: mean = theano.shared(np.zeros((fsize,))) var = theano.shared(np.ones((fsize,))) input_mean = T.mean(x, axis=0) input_var = T.var(x, axis=0) inv_std = T.inv(T.sqrt(input_var + ep)) updates = [] updates.append((mean, momentum*mean+(1-momentum)*input_mean)) updates.append((var,momentum*var+(1-momentum)*(x.shape[0]/(x.shape[0]-1)*input_var))) o = (x-input_mean) * gamma * inv_std + beta return o, mean, var, updates
def set_generator_evaluation_function(generator_rnn_model, generator_mean_model, generator_std_model): # input data (time length * num_samples * input_dims) source_data = tensor.tensor3(name='source_data', dtype=floatX) target_data = tensor.tensor3(name='target_data', dtype=floatX) # set generator input data list generator_input_data_list = [source_data,] # get generator hidden data hidden_data = generator_rnn_model[0].forward(generator_input_data_list, is_training=True)[0] hidden_data = hidden_data.dimshuffle(0, 2, 1, 3) hidden_data = hidden_data[:,:,-1,:].flatten(3) # get generator output data output_mean_data = get_tensor_output(input=hidden_data, layers=generator_mean_model, is_training=True) output_std_data = get_tensor_output(input=hidden_data, layers=generator_std_model, is_training=True) # output_std_data = 0.22 # get generator cost (time_length x num_samples x hidden_size) generator_cost = 0.5*tensor.inv(2.0*tensor.sqr(output_std_data))*tensor.sqr(output_mean_data-target_data) generator_cost += tensor.log(output_std_data) + 0.5*tensor.log(2.0*numpy.pi) generator_cost = tensor.sum(generator_cost, axis=2) # set generator evaluate inputs generator_evaluate_inputs = [source_data, target_data,] # set generator evaluate outputs generator_evaluate_outputs = [generator_cost,] # set generator evaluate function generator_evaluate_function = theano.function(inputs=generator_evaluate_inputs, outputs=generator_evaluate_outputs, on_unused_input='ignore') return generator_evaluate_function
def fprop(X, test): btest = tensor.lt(0, test) X_means = X.mean([0, 2, 3]) X_inv_stds = tensor.inv(tensor.sqrt(X.var([0, 2, 3])) + epsilon) means_clone = theano.clone(means, share_inputs = False) inv_stds_clone = theano.clone(inv_stds, share_inputs = False) means_clone.default_update = ifelse(btest, means, lerp(means, X_means, alpha)) inv_stds_clone.default_update = ifelse(btest, inv_stds, lerp(inv_stds, X_inv_stds, alpha)) X_means += 0 * means_clone X_inv_stds += 0 * inv_stds_clone X_means = ifelse(btest, means, X_means) X_inv_stds = ifelse(btest, inv_stds, X_inv_stds) return (X - ds(X_means)) * ds(X_inv_stds) * ds(gammas)
def _build(self, input_tensor): self._instantiate_parameters( input_tensor.shape, input_tensor.dtype) input_tensor_ = input_tensor.unwrap() mean_acc = self.get_parameter_variable('mean').unwrap() var_acc = self.get_parameter_variable('var').unwrap() scale = self.get_parameter_variable('scale').unwrap() offset = self.get_parameter_variable('offset').unwrap() if self.args['learn']: decay = self.args['decay'] mean_in = input_tensor_.mean(axis=self._axes) var_in = input_tensor_.var(self._axes) new_mean_acc = decay * mean_acc + (1 - decay) * mean_in new_var_acc = decay * var_acc + (1 - decay) * var_in self._update_operations.append( wrapper.Operation( op={mean_acc: new_mean_acc}, name='update_mean', ) ) self._update_operations.append( wrapper.Operation( op={var_acc: new_var_acc}, name='update_var', ) ) mean_acc = new_mean_acc var_acc = new_var_acc mean_acc = mean_acc.dimshuffle(self._pattern) var_acc = var_acc.dimshuffle(self._pattern) scale = scale.dimshuffle(self._pattern) offset = offset.dimshuffle(self._pattern) stdi = T.inv(T.sqrt(var_acc + self.args['epsilon'])) output = scale * (input_tensor_ - mean_acc) * stdi + offset return wrapper.Tensor(output, shape=input_tensor.shape, name='output')
def __call__(self, x): axes = range(x.ndim) axes.remove(self.axis) axes = tuple(axes) input_mean = x.mean(axes) input_inv_std = T.inv(T.sqrt(x.var(axes) + self.epsilon)) if self.train: mean = input_mean inv_std = input_inv_std else: if self.collect: mean = self.mean inv_std = self.inv_std else: mean = input_mean inv_std = input_inv_std self.updates = {} if self.train: if self.collect: self.updates[self.mean] = ( 1 - self.alpha) * self.mean + self.alpha * input_mean self.updates[self.inv_std] = ( 1 - self.alpha) * self.inv_std + self.alpha * input_inv_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(x.ndim - len(axes))) pattern = [ 'x' if input_axis in axes else next(param_axes) for input_axis in range(x.ndim) ] # apply dimshuffle pattern to all parameters beta = self.beta.dimshuffle(pattern) gamma = self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) inv_std = inv_std.dimshuffle(pattern) # normalize normalized = (x - mean) * (gamma * inv_std) + beta return normalized
def nll(mu, sigma, mixing, y): """Computes the mean of negative log likelihood for P(y|x) y = T.matrix('y') # (minibatch_size, output_size) mu = T.tensor3('mu') # (minibatch_size, output_size, n_components) sigma = T.matrix('sigma') # (minibatch_size, n_components) mixing = T.matrix('mixing') # (minibatch_size, n_components) """ # multivariate Gaussian exponent = -0.5 * T.inv(sigma) * T.sum((y.dimshuffle(0,1,'x') - mu)**2, axis=1) normalizer = (2 * np.pi * sigma) exponent = exponent + T.log(mixing) - (y.shape[1]*.5)*T.log(normalizer) max_exponent = T.max(exponent ,axis=1, keepdims=True) mod_exponent = exponent - max_exponent gauss_mix = T.sum(T.exp(mod_exponent),axis=1) log_gauss = max_exponent + T.log(gauss_mix) res = -T.mean(log_gauss) return res
def NLL(sigma, mixing, y): """Computes the mean of negative log likelihood for P(y|x) y = T.matrix('y') # (minibatch_size, output_size) mu = T.tensor3('mu') # (minibatch_size, output_size, n_components) sigma = T.matrix('sigma') # (minibatch_size, n_components) mixing = T.matrix('mixing') # (minibatch_size, n_components) """ # multivariate Gaussian exponent = -0.5 * T.inv(sigma) * T.sum(y ** 2, axis=1) normalizer = 2 * np.pi * sigma exponent = exponent + T.log(mixing) - (y.shape[1] * 0.5) * T.log(normalizer) max_exponent = T.max(exponent, axis=1) mod_exponent = exponent - max_exponent[:, None] gauss_mix = T.sum(T.exp(mod_exponent), axis=1) log_gauss = max_exponent + T.log(gauss_mix) res = -T.mean(log_gauss) return res
def set_generator_evaluation_function(generator_rnn_model, generator_mean_model, generator_std_model): # input data (time length * num_samples * input_dims) source_data = tensor.tensor3(name='source_data', dtype=floatX) target_data = tensor.tensor3(name='target_data', dtype=floatX) # set generator input data list generator_input_data_list = [source_data,] # get generator hidden data hidden_data = generator_rnn_model[0].forward(generator_input_data_list, is_training=True)[0] # get generator output data output_mean_data = get_tensor_output(input=hidden_data, layers=generator_mean_model, is_training=True) output_std_data = get_tensor_output(input=hidden_data, layers=generator_std_model, is_training=True) generator_cost = -0.5*tensor.inv(2.0*tensor.sqr(output_std_data))*tensor.sqr(output_mean_data-target_data) generator_cost += -0.5*tensor.log(2.0*tensor.sqr(output_std_data)*numpy.pi) # set generator evaluate inputs generator_evaluate_inputs = [source_data, target_data,] # set generator evaluate outputs generator_evaluate_outputs = [generator_cost, ] # set generator evaluate function generator_evaluate_function = theano.function(inputs=generator_evaluate_inputs, outputs=generator_evaluate_outputs, on_unused_input='ignore') return generator_evaluate_function
def NLL(mu, sigma, mixing, y): """Computes the mean of negative log likelihood for P(y|x) y = T.matrix('y') # (minibatch_size, output_size) mu = T.tensor3('mu') # (minibatch_size, output_size, n_components) sigma = T.matrix('sigma') # (minibatch_size, n_components) mixing = T.matrix('mixing') # (minibatch_size, n_components) """ # multivariate Gaussian exponent = -0.5 * T.inv(sigma) * T.sum( (y.dimshuffle(0, 1, 'x') - mu)**2, axis=1) normalizer = (2 * np.pi * sigma) exponent = exponent + T.log(mixing) - (y.shape[1] * .5) * T.log(normalizer) max_exponent = T.max(exponent, axis=1, keepdims=True) mod_exponent = exponent - max_exponent gauss_mix = T.sum(T.exp(mod_exponent), axis=1) log_gauss = max_exponent + T.log(gauss_mix) res = -T.mean(log_gauss) return res
def output(self, input_value): epsilon = asfloat(self.epsilon) alpha = asfloat(self.alpha) gamma, beta = self.gamma, self.beta ndim = input_value.ndim axes = self.axes running_mean = self.running_mean running_inv_std = self.running_inv_std input_mean = input_value.mean(axes) input_var = input_value.var(axes) input_inv_std = T.inv(T.sqrt(input_var + epsilon)) self.updates = [( running_inv_std, asfloat(1 - alpha) * running_inv_std + alpha * input_inv_std ), ( running_mean, asfloat(1 - alpha) * running_mean + alpha * input_mean )] if not self.training_state: mean = running_mean inv_std = running_inv_std else: mean = input_mean inv_std = input_inv_std opposite_axes = find_opposite_axes(axes, ndim) beta = dimshuffle(beta, ndim, opposite_axes) gamma = dimshuffle(gamma, ndim, opposite_axes) mean = dimshuffle(mean, ndim, opposite_axes) inv_std = dimshuffle(inv_std, ndim, opposite_axes) normalized_value = (input_value - mean) * inv_std return gamma * normalized_value + beta
def get_output_for(self, input, deterministic=False, **kwargs): input_mean = input.mean(self.axes) input_inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) mean = input_mean inv_std = input_inv_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(input.ndim - len(self.axes))) pattern = [ 'x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim) ] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) inv_std = inv_std.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma * inv_std) + beta return normalized
def generate_functions(A, y, gamma): tA = T.matrix('A') ty = T.vector('y') tx = T.vector('x') ttheta = T.vector('theta') tx0 = T.vector('x0') tx1 = T.vector('x1') tbetas = T.vector('betas') error = lambda x: T.sum((T.dot(tA, x) - ty)**2) derror = lambda x: T.grad(error(x), x) penalty = lambda x: x.norm(1) loss = lambda x: error(x) + penalty(x) entering_index = T.argmax(abs(derror(tx))) txs, _ = theano.map(lambda b, x0, x1: (1-b)*x0 + b*x1, [tbetas], [tx0, tx1]) return { "select_entering": theano.function([tx], [entering_index, derror(tx)[entering_index]], givens = {tA: A, ty: y}), "qp_optimum": theano.function([tA, ttheta], T.dot(T.inv(T.dot(tA.T, tA)), T.dot(tA.T, ty) - gamma/2*ttheta), givens = {ty: y}), "txs": theano.function([tbetas, tx0, tx1], txs), "select_candidate": theano.function([tA, tbetas, tx0, tx1], txs[T.argmin(theano.map(loss, [txs])[0])], givens = {ty: y}), "optimal_nz": theano.function([tA, tx], derror(tx) + gamma*T.sgn(tx), givens = {ty: y}), "optimal_z": theano.function([tA, tx], abs(derror(tx)), givens = {ty: y}), }
def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=0.0001): '''Compute mean and std for batch then apply batch_normalization on batch. ''' dev = theano.config.device use_cudnn = ndim(x) < 5 and reduction_axes == [0, 2, 3] and (dev.startswith('cuda') or dev.startswith('gpu')) if use_cudnn: broadcast_beta = beta.dimshuffle('x', 0, 'x', 'x') broadcast_gamma = gamma.dimshuffle('x', 0, 'x', 'x') try: normed, mean, stdinv = theano.sandbox.cuda.dnn.dnn_batch_normalization_train( x, broadcast_gamma, broadcast_beta, 'spatial', epsilon) var = T.inv(stdinv ** 2) return normed, T.flatten(mean), T.flatten(var) except AttributeError: pass var = x.var(reduction_axes) mean = x.mean(reduction_axes) target_shape = [] for axis in range(ndim(x)): if axis in reduction_axes: target_shape.append(1) else: target_shape.append(x.shape[axis]) target_shape = T.stack(*target_shape) broadcast_mean = T.reshape(mean, target_shape) broadcast_var = T.reshape(var, target_shape) broadcast_beta = T.reshape(beta, target_shape) broadcast_gamma = T.reshape(gamma, target_shape) normed = batch_normalization(x, broadcast_mean, broadcast_var, broadcast_beta, broadcast_gamma, epsilon) return normed, mean, var
def test_batch_normalization_train(): utt.seed_rng() for axes in ('per-activation', 'spatial', (1, 2, 3, 4)): for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector): x, scale, bias, running_mean, running_var = (vartype(n) for n in ('x', 'scale', 'bias', 'running_mean', 'running_var')) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used running_average_factor = 0.3 # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # forward pass out, x_mean, x_invstd, out_running_mean, out_running_var = \ bn.batch_normalization_train( x, scale, bias, axes, eps, running_average_factor, running_mean, running_var) # reference forward pass if axes == 'per-activation': axes2 = (0,) elif axes == 'spatial': axes2 = (0,) + tuple(range(2, ndim)) else: axes2 = axes x_mean2 = x.mean(axis=axes2, keepdims=True) x_var2 = x.var(axis=axes2, keepdims=True) x_invstd2 = T.inv(T.sqrt(x_var2 + eps)) scale2 = T.addbroadcast(scale, *axes2) bias2 = T.addbroadcast(bias, *axes2) out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX) out_running_mean2 = running_mean * (1 - running_average_factor) + \ x_mean2 * running_average_factor out_running_var2 = running_var * (1 - running_average_factor) + \ (m / (m - 1)) * x_var2 * running_average_factor # backward pass dy = vartype('dy') grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, running_mean, running_var, dy], [out, x_mean, x_invstd, out_running_mean, out_running_var, out2, x_mean2, x_invstd2, out_running_mean2, out_running_var2] + grads + grads2) # check if the abstract Ops have been replaced assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain, bn.AbstractBatchNormInference, bn.AbstractBatchNormTrainGrad)) for n in f.maker.fgraph.toposort()]) # run for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (2, 3, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes2 else s for d, s in enumerate(data_shape)) X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX) Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX) Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX) Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX) outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[0 + 5]) # out utt.assert_allclose(outputs[1], outputs[1 + 5]) # mean utt.assert_allclose(outputs[2], outputs[2 + 5]) # invstd utt.assert_allclose(outputs[3], outputs[3 + 5]) # running_mean utt.assert_allclose(numpy.nan_to_num(outputs[4]), numpy.nan_to_num(outputs[4 + 5])) # running_var # compare gradients utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4) # dx utt.assert_allclose(outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs[12], outputs[12 + 3]) # dbias
def logp(self, value): u = self.u k = self.k logp = -tt.pow(value / u, k) - tt.log(u) - gammaln(1 + tt.inv(k)) return bound(logp, value > 0, u > 0, k > 0)
def f1_score_theano(X, W, b=None): XW = T.dot(X, W.T) XX = T.abs_(X).sum(axis=1).reshape((-1, 1)) WW = T.abs_(W).sum(axis=1).reshape((1, -1)) return T.inv(XW / XX) + T.inv(XW / WW)
theano.config.floatX), name='mc', borrow=True, broadcastable=(True, False, False),) mw = theano.shared( value=numpy.log(model_weights.copy().astype( theano.config.floatX)), name='mw', borrow=True,) Wc = theano.shared( value=numpy.zeros((n_out, sigma_in, n_components,), dtype=theano.config.floatX), name='Wc', borrow=True,) invsigma_given_x = tensor.inv(tensor.maximum(tensor.nnet.softplus(theano.dot(x, Wc) + mc), 1e-8)) f = theano.function( inputs=[x,], outputs=invsigma_given_x, ) p_mix_given_x = tensor.nnet.softmax(mw) p_mix_given_x = tensor.log(p_mix_given_x / (tensor.sum(p_mix_given_x, axis=1)[:, None] + 10 * EPS) + EPS) log_exponent = tensor.sum((y**2)[:, :, None] * invsigma_given_x, axis=1) f = theano.function( inputs=[x, y], outputs=log_exponent, ) dim_constant = - 0.5 * WINSIZE * tensor.log(2 * numpy.pi) + p_mix_given_x lpr = dim_constant + 0.5 * ( tensor.sum(tensor.log(invsigma_given_x), axis=1) - log_exponent)
def __init__(self, numpy_rng, n_ins=784, n_outs=24, l1_reg = None, l2_reg = None, hidden_layers_sizes=[500, 500], hidden_activation='tanh', output_activation='linear', var_floor=0.01, n_component=1, beta_opt=False, use_rprop=0, rprop_init_update=0.001, eff_sample_size=0.8, mean_log_det=-100.0): logger = logging.getLogger("Multi-stream DNN initialization") self.sigmoid_layers = [] self.params = [] self.delta_params = [] self.final_layers = [] self.n_outs = n_outs self.n_layers = len(hidden_layers_sizes) self.output_activation = output_activation self.var_floor = var_floor self.use_rprop = use_rprop self.rprop_init_update = rprop_init_update self.l1_reg = l1_reg self.l2_reg = l2_reg self.beta_opt = beta_opt self.eff_sample_size = eff_sample_size self.mean_log_det = mean_log_det assert self.n_layers > 0 # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.matrix('y') for i in range(self.n_layers): if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.tanh) ##T.nnet.sigmoid) # self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) hidden_output_size = hidden_layers_sizes[-1] self.final_layer = MixtureDensityOutputLayer(rng = numpy_rng, input = sigmoid_layer.output, n_in = hidden_output_size, n_out = self.n_outs, n_component = n_component, var_floor = self.var_floor) self.params.extend(self.final_layer.params) self.delta_params.extend(self.final_layer.delta_params) ### Maximum likelihood self.finetune_cost = 0.0 self.errors = 0.0 epsd = self.eff_sample_size**(-2.0/(n_outs + 2.0)) beta = (epsd - 1.0) + math.sqrt(epsd*(epsd - 1.0)) if self.beta_opt: assert n_component == 1, "beta optimisation only implemented for single-component MDNs" for i in range(n_component): #n_component sigma = self.final_layer.sigma[:, i*n_outs:(i+1)*n_outs] mu = self.final_layer.mu[:, i*n_outs:(i+1)*n_outs] mix_weight = self.final_layer.mix[:, i] xEx = -0.5 * beta * T.sum(((self.y - mu)**2) * T.inv(sigma), axis=1) exponent = (0.5 * (n_outs + 2.0) * T.log(1 + beta)) + xEx point_fit = T.exp(exponent) - beta log_det_mult = -0.5 * beta * T.sum(T.log(sigma), axis=1) log_det_mult += (0.5 * beta * self.mean_log_det) # normalise by mean_log_det beta_obj = (mix_weight**2) * point_fit * T.exp(log_det_mult) self.finetune_cost += -T.mean(beta_obj) # lines to compute debugging information for later printing #self.errors = T.min(T.min(T.log(sigma), axis=1)) #self.errors = T.mean(T.sum(T.log(sigma), axis=1)) # computes mean_log_det #self.errors = -xEx # (vector quantity) should be about 0.5 * beta * n_outs #self.errors = point_fit # (vector quantity) should be about one #self.errors = T.mean(T.exp(exponent)) / T.exp(T.max(exponent)) # fraction of the data used, should be about efficiency #self.errors = T.mean(point_fit) # should be about one #self.errors = log_det_mult # (vector quantity) about zero, or always less if using Rprop #self.errors = beta_obj # (vector quantity) objective function terms #self.errors = self.finetune_cost # disable this line below when debugging else: all_mix_prob = [] print(n_component) for i in range(n_component): #n_component sigma = self.final_layer.sigma[:, i*n_outs:(i+1)*n_outs] mu = self.final_layer.mu[:, i*n_outs:(i+1)*n_outs] mix_weight = self.final_layer.mix[:, i] xEx = -0.5 * T.sum(((self.y - mu)**2) * T.inv(sigma), axis=1) normaliser = 0.5 * ( n_outs * T.log(2 * numpy.pi) + T.sum(T.log(sigma), axis=1)) exponent = xEx + T.log(mix_weight) - normaliser all_mix_prob.append(exponent) max_exponent = T.max(all_mix_prob, axis=0, keepdims=True) mod_exponent = T.as_tensor_variable(all_mix_prob) - max_exponent self.finetune_cost = - T.mean(max_exponent + T.log(T.sum(T.exp(mod_exponent), axis=0))) #self.errors = self.finetune_cost if self.l2_reg is not None: for i in range(self.n_layers-1): W = self.params[i * 2] self.finetune_cost += self.l2_reg * T.sqr(W).sum() self.finetune_cost += self.l2_reg * T.sqr(self.final_layer.W_mu).sum() self.finetune_cost += self.l2_reg * T.sqr(self.final_layer.W_sigma).sum() self.finetune_cost += self.l2_reg * T.sqr(self.final_layer.W_mix).sum() self.errors = self.finetune_cost # disable this line if debugging beta_opt