def DSSIM(p, y, eps=1e-7): # Taken/Modified from https://github.com/fchollet/keras/issues/4292 # Nan issue : T.maximum(x, eps) y_patch = neigh.images2neibs(y, [4, 4], mode='ignore_borders') p_patch = neigh.images2neibs(p, [4, 4], mode='ignore_borders') y_mean = T.mean(y_patch, axis=-1) p_mean = T.mean(p_patch, axis=-1) y_var = T.var(y_patch, axis=-1, corrected=True) p_var = T.var(p_patch, axis=-1, corrected=True) y_std = T.sqrt(T.maximum(y_var, eps)) p_std = T.sqrt(T.maximum(p_var, eps)) c1 = 0.01**2 c2 = 0.02**2 num = (2 * y_mean * p_mean + c1) * (2 * y_std * p_std + c2) denom = (T.pow(y_mean, 2) + T.pow(p_mean, 2) + c1) * (y_var + p_var + c2) ssim = num / T.maximum(denom, eps) return T.mean(1.0 - ssim)
def computeA(self, symmetric_double_encoder, params): regularization = 0 if self._layer == -1: for layer in symmetric_double_encoder: hidden_x = layer.output_forward_x hidden_y = layer.output_forward_y cov_x = Tensor.dot(hidden_x, hidden_x.T) cov_y = Tensor.dot(hidden_y, hidden_y.T) regularization += Tensor.mean(Tensor.sum(abs(cov_x), axis=1, dtype=Tensor.config.floatX)) + Tensor.mean( Tensor.sum(abs(cov_y), axis=1, dtype=Tensor.config.floatX)) elif self._layer < len(symmetric_double_encoder): hidden_x = symmetric_double_encoder[self._layer].output_forward_x hidden_y = symmetric_double_encoder[self._layer].output_forward_y var_x = Tensor.var(hidden_x, axis=1) var_y = Tensor.var(hidden_y, axis=1) norm_x = Tensor.mean(Tensor.sum(hidden_x ** 2, axis=1, dtype=Tensor.config.floatX)) norm_y = Tensor.mean(Tensor.sum(hidden_y ** 2, axis=1, dtype=Tensor.config.floatX)) regularization -= norm_x regularization -= norm_y # # cov_x = Tensor.dot(hidden_x.T, hidden_x) # cov_y = Tensor.dot(hidden_y.T, hidden_y) # # regularization -= ((Tensor.sum(abs(cov_x))) + (Tensor.sum(abs(cov_y)))) return self.weight * regularization
def build(self, output, tparams=None, BNparams=None): if self.BN_mode: self.BN_eps = npt(self.BN_eps) if not hasattr(self, 'BN_mean'): self.BN_mean = T.mean(output) if not hasattr(self, 'BN_std'): m2 = (1 + 1 / (T.prod(output.shape) - 1)).astype(floatX) self.BN_std = T.sqrt(m2 * T.var(output) + self.BN_eps) if self.BN_mode == 2: t_mean = T.mean(output, axis=[0, 2, 3], keepdims=True) t_var = T.var(output, axis=[0, 2, 3], keepdims=True) BN_mean = BNparams[p_(self.prefix, 'mean')].dimshuffle( 'x', 0, 'x', 'x') BN_std = BNparams[p_(self.prefix, 'std')].dimshuffle( 'x', 0, 'x', 'x') output = ifelse( self.training, (output - t_mean) / T.sqrt(t_var + self.BN_eps), (output - BN_mean) / BN_std) output *= tparams[p_(self.prefix, 'BN_scale')].dimshuffle( 'x', 0, 'x', 'x') output += tparams[p_(self.prefix, 'BN_shift')].dimshuffle( 'x', 0, 'x', 'x') elif self.BN_mode == 1: t_mean = T.mean(output) t_var = T.var(output) output = ifelse( self.training, (output - t_mean) / T.sqrt(t_var + self.BN_eps), ((output - BNparams[p_(self.prefix, 'mean')]) / BNparams[p_(self.prefix, 'std')])) output *= tparams[p_(self.prefix, 'BN_scale')] output += tparams[p_(self.prefix, 'BN_shift')] self.output = self.activation(output)
def __init__(self, network): self.network = network self.parameters = network.parameters num_trails = self.parameters.num_trials n_layers = network.n_layers self.channels = {} for channel in self.training_values: self.channels[channel] = np.zeros((n_layers, num_trails)) for channel in self.training_mean_std: self.channels[channel] = np.zeros((n_layers, num_trails, 2)) outputs = [] for layer in range(n_layers): if layer == 0: X = self.network.X else: X = self.network.Y[layer - 1] Y = self.network.Y[layer] Q = self.network.Q[layer] W = self.network.W[layer] theta = self.network.theta[layer] y_bar = Y.mean() Cyy_bar = (Y.T.dot(Y) / network.parameters.batch_size).mean() outputs.extend([y_bar, Cyy_bar]) X_rec = Y.dot(Q.T) X_rec_norm = T.sqrt(T.sum(T.sqr(X_rec), axis=1, keepdims=True)) X_norm = T.sqrt(T.sum(T.sqr(X), axis=1, keepdims=True)) X_rec_bar = X_rec_norm.mean() X_rec_std = X_rec_norm.std() outputs.extend([X_rec_bar, X_rec_std]) X_bar = X_norm.mean() X_std = X_norm.std() outputs.extend([X_bar, X_std]) SNR_Norm = T.mean(T.var(X, axis=0)) / T.mean( T.var(X - X_rec * X_norm / X_rec_norm, axis=0)) SNR = T.mean(T.var(X, axis=0)) / T.mean( T.var(X - X_rec_norm, axis=0)) outputs.extend([SNR, SNR_Norm]) Q_norm = T.sqrt(T.sum(T.sqr(Q), axis=0)) Q_bar = Q_norm.mean() Q_std = Q_norm.std() outputs.extend([Q_bar, Q_std]) W_bar = W.mean() W_std = W.std() outputs.extend([W_bar, W_std]) theta_bar = theta.mean() theta_std = theta.std() outputs.extend([theta_bar, theta_std]) self.f = theano.function([], outputs)
def decorate(self, layer): if self.onTrain: std = tt.sqrt(tt.var(layer.outputs) + self.espilon) layer.output = (layer.output - tt.mean(layer.output) / std) if self.onTest: std = tt.sqrt(tt.var(layer.testOutputs) + self.espilon) layer.testOutput = (layer.testOutput - tt.mean(layer.testOutput)) / std
def __init__(self, network): self.network = network self.parameters = network.parameters num_trails = self.parameters.num_trials n_layers = network.n_layers self.channels = {} for channel in self.training_values: self.channels[channel] = np.zeros((n_layers, num_trails)) for channel in self.training_mean_std: self.channels[channel] = np.zeros((n_layers, num_trails, 2)) outputs = [] for layer in range(n_layers): if layer == 0: X = self.network.X else: X = self.network.Y[layer-1] Y = self.network.Y[layer] Q = self.network.Q[layer] W = self.network.W[layer] theta = self.network.theta[layer] y_bar = Y.mean() Cyy_bar = (Y.T.dot(Y)/network.parameters.batch_size).mean() outputs.extend([y_bar, Cyy_bar]) X_rec = Y.dot(Q.T) X_rec_norm = T.sqrt(T.sum(T.sqr(X_rec),axis =1,keepdims=True)) X_norm = T.sqrt(T.sum(T.sqr(X),axis =1,keepdims=True)) X_rec_bar = X_rec_norm.mean() X_rec_std = X_rec_norm.std() outputs.extend([X_rec_bar, X_rec_std]) X_bar = X_norm.mean() X_std = X_norm.std() outputs.extend([X_bar, X_std]) SNR_Norm = T.mean(T.var(X,axis=0))/T.mean(T.var(X-X_rec*X_norm/X_rec_norm,axis=0)) SNR = T.mean(T.var(X,axis=0))/T.mean(T.var(X-X_rec_norm,axis=0)) outputs.extend([SNR, SNR_Norm]) Q_norm = T.sqrt(T.sum(T.sqr(Q), axis=0)) Q_bar = Q_norm.mean() Q_std = Q_norm.std() outputs.extend([Q_bar, Q_std]) W_bar = W.mean() W_std = W.std() outputs.extend([W_bar, W_std]) theta_bar = theta.mean() theta_std = theta.std() outputs.extend([theta_bar, theta_std]) self.f = theano.function([], outputs)
def instance(self, train_x, infer_x, dropout=None, epsilon=1e-8, **kwargs): """Returns (train_output, inference_output, statistics_updates, train_reconstruction, infer_reconstruction)""" # dropout dropout = dropout or 0. mask = self.srng.binomial(n=1, p=1 - dropout, size=train_x.shape) # cast because int * float32 = float64 which does not run on GPU train_x = train_x * T.cast(mask, theano.config.floatX) # outputs with batch-specific normalization train_lin_output = T.dot(train_x, self.t_W) + self.t_b train_lin_output.name = self.subname("trainLinOutput") batch_mean = T.mean(train_lin_output, axis=0) offset_output = train_lin_output - batch_mean batch_var = T.var(offset_output, axis=0) batch_sd = T.sqrt(batch_var + epsilon) normalized_lin_output = offset_output / batch_sd train_output = self.activation_fn(self.gamma * normalized_lin_output + self.beta) train_output.name = self.subname("trainOutput") # reconstruct batch-specific output W_T = self.t_W.T W_T.name = self.subname("W_T") recon_lin_output = T.dot(train_output, W_T) + self.t_decode_b recon_lin_output.name = self.subname("reconLinOutput") decode_batch_mean = T.mean(recon_lin_output, axis=0) recon_offset_output = recon_lin_output - decode_batch_mean decode_batch_var = T.var(recon_offset_output, axis=0) decode_batch_sd = T.sqrt(decode_batch_var + epsilon) normalized_recon_lin_output = recon_offset_output / decode_batch_sd reconstructed_output = self.activation_fn(self.decode_gamma * normalized_recon_lin_output + self.decode_beta) # outputs with rolling-average normalization infer_lin_output = T.dot(infer_x, self.t_W) + self.t_b infer_lin_output.name = self.subname("inferLinOutput") sd = T.sqrt(self.variance + epsilon) normalized_infer_lin_output = infer_lin_output - self.mean inference_output = self.activation_fn(self.gamma / sd * normalized_infer_lin_output + self.beta) infer_lin_output.name = self.subname("inferenceOutput") # reconstruct batch-specific output recon_infer_lin_output = T.dot(inference_output, W_T) + self.t_decode_b recon_infer_lin_output.name = self.subname("reconInferLinOutput") decode_sd = T.sqrt(self.decode_variance + epsilon) normalized_recon_infer_lin_output = recon_infer_lin_output - self.decode_mean recon_infer_output = self.activation_fn(self.decode_gamma / decode_sd * normalized_recon_infer_lin_output + self.decode_beta) # save exponential moving average for batch mean/variance statistics_updates = [ (self.mean, self.alpha * self.mean + (1.0 - self.alpha) * batch_mean), (self.variance, self.alpha * self.variance + (1.0 - self.alpha) * batch_var), (self.decode_mean, self.alpha * self.decode_mean + (1.0 - self.alpha) * decode_batch_mean), (self.decode_variance, self.alpha * self.decode_variance + (1.0 - self.alpha) * decode_batch_var), ] return train_output, inference_output, statistics_updates, reconstructed_output, recon_infer_output
def f_prop(self, x): if x.ndim == 2: mean = T.mean(x, axis=0, keepdims=True) std = T.sqrt(T.var(x, axis=0, keepdims=True)+self.epsilon) elif x.ndim == 4: mean = T.mean(x, axis=(0,2,3), keepdims=True) std = T.sqrt(T.var(x, axis=(0,2,3), keepdims=True)+self.epsilon) normalized_x = (x-mean)/std self.z = self.gamma*normalized_x+self.beta return self.z
def LayerNormalization(x, gamma, mask, estimated_mean=0.0, estimated_var=1.0): assert x.ndim == 3 or x.ndim == 2 if x.ndim == 3: x_mean = T.mean(x, axis=2).dimshuffle(0, 1, 'x') x_var = T.var(x, axis=2).dimshuffle(0, 1, 'x') return gamma * ( (x - x_mean) / T.sqrt(x_var + 1e-7)), x_mean[0, 0], x_var[0, 0] elif x.ndim == 2: x_mean = T.mean(x, axis=1).dimshuffle(0, 'x') x_var = T.var(x, axis=1).dimshuffle(0, 'x') return gamma * ( (x - x_mean) / T.sqrt(x_var + 1e-7)), x_mean[0], x_var[0]
def get_result(self, input): # returns BN result for given input. epsilon = 1e-06 if self.mode == 0: if self.run_mode == 0: now_mean = T.mean(input, axis=0) now_var = T.var(input, axis=0) now_normalize = (input - now_mean) / T.sqrt( now_var + epsilon) # should be broadcastable.. output = self.gamma * now_normalize + self.beta #print ('norm.shape =') #print (now_normalize.shape.eval({x: np.random.rand(2,2).astype(dtype=theano.config.floatX)})) # mean, var update self.mean = self.momentum * self.mean + ( 1.0 - self.momentum) * now_mean self.var = self.momentum * self.var + (1.0 - self.momentum) * ( self.input_shape[0] / (self.input_shape[0] - 1) * now_var) else: output = self.gamma * ( input - self.mean) / T.sqrt(self.var + epsilon) + self.beta else: # in CNN mode, gamma and beta exists for every single channel separately. # for each channel, calculate mean and std for (mini_batch_size * row * column) elements. # then, each channel has own scalar gamma/beta parameters. if self.run_mode == 0: now_mean = T.mean(input, axis=(0, 2, 3)) now_var = T.var(input, axis=(0, 2, 3)) # mean, var update self.mean = self.momentum * self.mean + ( 1.0 - self.momentum) * now_mean self.var = self.momentum * self.var + (1.0 - self.momentum) * ( self.input_shape[0] / (self.input_shape[0] - 1) * now_var) else: now_mean = self.mean now_var = self.var # change shape to fit input shape now_mean = self.change_shape(now_mean) now_var = self.change_shape(now_var) now_gamma = self.change_shape(self.gamma) now_beta = self.change_shape(self.beta) output = now_gamma * ( input - now_mean) / T.sqrt(now_var + epsilon) + now_beta return output
def build_trainer(input_data, input_mask, target_data, target_mask, network_params, output_layer, cond_layer_list, feat_reg, updater, learning_rate, load_updater_params=None): output_score = get_output(output_layer, deterministic=False) frame_prd_idx = T.argmax(output_score, axis=-1) one_hot_target = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1), nb_class=output_dim, dtype=floatX) output_score = T.reshape(x=output_score, newshape=(-1, output_dim), ndim=2) output_score = output_score - T.max(output_score, axis=-1, keepdims=True) output_score = output_score - T.log(T.sum(T.exp(output_score), axis=-1, keepdims=True)) train_ce = -T.sum(T.mul(one_hot_target, output_score), axis=-1)*T.flatten(target_mask, 1) train_loss = T.sum(train_ce)/target_mask.shape[0] frame_loss = T.sum(train_ce)/T.sum(target_mask) frame_accr = T.sum(T.eq(frame_prd_idx, target_data)*target_mask)/T.sum(target_mask) train_feat_loss = 0 for cond_layer in cond_layer_list: sample_feat = cond_layer.get_sample_feat() sample_feat_cost = T.var(sample_feat, axis=0) sample_feat_cost = -T.mean(sample_feat_cost) train_feat_loss += sample_feat_cost train_feat_loss /= len(cond_layer_list) train_total_loss = train_loss + train_feat_loss*feat_reg network_grads = theano.grad(cost=train_total_loss, wrt=network_params) network_grads_norm = T.sqrt(sum(T.sum(grad**2) for grad in network_grads)) train_lr = theano.shared(lasagne.utils.floatX(learning_rate)) train_updates, updater_params = updater(loss_or_grads=network_grads, params=network_params, learning_rate=train_lr, load_params_dict=load_updater_params) training_fn = theano.function(inputs=[input_data, input_mask, target_data, target_mask], outputs=[frame_loss, frame_accr, train_feat_loss, network_grads_norm], updates=train_updates) return training_fn, train_lr, updater_params
def get_stats(input, stat=None): """ Returns a dictionary mapping the name of the statistic to the result on the input. Currently gets mean, var, std, min, max, l1, l2. Parameters ---------- input : tensor Theano tensor to grab stats for. Returns ------- dict Dictionary of all the statistics expressions {string_name: theano expression} """ stats = { 'mean': T.mean(input), 'var': T.var(input), 'std': T.std(input), 'min': T.min(input), 'max': T.max(input), 'l1': input.norm(L=1), 'l2': input.norm(L=2), #'num_nonzero': T.sum(T.nonzero(input)), } stat_list = raise_to_list(stat) compiled_stats = {} if stat_list is None: return stats for stat in stat_list: if isinstance(stat, string_types) and stat in stats: compiled_stats.update({stat: stats[stat]}) return compiled_stats
def ZCA(data, n_component=2): ''' m is the number of data points n is the dimension of the data :param data: <numpy matrix, (m,n)> imput data :param n_component: <int> number of dimension to be extracted :return: ''' # data standardization x = T.matrix('x') eps = T.scalar('eps') y = (x - T.mean(x, axis=0)) / T.sqrt(T.var(x) + eps) standardize = th.function([x, eps], y) # zca whitening x_n = T.matrix('x_n') # normalized input eps2 = T.scalar('eps2') # small esp to prevent div by zero x_cov = T.dot(x_n.T, x_n) / x_n.shape[0] # variance of input u, s, v = T.nlinalg.svd(x_cov) z = T.dot(T.dot(u, T.nlinalg.diag(1. / T.sqrt(s + eps2))), u.T) x_zca = T.dot(x_n, z.T[:, :n_component]) zca_whiten = th.function([x_n, eps2], x_zca) return zca_whiten(standardize(data, 0.1), 0.01)
def decorate(self, layer) : if not hasattr(layer, "batchnorm_W") or not hasattr(layer, "batchnorm_b") : self.paramShape = layer.getOutputShape()#(layer.nbOutputs, ) self.WInitialization.initialize(self) self.bInitialization.initialize(self) layer.batchnorm_W = self.W layer.batchnorm_b = self.b mu = tt.mean(layer.outputs) sigma = tt.sqrt( tt.var(layer.outputs) + self.epsilon ) layer.outputs = layer.batchnorm_W * ( (layer.outputs - mu) / sigma ) + layer.batchnorm_b mu = tt.mean(layer.testOutputs) sigma = tt.sqrt( tt.var(layer.testOutputs) + self.epsilon ) layer.testOutputs = layer.batchnorm_W * ( (layer.testOutputs - mu) / sigma ) + layer.batchnorm_b
def get_output_for(self, input, moving_avg_hooks=None, deterministic=False, *args, **kwargs): if deterministic is False: m = T.mean(input, axis=self.axis, keepdims=True) v = T.sqrt( T.var(input, axis=self.axis, keepdims=True) + self.epsilon) m.name = "tensor:mean" v.name = "tensor:variance" key = "BatchNormLayer:movingavg" if key not in moving_avg_hooks: moving_avg_hooks[key] = [] moving_avg_hooks[key].append( [[m, v], [self.mean_inference, self.variance_inference]]) else: m = self.mean_inference v = self.variance_inference input_norm = (input - m) / v # normalize y = self.gamma * input_norm + self.beta # scale and shift return self.nonlinearity(y)
def _compute_training_statistics(self, input_): if self.n_iter: axes = (0, ) + tuple( (i + 1) for i, b in enumerate(self.population_mean[0].broadcastable) if b) else: axes = (0, ) + tuple( (i + 1) for i, b in enumerate(self.population_mean.broadcastable) if b) mean = input_.mean(axis=axes, keepdims=True) if self.n_iter: assert mean.broadcastable[1:] == self.population_mean[ 0].broadcastable else: assert mean.broadcastable[1:] == self.population_mean.broadcastable stdev = tensor.sqrt( tensor.var(input_, axis=axes, keepdims=True) + numpy.cast[theano.config.floatX](self.epsilon)) if self.n_iter: assert stdev.broadcastable[1:] == self.population_stdev[ 0].broadcastable else: assert stdev.broadcastable[ 1:] == self.population_stdev.broadcastable add_role(mean, BATCH_NORM_MINIBATCH_ESTIMATE) add_role(stdev, BATCH_NORM_MINIBATCH_ESTIMATE) return mean, stdev
def process(self, input, tparams, BNparams): mode = 'full' if self.border_mode == 'same' else self.border_mode output = conv.conv2d( input=input, filters=tparams[p_(self.prefix, 'W')], image_shape=[self.batch_size, self.n_in[0]] + self.image_shape, filter_shape=[self.n_out] + self.n_in, border_mode=mode, subsample=self.stride) if self.border_mode == 'same': a1 = (self.filter_size[0] - 1) // 2 b1 = (self.filter_size[1] - 1) // 2 a2 = self.filter_size[0] - a1 b2 = self.filter_size[1] - b1 if a2 == 1: if b2 == 1: output = output[:, :, a1:, b1:] else: output = output[:, :, a1:, b1:-b2+1] else: if b2 == 1: output = output[:, :, a1:-a2+1, b1:] else: output = output[:, :, a1:-a2+1, b1:-b2+1] if self.with_bias: output += tparams[p_(self.prefix, 'b')].dimshuffle('x', 0, 'x', 'x') self.BN_mean = T.mean(output, axis=[0, 2, 3]) m2 = (1 + 1 / (T.prod(output.shape) / self.n_out - 1)).astype(floatX) self.BN_std = T.sqrt(m2 * T.var(output, axis=[0, 2, 3]) + npt(self.BN_eps)) return output
def __init__(self, rng, layers, mc_samples=None): self.layers = layers self.params = [param for layer in self.layers for param in layer.params] self.cost = self.layers[-1].cost # function pointer if mc_samples is None: # Standard dropout network. try: self.preds = self.layers[-1].preds self.error = self.layers[-1].error # function pointer except: print('Could not access network outputs' ' - did you pass a (non-dropout) input?' ) else: # mc_dropout network. self.mc_samples = mc_samples mc_outputs, _ = theano.scan(lambda: self.layers[-1].output_dropout, outputs_info=None, n_steps = self.mc_samples) self.predictive_distribution_mean = T.mean(mc_outputs, axis=0) self.predictive_distribution_var = T.var(mc_outputs, axis=0) self.preds = T.argmax(self.predictive_distribution_mean, axis=1) self.error = self.__error_mc self.L1 = ( T.sum([abs(layer.W).sum() for layer in self.layers]) ) self.L2_sqr = ( T.sum([(layer.W ** 2).sum() for layer in self.layers]) )
def batch_norm(X, gamma, beta, m_shared, v_shared, test, add_updates): if X.ndim > 2: output_shape = X.shape X = X.flatten(2) if test is False: m = T.mean(X, axis=0, keepdims=True) v = T.sqrt(T.var(X, axis=0, keepdims=True) + self.epsilon) mulfac = 1.0 / 1000 if m_shared in add_updates: add_updates[m_shared] = ( 1.0 - mulfac) * add_updates[m_shared] + mulfac * m add_updates[v_shared] = ( 1.0 - mulfac) * add_updates[v_shared] + mulfac * v else: add_updates[m_shared] = (1.0 - mulfac) * m_shared + mulfac * m add_updates[v_shared] = (1.0 - mulfac) * v_shared + mulfac * v else: m = m_shared v = v_shared X_hat = (X - m) / v y = gamma * X_hat + beta if X.ndim > 2: y = T.reshape(y, output_shape) return y
def batch_norm(X, gamma, beta, m_shared, v_shared, test, add_updates): if X.ndim > 2: output_shape = X.shape X = X.flatten(2) if test is False: m = T.mean(X, axis=0, keepdims=True) v = T.sqrt(T.var(X, axis=0, keepdims=True) + self.epsilon) mulfac = 1.0/1000 if m_shared in add_updates: add_updates[m_shared] = (1.0-mulfac)*add_updates[m_shared] + mulfac*m add_updates[v_shared] = (1.0-mulfac)*add_updates[v_shared] + mulfac*v else: add_updates[m_shared] = (1.0-mulfac)*m_shared + mulfac*m add_updates[v_shared] = (1.0-mulfac)*v_shared + mulfac*v else: m = m_shared v = v_shared X_hat = (X - m) / v y = gamma*X_hat + beta if X.ndim > 2: y = T.reshape(y, output_shape) return y
def layer_var(self): # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small var = [] for layer in self.layers: var.append(T.var(layer.W)) return var
def layer_normalization(x, bias=None, scale=None, eps=1e-5): """ Layer Normalization, https://arxiv.org/abs/1607.06450 x is mean and variance normalized along its feature dimension. After that, we allow a bias and a rescale. This is supposed to be trainable. :param x: 3d tensor (time,batch,dim) (or any ndim, last dim is expected to be dim) :param bias: 1d tensor (dim) or None :param scale: 1d tensor (dim) or None """ mean = T.mean(x, axis=x.ndim - 1, keepdims=True) std = T.sqrt(T.var(x, axis=x.ndim - 1, keepdims=True) + numpy.float32(eps)) assert mean.ndim == std.ndim == x.ndim output = (x - mean) / std assert output.ndim == x.ndim if scale is not None: assert scale.ndim == 1 scale = scale.dimshuffle(*(('x',) * (x.ndim - 1) + (0,))) assert scale.ndim == x.ndim output = output * scale if bias is not None: assert bias.ndim == 1 bias = bias.dimshuffle(*(('x',) * (x.ndim - 1) + (0,))) assert bias.ndim == x.ndim output = output + bias return output
def normalize_samples(self, x, gamma, beta): OutputLog().write('Normalizing Samples') mean = Tensor.mean(x, axis=1, keepdims=True) var = Tensor.var(x, axis=1, keepdims=True) normalized_output = (x - mean) / Tensor.sqrt(var + self.epsilon) return normalized_output / gamma + beta
def ln(input, alpha, beta=None): output = (input - T.mean(input, axis=1, keepdims=True) ) / T.sqrt(T.var(input, axis=1, keepdims=True) + eps) output *= alpha[None, :] if beta: output += beta[None, :] return output
def _normalize_input(self): X = T.matrix('X') results, updates = theano.scan( lambda x_i: (x_i - T.mean(x_i)) / T.sqrt(T.var(x_i) + 10), sequences=[X] ) return theano.function(inputs=[X], outputs=results)
def instance(self, train_x, infer_x, dropout=None, epsilon=1e-8, **kwargs): """Returns (train_output, inference_output, statistics_updates)""" # dropout dropout = dropout or 0. mask = self.srng.binomial(n=1, p=1 - dropout, size=train_x.shape) # cast because int * float32 = float64 which does not run on GPU train_x = train_x * T.cast(mask, theano.config.floatX) # outputs with batch-specific normalization train_lin_output = T.dot(train_x, self.t_W) + self.t_b batch_mean = T.mean(train_lin_output, axis=0) offset_output = train_lin_output - batch_mean batch_var = T.var(offset_output, axis=0) normalized_lin_output = offset_output / T.sqrt(batch_var + epsilon) train_output = self.activation_fn(self.gamma * normalized_lin_output + self.beta) # outputs with rolling-average normalization infer_lin_output = T.dot(infer_x, self.t_W) + self.t_b sd = T.sqrt(self.variance + epsilon) inference_output = self.activation_fn(self.gamma / sd * infer_lin_output + (self.beta - (self.gamma * self.mean) / sd)) # save exponential moving average for batch mean/variance statistics_updates = [ (self.mean, self.alpha * self.mean + (1.0 - self.alpha) * batch_mean), (self.variance, self.alpha * self.variance + (1.0 - self.alpha) * batch_var) ] return train_output, inference_output, statistics_updates
def get_output_for(self, input, deterministic=False, **kwargs): beta = self.beta gamma = self.gamma means = self.means stdevs = self.stdevs output_shape = input.shape if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) if deterministic == False: m = T.mean(input, axis=0, keepdims=False) s = T.sqrt(T.var(input, axis=0, keepdims=False) + self.eta) means.default_update = self.alpha * means + (1-self.alpha) * m Es = self.alpha * stdevs + (1-self.alpha) * s u = self.batch_size / (self.batch_size - 1) stdevs.default_update = u * Es else: m = means s = stdevs output = input - m output /= s # transform normalized outputs based on learned shift and scale if self.learn_transform is True: output = gamma * output + beta output = output.reshape(output_shape) return self.nonlinearity(output)
def add_param(self, param, name="", constraints=True, custom_update=None, custom_update_normalized=False, custom_update_exp_average=0, custom_update_condition=None, custom_update_accumulate_batches=None): """ :type param: theano.SharedVariable :type name: str :rtype: theano.SharedVariable """ param = super(Layer, self).add_param(param, name) if custom_update: # Handled in Device and Updater. param.custom_update = custom_update param.custom_update_normalized = custom_update_normalized param.custom_update_exp_average = custom_update_exp_average param.custom_update_condition = custom_update_condition param.custom_update_accumulate_batches = custom_update_accumulate_batches if constraints: if 'L1' in self.attrs and self.attrs['L1'] > 0: self.constraints += T.constant(self.attrs['L1'], name="L1", dtype='floatX') * abs(param).sum() if 'L2' in self.attrs and self.attrs['L2'] > 0: self.constraints += T.constant(self.attrs['L2'], name="L2", dtype='floatX') * (param**2).sum() if self.attrs.get('L2_eye', 0) > 0: L2_eye = T.constant(self.attrs['L2_eye'], name="L2_eye", dtype='floatX') if param.ndim == 2: eye = tiled_eye(param.shape[0], param.shape[1], dtype=param.dtype) self.constraints += L2_eye * ((param - eye)**2).sum() else: # standard L2 self.constraints += L2_eye * (param**2).sum() if 'varreg' in self.attrs and self.attrs['varreg'] > 0: self.constraints += self.attrs['varreg'] * (1.0 * T.sqrt(T.var(param)) - 1.0 / numpy.sum(param.get_value().shape))**2 return param
def layer_normalization(x, bias=None, scale=None, eps=1e-5): """ Layer Normalization, https://arxiv.org/abs/1607.06450 x is mean and variance normalized along its feature dimension. After that, we allow a bias and a rescale. This is supposed to be trainable. :param x: 3d tensor (time,batch,dim) (or any ndim, last dim is expected to be dim) :param bias: 1d tensor (dim) or None :param scale: 1d tensor (dim) or None """ mean = T.mean(x, axis=x.ndim - 1, keepdims=True) std = T.sqrt(T.var(x, axis=x.ndim - 1, keepdims=True) + numpy.float32(eps)) assert mean.ndim == std.ndim == x.ndim output = (x - mean) / std assert output.ndim == x.ndim if scale is not None: assert scale.ndim == 1 scale = scale.dimshuffle(*(('x', ) * (x.ndim - 1) + (0, ))) assert scale.ndim == x.ndim output = output * scale if bias is not None: assert bias.ndim == 1 bias = bias.dimshuffle(*(('x', ) * (x.ndim - 1) + (0, ))) assert bias.ndim == x.ndim output = output + bias return output
def get_output_for(self, input, moving_avg_hooks=None, deterministic=False, *args, **kwargs): if deterministic is False: m = T.mean(input, axis=0, keepdims=True) m.name = "tensor:mean" v = T.sqrt(T.var(input, axis=0, keepdims=True) + self.epsilon) v.name = "tensor:variance" R = T.dot(((input - m)).T, ((input - m))) key = "WhiteningLayer:movingavg" if key not in moving_avg_hooks: moving_avg_hooks[key] = [] moving_avg_hooks[key].append( [[self.R_inference], [self.W]]) key = "BatchNormalizationLayer:movingavg" if key not in moving_avg_hooks: moving_avg_hooks[key] = [] moving_avg_hooks[key].append( [[m, v, R], [self.mean_inference, self.variance_inference, self.R_inference]]) else: m = self.mean_inference v = self.variance_inference input_hat = T.dot((input - m), self.W.T) # normalize y = input_hat / self.gamma + self.beta # scale and shift return y
def get_stats(input, stat=None): """ Returns a dictionary mapping the name of the statistic to the result on the input. Currently gets mean, var, std, min, max, l1, l2. Parameters ---------- input : tensor Theano tensor to grab stats for. Returns ------- dict Dictionary of all the statistics expressions {string_name: theano expression} """ stats = { 'mean': T.mean(input), 'var': T.var(input), 'std': T.std(input), 'min': T.min(input), 'max': T.max(input), 'l1': input.norm(L=1), 'l2': input.norm(L=2), #'num_nonzero': T.sum(T.nonzero(input)), } stat_list = raise_to_list(stat) compiled_stats = {} if stat_list is None: return stats for stat in stat_list: if isinstance(stat, six.string_types) and stat in stats: compiled_stats.update({stat: stats[stat]}) return compiled_stats
def __init__(self, inputData, image_shape): self.input = inputData num_out = image_shape[-3] epsilon = 0.01 self.image_shape = image_shape gamma_values = numpy.ones((num_out, ), dtype=theano.config.floatX) self.gamma_vals = theano.shared(value=gamma_values, borrow=True) beta_values = numpy.zeros((num_out, ), dtype=theano.config.floatX) self.beta_vals = theano.shared(value=beta_values, borrow=True) batch_mean = T.mean(self.input, keepdims=True, axis=(0, -2, -1)) batch_var = T.var(self.input, keepdims=True, axis=(0, -2, -1)) + epsilon self.batch_mean = self.adjustVals(batch_mean) batch_var = self.adjustVals(batch_var) self.batch_var = T.pow(batch_var, 0.5) batch_normalize = (inputData - self.batch_mean) / (T.pow( self.batch_var, 0.5)) if self.input.ndim == 5: self.beta = self.beta_vals.dimshuffle('x', 'x', 0, 'x', 'x') self.gamma = self.gamma_vals.dimshuffle('x', 'x', 0, 'x', 'x') else: self.beta = self.beta_vals.dimshuffle('x', 0, 'x', 'x') self.gamma = self.gamma_vals.dimshuffle('x', 0, 'x', 'x') self.output = batch_normalize * self.gamma + self.beta #self.output=inputData-self.batch_mean self.params = [self.gamma_vals, self.beta_vals]
def make_consensus(self, networks, axis=2): cns = self.attrs['consensus'] if cns == 'max': return T.max(networks, axis=axis) elif cns == 'min': return T.min(networks, axis=axis) elif cns == 'mean': return T.mean(networks, axis=axis) elif cns == 'flat': if self.depth == 1: return networks if axis == 2: return networks.flatten(ndim=3) #return T.reshape(networks, (networks.shape[0], networks.shape[1], T.prod(networks.shape[2:]) )) else: return networks.flatten(ndim=2) # T.reshape(networks, (networks.shape[0], T.prod(networks.shape[1:]) )) elif cns == 'sum': return T.sum(networks, axis=axis, acc_dtype=theano.config.floatX) elif cns == 'prod': return T.prod(networks, axis=axis) elif cns == 'var': return T.var(networks, axis=axis) elif cns == 'project': p = self.add_param(self.create_random_uniform_weights(self.attrs['n_out'], 1, self.attrs['n_out'] + self.depth + 1)) return T.tensordot(p, networks, [[1], [axis]]) elif cns == 'random': idx = self.rng.random_integers(size=(1,), low=0, high=self.depth) if axis == 0: return networks[idx] if axis == 1: return networks[:,idx] if axis == 2: return networks[:,:,idx] if axis == 3: return networks[:,:,:,idx] assert False, "axis too large" else: assert False, "consensus method unknown: " + cns
def get_output_for(self, input, moving_avg_hooks=None, deterministic=False, *args, **kwargs): reshape = False if input.ndim > 2: output_shape = input.shape reshape = True input = input.flatten(2) if deterministic is False: m = T.mean(input, axis=0, keepdims=True) v = T.sqrt(T.var(input, axis=0, keepdims=True)+self.epsilon) m.name = "tensor:mean-" + self.name v.name = "tensor:variance-" + self.name key = "BatchNormalizationLayer:movingavg" if key not in moving_avg_hooks: # moving_avg_hooks[key] = {} moving_avg_hooks[key] = [] # moving_avg_hooks[key][self.name] = [[m,v], [self.mean_inference, self.variance_inference]] moving_avg_hooks[key].append([[m,v], [self.mean_inference, self.variance_inference]]) else: m = self.mean_inference v = self.variance_inference input_hat = (input - m) / v # normalize y = self.gamma*input_hat + self.beta # scale and shift if reshape:#input.ndim > 2: y = T.reshape(y, output_shape) return self.nonlinearity(y)
def add_param(self, param, name="", constraints=True, custom_update=None, custom_update_normalized=False, custom_update_exp_average=0, custom_update_condition=None, custom_update_accumulate_batches=None, live_update=None): """ :type param: theano.SharedVariable :type name: str :rtype: theano.SharedVariable """ param = super(Layer, self).add_param(param, name) param.live_update = live_update if custom_update: # Handled in Device and Updater. param.custom_update = custom_update param.custom_update_normalized = custom_update_normalized param.custom_update_exp_average = custom_update_exp_average param.custom_update_condition = custom_update_condition param.custom_update_accumulate_batches = custom_update_accumulate_batches if constraints: if 'L1' in self.attrs and self.attrs['L1'] > 0: self.constraints += T.constant(self.attrs['L1'], name="L1", dtype='floatX') * abs(param).sum() if 'L2' in self.attrs and self.attrs['L2'] > 0: self.constraints += T.constant(self.attrs['L2'], name="L2", dtype='floatX') * (param**2).sum() if self.attrs.get('L2_eye', 0) > 0: L2_eye = T.constant(self.attrs['L2_eye'], name="L2_eye", dtype='floatX') if param.ndim == 2: eye = tiled_eye(param.shape[0], param.shape[1], dtype=param.dtype) self.constraints += L2_eye * ((param - eye)**2).sum() else: # standard L2 self.constraints += L2_eye * (param**2).sum() if 'varreg' in self.attrs and self.attrs['varreg'] > 0: self.constraints += self.attrs['varreg'] * (1.0 * T.sqrt(T.var(param)) - 1.0 / numpy.sum(param.get_value().shape))**2 return param
def normalise(X): eps = 1e-4 X_m = T.mean(X, keepdims=True, axis=0) X_var = T.var(X, keepdims=True, axis=0) X = (X - X_m) / (T.sqrt(X_var + eps)) return X
def __init__(self,inputData,image_shape): self.input=inputData num_out=image_shape[1] epsilon=0.01 self.image_shape=image_shape gamma_values = numpy.ones((num_out,), dtype=theano.config.floatX) self.gamma_vals = theano.shared(value=gamma_values, borrow=True) beta_values = numpy.zeros((num_out,), dtype=theano.config.floatX) self.beta_vals = theano.shared(value=beta_values, borrow=True) batch_mean=T.mean(self.input,keepdims=True,axis=(0,2,3)) batch_var=T.var(self.input,keepdims=True,axis=(0,2,3))+epsilon self.batch_mean=self.adjustVals(batch_mean) batch_var=self.adjustVals(batch_var) self.batch_var=T.pow(batch_var,0.5) batch_normalize=(inputData-self.batch_mean)/(T.pow(self.batch_var,0.5)) self.beta = self.beta_vals.dimshuffle('x', 0, 'x', 'x') self.gamma = self.gamma_vals.dimshuffle('x', 0, 'x', 'x') self.output=batch_normalize*self.gamma+self.beta #self.output=inputData-self.batch_mean self.params=[self.gamma_vals,self.beta_vals]
def get_output_for(self, input, deterministic=False, **kwargs): beta = self.beta gamma = self.gamma means = self.means stdevs = self.stdevs output_shape = input.shape if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) if deterministic == False: m = T.mean(input, axis=0, keepdims=False) s = T.sqrt(T.var(input, axis=0, keepdims=False) + self.eta) means.default_update = self.alpha * means + (1 - self.alpha) * m Es = self.alpha * stdevs + (1 - self.alpha) * s u = self.batch_size / (self.batch_size - 1) stdevs.default_update = u * Es else: m = means s = stdevs output = input - m output /= s # transform normalized outputs based on learned shift and scale if self.learn_transform is True: output = gamma * output + beta output = output.reshape(output_shape) return self.nonlinearity(output)
def activations(self, dataset): prev_activations = self._prev_layer.activations(dataset) if prev_activations.ndim == 2: # flat dataset: (example, vector) mean = T.mean(prev_activations, axis=0) variance = T.var(prev_activations, axis=0) elif prev_activations.ndim == 3: # sequence dataset: (seq num, example, vector) mean = T.mean(prev_activations, axis=1).dimshuffle(0,'x',1) variance = T.var(prev_activations, axis=1).dimshuffle(0,'x',1) normalized = (prev_activations - mean) / T.sqrt(variance + self.EPSILON) scaled_and_shifted = (normalized * self._scale) + self._shift return scaled_and_shifted
def kmeans(train_set_x): if train_set_x is None: train_set_x = T.matrix('train_set_x') ######################## # Normalize the inputs # ######################## epsilon_norm = 10 epsilon_zca = 0.015 K = 500 train_set_x = train_set_x - T.mean(train_set_x, axis=0) / T.sqrt(T.var(train_set_x, axis=0) + epsilon_norm) ##################### # Whiten the inputs # ##################### # a simple choice of whitening transform is the ZCA whitening transform # epsilon_zca is small constant # for contrast-normalizaed data, setting epsilon_zca to 0.01 for 16-by-16 pixel patches, # or to 0.1 for 8-by-8 pixel patches # is good starting point cov = T.dot(train_set_x, T.transpose(train_set_x)) / train_set_x.shape[1] U, S, V = linalg.svd(cov) tmp = T.dot(U, T.diag(1/T.sqrt(S + epsilon_zca))) tmp = T.dot(tmp, T.transpose(U)) whitened_x = T.dot(tmp, train_set_x) ###################### # Training the Model # ###################### # Initialization dimension_size = whitened_x.shape[0] num_samples = whitened_x.shape[1] srng = RandomStreams(seed=234) D = srng.normal(size=(dimension_size, K)) D = D / T.sqrt(T.sum(T.sqr(D), axis=0)) # typically 10 iterations is enough num_iteration = 15 # compute new centroids, D_new for i in xrange(num_iteration): dx = T.dot(D.T, whitened_x) arg_max_dx = T.argmax(dx, axis=0) s = dx[arg_max_dx, T.arange(num_samples)] S = T.zeros((K, num_samples)) S = T.set_subtensor(S[arg_max_dx, T.arange(num_samples)], s) D = T.dot(whitened_x, T.transpose(S)) + D D = D / T.sqrt(T.sum(T.sqr(D), axis=0)) return D
def activations(self, dataset): prev_activations = self._prev_layer.activations(dataset) if prev_activations.ndim == 2: # flat dataset: (example, vector) mean = T.mean(prev_activations, axis=0) variance = T.var(prev_activations, axis=0) elif prev_activations.ndim == 3: # sequence dataset: (seq num, example, vector) mean = T.mean(prev_activations, axis=1).dimshuffle(0, 'x', 1) variance = T.var(prev_activations, axis=1).dimshuffle(0, 'x', 1) normalized = (prev_activations - mean) / T.sqrt(variance + self.EPSILON) scaled_and_shifted = (normalized * self._scale) + self._shift return scaled_and_shifted
def fprop(self, input): """Propogate the input through the layer.""" output = input - T.mean(input, axis=1, keepdims=True) output = output / T.sqrt(T.var(input, axis=1, keepdims=True) + 1e-5) output = self.alpha.dimshuffle('x', 0) * output + \ self.beta.dimshuffle('x', 0) # scale and shift return output
def Kmeans(X_train=None, K=300, epsilon_whitening=0.015): if X_train is None: X_train = T.matrix('X_train') ######################## # Normalize the inputs # ######################## # A constant added to the variance to avoid division by zero epsilon_norm = 10 # We subtract from each training sample (each column in X_train) its mean X_train = X_train - T.mean( X_train, axis=0) / T.sqrt(T.var(X_train, axis=0) + epsilon_norm) ##################### # Whiten the inputs # ##################### sigma = T.dot(X_train, T.transpose(X_train)) / X_train.shape[1] U, s, V = linalg.svd(sigma, full_matrices=False) tmp = T.dot(U, T.diag(1 / T.sqrt(s + epsilon_whitening))) tmp = T.dot(tmp, T.transpose(U)) X_Whitened = T.dot(tmp, X_train) ###################### # Training the Model # ###################### # Initialization dimensions = X_Whitened.shape[0] samples = X_Whitened.shape[1] srng = RandomStreams(seed=234) # We initialize the centroids by sampling them from a normal # distribution, and then normalizing them to unit length # D \in R^{n \times k} D = srng.normal(size=(dimensions, K)) D = D / T.sqrt(T.sum(T.sqr(D), axis=0)) iterations = 30 for i in xrange(iterations): # Initialize new point representations # for every pass of the algorithm S = T.zeros((K, samples)) tmp = T.dot(D.T, X_Whitened) res = T.argmax(tmp, axis=0) max_values = tmp[res, T.arange(samples)] S = T.set_subtensor(S[res, T.arange(samples)], max_values) D = T.dot(X_Whitened, T.transpose(S)) D = D / T.sqrt(T.sum(T.sqr(D), axis=0)) return D
def sample_elbo(model, population=None, samples=1, pi=1, vp=None): """ pi*KL[q(w|mu,rho)||p(w)] + E_q[log p(D|w)] approximated by Monte Carlo sampling Parameters ---------- model : pymc3.Model population : dict - maps observed_RV to its population size if not provided defaults to full population samples : number of Monte Carlo samples used for approximation, defaults to 1 pi : additional coefficient for KL[q(w|mu,rho)||p(w)] as proposed in [1]_ vp : gelato.variational.utils.VariatioanalParams tuple, holding nodes mappings with shared params, if None - new will be created Returns ------- (E_q[elbo], V_q[elbo], updates, VariationalParams) mean, variance of elbo, updates for random streams, shared dicts Notes ----- You can pass tensors for `pi` and `samples` to control them while training References ---------- .. [1] Charles Blundell et al: "Weight Uncertainty in Neural Networks" arXiv preprint arXiv:1505.05424 """ if population is None: population = dict() if vp is None: vp = variational_replacements(model.root) x = flatten(vp.mapping.values()) mu = flatten(vp.shared.means.values()) rho = flatten(vp.shared.rhos.values()) def likelihood(var): tot = population.get(var, population.get(var.name)) logpt = tt.sum(var.logpt) if tot is not None: tot = tt.as_tensor(tot) logpt *= tot / var.size return logpt log_p_D = tt.add(*map(likelihood, model.root.observed_RVs)) log_p_W = model.root.varlogpt + tt.sum(model.root.potentials) log_q_W = tt.sum(log_normal3(x, mu, rho)) _elbo_ = log_p_D + pi * (log_p_W - log_q_W) _elbo_ = apply_replacements(_elbo_, vp) samples = tt.as_tensor(samples) elbos, updates = theano.scan(fn=lambda: _elbo_, outputs_info=None, n_steps=samples) return tt.mean(elbos), tt.var(elbos), updates, vp
def fprop(self, x, can_fit, eval): """ x : input to the layer can_fit : eval : """ # shape the input as a matrix (batch_size, n_inputs) self.x = x.flatten(2) # apply dropout mask if self.dropout < 1.: if eval == False: # The cast is important because # int * float32 = float64 which pulls things off the gpu # very slow ?? # srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999)) srng = theano.sandbox.rng_mrg.MRG_RandomStreams(self.rng.randint(999999)) mask = T.cast(srng.binomial(n=1, p=self.dropout, size=T.shape(self.x)), theano.config.floatX) # apply the mask self.x = self.x * mask else: self.x = self.x * self.dropout # binarize the weights self.Wb = self.binarize_weights(self.W, eval) z = T.dot(self.x, self.Wb) # for BN updates self.z = z # batch normalization if self.BN == True: self.batch_mean = T.mean(z,axis=0) self.batch_var = T.var(z,axis=0) if can_fit == True: mean = self.batch_mean var = self.batch_var else: mean = self.mean var = self.var z = (z - mean)/(T.sqrt(var+self.BN_epsilon)) z = self.a * z self.z = z + self.b # activation function y = self.activation(self.z) return y
def mom(cost, params, learning_rate, runningGradientStats, activations, runningActGrad): updates = [] resNumber = 0 for res in params: insideNumber = 0 for current_params in res: p_no = 0 for p in current_params: #weight bias gamma beta p_no += 1 if (p_no == 4): break g = T.grad(cost, p) updates.append((p, T.clip(p - learning_rate * g, -1.0, 1.0))) #now update weight gradient stats if (p_no == 1): mu = T.mean(g) sigma2 = T.var(g) updates.append( (runningGradientStats[resNumber][insideNumber][0], 0.9 * runningGradientStats[resNumber][insideNumber][0] + 0.1 * mu)) updates.append( (runningGradientStats[resNumber][insideNumber][1], 0.9 * runningGradientStats[resNumber][insideNumber][1] + 0.1 * sigma2)) insideNumber += 1 resNumber += 1 resNumber = 0 for res in activations: insideNumber = 0 for a in res: g = T.grad(cost, a) mu = T.mean(g) sigma2 = T.var(g) updates.append( (runningActGrad[resNumber][insideNumber][0], 0.9 * runningActGrad[resNumber][insideNumber][0] + 0.1 * mu)) updates.append((runningActGrad[resNumber][insideNumber][1], 0.9 * runningActGrad[resNumber][insideNumber][1] + 0.1 * sigma2)) insideNumber += 1 resNumber += 1 return updates
def _ln(self, x, lnb, lns): _eps = np.float32(1e-5) out = (x - T.mean(x, axis=-1, keepdims=True) ) / T.sqrt(T.var(x, axis=-1, keepdims=True) + _eps) out = lns * out + lnb return out
def compute_output(self, network, in_vw): super(MonitorVarianceNode, self).compute_output(network, in_vw) if network.find_hyperparameter(["monitor"]): network.create_vw( "var", variable=T.var(in_vw.variable), shape=(), tags={"monitor"}, )
def fprop(self, x, can_fit, eval): # shape the input as a matrix (batch_size, n_inputs) self.x = x.flatten(2) # apply dropout mask if self.dropout < 1.: if eval == False: # The cast is important because # int * float32 = float64 which pulls things off the gpu # very slow ?? # srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999)) srng = theano.sandbox.rng_mrg.MRG_RandomStreams( self.rng.randint(999999)) mask = T.cast( srng.binomial(n=1, p=self.dropout, size=T.shape(self.x)), theano.config.floatX) # apply the mask self.x = self.x * mask else: self.x = self.x * self.dropout # binarize the weights self.Wb = self.binarize_weights(self.W, eval) z = T.dot(self.x, self.Wb) # for BN updates self.z = z # batch normalization if self.BN == True: self.batch_mean = T.mean(z, axis=0) self.batch_var = T.var(z, axis=0) if can_fit == True: mean = self.batch_mean var = self.batch_var else: mean = self.mean var = self.var z = (z - mean) / (T.sqrt(var + self.BN_epsilon)) z = self.a * z self.z = z + self.b # activation function y = self.activation(self.z) return y
def Kmeans(X_train=None, K=300, epsilon_whitening=0.015): if X_train is None: X_train = T.matrix("X_train") ######################## # Normalize the inputs # ######################## # A constant added to the variance to avoid division by zero epsilon_norm = 10 # We subtract from each training sample (each column in X_train) its mean X_train = X_train - T.mean(X_train, axis=0) / T.sqrt(T.var(X_train, axis=0) + epsilon_norm) ##################### # Whiten the inputs # ##################### sigma = T.dot(X_train, T.transpose(X_train)) / X_train.shape[1] U, s, V = linalg.svd(sigma, full_matrices=False) tmp = T.dot(U, T.diag(1 / T.sqrt(s + epsilon_whitening))) tmp = T.dot(tmp, T.transpose(U)) X_Whitened = T.dot(tmp, X_train) ###################### # Training the Model # ###################### # Initialization dimensions = X_Whitened.shape[0] samples = X_Whitened.shape[1] srng = RandomStreams(seed=234) # We initialize the centroids by sampling them from a normal # distribution, and then normalizing them to unit length # D \in R^{n \times k} D = srng.normal(size=(dimensions, K)) D = D / T.sqrt(T.sum(T.sqr(D), axis=0)) iterations = 30 for i in xrange(iterations): # Initialize new point representations # for every pass of the algorithm S = T.zeros((K, samples)) tmp = T.dot(D.T, X_Whitened) res = T.argmax(tmp, axis=0) max_values = tmp[res, T.arange(samples)] S = T.set_subtensor(S[res, T.arange(samples)], max_values) D = T.dot(X_Whitened, T.transpose(S)) D = D / T.sqrt(T.sum(T.sqr(D), axis=0)) return D
def fprop(self, input): """"Propogate input through the layer.""" if self.layer == 'fc': # Training time if self.run_mode == 0: mean_t = T.mean(input, axis=0) # Compute mean var_t = T.var(input, axis=0) # Compute variance # Subtract mean and divide by std norm_t = (input - mean_t) / T.sqrt(var_t + self.epsilon) # Add parameters output = self.gamma * norm_t + self.beta # Update mean and variance self.mean = self.momentum * self.mean + \ (1.0 - self.momentum) * mean_t self.var = self.momentum * self.var + (1.0 - self.momentum) \ * (self.input_shape[0] / (self.input_shape[0] - 1) * var_t) # Test time - use statistics from the training data else: output = self.gamma * (input - self.mean) / \ T.sqrt(self.var + self.epsilon) + self.beta elif self.layer == 'conv': if self.run_mode == 0: # Mean across every channel mean_t = T.mean(input, axis=(0, 2, 3)) var_t = T.var(input, axis=(0, 2, 3)) # mean, var update self.mean = self.momentum * self.mean + \ (1.0 - self.momentum) * mean_t self.var = self.momentum * self.var + (1.0 - self.momentum) * \ (self.input_shape[0] / (self.input_shape[0] - 1) * var_t) else: mean_t = self.mean var_t = self.var # change shape to fit input shape mean_t = self.change_shape(mean_t) var_t = self.change_shape(var_t) gamma_t = self.change_shape(self.gamma) beta_t = self.change_shape(self.beta) output = gamma_t * (input - mean_t) / \ T.sqrt(var_t + self.epsilon) + beta_t return output
def forward(self, x, train=True): if train or (not self.moving): if x.ndim == 2: mean = T.mean(x, axis=0) var = T.var(x, axis=0) elif x.ndim == 4: mean = T.mean(x, axis=(0, 2, 3)) var = T.var(x, axis=(0, 2, 3)) else: raise ValueError('input.shape must be (batch_size, dim) ' 'or (batch_size, filter_num, h, w).') if self.moving: bs = x.shape[0].astype(theano.config.floatX) mean_inf_next = (self.momentum*self.mean_inf + (1-self.momentum)*mean) var_inf_next = (self.momentum*self.var_inf + (1-self.momentum)*var*bs/(bs-1.)) self.updates = [(self.mean_inf, mean_inf_next), (self.var_inf, var_inf_next)] else: self.updates = [] else: mean = self.mean_inf var = self.var_inf if x.ndim == 4: mean = mean.dimshuffle('x', 0, 'x', 'x') var = var.dimshuffle('x', 0, 'x', 'x') output = (x-mean) / T.sqrt(var+self.eps) if self.gamma is not None: if x.ndim == 4: output *= self.gamma.dimshuffle('x', 0, 'x', 'x') else: output *= self.gamma if self.beta is not None: if x.ndim == 4: output += self.beta.dimshuffle('x', 0, 'x', 'x') else: output += self.beta return output
def perform(self, x): EPSI = 1e-5 S = self.params[0] b = self.params[1] x_ln = (x - T.mean(x_ln, axis=-1, keepdims=True))/T.sqrt(T.var(x, axis=-1, keepdims=True)+EPSI) if x.ndim==3: return x_ln * S.dimshuffle('x', 'x', 0) + b.dimshuffle('x', 'x', 0) else: return x_ln * S.dimshuffle('x', 0) + b.dimshuffle('x', 0)
def _compute_training_statistics(self, input_): axes = (0,) + tuple((i + 1) for i, b in enumerate(self.population_mean.broadcastable) if b) mean = input_.mean(axis=axes, keepdims=True) assert mean.broadcastable[1:] == self.population_mean.broadcastable stdev = tensor.sqrt(tensor.var(input_, axis=axes, keepdims=True) + numpy.cast[theano.config.floatX](self.epsilon)) assert stdev.broadcastable[1:] == self.population_stdev.broadcastable add_role(mean, BATCH_NORM_MINIBATCH_ESTIMATE) add_role(stdev, BATCH_NORM_MINIBATCH_ESTIMATE) return mean, stdev
def get_output(self, train): X = self.get_input(train) if self.mode == 0: X_normed = (X - self.running_mean) / self.running_std elif self.mode == 1: m = T.mean(X, self.axis, keepdims=True) std = T.sqrt(T.var(X, self.axis, keepdims=True) + self.epsilon) X_normed = (X - m) / std out = self.gamma * X_normed + self.beta return out