def cosine_distances(test, support): """Computes pairwise cosine distances between provided tensors Parameters ---------- test: tf.Tensor Of shape (n_test, n_feat) support: tf.Tensor Of shape (n_support, n_feat) Returns ------- tf.Tensor: Of shape (n_test, n_support) """ rnorm_test = tf.rsqrt( tf.reduce_sum(tf.square(test), 1, keep_dims=True)) + 1e-7 rnorm_support = tf.rsqrt( tf.reduce_sum(tf.square(support), 1, keep_dims=True)) + 1e-7 test_normalized = test * rnorm_test support_normalized = support * rnorm_support # Transpose for mul support_normalized_t = tf.transpose(support_normalized, perm=[1, 0]) g = tf.matmul(test_normalized, support_normalized_t) # Gram matrix return g
def _apply_noisy_update(self, mom, grad, var): # Compute and apply the gradient update following # preconditioned Langevin dynamics stddev = tf.where( tf.squeeze(self._counter > self._burnin), tf.cast(tf.rsqrt(self._learning_rate), grad.dtype), tf.zeros([], grad.dtype)) # Keep an exponentially weighted moving average of squared gradients. # Not thread safe decay_tensor = tf.cast(self._decay_tensor, grad.dtype) new_mom = decay_tensor * mom + (1. - decay_tensor) * tf.square(grad) preconditioner = tf.rsqrt( new_mom + tf.cast(self._diagonal_bias, grad.dtype)) # Compute gradients of the preconsitionaer _, preconditioner_grads = diag_jacobian( xs=var, ys=preconditioner, parallel_iterations=self._parallel_iterations) mean = 0.5 * (preconditioner * grad * tf.cast(self._data_size, grad.dtype) - preconditioner_grads[0]) stddev *= tf.sqrt(preconditioner) result_shape = tf.broadcast_dynamic_shape(tf.shape(mean), tf.shape(stddev)) with tf.control_dependencies([tf.assign(mom, new_mom)]): return tf.random_normal(shape=result_shape, mean=mean, stddev=stddev, dtype=grad.dtype)
def batch_normalized_linear_layer(state_below, scope_name, n_inputs, n_outputs, stddev, wd, eps=.00001, test=False): """ A linear layer with batch normalization """ with tf.variable_scope(scope_name) as scope: weight = _variable_with_weight_decay( "weights", shape=[n_inputs, n_outputs], stddev=stddev, wd=wd ) act = tf.matmul(state_below, weight) # get moments act_mean, act_variance = tf.nn.moments(act, [0]) # get mean and variance variables mean = _variable_on_cpu('bn_mean', [n_outputs], tf.constant_initializer(0.0), trainable=False) variance = _variable_on_cpu('bn_variance', [n_outputs], tf.constant_initializer(1.0), trainable=False) # assign the moments if not test: assign_mean = mean.assign(act_mean) assign_variance = variance.assign(act_variance) act_bn = tf.mul((act - act_mean), tf.rsqrt(act_variance + eps), name=scope.name+"_bn") else: act_bn = tf.mul((act - mean), tf.rsqrt(variance + eps), name=scope.name+"_bn") beta = _variable_on_cpu("beta", [n_outputs], tf.constant_initializer(0.0)) gamma = _variable_on_cpu("gamma", [n_outputs], tf.constant_initializer(1.0)) bn = tf.add(tf.mul(act_bn, gamma), beta) # output = tf.nn.relu(bn, name=scope.name) output = randomized_relu(bn, .1, name=scope.name, is_training=(not test)) if not test: output = control_flow_ops.with_dependencies(dependencies=[assign_mean, assign_variance], output_tensor=output) _activation_summary(output) return output
def batch_normalized_conv_layer(state_below, scope_name, n_inputs, n_outputs, filter_shape, stddev, wd, eps=.00001, test=False): """ Convolutional layer with batch normalization """ with tf.variable_scope(scope_name) as scope: kernel = _variable_with_weight_decay( "weights", shape=[filter_shape[0], filter_shape[1], n_inputs, n_outputs], stddev=stddev, wd=wd ) conv = tf.nn.conv2d(state_below, kernel, [1, 1, 1, 1], padding='SAME') # get moments conv_mean, conv_variance = tf.nn.moments(conv, [0, 1, 2]) # get mean and variance variables mean = _variable_on_cpu("bn_mean", [n_outputs], tf.constant_initializer(0.0), False) variance = _variable_on_cpu("bn_variance", [n_outputs], tf.constant_initializer(1.0), False) # assign the moments if not test: assign_mean = mean.assign(conv_mean) assign_variance = variance.assign(conv_variance) conv_bn = tf.mul((conv - conv_mean), tf.rsqrt(conv_variance + eps), name=scope.name+"_bn") else: conv_bn = tf.mul((conv - mean), tf.rsqrt(variance + eps), name=scope.name+"_bn") beta = _variable_on_cpu("beta", [n_outputs], tf.constant_initializer(0.0)) gamma = _variable_on_cpu("gamma", [n_outputs], tf.constant_initializer(1.0)) bn = tf.add(tf.mul(conv_bn, gamma), beta) # output = tf.nn.relu(bn, name=scope.name) output = randomized_relu(bn, .1, name=scope.name, is_training=(not test)) if not test: output = control_flow_ops.with_dependencies(dependencies=[assign_mean, assign_variance], output_tensor=output) _activation_summary(output) return output
def _resource_apply_dense(self, grad, var): grad_squared = tf.square(grad) + 1e-30 grad_squared_mean = tf.reduce_mean(grad_squared) decay_rate = self._decay_rate update_scale = self._learning_rate if self._multiply_by_parameter_scale: update_scale *= self._parameter_scale(var) # HACK: Make things dependent on grad. # This confounds the XLA rewriter and keeps it from fusing computations # across different variables. This fusion is a bad for HBM usage, since # it causes the gradients to persist in memory. decay_rate += grad_squared_mean * 1e-30 update_scale += grad_squared_mean * 1e-30 # END HACK mixing_rate = 1.0 - decay_rate shape = var.get_shape().as_list() updates = [] if self._should_use_factored_second_moment_estimate(shape): grad_squared_row_mean = tf.reduce_mean(grad_squared, 1) grad_squared_col_mean = tf.reduce_mean(grad_squared, 0) vr = self.get_slot(var, "vr") new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean) vc = self.get_slot(var, "vc") new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean) vr_update = tf.assign(vr, new_vr, use_locking=self._use_locking) vc_update = tf.assign(vc, new_vc, use_locking=self._use_locking) updates = [vr_update, vc_update] long_term_mean = tf.reduce_mean(new_vr) r_factor = tf.rsqrt(new_vr / long_term_mean) c_factor = tf.rsqrt(new_vc) x = grad * tf.expand_dims(r_factor, 1) * tf.expand_dims(c_factor, 0) else: v = self.get_slot(var, "v") new_v = decay_rate * v + mixing_rate * grad_squared v_update = tf.assign(v, new_v, use_locking=self._use_locking) updates = [v_update] x = grad * tf.rsqrt(new_v) if self._clipping_threshold is not None: clipping_denom = tf.maximum(1.0, reduce_rms(x) / self._clipping_threshold) x /= clipping_denom subtrahend = update_scale * x if self._beta1: m = self.get_slot(var, "m") new_m = self._beta1 * m + (1.0 - self._beta1) * subtrahend updates.append(tf.assign(m, new_m, use_locking=self._use_locking)) subtrahend = new_m var_update = tf.assign_sub(var, subtrahend, use_locking=self._use_locking) updates = [var_update] + updates return tf.group(*updates)
def _opsBatchNorm(self, x, m, v, beta, gamma, epsilon, scale_after_normalization): y = (x - m) * tf.rsqrt(v + epsilon) if scale_after_normalization: y = gamma * y y += beta return y
def l2_normalize(incoming, dim, epsilon=1e-12, name="l2_normalize"): """ L2 Normalization. Normalizes along dimension `dim` using an L2 norm. For a 1-D tensor with `dim = 0`, computes ``` output = x / sqrt(max(sum(x**2), epsilon)) ``` For `x` with more dimensions, independently normalizes each 1-D slice along dimension `dim`. Arguments: incoming: `Tensor`. Incoming Tensor. dim: `int`. Dimension along which to normalize. epsilon: `float`. A lower bound value for the norm. Will use `sqrt(epsilon)` as the divisor if `norm < sqrt(epsilon)`. name: `str`. A name for this layer (optional). Returns: A `Tensor` with the same shape as `x`. """ with tf.variable_op_scope([incoming], name) as name: x = tf.ops.convert_to_tensor(incoming, name="x") square_sum = tf.reduce_sum(tf.square(x), [dim], keep_dims=True) x_inv_norm = tf.rsqrt(tf.maximum(square_sum, epsilon)) return tf.mul(x, x_inv_norm, name=name)
def ae_latent_softmax(latents_pred, latents_discrete_hot, vocab_size, hparams): """Latent prediction and loss. Args: latents_pred: Tensor of shape [..., depth]. latents_discrete_hot: Tensor of shape [..., vocab_size]. vocab_size: an int representing the vocab size. hparams: tf.contrib.training.HParams. Returns: sample: Tensor of shape [...], a sample from a multinomial distribution. loss: Tensor of shape [...], the softmax cross-entropy. """ with tf.variable_scope("latent_logits"): latents_logits = tf.layers.dense(latents_pred, vocab_size, name="logits_dense") if hparams.logit_normalization: latents_logits *= tf.rsqrt(1e-8 + tf.reduce_mean(tf.square(latents_logits))) loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=latents_discrete_hot, logits=latents_logits) # TODO(trandustin): tease this out from ae_latent_softmax. # we use just the loss portion to anchor prior / encoder on text. sample = multinomial_sample(latents_logits, vocab_size, hparams.sampling_method, hparams.sampling_temp) return sample, loss
def layer_norm_all(h, batch_size, base, num_units, scope='layer_norm', reuse=False, gamma_start=1.0, epsilon=1e-3, use_bias=True): """Layer Norm (faster version, but not using defun).""" # Performs layer norm on multiple base at once (ie, i, g, j, o for lstm) # Reshapes h in to perform layer norm in parallel h_reshape = tf.reshape(h, [batch_size, base, num_units]) mean = tf.reduce_mean(h_reshape, [2], keep_dims=True) var = tf.reduce_mean(tf.square(h_reshape - mean), [2], keep_dims=True) epsilon = tf.constant(epsilon) rstd = tf.rsqrt(var + epsilon) h_reshape = (h_reshape - mean) * rstd # reshape back to original h = tf.reshape(h_reshape, [batch_size, base * num_units]) with tf.variable_scope(scope): if reuse: tf.get_variable_scope().reuse_variables() gamma = tf.get_variable( 'ln_gamma', [4 * num_units], initializer=tf.constant_initializer(gamma_start)) if use_bias: beta = tf.get_variable( 'ln_beta', [4 * num_units], initializer=tf.constant_initializer(0.0)) if use_bias: return gamma * h + beta return gamma * h
def BatchClipByL2norm(t, upper_bound, name=None): """Clip an array of tensors by L2 norm. Shrink each dimension-0 slice of tensor (for matrix it is each row) such that the l2 norm is at most upper_bound. Here we clip each row as it corresponds to each example in the batch. Args: t: the input tensor. upper_bound: the upperbound of the L2 norm. name: optional name. Returns: the clipped tensor. """ assert upper_bound > 0 with tf.op_scope([t, upper_bound], name, "batch_clip_by_l2norm") as name: saved_shape = tf.shape(t) batch_size = tf.slice(saved_shape, [0], [1]) t2 = tf.reshape(t, tf.concat(0, [batch_size, [-1]])) upper_bound_inv = tf.fill(tf.slice(saved_shape, [0], [1]), tf.constant(1.0/upper_bound)) # Add a small number to avoid divide by 0 l2norm_inv = tf.rsqrt(tf.reduce_sum(t2 * t2, [1]) + 0.000001) scale = tf.minimum(l2norm_inv, upper_bound_inv) * upper_bound clipped_t = tf.matmul(tf.diag(scale), t2) clipped_t = tf.reshape(clipped_t, saved_shape, name=name) return clipped_t
def _norm(x, g=None, b=None, e=1e-5, axis=[1]): u = tf.reduce_mean(x, axis=axis, keepdims=True) s = tf.reduce_mean(tf.square(x-u), axis=axis, keepdims=True) x = (x - u) * tf.rsqrt(s + e) if g is not None and b is not None: x = x*g + b return x
def compute_next_h_d(self, meta_opt, w_bot, w_top, bias, x, z, d, backward_w): """ Propogate error back down the network while computing hidden state. """ if z is None: z = x h = meta_opt.compute_h(x, z, d, bias, w_bot, w_top) # [bs x 60 x h_channels] # compute the next d delta = meta_opt.next_delta(z, h, d) if backward_w is not None: def delta_matmul(w, delta): d = tf.transpose(delta, [0, 2, 1]) # [bs x delta_channels x n_units) d = snt.BatchApply(lambda x: tf.matmul(x, w, transpose_b=True))(d) d = tf.transpose(d, [0, 2, 1]) return d # replace the "backward pass" with a random matrix. d = delta_matmul(backward_w, delta) # [bs x 60 x delta_channels] var = tf.reduce_mean(tf.square(d), [2], keepdims=True) d = d * tf.rsqrt(1e-6 + var) return h, d
def multihead_attn(q, k, v): # q, k, v have shape [batch, heads, sequence, features] w = tf.matmul(q, k, transpose_b=True) w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype)) w = tf.nn.softmax(w) a = tf.matmul(w, v) return a
def simple_attention(target, source, bias=None, summaries=True): """A simple attention function. Args: target: a `Tensor` with shape `[batch, target_timesteps, depth]` or `[batch, target_timesteps_1, target_timesteps_2, depth]` source: a `Tensor` with shape `[batch, source_timesteps, depth]` or `[batch, source_timesteps_1, source_timesteps_2, depth]` bias: an optional `Tensor` with shape `[batch, timesteps, 1, 1]` used to mask the attention to not attend to padding of input. summaries: Boolean, whether to output summaries. Returns: a `Tensor` with same shape as `target` """ with tf.name_scope("simple_attention", [target, source]): target_shape = tf.shape(target) source_shape = tf.shape(source) target = tf.reshape(target, [ target_shape[0], target_shape[1] * target_shape[2], target_shape[3] ]) source = tf.reshape(source, [ source_shape[0], source_shape[1] * source_shape[2], source_shape[3] ]) attention = tf.matmul(target, source, transpose_b=True) attention *= tf.rsqrt(tf.to_float(tf.shape(target)[2])) if bias is not None: attention += tf.expand_dims(tf.squeeze(bias, axis=[2, 3]), axis=1) attention = tf.nn.softmax(attention) if summaries and not tf.get_variable_scope().reuse: tf.summary.image("attention", tf.expand_dims(attention, 3), max_outputs=5) attended = tf.matmul(attention, source) return tf.reshape(attended, target_shape)
def layer_norm(x: tf.Tensor, epsilon: float = 1e-6) -> tf.Tensor: """Layer normalize the tensor x, averaging over the last dimension. Implementation based on tensor2tensor. Arguments: x: The ``Tensor`` to normalize. epsilon: The smoothing parameter of the normalization. Returns: The normalized tensor. """ with tf.variable_scope("LayerNorm"): gamma = get_variable( name="gamma", shape=[x.get_shape()[-1]], dtype=tf.float32, initializer=tf.ones_initializer()) beta = get_variable( name="beta", shape=[x.get_shape()[-1]], dtype=tf.float32, initializer=tf.zeros_initializer()) mean = tf.reduce_mean(x, axis=[-1], keepdims=True) variance = tf.reduce_mean( tf.square(x - mean), axis=[-1], keepdims=True) norm_x = (x - mean) * tf.rsqrt(variance + epsilon) return norm_x * gamma + beta
def inst_norm(x, train, data_format='NHWC', name=None, affine=False, act=lrelu, epsilon=1e-5): with tf.variable_scope(name, default_name='Inst', reuse=None) as vs: if x.get_shape().ndims == 4 and data_format == 'NCHW': x = nchw_to_nhwc(x) if x.get_shape().ndims == 4: mean_dim = [1,2] else: # 2 mean_dim = [1] mu, sigma_sq = tf.nn.moments(x, mean_dim, keep_dims=True) inv = tf.rsqrt(sigma_sq+epsilon) normalized = (x-mu)*inv if affine: var_shape = [x.get_shape()[-1]] shift = slim.model_variable('shift', shape=var_shape, initializer=tf.zeros_initializer) scale = slim.model_variable('scale', shape=var_shape, initializer=tf.ones_initializer) out = scale*normalized + shift else: out = normalized if x.get_shape().ndims == 4 and data_format == 'NCHW': out = nhwc_to_nchw(out) if act is None: return out else: return act(out)
def layer_norm(x, num_units, scope='layer_norm', reuse=False, gamma_start=1.0, epsilon=1e-3, use_bias=True): """Calculate layer norm.""" axes = [1] mean = tf.reduce_mean(x, axes, keep_dims=True) x_shifted = x - mean var = tf.reduce_mean(tf.square(x_shifted), axes, keep_dims=True) inv_std = tf.rsqrt(var + epsilon) with tf.variable_scope(scope): if reuse is True: tf.get_variable_scope().reuse_variables() gamma = tf.get_variable( 'ln_gamma', [num_units], initializer=tf.constant_initializer(gamma_start)) if use_bias: beta = tf.get_variable( 'ln_beta', [num_units], initializer=tf.constant_initializer(0.0)) output = gamma * (x_shifted) * inv_std if use_bias: output += beta return output
def ae_latent_softmax(latents_pred, latents_discrete_hot, hparams): """Latent prediction and loss. Args: latents_pred: Tensor of shape [..., depth]. latents_discrete_hot: Tensor of shape [..., vocab_size]. hparams: tf.contrib.training.HParams. Returns: sample: Tensor of shape [...], a sample from a multinomial distribution. loss: Tensor of shape [...], the softmax cross-entropy. """ vocab_size = 2**hparams.bottleneck_bits with tf.variable_scope("latent_logits"): latents_logits = tf.layers.dense(latents_pred, vocab_size, name="logits_dense") if hparams.logit_normalization: latents_logits *= tf.rsqrt(1e-8 + tf.reduce_mean(tf.square(latents_logits))) loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=latents_discrete_hot, logits=latents_logits) sample = multinomial_sample(latents_logits, vocab_size, hparams.sampling_method, hparams.sampling_temp) return sample, loss
def full_batchnorm(pre_activations, batch, epsilon=1e-8, train=True, beta_init=tf.constant_initializer(0), gamma_init=tf.constant_initializer(1)): """Does full batch normalisation of pre activations. Expects to get given something pre-nonlinearity. This is only set up for feed forward nets, in order to work properly for recurrent nets we will need to know what step we are up to, as in the paper they calculate population statistics at every time step. Args: pre_activations: the logits who will be normalised. We assume this is of shape [batch_size, num_units] batch: the data which generated the logits, which we need to calculate statistics used to normalise. train: if true, the statistics will be recalculated for each batch. If not, then the average from the training phase will be used. Returns: batch normalised activations. """ # get beta and gamma num_units = pre_activations.get_shape()[0] beta = tf.get_variable('beta', [num_units]) gamma = tf.get_variable('gamma', [num_units]) mean, variance = tf.nn.moments(pre_activations, [0]) isqr = tf.rsqrt(variance+epsilon) centered = pre_activations - mean return beta + gamma * centered * isqr
def grad(grad_ys): large_float_like_x = np.sqrt(np.finfo(x.dtype.as_numpy_dtype()).max) safe_grads = tf.where( tf.equal(x, 0), tf.fill(x.shape, large_float_like_x), 0.5 * tf.rsqrt(x)) return grad_ys * safe_grads
def clip_weight_norm(t, clip_norm, name=None): with tf.op_scope([t, clip_norm], name, "clip_weight_norm") as scope: l2norm_inv = tf.rsqrt( tf.reduce_sum(t * t, 0)) tclip = tf.identity(t * clip_norm * tf.minimum( l2norm_inv, tf.constant(1.0 / clip_norm))) return tclip
def call(self, x, epsilon=1e-6): dtype = x.dtype x = tf.cast(x=x, dtype=tf.float32) mean = tf.reduce_mean(x, axis=[-1], keepdims=True) variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True) norm_x = (x - mean) * tf.rsqrt(variance + epsilon) result = norm_x * self.scale + self.bias return tf.cast(x=result, dtype=dtype)
def true_log_joint(features, prior_precision, w, y): log_prob = tf.reduce_sum(tfd.Normal( loc=0., scale=tf.rsqrt(prior_precision)).log_prob(w)) log_prob += tf.reduce_sum(tfd.Normal( loc=tf.tensordot(features, w, [[1], [0]]), scale=1.).log_prob(y)) return log_prob
def BatchRenorm(x, rmax, dmax, decay=0.9, epsilon=1e-5, use_scale=True, use_bias=True): """ Batch Renormalization layer, as described in the paper: `Batch Renormalization: Towards Reducing Minibatch Dependence in Batch-Normalized Models <https://arxiv.org/abs/1702.03275>`_. Args: x (tf.Tensor): a NHWC or NC tensor. rmax, dmax (tf.Tensor): a scalar tensor, the maximum allowed corrections. decay (float): decay rate of moving average. epsilon (float): epsilon to avoid divide-by-zero. use_scale, use_bias (bool): whether to use the extra affine transformation or not. Returns: tf.Tensor: a tensor named ``output`` with the same shape of x. Variable Names: * ``beta``: the bias term. * ``gamma``: the scale term. Input will be transformed by ``x * gamma + beta``. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. """ shape = x.get_shape().as_list() assert len(shape) in [2, 4] n_out = shape[-1] if len(shape) == 2: x = tf.reshape(x, [-1, 1, 1, n_out]) beta, gamma, moving_mean, moving_var = get_bn_variables( n_out, use_scale, use_bias, tf.constant_initializer(1.0)) ctx = get_current_tower_context() use_local_stat = ctx.is_training # for BatchRenorm, use_local_stat should always be is_training, unless a # different usage comes out in the future. if use_local_stat: xn, batch_mean, batch_var = tf.nn.fused_batch_norm(x, gamma, beta, epsilon=epsilon, is_training=True) inv_sigma = tf.rsqrt(moving_var, 'inv_sigma') r = tf.stop_gradient(tf.clip_by_value( tf.sqrt(batch_var) * inv_sigma, 1.0 / rmax, rmax)) d = tf.stop_gradient(tf.clip_by_value( (batch_mean - moving_mean) * inv_sigma, -dmax, dmax)) xn = xn * r + d else: xn = tf.nn.batch_normalization( x, moving_mean, moving_var, beta, gamma, epsilon) if len(shape) == 2: xn = tf.squeeze(xn, [1, 2]) if ctx.is_main_training_tower: return update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay) else: return tf.identity(xn, name='output')
def get_function(points, mu, sigma): # f_ik [n,k] div = coef*tf.rsqrt(tf.batch_matrix_determinant(sigma)) # ((2pi)^p*|S_k|)^-1/2 [k] div = tf.tile(tf.reshape(div, [1,k]), [n,1]) # [n,k] diff = tf.sub(tf.tile(points, [k,1,1]), tf.tile(mu, [n,1,1])) # x_i-u_k [n*k, p, 1] sigma = tf.tile(sigma, [n,1,1]) # [n*k,p,p] exp = tf.exp(-0.5*tf.batch_matmul( tf.transpose(diff,perm=[0,2,1]), tf.batch_matmul(tf.batch_matrix_inverse(sigma), diff) )) # e^(d'*S^-1*d)_ik [n*k, 1, 1] exp = tf.reshape(exp, [n,k]) return tf.mul(div, exp) # Multivariate normal distribution evaluated for each vector, for each cluster parameter. Hence the [n,k] shape.
def model(features, prior_precision): w = ed.Normal(loc=0., scale=tf.rsqrt(prior_precision), sample_shape=features.shape[1], name="w") y = ed.Normal(loc=tf.tensordot(features, w, [[1], [0]]), scale=1., name="y") return y
def normalize(self, x, train=True): """Returns a batch-normalized version of x.""" if train: mean, variance = tf.nn.moments(x, [0]) assign_mean = self.mean.assign(mean) assign_variance = self.variance.assign(tf.mul(variance, self.keep_prob_prior)) with tf.control_dependencies([assign_mean, assign_variance]): act_bn = tf.mul((x - mean), tf.rsqrt(variance + self.epsilon), name="act_bn") return tf.add(tf.mul(act_bn, self.gamma), self.beta) else: mean = self.ewma_trainer.average(self.mean) or self.epsilon variance = self.ewma_trainer.average(self.variance) or self.epsilon local_beta = tf.identity(self.beta) local_gamma = tf.identity(self.gamma) act_bn = tf.mul((x-mean), tf.rsqrt(variance + self.epsilon), name="act1_bn") return tf.add(tf.mul(act_bn, local_gamma), local_beta)
def get_weightnormed_matrix(shape, axis=1, name='weightnorm', V_init=tf.random_normal_initializer(stddev=0.015), train_gains=True, dtype=tf.float32, trainable=True, squared=False): """Returns a matrix weightnormed across a given index. Adds 2 trainable variables: - V, a matrix, initialised with the default init - g, a vector, initialised to 1s returns g * V / elementwise l2 norm of V. Args: shape: sequence of 2 ints. We are only dealing with matrices here. axis: how to do the normalising, defaults to 1, which is likely to be what you want if your data is `[batch_size x d]`. name: name for the scope, defaults to weightnorm V_init: initialiser for the unnormalised part of the matrix. train_gains: if false, gains will be always one. dtype: type for the created variables. trainable: whether the matrix should be added to the tensorflow trainable variables collection. squared: if true, don't take the square root and just divide by the squared norm. Returns: Tensor: the matrix whose rows or columns will never exceed the learned norm. """ if len(shape) != 2: raise ValueError( 'Expected two dimensional shape, but it is {}'.format(shape)) with tf.name_scope(name): unnormed_w = tf.get_variable(name+'_V', shape, trainable=trainable, initializer=V_init, dtype=dtype) if axis: gains = tf.get_variable(name+'_g', [shape[0], 1], trainable=train_gains, initializer=tf.constant_initializer(1.0), dtype=dtype) else: gains = 1.0 sqr_norms = tf.reduce_sum( tf.square(unnormed_w), axis=axis, keep_dims=True) if not squared: inv_norms = tf.rsqrt(sqr_norms) else: inv_norms = 1.0 / sqr_norms return gains * unnormed_w * inv_norms
def instance_norm(input, name="instance_norm"): with tf.variable_scope(name): depth = input.get_shape()[3] scale = tf.get_variable("scale", [depth], initializer=tf.random_normal_initializer(1.0, 0.02, dtype=tf.float32)) offset = tf.get_variable("offset", [depth], initializer=tf.constant_initializer(0.0)) mean, variance = tf.nn.moments(input, axes=[1,2], keep_dims=True) epsilon = 1e-5 inv = tf.rsqrt(variance + epsilon) normalized = (input-mean)*inv return scale*normalized + offset
def batch_norm(x, name="batch_norm"): eps = 1e-6 with tf.variable_scope(name): nchannels = x.get_shape()[3] scale = tf.get_variable("scale", [nchannels], initializer=tf.random_normal_initializer(1.0, 0.02, dtype=tf.float32)) center = tf.get_variable("center", [nchannels], initializer=tf.constant_initializer(0.0, dtype = tf.float32)) ave, dev = tf.nn.moments(x, axes=[1,2], keep_dims=True) inv_dev = tf.rsqrt(dev + eps) normalized = (x-ave)*inv_dev * scale + center return normalized
def training(features, labels, mode): dist_min = 9999999.0 dist_max = 0.0 dist = [] cost_train = net_archs.conv_net(features, labels, dropout, reuse=False, is_training=True) cost_test = net_archs.conv_net(features, labels, dropout, reuse=True, is_training=False) logits_test = rrt_star_module.rrt_star(features['images'], cost_train, features['labels'], 0) dist_paths_rrt, dissimilarity = metric_path_module.metric_path( logits_test, features['labels'], features['images'], 0) mse_rrt = tf.losses.mean_squared_error(logits_test, features['labels']) dist.append(dist_paths_rrt) dist_min = tf.cond(dist_paths_rrt < dist_min, lambda: dist_paths_rrt, lambda: dist_min) dist_max = tf.cond(dist_paths_rrt > dist_max, lambda: dist_paths_rrt, lambda: dist_max) for i in range(N_REP_NET - 1): logits_test_i = rrt_star_module.rrt_star(features['images'], cost_train, features['labels'], i + 1) dist_paths_rrt_aux, dissimilarity_aux = metric_path_module.metric_path( logits_test_i, features['labels'], features['images'], 0) mse_rrt += tf.losses.mean_squared_error(logits_test_i, features['labels']) logits_test += logits_test_i dissimilarity += dissimilarity_aux dist.append(dist_paths_rrt_aux) dist_paths_rrt += dist_paths_rrt_aux dist_min = tf.cond(dist_paths_rrt_aux < dist_min, lambda: dist_paths_rrt_aux, lambda: dist_min) dist_max = tf.cond(dist_paths_rrt_aux > dist_max, lambda: dist_paths_rrt_aux, lambda: dist_max) logits_test /= N_REP_NET dist_paths_rrt /= N_REP_NET dissimilarity /= N_REP_NET mse_rrt /= N_REP_NET tf_dist = tf.stack(dist) size_dist = float(N_REP_NET) stddev = tf.sqrt( tf.reduce_sum(tf.pow(tf_dist - dist_paths_rrt, 2)) / size_dist) stderror = stddev * tf.rsqrt(size_dist) mse_cost = tf.losses.mean_squared_error(cost_test, 1.0 - features['labels']) log_likelihood = tf.reduce_sum( tf.multiply(cost_test, features['labels'] - logits_test + sum_eps)) loss_op = log_likelihood log_image = tf.log( tf.clip_by_value(tf.reshape(cost_test, shape=[-1, 200, 200, 1]), 0, 0.1)) tf.summary.image('images', tf.reshape(features['images'], shape=[-1, 200, 200, 1])) tf.summary.image('rrt_in', tf.reshape(cost_test, shape=[-1, 200, 200, 1])) tf.summary.image('rrt_out', tf.reshape(logits_test, shape=[-1, 200, 200, 1])) tf.summary.image('label', tf.reshape(features['labels'], shape=[-1, 200, 200, 1])) #~ tf.summary.scalar('dist_between_paths', dist_paths_rrt) #~ tf.summary.scalar('dist_min', dist_min) #~ tf.summary.scalar('dist_max', dist_max) #~ tf.summary.scalar('dissimilarity', dissimilarity) tf.summary.scalar('mse_rrt', mse_rrt) tf.summary.scalar('mse_cost', mse_cost) tf.summary.scalar('log_likelihood', log_likelihood) #~ tf.summary.scalar('stderror', stderror) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) train_op = optimizer.minimize(mse_rrt, global_step=tf.train.get_global_step()) if mode == tf.estimator.ModeKeys.PREDICT: #estim_specs = tf.estimator.EstimatorSpec(mode, predictions=cost_test) estim_specs = tf.estimator.EstimatorSpec(mode, predictions=logits_test) elif mode == tf.estimator.ModeKeys.EVAL: metrics = { 'mse_rrt': tf.metrics.mean(mse_rrt), 'mse_cost': tf.metrics.mean(mse_cost), 'log_likelihood': tf.metrics.mean(log_likelihood) } estim_specs = tf.estimator.EstimatorSpec(mode, loss=loss_op, eval_metric_ops=metrics) else: estim_specs = tf.estimator.EstimatorSpec(mode=mode, predictions=logits_test, loss=mse_rrt, train_op=train_op) return estim_specs
def pixel_norm(x, epsilon=1e-8): return x * tf.rsqrt(tf.reduce_mean(tf.square(x), axis=-1, keepdims=True) + epsilon)
def model_fn(features, labels, mode, params): tf.logging.info('*** Features ***') for name in sorted(features.keys()): tf.logging.info( ' name = %s, shape = %s' % (name, features[name].shape) ) inputs = features['input_ids'] targets = features['target_ids'] is_training = mode == tf.estimator.ModeKeys.TRAIN model = modeling.TransformerModel(bert_config) (llh, logits, pred_ids), _ = model( inputs, target_ids=targets, training=is_training ) total_loss = padded_cross_entropy_loss( logits, targets, bert_config['label_smoothing'], bert_config['vocab_size'], ) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: ( assignment_map, initialized_variable_names, ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint( init_checkpoint, assignment_map ) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info('**** Trainable Variables ****') print(initialized_variable_names) for var in tvars: init_string = '' if var.name in initialized_variable_names: init_string = ', *INIT_FROM_CKPT*' tf.logging.info( ' name = %s, shape = %s%s', var.name, var.shape, init_string ) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: init_lr = learning_rate global_step = tf.train.get_global_step() lr = ( init_lr / 0.01 * tf.rsqrt(tf.maximum(tf.to_float(global_step), 10000)) ) optimizer = adafactor.AdafactorOptimizer( learning_rate=lr, decay_rate=adafactor.adafactor_decay_rate_pow(0.8), beta1=0.0, ) if use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(total_loss, global_step=global_step) # if not bert_config['use_bias']: # logging.info('Fixing position embedding, i.e. not trainable.') # posemb = 'pegasus/embeddings/position_embeddings' # tvars = list( # filter(lambda v: v.name.split(':')[0] != posemb, tvars) # ) # gradients = optimizer.compute_gradients(total_loss, tvars) # train_op = optimization.create_optimizer( # total_loss, # learning_rate, # num_train_steps, # num_warmup_steps, # use_tpu, # ) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn, ) elif mode == tf.estimator.ModeKeys.EVAL: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=None, scaffold_fn=scaffold_fn, ) else: raise ValueError( 'Only TRAIN and EVAL modes are supported: %s' % (mode) ) return output_spec
def __call__(self, x): assert [x.shape[-1]] == self.g.shape == self.b.shape u = tf.reduce_mean(x, axis=-1, keepdims=True) v = tf.reduce_mean(tf.squared_difference(x, u), axis=-1, keepdims=True) return (x - u) * tf.rsqrt(v + self.eps) * self.g + self.b
def call(self, x, epsilon=1e-6): mean = tf.reduce_mean(x, axis=[-1], keepdims=True) variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True) norm_x = (x - mean) * tf.rsqrt(variance + epsilon) return norm_x * self.scale + self.bias
def _learning_rate_default(self, multiply_by_parameter_scale): learning_rate = tf.minimum(tf.rsqrt(step_num() + 1.0), 0.01) if not multiply_by_parameter_scale: learning_rate *= 0.05 return learning_rate
def apply_cmvn(feats, mean, variance, epsilon=1e-9): ''' TF: apply CMVN on feature''' return (feats - mean) * tf.rsqrt(variance + epsilon)
def tfe_rsqrt(t): return tf.rsqrt(t)
def fit_one_step( model_matrix, response, model, model_coefficients_start=None, predicted_linear_response_start=None, l2_regularizer=None, dispersion=None, offset=None, learning_rate=None, fast_unsafe_numerics=True, name=None): """Runs one step of Fisher scoring. Args: model_matrix: (Batch of) `float`-like, matrix-shaped `Tensor` where each row represents a sample's features. response: (Batch of) vector-shaped `Tensor` where each element represents a sample's observed response (to the corresponding row of features). Must have same `dtype` as `model_matrix`. model: `tfp.glm.ExponentialFamily`-like instance used to construct the negative log-likelihood loss, gradient, and expected Hessian (i.e., the Fisher information matrix). model_coefficients_start: Optional (batch of) vector-shaped `Tensor` representing the initial model coefficients, one for each column in `model_matrix`. Must have same `dtype` as `model_matrix`. Default value: Zeros. predicted_linear_response_start: Optional `Tensor` with `shape`, `dtype` matching `response`; represents `offset` shifted initial linear predictions based on `model_coefficients_start`. Default value: `offset` if `model_coefficients is None`, and `tfp.math.matvecmul(model_matrix, model_coefficients_start) + offset` otherwise. l2_regularizer: Optional scalar `Tensor` representing L2 regularization penalty, i.e., `loss(w) = sum{-log p(y[i]|x[i],w) : i=1..n} + l2_regularizer ||w||_2^2`. Default value: `None` (i.e., no L2 regularization). dispersion: Optional (batch of) `Tensor` representing `response` dispersion, i.e., as in, `p(y|theta) := exp((y theta - A(theta)) / dispersion)`. Must broadcast with rows of `model_matrix`. Default value: `None` (i.e., "no dispersion"). offset: Optional `Tensor` representing constant shift applied to `predicted_linear_response`. Must broadcast to `response`. Default value: `None` (i.e., `tf.zeros_like(response)`). learning_rate: Optional (batch of) scalar `Tensor` used to dampen iterative progress. Typically only needed if optimization diverges, should be no larger than `1` and typically very close to `1`. Default value: `None` (i.e., `1`). fast_unsafe_numerics: Optional Python `bool` indicating if solve should be based on Cholesky or QR decomposition. Default value: `True` (i.e., "prefer speed via Cholesky decomposition"). name: Python `str` used as name prefix to ops created by this function. Default value: `"fit_one_step"`. Returns: model_coefficients: (Batch of) vector-shaped `Tensor`; represents the next estimate of the model coefficients, one for each column in `model_matrix`. predicted_linear_response: `response`-shaped `Tensor` representing linear predictions based on new `model_coefficients`, i.e., `tfp.math.matvecmul(model_matrix, model_coefficients_next) + offset`. """ graph_deps = [model_matrix, response, model_coefficients_start, predicted_linear_response_start, dispersion, learning_rate] with tf.name_scope(name, 'fit_one_step', graph_deps): [ model_matrix, response, model_coefficients_start, predicted_linear_response_start, offset, ] = prepare_args( model_matrix, response, model_coefficients_start, predicted_linear_response_start, offset) # Compute: mean, grad(mean, predicted_linear_response_start), and variance. mean, variance, grad_mean = model(predicted_linear_response_start) # If either `grad_mean` or `variance is non-finite or zero, then we'll # replace it with a value such that the row is zeroed out. Although this # procedure may seem circuitous, it is necessary to ensure this algorithm is # itself differentiable. is_valid = (tf.is_finite(grad_mean) & tf.not_equal(grad_mean, 0.) & tf.is_finite(variance) & (variance > 0.)) def mask_if_invalid(x, mask): mask = tf.fill(tf.shape(x), value=np.array(mask, x.dtype.as_numpy_dtype)) return tf.where(is_valid, x, mask) # Run one step of iteratively reweighted least-squares. # Compute "`z`", the adjusted predicted linear response. # z = predicted_linear_response_start # + learning_rate * (response - mean) / grad_mean z = (response - mean) / mask_if_invalid(grad_mean, 1.) # TODO(jvdillon): Rather than use learning rate, we should consider using # backtracking line search. if learning_rate is not None: z *= learning_rate[..., tf.newaxis] z += predicted_linear_response_start # Compute "`w`", the per-sample weight. if dispersion is not None: # For convenience, we'll now scale the variance by the dispersion factor. variance *= dispersion w = (mask_if_invalid(grad_mean, 0.) * tf.rsqrt(mask_if_invalid(variance, np.inf))) a = model_matrix * w[..., tf.newaxis] b = z * w # Solve `min{ || A @ model_coefficients - b ||_2**2 : model_coefficients }` # where `@` denotes `matmul`. if l2_regularizer is None: l2_regularizer = np.array(0, a.dtype.as_numpy_dtype) else: l2_regularizer_ = distributions_util.maybe_get_static_value( l2_regularizer, a.dtype.as_numpy_dtype) if l2_regularizer_ is not None: l2_regularizer = l2_regularizer_ def _embed_l2_regularization(): """Adds synthetic observations to implement L2 regularization.""" # `tf.matrix_solve_ls` does not respect the `l2_regularization` argument # when `fast_unsafe_numerics` is `False`. This function adds synthetic # observations to the data to implement the regularization instead. # Adding observations `sqrt(l2_regularizer) * I` is mathematically # equivalent to adding the term # `-l2_regularizer ||coefficients||_2**2` to the log-likelihood. num_model_coefficients = num_cols(model_matrix) batch_shape = tf.shape(model_matrix)[:-2] eye = tf.eye( num_model_coefficients, batch_shape=batch_shape, dtype=a.dtype) a_ = tf.concat([a, tf.sqrt(l2_regularizer) * eye], axis=-2) b_ = distributions_util.pad( b, count=num_model_coefficients, axis=-1, back=True) # Return l2_regularizer=0 since its now embedded. l2_regularizer_ = np.array(0, a.dtype.as_numpy_dtype) return a_, b_, l2_regularizer_ a, b, l2_regularizer = smart_cond.smart_cond( smart_reduce_all([not(fast_unsafe_numerics), l2_regularizer > 0.]), _embed_l2_regularization, lambda: (a, b, l2_regularizer)) model_coefficients_next = tf.matrix_solve_ls( a, b[..., tf.newaxis], fast=fast_unsafe_numerics, l2_regularizer=l2_regularizer, name='model_coefficients_next') model_coefficients_next = model_coefficients_next[..., 0] # TODO(b/79122261): The approach used in `matrix_solve_ls` could be made # faster by avoiding explicitly forming Q and instead keeping the # factorization in 'implicit' form with stacked (rescaled) Householder # vectors underneath the 'R' and then applying the (accumulated) # reflectors in the appropriate order to apply Q'. However, we don't # presently do this because we lack core TF functionality. For reference, # the vanilla QR approach is: # q, r = tf.linalg.qr(a) # c = tf.matmul(q, b, adjoint_a=True) # model_coefficients_next = tf.matrix_triangular_solve( # r, c, lower=False, name='model_coefficients_next') predicted_linear_response_next = calculate_linear_predictor( model_matrix, model_coefficients_next, offset, name='predicted_linear_response_next') return model_coefficients_next, predicted_linear_response_next
def testRenames(self): self.assertAllClose(1.04719755, tf.acos(0.5)) self.assertAllClose(0.5, tf.rsqrt(4.0))
def bottom(self, x): """Use batchnorm instead of CMVN and shorten the stft with strided convs. Args: x: float32 tensor with shape [batch_size, len, 1, freqs * channels] Returns: float32 tensor with shape [batch_size, shorter_len, 1, hidden_size] """ inputs = x p = self._model_hparams num_mel_bins = p.audio_num_mel_bins num_channels = 3 if p.audio_add_delta_deltas else 1 with tf.variable_scope(self.name): if p.audio_preproc_in_bottom: # Compute filterbanks with tf.variable_scope("fbanks"): waveforms = tf.squeeze(inputs, [2, 3]) mel_fbanks = common_audio.compute_mel_filterbank_features( waveforms, sample_rate=p.audio_sample_rate, dither=p.audio_dither, preemphasis=p.audio_preemphasis, frame_length=p.audio_frame_length, frame_step=p.audio_frame_step, lower_edge_hertz=p.audio_lower_edge_hertz, upper_edge_hertz=p.audio_upper_edge_hertz, num_mel_bins=p.audio_num_mel_bins, apply_mask=True) if p.audio_add_delta_deltas: mel_fbanks = common_audio.add_delta_deltas(mel_fbanks) x = tf.reshape( mel_fbanks, common_layers.shape_list(mel_fbanks)[:2] + [num_mel_bins, num_channels]) nonpadding_mask = 1. - common_attention.embedding_to_padding( x) num_of_nonpadding_elements = tf.reduce_sum( nonpadding_mask) * num_mel_bins * num_channels # This replaces CMVN estimation on data var_epsilon = 1e-09 mean = tf.reduce_sum(x, axis=[ 1 ], keepdims=True) / num_of_nonpadding_elements variance = ( num_of_nonpadding_elements * mean**2. - 2. * mean * tf.reduce_sum(x, axis=[1], keepdims=True) + tf.reduce_sum(x**2, axis=[1], keepdims=True) ) / num_of_nonpadding_elements x = (x - mean) * tf.rsqrt(variance + var_epsilon) * tf.expand_dims( nonpadding_mask, -1) else: x = inputs # The convention is that the models are flattened along the spatial, # dimensions, thus the speech preprocessor treats frequencies and # channels as image colors (last axis) x.set_shape([None, None, num_mel_bins, num_channels]) # TODO(chorowski): how to specify bottom's hparams and avoid hardcoding? x = tf.pad(x, [[0, 0], [0, 8], [0, 0], [0, 0]]) for _ in range(2): x = tf.layers.conv2d(x, 128, (3, 3), (2, 2), use_bias=False) x = common_layers.layer_norm(x) x = tf.nn.relu(x) xshape = common_layers.shape_list(x) # apply a conv that will remove all frequencies and at the same time # project the output into desired hidden_size x = tf.pad(x, [[0, 0], [0, 2], [0, 0], [0, 0]]) x = tf.layers.conv2d(x, p.hidden_size, (3, xshape[2]), use_bias=False) assert common_layers.shape_list(x)[2] == 1 x = common_layers.layer_norm(x) x = tf.nn.relu(x) return x
def call(self, inputs, training=None): axis = self.axis input_shape = K.int_shape(inputs) ndim = len(input_shape) dim = input_shape[axis] dtype = K.dtype(inputs) if ndim > 2: image_axes = list(ax for ax in range(1, ndim) if ax != axis) if self.image_size is not None: scale_squared_norm = tf.cast(1.0 / self.image_size, dtype=dtype) else: num_pixels = K.prod([K.shape(inputs)[ax] for ax in image_axes]) scale_squared_norm = 1.0 / K.cast(num_pixels, dtype=dtype) if self.scale_coe != 1.0: scale_squared_norm /= self.scale_coe broadcast_shape = [1] * ndim broadcast_shape[axis] = dim def unitized_inference(): broadcasted_moving_mean = K.reshape( self.moving_mean, broadcast_shape ) broadcasted_moving_variance = K.reshape( self.moving_variance, broadcast_shape ) broadcasted_moving_variance += self.epsilon scale = tf.rsqrt(broadcasted_moving_variance) centered_inputs = inputs - broadcasted_moving_mean if ndim > 2: squared_inputs = tf.reduce_mean( centered_inputs**2, image_axes, True ) else: squared_inputs = centered_inputs**2 normalized_inputs = squared_inputs / broadcasted_moving_variance squared_norm = tf.reduce_sum(normalized_inputs, [axis], True) if ndim > 2: squared_norm *= scale_squared_norm alpha = K.reshape(self.alpha, broadcast_shape) scale *= alpha * tf.rsqrt(squared_norm + self.epsilon) + (1 - alpha) if self.scale: scale *= K.reshape(self.gamma, broadcast_shape) outputs = scale * centered_inputs if self.center: outputs += K.reshape(self.beta, broadcast_shape) return outputs if training in {0, False}: unitized_inputs = unitize_inference() else: reduction_axes = list(ax for ax in range(ndim) if ax != axis) mean = tf.reduce_mean(inputs, reduction_axes, False) broadcasted_mean = K.reshape(mean, broadcast_shape) centered_inputs = inputs - broadcasted_mean if ndim > 2: squared_inputs = tf.reduce_mean( centered_inputs**2, image_axes, True ) else: squared_inputs = centered_inputs**2 broadcasted_variance = tf.reduce_mean(squared_inputs, [0], True) sample_size = K.prod([ K.shape(inputs)[axis] for axis in reduction_axes ]) sample_size = K.cast(sample_size, dtype=dtype) broadcasted_variance *= sample_size / (sample_size - (1.0 + self.epsilon)) variance = tf.squeeze(broadcasted_variance, reduction_axes) self.add_update( [ K.moving_average_update( self.moving_mean, mean, self.momentum ), K.moving_average_update( self.moving_variance, variance, self.momentum ) ], inputs ) broadcasted_variance += self.epsilon scale = tf.rsqrt(broadcasted_variance) normalized_inputs = squared_inputs / broadcasted_variance squared_norm = tf.reduce_sum(normalized_inputs, [axis], True) if ndim > 2: squared_norm *= scale_squared_norm alpha = K.reshape(self.alpha, broadcast_shape) scale *= alpha * tf.rsqrt(squared_norm + self.epsilon) + (1 - alpha) if self.scale: scale *= K.reshape(self.gamma, broadcast_shape) unitized_inputs = scale * centered_inputs if self.center: unitized_inputs += K.reshape(self.beta, broadcast_shape) return K.in_train_phase( unitized_inputs, unitized_inference, training=training )
def pixel_norm(x, epsilon=1e-8): with tf.variable_scope("PixelNorm"): return x * tf.rsqrt( tf.reduce_mean(tf.square(x), axis=1, keepdims=True) + epsilon)
def modulated_conv2d_layer(x, y, fmaps, kernel, up=False, down=False, demodulate=True, resample_kernel=None, gain=1, use_wscale=True, lrmul=1, fused_modconv=True, weight_var='weight', mod_weight_var='mod_weight', mod_bias_var='mod_bias'): assert not (up and down) assert kernel >= 1 and kernel % 2 == 1 # Get weight. w = get_weight([kernel, kernel, x.shape[1].value, fmaps], gain=gain, use_wscale=use_wscale, lrmul=lrmul, weight_var=weight_var) ww = w[np.newaxis] # [BkkIO] Introduce minibatch dimension. # Modulate. s = dense_layer( y, fmaps=x.shape[1].value, weight_var=mod_weight_var) # [BI] Transform incoming W to style. s = apply_bias_act( s, bias_var=mod_bias_var) + 1 # [BI] Add bias (initially 1). ww *= tf.cast(s[:, np.newaxis, np.newaxis, :, np.newaxis], w.dtype) # [BkkIO] Scale input feature maps. # Demodulate. if demodulate: d = tf.rsqrt(tf.reduce_sum(tf.square(ww), axis=[1, 2, 3]) + 1e-8) # [BO] Scaling factor. ww *= d[:, np.newaxis, np.newaxis, np.newaxis, :] # [BkkIO] Scale output feature maps. # Reshape/scale input. if fused_modconv: x = tf.reshape(x, [1, -1, x.shape[2], x.shape[3] ]) # Fused => reshape minibatch to convolution groups. w = tf.reshape(tf.transpose(ww, [1, 2, 3, 0, 4]), [ww.shape[1], ww.shape[2], ww.shape[3], -1]) else: x *= tf.cast(s[:, :, np.newaxis, np.newaxis], x.dtype) # [BIhw] Not fused => scale input activations. # Convolution with optional up/downsampling. if up: x = upsample_conv_2d(x, tf.cast(w, x.dtype), data_format='NCHW', k=resample_kernel) elif down: x = conv_downsample_2d(x, tf.cast(w, x.dtype), data_format='NCHW', k=resample_kernel) else: x = tf.nn.conv2d(x, tf.cast(w, x.dtype), data_format='NCHW', strides=[1, 1, 1, 1], padding='SAME') # Reshape/scale output. if fused_modconv: x = tf.reshape( x, [-1, fmaps, x.shape[2], x.shape[3] ]) # Fused => reshape convolution groups back to minibatch. elif demodulate: x *= tf.cast(d[:, :, np.newaxis, np.newaxis], x.dtype) # [BOhw] Not fused => scale output activations. return x
def normalize_vector(d, scope=None): with tf.name_scope(scope, 'norm_vec'): ndim = len(d.shape) output = d * tf.rsqrt(1e-6 + tf.reduce_sum( tf.square(d), axis=range(1, ndim), keep_dims=True)) return output
def apply_local_cmvn(feats, epsilon=1e-9): ''' feats: (NHWC) ''' mean = tf.expand_dims(keras_backend.mean(feats, axis=1), axis=1) var = tf.expand_dims(keras_backend.var(feats, axis=1), axis=1) feats = (feats - mean) * tf.rsqrt(var + epsilon) return feats
else: ff_loss_reg = ff_loss opt = tf.train.GradientDescentOptimizer(learning_rate).minimize( ff_loss_reg, global_step=batch) op_list = [] if FLAGS.cges: # Normalization parameter glayerwise = utils.glayerwise elayerwise = utils.elayerwise for vind, var in enumerate(S_vars): # GS group_sum = tf.reduce_sum(tf.square(var), -1) g_param = learning_rate * FLAGS.lamb * (FLAGS.mu - vind * FLAGS.chvar) gl_comp = 1. - g_param * glayerwise[vind] * tf.rsqrt(group_sum) gl_plus = tf.cast(gl_comp > 0, tf.float32) * gl_comp gl_stack = tf.stack([gl_plus for _ in range(var.get_shape()[-1])], -1) gl_op = gl_stack * var # ES e_param = learning_rate * FLAGS.lamb * ( (1. - FLAGS.mu) + vind * FLAGS.chvar) W_sum = e_param * elayerwise[vind] * tf.reduce_sum(tf.abs(gl_op), -1) W_sum_stack = tf.stack([W_sum for _ in range(gl_op.get_shape()[-1])], -1) el_comp = tf.abs(gl_op) - W_sum_stack el_plus = tf.cast(el_comp > 0, tf.float32) * el_comp cges_op = var.assign(el_plus * tf.sign(gl_op)) op_list.append(cges_op)
def layer_norm_compute_python(x, epsilon, scale, bias): """Layer norm raw computation.""" mean = tf.reduce_mean(x, axis=[-1], keep_dims=True) variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True) norm_x = (x - mean) * tf.rsqrt(variance + epsilon) return norm_x * scale + bias
def layerfrn(input_tensor, name, eps=1e-6, learn_eps=True, scale=True): """ :param input_tensor: :param name: :param eps: :param learn_eps: :param scale: :return: """ with tf.variable_scope(name_or_scope=name): input_channels = input_tensor.get_shape().as_list()[-1] # compute norm norm_square = tf.pow(input_tensor, 2, name='power') norm_square = tf.reduce_mean(input_tensor=norm_square, axis=[1, 2], keepdims=True) if scale: gamma = tf.get_variable(name='gamma', shape=[1, 1, 1, input_channels], dtype=tf.float32, initializer=tf.ones_initializer(), trainable=True) else: gamma = tf.get_variable(name='gamma', shape=[1, 1, 1, input_channels], dtype=tf.float32, initializer=tf.ones_initializer(), trainable=False) beta = tf.get_variable(name='beta', shape=[1, 1, 1, input_channels], dtype=tf.float32, initializer=tf.zeros_initializer(), trainable=True) # apply frn if learn_eps: eps_ = tf.get_variable( name='eps', shape=[1, 1, 1, input_channels], dtype=tf.float32, initializer=tf.constant_initializer(eps), trainable=True) else: eps_ = tf.get_variable( name='eps', shape=[1, 1, 1, input_channels], dtype=tf.float32, initializer=tf.constant_initializer(eps), trainable=False) frn = input_tensor * tf.rsqrt(norm_square + tf.abs(eps_)) frn = gamma * frn + beta # apply tlu t_thresh = tf.get_variable( name='t_thresh', shape=[1, 1, 1, input_channels], dtype=tf.float32, initializer=tf.constant_initializer(eps), trainable=True) frn_output = tf.maximum(frn, t_thresh, 'frn_output') return frn_output
def _resource_apply_dense(self, grad, var): grad = tf.to_float(grad) grad_squared = tf.square(grad) + 1e-30 grad_squared_mean = tf.reduce_mean(grad_squared) decay_rate = self._decay_rate update_scale = self._learning_rate old_val = var if var.dtype.base_dtype == tf.bfloat16: old_val = tf.to_float(self._parameter_encoding.decode(old_val)) if self._multiply_by_parameter_scale: update_scale *= tf.to_float(self._parameter_scale(old_val)) # HACK: Make things dependent on grad. # This confounds the XLA rewriter and keeps it from fusing computations # across different variables. This fusion is a bad for HBM usage, since # it causes the gradients to persist in memory. decay_rate += grad_squared_mean * 1e-30 update_scale += grad_squared_mean * 1e-30 # END HACK mixing_rate = 1.0 - decay_rate shape = var.get_shape().as_list() updates = [] if self._should_use_factored_second_moment_estimate(shape): grad_squared_row_mean = tf.reduce_mean(grad_squared, -1) grad_squared_col_mean = tf.reduce_mean(grad_squared, -2) vr = self.get_slot(var, "vr") new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean) vc = self.get_slot(var, "vc") new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean) vr_update = tf.assign(vr, new_vr, use_locking=self._use_locking) vc_update = tf.assign(vc, new_vc, use_locking=self._use_locking) updates = [vr_update, vc_update] long_term_mean = tf.reduce_mean(new_vr, -1, keep_dims=True) r_factor = tf.rsqrt(new_vr / long_term_mean) c_factor = tf.rsqrt(new_vc) x = grad * tf.expand_dims(r_factor, -1) * tf.expand_dims( c_factor, -2) else: v = self.get_slot(var, "v") new_v = decay_rate * v + mixing_rate * grad_squared v_update = tf.assign(v, new_v, use_locking=self._use_locking) updates = [v_update] x = grad * tf.rsqrt(new_v) if self._clipping_threshold is not None: clipping_denom = tf.maximum( 1.0, reduce_rms(x) / self._clipping_threshold) x /= clipping_denom subtrahend = update_scale * x if self._beta1: m = self.get_slot(var, "m") new_m = self._beta1 * tf.to_float(m) + (1.0 - self._beta1) * subtrahend subtrahend = new_m new_m = tf.cast(new_m, var.dtype) updates.append(tf.assign(m, new_m, use_locking=self._use_locking)) new_val = tf.to_float(old_val) - subtrahend if var.dtype.base_dtype == tf.bfloat16: new_val = self._parameter_encoding.encode(new_val, self._quantization_noise) if self._simulated_quantize_bits: new_val = quantization.simulated_quantize( var - subtrahend, self._simulated_quantize_bits, self._quantization_noise) var_update = tf.assign(var, new_val, use_locking=self._use_locking) updates = [var_update] + updates return tf.group(*updates)
def __call__(self, x, name=None): with scope(name or self.name): mean, var = tf.nn.moments(x, 1, keep_dims=True) return (x - mean) * tf.rsqrt(var + 1e-12) * self.gain + self.bias
def __call__(self, shape, dtype=None, partition_info=None): del partition_info dtype = self.dtype if dtype is None else dtype std = tf.rsqrt(tf.cast(tf.reduce_prod(shape[:-1]), tf.float32) + 1e-7) return tf.random_normal(shape, stddev=std, dtype=dtype)
def center_y(y): y -= tf.reduce_mean(y) y *= tf.rsqrt( tf.reduce_mean( tf.reduce_sum(y**2, axis=[1], keep_dims=True))) return y
def BatchRenorm(x, rmax, dmax, decay=0.9, epsilon=1e-5, use_scale=True, use_bias=True, data_format='NHWC'): """ Batch Renormalization layer, as described in the paper: `Batch Renormalization: Towards Reducing Minibatch Dependence in Batch-Normalized Models <https://arxiv.org/abs/1702.03275>`_. Args: x (tf.Tensor): a NHWC or NC tensor. rmax, dmax (tf.Tensor): a scalar tensor, the maximum allowed corrections. decay (float): decay rate of moving average. epsilon (float): epsilon to avoid divide-by-zero. use_scale, use_bias (bool): whether to use the extra affine transformation or not. Returns: tf.Tensor: a tensor named ``output`` with the same shape of x. Variable Names: * ``beta``: the bias term. * ``gamma``: the scale term. Input will be transformed by ``x * gamma + beta``. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. """ shape = x.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4] if ndims == 2: data_format = 'NHWC' # error using NCHW? (see #190) if data_format == 'NCHW': n_out = shape[1] else: n_out = shape[-1] # channel assert n_out is not None, "Input to BatchRenorm cannot have unknown channels!" beta, gamma, moving_mean, moving_var = get_bn_variables( n_out, use_scale, use_bias, tf.constant_initializer(1.0)) ctx = get_current_tower_context() use_local_stat = ctx.is_training # for BatchRenorm, use_local_stat should always be is_training, unless a # different usage comes out in the future. if use_local_stat: if ndims == 2: x = tf.reshape(x, [-1, 1, 1, n_out]) xn, batch_mean, batch_var = tf.nn.fused_batch_norm( x, gamma, beta, epsilon=epsilon, is_training=True, data_format=data_format) inv_sigma = tf.rsqrt(moving_var, 'inv_sigma') r = tf.stop_gradient(tf.clip_by_value( tf.sqrt(batch_var) * inv_sigma, 1.0 / rmax, rmax)) d = tf.stop_gradient(tf.clip_by_value( (batch_mean - moving_mean) * inv_sigma, -dmax, dmax)) r = reshape_for_bn(r, ndims, n_out, data_format) d = reshape_for_bn(d, ndims, n_out, data_format) xn = xn * r + d if ndims == 2: xn = tf.squeeze(xn, [1, 2]) else: if ndims == 4 and data_format == 'NCHW': [g, b, mm, mv] = [reshape_for_bn(_, ndims, n_out, data_format) for _ in [gamma, beta, moving_mean, moving_var]] xn = tf.nn.batch_normalization(x, mm, mv, b, g, epsilon) else: xn = tf.nn.batch_normalization( x, moving_mean, moving_var, beta, gamma, epsilon) # training also needs EMA, so ideally we should maintain it on every tower if ctx.is_main_training_tower or ctx.has_own_variables: ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder(mean=moving_mean, variance=moving_var) if use_scale: vh.gamma = gamma if use_bias: vh.beta = beta return ret
def adaptive_instance_norm(x, mu, sigma): mean, variance = tf.nn.moments(x, axes=[1, 2], keep_dims=True) inv = tf.rsqrt(variance + EPS) return sigma * (x - mean) * inv + mu
def layer_pixel_norm(self, net, args, options): epsilon = 1e-8 return net * tf.rsqrt( tf.reduce_mean(tf.square(net), axis=1, keepdims=True) + epsilon)
def pixel_norm(x): with tf.variable_scope('PixelNorm'): return x * tf.rsqrt( tf.reduce_mean(tf.square(x), axis=1, keep_dims=True) + 1e-8)
def pixel_norm(x, epsilon=1e-8): with tf.variable_scope('PixelNorm'): epsilon = tf.constant(epsilon, dtype=x.dtype, name='epsilon') return x * tf.rsqrt( tf.reduce_mean(tf.square(x), axis=1, keepdims=True) + epsilon)
def __l1_normalize(x, dim, epsilon=1e-12, name=None): square_sum = tf.reduce_sum(tf.abs(x), [dim], keep_dims=True) x_inv_norm = tf.rsqrt(tf.maximum(square_sum, epsilon)) return tf.mul(x, x_inv_norm, name=name)
def __call__(self, inp): with tf.variable_scope(self.name): mean = tf.reduce_mean(inp, axis=[-1], keep_dims=True) variance = tf.reduce_mean(tf.square(inp - mean), axis=[-1], keep_dims=True) norm_x = (inp - mean) * tf.rsqrt(variance + self.epsilon) return norm_x * self.scale + self.bias