def _resource_apply_sparse(self, grad, var, indices): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) beta_1_t = self._get_hyper('beta_1', var_dtype) beta_2_t = self._get_hyper('beta_2', var_dtype) local_step = math_ops.cast(self.iterations + 1, var_dtype) beta_1_power = math_ops.pow(beta_1_t, local_step) beta_2_power = math_ops.pow(beta_2_t, local_step) epsilon_t = self._get_hyper('epsilon', var_dtype) lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, 'm') m_scaled_g_values = grad * (1 - beta_1_t) m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking) with ops.control_dependencies([m_t]): m_t = self._resource_scatter_add(m, indices, m_scaled_g_values) # m_bar = (1 - beta1) * g_t + beta1 * m_t m_bar = m_scaled_g_values + beta_1_t * array_ops.gather(m_t, indices) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, 'v') v_scaled_g_values = (grad * grad) * (1 - beta_2_t) v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = self._resource_scatter_add(v, indices, v_scaled_g_values) v_t_slice = array_ops.gather(v_t, indices) v_sqrt = math_ops.sqrt(v_t_slice) var_update = self._resource_scatter_add(var, indices, -lr * m_bar / (v_sqrt + epsilon_t)) return control_flow_ops.group(*[var_update, m_bar, v_t])
def _resource_apply_sparse(self, grad, var, indices): beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # \\(m := beta1 * m + (1 - beta1) * g_t\\) m = self.get_slot(var, "m") m_t_slice = beta1_t * array_ops.gather(m, indices) + (1 - beta1_t) * grad m_update_op = resource_variable_ops.resource_scatter_update(m.handle, indices, m_t_slice) # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\) v = self.get_slot(var, "v") v_t_slice = (beta2_t * array_ops.gather(v, indices) + (1 - beta2_t) * math_ops.square(grad)) v_update_op = resource_variable_ops.resource_scatter_update(v.handle, indices, v_t_slice) # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\) var_slice = lr * m_t_slice / (math_ops.sqrt(v_t_slice) + epsilon_t) var_update_op = resource_variable_ops.resource_scatter_sub(var.handle, indices, var_slice) return control_flow_ops.group(var_update_op, m_update_op, v_update_op)
def _apply_dense(self, grad, var): beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = state_ops.assign(m, beta1_t * m + m_scaled_g_values, use_locking=self._use_locking) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = state_ops.assign(v, beta2_t * v + v_scaled_g_values, use_locking=self._use_locking) # amsgrad vhat = self.get_slot(var, "vhat") vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat)) v_sqrt = math_ops.sqrt(vhat_t) var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t])
def _apply_sparse(self, grad, var): beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # m := beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1 - beta1_t) * grad.values, use_locking=self._use_locking) # v := beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1 - beta2_t) * math_ops.square(grad.values), use_locking=self._use_locking) # variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t)) m_t_slice = array_ops.gather(m_t, grad.indices) v_t_slice = array_ops.gather(v_t, grad.indices) denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t var_update = state_ops.scatter_sub(var, grad.indices, lr * m_t_slice / denominator_slice, use_locking=self._use_locking) return control_flow_ops.group(var_update, m_t, v_t)
def _apply_sparse_shared(self, grad, var, indices, scatter_add): beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) with ops.control_dependencies([m_t]): m_t = scatter_add(m, indices, m_scaled_g_values) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = scatter_add(v, indices, v_scaled_g_values) v_sqrt = math_ops.sqrt(v_t) var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
def _apply_rms_spectral(self, grad, var): # see if variable updates need something special # might have to resize the variables (they are suposedly flat) rms = self.get_slot(var, "rms") mom = self.get_slot(var, "momentum") momentum = math_ops.cast(self._momentum_tensor, var.dtype.base_dtype) lr = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) decay = math_ops.cast(self._decay_tensor, var.dtype.base_dtype) epsilon = math_ops.cast(self._epsilon_tensor, var.dtype.base_dtype) rms_update = rms.assign(decay * rms + (1 - decay) * math_ops.square(grad)) aux = math_ops.sqrt(math_ops.sqrt(rms_update)+epsilon) #sharpGrad = (self._sharpOp(grad / aux) if min(grad.get_shape()) < self._svd_approx_size # else self._approxSharp(grad / aux, self._svd_approx_size)) sharpGrad = self._sharpOp(grad / aux) update = (lr * (sharpGrad / aux)) mom_update = mom.assign(mom * momentum + update) var_update = var.assign_sub(mom_update) return control_flow_ops.group(*[var_update, rms_update, mom_update])
def _apply_sparse_shared(self, grad, var, indices, scatter_update, scatter_sub): beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # \\(m := beta1 * m + (1 - beta1) * g_t\\) m = self.get_slot(var, "m") m_t = scatter_update(m, indices, beta1_t * array_ops.gather(m, indices) + (1 - beta1_t) * grad) # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\) v = self.get_slot(var, "v") v_t = scatter_update(v, indices, beta2_t * array_ops.gather(v, indices) + (1 - beta2_t) * math_ops.square(grad)) # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\) m_t_slice = array_ops.gather(m_t, indices) v_t_slice = array_ops.gather(v_t, indices) denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t var_update = scatter_sub(var, indices, lr * m_t_slice / denominator_slice) return control_flow_ops.group(var_update, m_t, v_t)
def _stddev(self): if distribution_util.is_diagonal_scale(self.scale): return np.sqrt(2) * math_ops.abs(self.scale.diag_part()) elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate) and self.scale.is_self_adjoint): return np.sqrt(2) * math_ops.sqrt(array_ops.matrix_diag_part( self.scale.matmul(self.scale.to_dense()))) else: return np.sqrt(2) * math_ops.sqrt(array_ops.matrix_diag_part( self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)))
def _iter_body(i, mat_y, unused_old_mat_y, mat_z, unused_old_mat_z, err, unused_old_err): current_iterate = 0.5 * (3.0 * identity - math_ops.matmul(mat_z, mat_y)) current_mat_y = math_ops.matmul(mat_y, current_iterate) current_mat_z = math_ops.matmul(current_iterate, mat_z) # Compute the error in approximation. mat_sqrt_a = current_mat_y * math_ops.sqrt(norm) mat_a_approx = math_ops.matmul(mat_sqrt_a, mat_sqrt_a) residual = mat_a - mat_a_approx current_err = math_ops.sqrt(math_ops.reduce_sum(residual * residual)) / norm return i + 1, current_mat_y, mat_y, current_mat_z, mat_z, current_err, err
def _stddev(self): if (isinstance(self.scale, linalg.LinearOperatorIdentity) or isinstance(self.scale, linalg.LinearOperatorScaledIdentity) or isinstance(self.scale, linalg.LinearOperatorDiag)): return math_ops.abs(self.scale.diag_part()) elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate) and self.scale.is_self_adjoint): return math_ops.sqrt(array_ops.matrix_diag_part( self.scale.apply(self.scale.to_dense()))) else: # TODO(b/35040238): Remove transpose once LinOp supports `transpose`. return math_ops.sqrt(array_ops.matrix_diag_part( self.scale.apply(array_ops.matrix_transpose(self.scale.to_dense()))))
def segment_sqrt_n(data, segment_ids, num_segments, name=None): """For docs, see: _RAGGED_SEGMENT_DOCSTRING.""" with ops.name_scope(name, 'RaggedSegmentSqrtN', [data, segment_ids, num_segments]): total = segment_sum(data, segment_ids, num_segments) ones = ragged_tensor.RaggedTensor.from_nested_row_splits( array_ops.ones_like(data.flat_values), data.nested_row_splits) count = segment_sum(ones, segment_ids, num_segments) if ragged_tensor.is_ragged(total): return total.with_flat_values( total.flat_values / math_ops.sqrt(count.flat_values)) else: return total / math_ops.sqrt(count)
def matrix_square_root(mat_a, mat_a_size, iter_count=100, ridge_epsilon=1e-4): """Iterative method to get matrix square root. Stable iterations for the matrix square root, Nicholas J. Higham Page 231, Eq 2.6b http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.6.8799&rep=rep1&type=pdf Args: mat_a: the symmetric PSD matrix whose matrix square root be computed mat_a_size: size of mat_a. iter_count: Maximum number of iterations. ridge_epsilon: Ridge epsilon added to make the matrix positive definite. Returns: mat_a^0.5 """ def _iter_condition(i, unused_mat_y, unused_old_mat_y, unused_mat_z, unused_old_mat_z, err, old_err): # This method require that we check for divergence every step. return math_ops.logical_and(i < iter_count, err < old_err) def _iter_body(i, mat_y, unused_old_mat_y, mat_z, unused_old_mat_z, err, unused_old_err): current_iterate = 0.5 * (3.0 * identity - math_ops.matmul(mat_z, mat_y)) current_mat_y = math_ops.matmul(mat_y, current_iterate) current_mat_z = math_ops.matmul(current_iterate, mat_z) # Compute the error in approximation. mat_sqrt_a = current_mat_y * math_ops.sqrt(norm) mat_a_approx = math_ops.matmul(mat_sqrt_a, mat_sqrt_a) residual = mat_a - mat_a_approx current_err = math_ops.sqrt(math_ops.reduce_sum(residual * residual)) / norm return i + 1, current_mat_y, mat_y, current_mat_z, mat_z, current_err, err identity = linalg_ops.eye(math_ops.to_int32(mat_a_size)) mat_a = mat_a + ridge_epsilon * identity norm = math_ops.sqrt(math_ops.reduce_sum(mat_a * mat_a)) mat_init_y = mat_a / norm mat_init_z = identity init_err = norm _, _, prev_mat_y, _, _, _, _ = control_flow_ops.while_loop( _iter_condition, _iter_body, [ 0, mat_init_y, mat_init_y, mat_init_z, mat_init_z, init_err, init_err + 1.0 ]) return prev_mat_y * math_ops.sqrt(norm)
def testIdentifyGradientWorksOnMultipleLosses(self): grad_debugger_1 = debug_gradients.GradientsDebugger() grad_debugger_2 = debug_gradients.GradientsDebugger() y = math_ops.add(self.w, -1.0, name="y") debug_y = grad_debugger_1.identify_gradient(y) z1 = math_ops.square(debug_y, name="z1") debug_y = grad_debugger_2.identify_gradient(y) z2 = math_ops.sqrt(debug_y, name="z2") with grad_debugger_1: gradient_descent.GradientDescentOptimizer(0.1).minimize(z1) with grad_debugger_2: gradient_descent.GradientDescentOptimizer(0.1).minimize(z2) dz1_dy = grad_debugger_1.gradient_tensor(y) dz2_dy = grad_debugger_2.gradient_tensor(y) self.assertIsInstance(dz1_dy, ops.Tensor) self.assertIsInstance(dz2_dy, ops.Tensor) self.assertIsNot(dz1_dy, dz2_dy) self.sess.run(variables.global_variables_initializer()) self.assertAllClose(5.0 ** 2, self.sess.run(z1)) self.assertAllClose(5.0 ** 0.5, self.sess.run(z2)) self.assertAllClose(2.0 * 5.0, self.sess.run(dz1_dy)) self.assertAllClose(0.5 * (5.0 ** -0.5), self.sess.run(dz2_dy))
def __call__(self, step): with ops.name_scope(self.name, "NoisyLinearCosineDecay", [self.initial_learning_rate, step]) as name: initial_learning_rate = ops.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype decay_steps = math_ops.cast(self.decay_steps, dtype) initial_variance = math_ops.cast(self.initial_variance, dtype) variance_decay = math_ops.cast(self.variance_decay, dtype) num_periods = math_ops.cast(self.num_periods, dtype) alpha = math_ops.cast(self.alpha, dtype) beta = math_ops.cast(self.beta, dtype) global_step_recomp = math_ops.cast(step, dtype) global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) linear_decayed = (decay_steps - global_step_recomp) / decay_steps variance = initial_variance / ( math_ops.pow(1.0 + global_step_recomp, variance_decay)) std = math_ops.sqrt(variance) noisy_linear_decayed = ( linear_decayed + random_ops.random_normal( linear_decayed.shape, stddev=std)) completed_fraction = global_step_recomp / decay_steps fraction = 2.0 * num_periods * completed_fraction cosine_decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) noisy_linear_cosine_decayed = ( (alpha + noisy_linear_decayed) * cosine_decayed + beta) return math_ops.multiply( initial_learning_rate, noisy_linear_cosine_decayed, name=name)
def _variance(self): x = math_ops.sqrt(self.df) * self.scale_operator_pd.to_dense() d = array_ops.expand_dims(array_ops.matrix_diag_part(x), -1) v = math_ops.square(x) + math_ops.matmul(d, d, adjoint_b=True) if self.cholesky_input_output_matrices: return linalg_ops.cholesky(v) return v
def entropy_matched_cauchy_scale(covariance): """Approximates a similar Cauchy distribution given a covariance matrix. Since Cauchy distributions do not have moments, entropy matching provides one way to set a Cauchy's scale parameter in a way that provides a similar distribution. The effect is dividing the standard deviation of an independent Gaussian by a constant very near 3. To set the scale of the Cauchy distribution, we first select the diagonals of `covariance`. Since this ignores cross terms, it overestimates the entropy of the Gaussian. For each of these variances, we solve for the Cauchy scale parameter which gives the same entropy as the Gaussian with that variance. This means setting the (univariate) Gaussian entropy 0.5 * ln(2 * variance * pi * e) equal to the Cauchy entropy ln(4 * pi * scale) Solving, we get scale = sqrt(variance * (e / (8 pi))). Args: covariance: A [batch size x N x N] batch of covariance matrices to produce Cauchy scales for. Returns: A [batch size x N] set of Cauchy scale parameters for each part of the batch and each dimension of the input Gaussians. """ return math_ops.sqrt(math.e / (8. * math.pi) * array_ops.matrix_diag_part(covariance))
def _cholesky_diag(diag_operator): return linear_operator_diag.LinearOperatorDiag( math_ops.sqrt(diag_operator.diag), is_non_singular=True, is_self_adjoint=True, is_positive_definite=True, is_square=True)
def __call__(self, shape, dtype=None, partition_info=None): if dtype is None: dtype = self.dtype # Check the shape if len(shape) < 3 or len(shape) > 5: raise ValueError("The tensor to initialize must be at least " "three-dimensional and at most five-dimensional") if shape[-2] > shape[-1]: raise ValueError("In_filters cannot be greater than out_filters.") # Generate a random matrix a = random_ops.random_normal([shape[-1], shape[-1]], dtype=dtype, seed=self.seed) # Compute the qr factorization q, r = linalg_ops.qr(a, full_matrices=False) # Make Q uniform d = array_ops.diag_part(r) q *= math_ops.sign(d) q = q[:shape[-2], :] q *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype)) if len(shape) == 3: weight = array_ops.scatter_nd([[(shape[0]-1)//2]], array_ops.expand_dims(q, 0), shape) elif len(shape) == 4: weight = array_ops.scatter_nd([[(shape[0]-1)//2, (shape[1]-1)//2]], array_ops.expand_dims(q, 0), shape) else: weight = array_ops.scatter_nd([[(shape[0]-1)//2, (shape[1]-1)//2, (shape[2]-1)//2]], array_ops.expand_dims(q, 0), shape) return weight
def _adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name): """Find max_norm given norm and previous average.""" with vs.variable_scope(name, "AdaptiveMaxNorm", [norm]): log_norm = math_ops.log(norm + epsilon) def moving_average(name, value, decay): moving_average_variable = vs.get_variable( name, shape=value.get_shape(), dtype=value.dtype, initializer=init_ops.zeros_initializer(), trainable=False) return moving_averages.assign_moving_average( moving_average_variable, value, decay, zero_debias=False) # quicker adaptation at the beginning if global_step is not None: n = math_ops.to_float(global_step) decay = math_ops.minimum(decay, n / (n + 1.)) # update averages mean = moving_average("mean", log_norm, decay) sq_mean = moving_average("sq_mean", math_ops.square(log_norm), decay) variance = sq_mean - math_ops.square(mean) std = math_ops.sqrt(math_ops.maximum(epsilon, variance)) max_norms = math_ops.exp(mean + std_factor * std) return max_norms, mean
def __init__(self, logits, targets=None, seed=None): dist = categorical.Categorical(logits=logits) self._logits = logits self._probs = dist.probs self._sqrt_probs = math_ops.sqrt(self._probs) super(CategoricalLogitsNegativeLogProbLoss, self).__init__( dist, targets=targets, seed=seed)
def compute_pi_tracenorm(left_cov, right_cov): """Computes the scalar constant pi for Tikhonov regularization/damping. pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) ) See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details. Args: left_cov: The left Kronecker factor "covariance". right_cov: The right Kronecker factor "covariance". Returns: The computed scalar constant pi for these Kronecker Factors (as a Tensor). """ def _trace(cov): if len(cov.shape) == 1: # Diagonal matrix. return math_ops.reduce_sum(cov) elif len(cov.shape) == 2: # Full matrix. return math_ops.trace(cov) else: raise ValueError( "What's the trace of a Tensor of rank %d?" % len(cov.shape)) # Instead of dividing by the dim of the norm, we multiply by the dim of the # other norm. This works out the same in the ratio. left_norm = _trace(left_cov) * right_cov.shape.as_list()[0] right_norm = _trace(right_cov) * left_cov.shape.as_list()[0] return math_ops.sqrt(left_norm / right_norm)
def _symmetric_matrix_square_root(mat, eps=1e-10): """Compute square root of a symmetric matrix. Note that this is different from an elementwise square root. We want to compute M' where M' = sqrt(mat) such that M' * M' = mat. Also note that this method **only** works for symmetric matrices. Args: mat: Matrix to take the square root of. eps: Small epsilon such that any element less than eps will not be square rooted to guard against numerical instability. Returns: Matrix square root of mat. """ # Unlike numpy, tensorflow's return order is (s, u, v) s, u, v = linalg_ops.svd(mat) # sqrt is unstable around 0, just use 0 in such case si = array_ops.where(math_ops.less(s, eps), s, math_ops.sqrt(s)) # Note that the v returned by Tensorflow is v = V # (when referencing the equation A = U S V^T) # This is unlike Numpy which returns v = V^T return math_ops.matmul( math_ops.matmul(u, array_ops.diag(si)), v, transpose_b=True)
def _dkwm_cdf_envelope(n, error_rate, name=None): """Computes the CDF envelope that the DKWM inequality licenses. The [Dvoretzky-Kiefer-Wolfowitz-Massart inequality] (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval) gives a stochastic bound on the distance between the true cumulative distribution function (CDF) of any distribution and its empirical CDF. To wit, for `n` iid samples from any distribution with CDF F, ```none P(sup_x |F_n(x) - F(x)| > eps) < 2exp(-2n eps^2) ``` This function computes the envelope size `eps` as a function of the number of samples `n` and the desired limit on the left-hand probability above. Args: n: Tensor of numbers of samples drawn. error_rate: Floating-point tensor of admissible rates of mistakes. name: A name for this operation (optional). Returns: eps: Tensor of maximum distances the true CDF can be from the empirical CDF. This scales as `O(sqrt(-log(error_rate)))` and as `O(1 / sqrt(n))`. The shape is the broadcast of `n` and `error_rate`. """ with ops.name_scope(name, "dkwm_cdf_envelope", [n, error_rate]): n = math_ops.cast(n, dtype=error_rate.dtype) return math_ops.sqrt(-gen_math_ops.log(error_rate / 2.) / (2. * n))
def test_normal_integral_mean_and_var_correctly_estimated(self): n = int(1e6) with self.test_session(): mu_p = constant_op.constant([-1.0, 1.0], dtype=dtypes.float64) mu_q = constant_op.constant([0.0, 0.0], dtype=dtypes.float64) sigma_p = constant_op.constant([0.5, 0.5], dtype=dtypes.float64) sigma_q = constant_op.constant([1.0, 1.0], dtype=dtypes.float64) p = distributions.Normal(loc=mu_p, scale=sigma_p) q = distributions.Normal(loc=mu_q, scale=sigma_q) # Compute E_p[X]. e_x = monte_carlo.expectation_importance_sampler( f=lambda x: x, log_p=p.log_prob, sampling_dist_q=q, n=n, seed=42) # Compute E_p[X^2]. e_x2 = monte_carlo.expectation_importance_sampler( f=math_ops.square, log_p=p.log_prob, sampling_dist_q=q, n=n, seed=42) stddev = math_ops.sqrt(e_x2 - math_ops.square(e_x)) # Relative tolerance (rtol) chosen 2 times as large as minimim needed to # pass. # Convergence of mean is +- 0.003 if n = 100M # Convergence of stddev is +- 0.00001 if n = 100M self.assertEqual(p.batch_shape, e_x.get_shape()) self.assertAllClose(p.mean().eval(), e_x.eval(), rtol=0.01) self.assertAllClose(p.stddev().eval(), stddev.eval(), rtol=0.02)
def testCovarianceFromSampling(self): alpha = np.array([[1., 2, 3], [2.5, 4, 0.01]], dtype=np.float32) with self.test_session() as sess: dist = dirichlet_lib.Dirichlet(alpha) # batch_shape=[2], event_shape=[3] x = dist.sample(int(250e3), seed=1) sample_mean = math_ops.reduce_mean(x, 0) x_centered = x - sample_mean[None, ...] sample_cov = math_ops.reduce_mean(math_ops.matmul( x_centered[..., None], x_centered[..., None, :]), 0) sample_var = array_ops.matrix_diag_part(sample_cov) sample_stddev = math_ops.sqrt(sample_var) [ sample_mean_, sample_cov_, sample_var_, sample_stddev_, analytic_mean, analytic_cov, analytic_var, analytic_stddev, ] = sess.run([ sample_mean, sample_cov, sample_var, sample_stddev, dist.mean(), dist.covariance(), dist.variance(), dist.stddev(), ]) self.assertAllClose(sample_mean_, analytic_mean, atol=0., rtol=0.04) self.assertAllClose(sample_cov_, analytic_cov, atol=0., rtol=0.06) self.assertAllClose(sample_var_, analytic_var, atol=0., rtol=0.03) self.assertAllClose(sample_stddev_, analytic_stddev, atol=0., rtol=0.02)
def _sample_n(self, n, seed): batch_shape = self.batch_shape_tensor() event_shape = self.event_shape_tensor() batch_ndims = array_ops.shape(batch_shape)[0] ndims = batch_ndims + 3 # sample_ndims=1, event_ndims=2 shape = array_ops.concat([[n], batch_shape, event_shape], 0) # Complexity: O(nbk**2) x = random_ops.random_normal(shape=shape, mean=0., stddev=1., dtype=self.dtype, seed=seed) # Complexity: O(nbk) # This parametrization is equivalent to Chi2, i.e., # ChiSquared(k) == Gamma(alpha=k/2, beta=1/2) expanded_df = self.df * array_ops.ones( self.scale_operator.batch_shape_tensor(), dtype=self.df.dtype.base_dtype) g = random_ops.random_gamma(shape=[n], alpha=self._multi_gamma_sequence( 0.5 * expanded_df, self.dimension), beta=0.5, dtype=self.dtype, seed=distribution_util.gen_new_seed( seed, "wishart")) # Complexity: O(nbk**2) x = array_ops.matrix_band_part(x, -1, 0) # Tri-lower. # Complexity: O(nbk) x = array_ops.matrix_set_diag(x, math_ops.sqrt(g)) # Make batch-op ready. # Complexity: O(nbk**2) perm = array_ops.concat([math_ops.range(1, ndims), [0]], 0) x = array_ops.transpose(x, perm) shape = array_ops.concat([batch_shape, [event_shape[0]], [-1]], 0) x = array_ops.reshape(x, shape) # Complexity: O(nbM) where M is the complexity of the operator solving a # vector system. E.g., for LinearOperatorDiag, each matmul is O(k**2), so # this complexity is O(nbk**2). For LinearOperatorLowerTriangular, # each matmul is O(k^3) so this step has complexity O(nbk^3). x = self.scale_operator.matmul(x) # Undo make batch-op ready. # Complexity: O(nbk**2) shape = array_ops.concat([batch_shape, event_shape, [n]], 0) x = array_ops.reshape(x, shape) perm = array_ops.concat([[ndims - 1], math_ops.range(0, ndims - 1)], 0) x = array_ops.transpose(x, perm) if not self.cholesky_input_output_matrices: # Complexity: O(nbk^3) x = math_ops.matmul(x, x, adjoint_b=True) return x
def fn(x): if not state: two = constant_op.constant(2.0) four = two * two two_again = math_ops.sqrt(four) state.append(variables.Variable(two_again + four)) return state[0] * x
def testSampleConsistentStats(self): loc = np.float32([[-1., 1], [1, -1]]) scale = np.float32([1., 0.5]) n_samp = 1e4 with self.test_session() as sess: ind = independent_lib.Independent( distribution=mvn_diag_lib.MultivariateNormalDiag( loc=loc, scale_identity_multiplier=scale), reduce_batch_ndims=1) x = ind.sample(int(n_samp), seed=42) sample_mean = math_ops.reduce_mean(x, axis=0) sample_var = math_ops.reduce_mean( math_ops.squared_difference(x, sample_mean), axis=0) sample_std = math_ops.sqrt(sample_var) sample_entropy = -math_ops.reduce_mean(ind.log_prob(x), axis=0) [ sample_mean_, sample_var_, sample_std_, sample_entropy_, actual_mean_, actual_var_, actual_std_, actual_entropy_, actual_mode_, ] = sess.run([ sample_mean, sample_var, sample_std, sample_entropy, ind.mean(), ind.variance(), ind.stddev(), ind.entropy(), ind.mode(), ]) self.assertAllClose(sample_mean_, actual_mean_, rtol=0.02, atol=0.) self.assertAllClose(sample_var_, actual_var_, rtol=0.04, atol=0.) self.assertAllClose(sample_std_, actual_std_, rtol=0.02, atol=0.) self.assertAllClose(sample_entropy_, actual_entropy_, rtol=0.01, atol=0.) self.assertAllClose(loc, actual_mode_, rtol=1e-6, atol=0.)
def stddev(self, name="stddev"): """Standard deviation. Standard deviation is defined as, ```none stddev = E[(X - E[X])**2]**0.5 ``` where `X` is the random variable associated with this distribution, `E` denotes expectation, and `stddev.shape = batch_shape + event_shape`. Args: name: Python `str` prepended to names of ops created by this function. Returns: stddev: Floating-point `Tensor` with shape identical to `batch_shape + event_shape`, i.e., the same shape as `self.mean()`. """ with self._name_scope(name): try: return self._stddev() except NotImplementedError as original_exception: try: return math_ops.sqrt(self._variance()) except NotImplementedError: raise original_exception
def _prob(self, x): y = (x - self.mu) / self.sigma half_df = 0.5 * self.df return (math_ops.exp(math_ops.lgamma(0.5 + half_df) - math_ops.lgamma(half_df)) / (math_ops.sqrt(self.df) * math.sqrt(math.pi) * self.sigma) * math_ops.pow(1. + math_ops.square(y) / self.df, -(0.5 + half_df)))
def _renorm_correction_and_moments(self, mean, variance, training): """Returns the correction and update values for renorm.""" stddev = math_ops.sqrt(variance + self.epsilon) # Compute the average mean and standard deviation, as if they were # initialized with this batch's moments. mixed_renorm_mean = (self.renorm_mean + (1. - self.renorm_mean_weight) * mean) mixed_renorm_stddev = (self.renorm_stddev + (1. - self.renorm_stddev_weight) * stddev) # Compute the corrections for batch renorm. r = stddev / mixed_renorm_stddev d = (mean - mixed_renorm_mean) / mixed_renorm_stddev # Ensure the corrections use pre-update moving averages. with ops.control_dependencies([r, d]): mean = array_ops.identity(mean) stddev = array_ops.identity(stddev) rmin, rmax, dmax = [ self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax'] ] if rmin is not None: r = math_ops.maximum(r, rmin) if rmax is not None: r = math_ops.minimum(r, rmax) if dmax is not None: d = math_ops.maximum(d, -dmax) d = math_ops.minimum(d, dmax) # When not training, use r=1, d=0. r = tf_utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r)) d = tf_utils.smart_cond(training, lambda: d, lambda: array_ops.zeros_like(d)) def _update_renorm_variable(var, weight, value): """Updates a moving average and weight, returns the unbiased value.""" value = array_ops.identity(value) def _do_update(): """Updates the var and weight, returns their updated ratio.""" # Update the variables without zero debiasing. The debiasing will be # accomplished by dividing the exponential moving average by the weight. # For example, after a single update, the moving average would be # (1-decay) * value. and the weight will be 1-decay, with their ratio # giving the value. # Make sure the weight is not updated until before r and d computation. with ops.control_dependencies([value]): weight_value = array_ops.constant(1., dtype=weight.dtype) new_var = self._assign_moving_average(var, value, self.renorm_momentum) new_weight = self._assign_moving_average( weight, weight_value, self.renorm_momentum) # TODO(yuefengz): the updates to var and weighted can not be batched # together if we fetch their updated values here. Consider calculating # new values and delaying the updates. return new_var / new_weight def _fake_update(): return array_ops.identity(var) return tf_utils.smart_cond(training, _do_update, _fake_update) # TODO(yuefengz): colocate the operations new_mean = _update_renorm_variable(self.renorm_mean, self.renorm_mean_weight, mean) new_stddev = _update_renorm_variable(self.renorm_stddev, self.renorm_stddev_weight, stddev) # Make sqrt(moving_variance + epsilon) = new_stddev. new_variance = math_ops.square(new_stddev) - self.epsilon return (r, d, new_mean, new_variance)
def dct(input, type=2, n=None, axis=-1, norm=None, name=None): # pylint: disable=redefined-builtin """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`. Types I, II, III and IV are supported. Type I is implemented using a length `2N` padded `tf.signal.rfft`. Type II is implemented using a length `2N` padded `tf.signal.rfft`, as described here: [Type 2 DCT using 2N FFT padded (Makhoul)] (https://dsp.stackexchange.com/a/10606). Type III is a fairly straightforward inverse of Type II (i.e. using a length `2N` padded `tf.signal.irfft`). Type IV is calculated through 2N length DCT2 of padded signal and picking the odd indices. @compatibility(scipy) Equivalent to [scipy.fftpack.dct] (https://docs.scipy.org/doc/scipy-1.4.0/reference/generated/scipy.fftpack.dct.html) for Type-I, Type-II, Type-III and Type-IV DCT. @end_compatibility Args: input: A `[..., samples]` `float32`/`float64` `Tensor` containing the signals to take the DCT of. type: The DCT type to perform. Must be 1, 2, 3 or 4. n: The length of the transform. If length is less than sequence length, only the first n elements of the sequence are considered for the DCT. If n is greater than the sequence length, zeros are padded and then the DCT is computed as usual. axis: For future expansion. The axis to compute the DCT along. Must be `-1`. norm: The normalization to apply. `None` for no normalization or `'ortho'` for orthonormal normalization. name: An optional name for the operation. Returns: A `[..., samples]` `float32`/`float64` `Tensor` containing the DCT of `input`. Raises: ValueError: If `type` is not `1`, `2`, `3` or `4`, `axis` is not `-1`, `n` is not `None` or greater than 0, or `norm` is not `None` or `'ortho'`. ValueError: If `type` is `1` and `norm` is `ortho`. [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform """ _validate_dct_arguments(input, type, n, axis, norm) with _ops.name_scope(name, "dct", [input]): input = _ops.convert_to_tensor(input) zero = _ops.convert_to_tensor(0.0, dtype=input.dtype) seq_len = (tensor_shape.dimension_value(input.shape[-1]) or _array_ops.shape(input)[-1]) if n is not None: if n <= seq_len: input = input[..., 0:n] else: rank = len(input.shape) padding = [[0, 0] for _ in range(rank)] padding[rank - 1][1] = n - seq_len padding = _ops.convert_to_tensor(padding, dtype=_dtypes.int32) input = _array_ops.pad(input, paddings=padding) axis_dim = (tensor_shape.dimension_value(input.shape[-1]) or _array_ops.shape(input)[-1]) axis_dim_float = _math_ops.cast(axis_dim, input.dtype) if type == 1: dct1_input = _array_ops.concat([input, input[..., -2:0:-1]], axis=-1) dct1 = _math_ops.real(fft_ops.rfft(dct1_input)) return dct1 if type == 2: scale = 2.0 * _math_ops.exp( _math_ops.complex( zero, -_math_ops.range(axis_dim_float) * _math.pi * 0.5 / axis_dim_float)) # TODO(rjryan): Benchmark performance and memory usage of the various # approaches to computing a DCT via the RFFT. dct2 = _math_ops.real( fft_ops.rfft(input, fft_length=[2 * axis_dim])[..., :axis_dim] * scale) if norm == "ortho": n1 = 0.5 * _math_ops.rsqrt(axis_dim_float) n2 = n1 * _math.sqrt(2.0) # Use tf.pad to make a vector of [n1, n2, n2, n2, ...]. weights = _array_ops.pad(_array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]], constant_values=n2) dct2 *= weights return dct2 elif type == 3: if norm == "ortho": n1 = _math_ops.sqrt(axis_dim_float) n2 = n1 * _math.sqrt(0.5) # Use tf.pad to make a vector of [n1, n2, n2, n2, ...]. weights = _array_ops.pad(_array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]], constant_values=n2) input *= weights else: input *= axis_dim_float scale = 2.0 * _math_ops.exp( _math_ops.complex( zero, _math_ops.range(axis_dim_float) * _math.pi * 0.5 / axis_dim_float)) dct3 = _math_ops.real( fft_ops.irfft(scale * _math_ops.complex(input, zero), fft_length=[2 * axis_dim]))[..., :axis_dim] return dct3 elif type == 4: # DCT-2 of 2N length zero-padded signal, unnormalized. dct2 = dct(input, type=2, n=2 * axis_dim, axis=axis, norm=None) # Get odd indices of DCT-2 of zero padded 2N signal to obtain # DCT-4 of the original N length signal. dct4 = dct2[..., 1::2] if norm == "ortho": dct4 *= _math.sqrt(0.5) * _math_ops.rsqrt(axis_dim_float) return dct4
def _get_train_ops(self, loss, tf_variables, global_step, grad_bound=1.25, lr_init=1e-3, lr_dec=0.9, start_decay_step=10000, decay_steps=100, optimizer_type="adam"): """Loss optimizer. Args: loss: scalar tf tensor tf_variables: list of training variables, typically tf.trainable_variables() global_step: global_step grad_bound: max gradient norm lr_init: initial learning rate lr_dec: leaning rate decay coefficient start_decay_step: start decaying learning rate after this many steps decay_steps: apply decay rate factor at this step intervals optimizer_type: optimizer type should be either adam or sgd Returns: train_op: training op learning_rate: scalar learning rate tensor grad_norm: l2 norm of the gradient vector all_grad_norms: l2 norm of each component """ lr_gstep = global_step - start_decay_step def f1(): return constant_op.constant(lr_init) def f2(): return learning_rate_decay.exponential_decay( lr_init, lr_gstep, decay_steps, lr_dec, True) learning_rate = control_flow_ops.cond(math_ops.less( global_step, start_decay_step), f1, f2, name="learning_rate") if optimizer_type == "adam": opt = adam.AdamOptimizer(learning_rate) elif optimizer_type == "sgd": opt = gradient_descent.GradientDescentOptimizer(learning_rate) grads_and_vars = opt.compute_gradients(loss, tf_variables) grad_norm = clip_ops.global_norm([g for g, v in grads_and_vars]) all_grad_norms = {} clipped_grads = [] clipped_rate = math_ops.maximum(grad_norm / grad_bound, 1.0) for g, v in grads_and_vars: if g is not None: if isinstance(g, tf_ops.IndexedSlices): clipped = g.values / clipped_rate norm_square = math_ops.reduce_sum(clipped * clipped) clipped = tf_ops.IndexedSlices(clipped, g.indices) else: clipped = g / clipped_rate norm_square = math_ops.reduce_sum(clipped * clipped) all_grad_norms[v.name] = math_ops.sqrt(norm_square) clipped_grads.append((clipped, v)) train_op = opt.apply_gradients(clipped_grads, global_step) return train_op, learning_rate, grad_norm, all_grad_norms
def l2norm(v): return math_ops.sqrt(l2norm_squared(v))
def _resource_apply_sparse(self, grad, var, indices): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) rms = self.get_slot(var, "rms") rho = self._get_hyper("rho", var_dtype) momentum = self._get_hyper("momentum", var_dtype) epsilon = self._get_hyper("epsilon", var_dtype) if self._momentum: mom = self.get_slot(var, "momentum") if self.centered: mg = self.get_slot(var, "mg") return training_ops.resource_sparse_apply_centered_rms_prop( var.handle, mg.handle, rms.handle, mom.handle, lr_t, rho, momentum, epsilon, grad, indices, use_locking=self._use_locking) else: return training_ops.resource_sparse_apply_rms_prop( var.handle, rms.handle, mom.handle, lr_t, rho, momentum, epsilon, grad, indices, use_locking=self._use_locking) else: rms_scaled_g_values = (grad * grad) * (1. - rho) rms_t = state_ops.assign(rms, rms * rho, use_locking=self._use_locking) with ops.control_dependencies([rms_t]): rms_t = self._resource_scatter_add(rms, indices, rms_scaled_g_values) rms_slice = array_ops.gather(rms_t, indices) denom_slice = rms_slice if self.centered: mg = self.get_slot(var, "mg") mg_scaled_g_values = grad * (1. - rho) mg_t = state_ops.assign(mg, mg * rho, use_locking=self._use_locking) with ops.control_dependencies([mg_t]): mg_t = self._resource_scatter_add(mg, indices, mg_scaled_g_values) mg_slice = array_ops.gather(mg_t, indices) denom_slice = rms_slice - math_ops.square(mg_slice) var_update = self._resource_scatter_add( var, indices, -lr_t * grad / (math_ops.sqrt(denom_slice) + epsilon)) if self.centered: return control_flow_ops.group(*[var_update, rms_t, mg_t]) return control_flow_ops.group(*[var_update, rms_t])
def embedding_lookup_sparse(params, sp_ids, sp_weights, partition_strategy="mod", name=None, combiner=None, max_norm=None): """Computes embeddings for the given ids and weights. This op assumes that there is at least one id for each row in the dense tensor represented by sp_ids (i.e. there are no rows with empty features), and that all the indices of sp_ids are in canonical row-major order. It also assumes that all id values lie in the range [0, p0), where p0 is the sum of the size of params along dimension 0. Args: params: A single tensor representing the complete embedding tensor, or a list of P tensors all of same shape except for the first dimension, representing sharded embedding tensors. Alternatively, a `PartitionedVariable`, created by partitioning along dimension 0. Each element must be appropriately sized for the given `partition_strategy`. sp_ids: N x M `SparseTensor` of int64 ids where N is typically batch size and M is arbitrary. sp_weights: either a `SparseTensor` of float / double weights, or `None` to indicate all weights should be taken to be 1. If specified, `sp_weights` must have exactly the same shape and indices as `sp_ids`. partition_strategy: A string specifying the partitioning strategy, relevant if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default is `"mod"`. See `tf.nn.embedding_lookup` for more details. name: Optional name for the op. combiner: A string specifying the reduction op. Currently "mean", "sqrtn" and "sum" are supported. "sum" computes the weighted sum of the embedding results for each row. "mean" is the weighted sum divided by the total weight. "sqrtn" is the weighted sum divided by the square root of the sum of the squares of the weights. max_norm: If not `None`, each embedding is clipped if its l2-norm is larger than this value, before combining. Returns: A dense tensor representing the combined embeddings for the sparse ids. For each row in the dense tensor represented by `sp_ids`, the op looks up the embeddings for all ids in that row, multiplies them by the corresponding weight, and combines these embeddings as specified. In other words, if `shape(combined params) = [p0, p1, ..., pm]` and `shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]` then `shape(output) = [d0, d1, ..., dn-1, p1, ..., pm]`. For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are ```python [0, 0]: id 1, weight 2.0 [0, 1]: id 3, weight 0.5 [1, 0]: id 0, weight 1.0 [2, 3]: id 1, weight 3.0 ``` with `combiner`="mean", then the output will be a 3x20 matrix where ```python output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5) output[1, :] = (params[0, :] * 1.0) / 1.0 output[2, :] = (params[1, :] * 3.0) / 3.0 ``` Raises: TypeError: If `sp_ids` is not a `SparseTensor`, or if `sp_weights` is neither `None` nor `SparseTensor`. ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}. """ if combiner is None: logging.warn("The default value of combiner will change from \"mean\" " "to \"sqrtn\" after 2016/11/01.") combiner = "mean" if combiner not in ("mean", "sqrtn", "sum"): raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'") if isinstance(params, variables.PartitionedVariable): params = list(params) # Iterate to get the underlying Variables. if not isinstance(params, list): params = [params] if not isinstance(sp_ids, sparse_tensor.SparseTensor): raise TypeError("sp_ids must be SparseTensor") ignore_weights = sp_weights is None if not ignore_weights: if not isinstance(sp_weights, sparse_tensor.SparseTensor): raise TypeError("sp_weights must be either None or SparseTensor") sp_ids.values.get_shape().assert_is_compatible_with( sp_weights.values.get_shape()) sp_ids.indices.get_shape().assert_is_compatible_with( sp_weights.indices.get_shape()) sp_ids.dense_shape.get_shape().assert_is_compatible_with( sp_weights.dense_shape.get_shape()) # TODO(yleon): Add enhanced node assertions to verify that sp_ids and # sp_weights have equal indices and shapes. with ops.name_scope(name, "embedding_lookup_sparse", params + [sp_ids]) as name: segment_ids = sp_ids.indices[:, 0] if segment_ids.dtype != dtypes.int32: segment_ids = math_ops.cast(segment_ids, dtypes.int32) ids = sp_ids.values ids, idx = array_ops.unique(ids) embeddings = embedding_lookup( params, ids, partition_strategy=partition_strategy, max_norm=max_norm) if embeddings.dtype in (dtypes.float16, dtypes.bfloat16): embeddings = math_ops.cast(embeddings, dtypes.float32) if not ignore_weights: weights = sp_weights.values if weights.dtype != embeddings.dtype: weights = math_ops.cast(weights, embeddings.dtype) embeddings = array_ops.gather(embeddings, idx) # Reshape weights to allow broadcast ones = array_ops.fill( array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1) bcast_weights_shape = array_ops.concat([array_ops.shape(weights), ones], 0) orig_weights_shape = weights.get_shape() weights = array_ops.reshape(weights, bcast_weights_shape) # Set the weight shape, since after reshaping to bcast_weights_shape, # the shape becomes None. if embeddings.get_shape().ndims is not None: weights.set_shape( orig_weights_shape.concatenate( [1 for _ in range(embeddings.get_shape().ndims - 1)])) embeddings *= weights if combiner == "sum": embeddings = math_ops.segment_sum(embeddings, segment_ids, name=name) elif combiner == "mean": embeddings = math_ops.segment_sum(embeddings, segment_ids) weight_sum = math_ops.segment_sum(weights, segment_ids) embeddings = math_ops.divide(embeddings, weight_sum, name=name) elif combiner == "sqrtn": embeddings = math_ops.segment_sum(embeddings, segment_ids) weights_squared = math_ops.pow(weights, 2) weight_sum = math_ops.segment_sum(weights_squared, segment_ids) weight_sum_sqrt = math_ops.sqrt(weight_sum) embeddings = math_ops.divide(embeddings, weight_sum_sqrt, name=name) else: assert False, "Unrecognized combiner" else: assert idx is not None if combiner == "sum": embeddings = math_ops.sparse_segment_sum( embeddings, idx, segment_ids, name=name) elif combiner == "mean": embeddings = math_ops.sparse_segment_mean( embeddings, idx, segment_ids, name=name) elif combiner == "sqrtn": embeddings = math_ops.sparse_segment_sqrt_n( embeddings, idx, segment_ids, name=name) else: assert False, "Unrecognized combiner" return embeddings
def std(self, name="std"): with ops.name_scope(self.name): with ops.name_scope(name, values=[self.range()]): return self.range() / math_ops.sqrt(12.)
def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None): r"""Computes the norm of vectors, matrices, and tensors. This function can compute 3 different matrix norms (Frobenius, 1-norm, and inf-norm) and up to 9218868437227405311 different vectors norms. Args: tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128` ord: Order of the norm. Supported values are 'fro', 'euclidean', `0`, `1, `2`, `np.inf` and any positive real number yielding the corresponding p-norm. Default is 'euclidean' which is equivalent to Frobenius norm if `tensor` is a matrix and equivalent to 2-norm for vectors. Some restrictions apply, a) The Frobenius norm `fro` is not defined for vectors, b) If axis is a 2-tuple (matrix-norm), only 'euclidean', 'fro', `1`, `np.inf` are supported. See the description of `axis` on how to compute norms for a batch of vectors or matrices stored in a tensor. axis: If `axis` is `None` (the default), the input is considered a vector and a single vector norm is computed over the entire set of values in the tensor, i.e. `norm(tensor, ord=ord)` is equivalent to `norm(reshape(tensor, [-1]), ord=ord)`. If `axis` is a Python integer, the input is considered a batch of vectors, and `axis`t determines the axis in `tensor` over which to compute vector norms. If `axis` is a 2-tuple of Python integers it is considered a batch of matrices and `axis` determines the axes in `tensor` over which to compute a matrix norm. Negative indices are supported. Example: If you are passing a tensor that can be either a matrix or a batch of matrices at runtime, pass `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are computed. keep_dims: If True, the axis indicated in `axis` are kept with size 1. Otherwise, the dimensions in `axis` are removed from the output shape. name: The name of the op. Returns: output: A `Tensor` of the same type as tensor, containing the vector or matrix norms. If `keep_dims` is True then the rank of output is equal to the rank of `tensor`. Otherwise, if `axis` is none the output is a scalar, if `axis` is an integer, the rank of `output` is one less than the rank of `tensor`, if `axis` is a 2-tuple the rank of `output` is two less than the rank of `tensor`. Raises: ValueError: If `ord` or `axis` is invalid. @compatibility(numpy) Mostly equivalent to np.linalg.norm. Not supported: ord <= 0, 2-norm for matrices, nuclear norm. Other differences: a) If axis is `None`, treats the the flattened `tensor` as a vector regardless of rank. b) Explicitly supports 'euclidean' norm as the default, including for higher order tensors. @end_compatibility """ is_matrix_norm = ((isinstance(axis, tuple) or isinstance(axis, list)) and len(axis) == 2) if is_matrix_norm: axis = tuple(axis) if (not isinstance(axis[0], int) or not isinstance(axis[1], int) or axis[0] == axis[1]): raise ValueError( "'axis' must be None, an integer, or a tuple of 2 unique integers" ) # TODO(rmlarsen): Implement matrix 2-norm using tf.svd(). supported_matrix_norms = ['euclidean', 'fro', 1, np.inf] if ord not in supported_matrix_norms: raise ValueError( "'ord' must be a supported matrix norm in %s, got %s" % (supported_matrix_norms, ord)) else: if not (isinstance(axis, int) or axis is None): raise ValueError( "'axis' must be None, an integer, or a tuple of 2 unique integers" ) supported_vector_norms = ['euclidean', 1, 2, np.inf] if (not np.isreal(ord) or ord <= 0) and ord not in supported_vector_norms: raise ValueError("'ord' must be a supported vector norm, got %s" % ord) if axis is not None: axis = (axis, ) with ops.name_scope(name, 'norm', [tensor]): tensor = ops.convert_to_tensor(tensor) if ord in ['fro', 'euclidean', 2, 2.0]: # TODO(rmlarsen): Move 2-norm to a separate clause once we support it for # matrices. result = math_ops.sqrt( math_ops.reduce_sum(math_ops.square(tensor), axis, keep_dims=True)) else: result = math_ops.abs(tensor) if ord == 1: sum_axis = None if axis is None else axis[0] result = math_ops.reduce_sum(result, sum_axis, keep_dims=True) if is_matrix_norm: result = math_ops.reduce_max(result, axis[-1], keep_dims=True) elif ord == np.inf: if is_matrix_norm: result = math_ops.reduce_sum(result, axis[1], keep_dims=True) max_axis = None if axis is None else axis[0] result = math_ops.reduce_max(result, max_axis, keep_dims=True) else: # General p-norms (positive p only) result = math_ops.pow( math_ops.reduce_sum(math_ops.pow(result, ord), axis, keep_dims=True), 1.0 / ord) if not keep_dims: result = array_ops.squeeze(result, axis) return result
def _resource_apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr_t = array_ops.identity(self._get_hyper('learning_rate', var_dtype)) beta_1_t = array_ops.identity(self._get_hyper('beta_1', var_dtype)) beta_2_t = array_ops.identity(self._get_hyper('beta_2', var_dtype)) epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') local_step = math_ops.cast(self.iterations + 1, var_dtype) next_step = math_ops.cast(self.iterations + 2, var_dtype) decay_base = math_ops.cast(0.96, var_dtype) # Learning rate multipliers if self.lr_multipliers is not None: lr_t = _apply_lr_multiplier(self, lr_t, var) # print(lr_t) # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_t = beta_1_t * (1. - 0.5 * ( math_ops.pow(decay_base, self._initial_decay * local_step))) momentum_cache_t_1 = beta_1_t * (1. - 0.5 * ( math_ops.pow(decay_base, self._initial_decay * next_step))) m_schedule_new = math_ops.cast(self._m_cache_read, var_dtype) * momentum_cache_t if var_dtype is self._m_cache.dtype: m_schedule_new = array_ops.identity(state_ops.assign( self._m_cache, m_schedule_new, use_locking=self._use_locking)) m_schedule_next = m_schedule_new * momentum_cache_t_1 # the following equations given in [1] g_prime = grad / (1. - m_schedule_new) m_t = beta_1_t * m + (1. - beta_1_t) * grad m_t_prime = m_t / (1. - m_schedule_next) v_t = beta_2_t * v + (1. - beta_2_t) * math_ops.square(grad) v_t_prime = v_t / (1. - math_ops.pow(beta_2_t, local_step)) m_t_bar = (1. - momentum_cache_t) * g_prime + ( momentum_cache_t * m_t_prime) m_t = state_ops.assign(m, m_t, use_locking=self._use_locking) v_t = state_ops.assign(v, v_t, use_locking=self._use_locking) var_t = math_ops.sub(var, self.eta_t * lr_t * m_t_bar / ( math_ops.sqrt(v_t_prime + epsilon_t))) # Weight decays if var.name in self.weight_decays.keys(): var_t = _apply_weight_decays(self, var, var_t) var_update = state_ops.assign(var, var_t, use_locking=self._use_locking) # Cosine annealing (iteration_done, t_cur_update, eta_t_update ) = _update_t_cur_eta_t_v2(self, lr_t, var) if iteration_done and not self._init_notified: self._init_notified = True updates = [var_update, m_t, v_t] if iteration_done: updates += [t_cur_update] if self.use_cosine_annealing and iteration_done: updates += [eta_t_update] return control_flow_ops.group(*updates)
def _batch_sqrt_solve(self, rhs): diag_mat = array_ops.expand_dims(self._diag, -1) return rhs / math_ops.sqrt(diag_mat)
def _batch_sqrt_matmul(self, x, transpose_x=False): if transpose_x: x = array_ops.matrix_transpose(x) diag_mat = array_ops.expand_dims(self._diag, -1) return math_ops.sqrt(diag_mat) * x
def embedding_lookup_sparse(params, sp_ids, sp_weights, partition_strategy="mod", name=None, combiner="mean"): """Computes embeddings for the given ids and weights. This op assumes that there is at least one id for each row in the dense tensor represented by sp_ids (i.e. there are no rows with empty features), and that all the indices of sp_ids are in canonical row-major order. It also assumes that all id values lie in the range [0, p0), where p0 is the sum of the size of params along dimension 0. Args: params: A single tensor representing the complete embedding tensor, or a list of P tensors all of same shape except for the first dimension, representing sharded embedding tensors. sp_ids: N x M SparseTensor of int64 ids (typically from FeatureValueToId), where N is typically batch size and M is arbitrary. sp_weights: either a SparseTensor of float / double weights, or None to indicate all weights should be taken to be 1. If specified, sp_weights must have exactly the same shape and indices as sp_ids. partition_strategy: A string specifying the partitioning strategy, relevant if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default is `"mod"`. See `tf.nn.embedding_lookup` for more details. name: Optional name for the op. combiner: A string specifying the reduction op. Currently "mean", "sqrtn" and "sum" are supported. "sum" computes the weighted sum of the embedding results for each row. "mean" is the weighted sum divided by the total weight. "sqrtn" is the weighted sum divided by the square root of the sum of the squares of the weights. Returns: A dense tensor representing the combined embeddings for the sparse ids. For each row in the dense tensor represented by sp_ids, the op looks up the embeddings for all ids in that row, multiplies them by the corresponding weight, and combines these embeddings as specified. In other words, if shape(combined params) = [p0, p1, ..., pm] and shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn] then shape(output) = [d0, d1, ..., dn-1, p1, ..., pm]. For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are [0, 0]: id 1, weight 2.0 [0, 1]: id 3, weight 0.5 [1, 0]: id 0, weight 1.0 [2, 3]: id 1, weight 3.0 with combiner="mean", then the output will be a 3x20 matrix where output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5) output[1, :] = params[0, :] * 1.0 output[2, :] = params[1, :] * 3.0 Raises: TypeError: If sp_ids is not a SparseTensor, or if sp_weights is neither None nor SparseTensor. ValueError: If combiner is not one of {"mean", "sqrtn", "sum"}. """ if combiner not in ("mean", "sqrtn", "sum"): raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'") if not isinstance(params, list): params = [params] if not isinstance(sp_ids, ops.SparseTensor): raise TypeError("sp_ids must be SparseTensor") ignore_weights = sp_weights is None if not ignore_weights: if not isinstance(sp_weights, ops.SparseTensor): raise TypeError("sp_weights must be either None or SparseTensor") sp_ids.values.get_shape().assert_is_compatible_with( sp_weights.values.get_shape()) sp_ids.indices.get_shape().assert_is_compatible_with( sp_weights.indices.get_shape()) sp_ids.shape.get_shape().assert_is_compatible_with( sp_weights.shape.get_shape()) # TODO(yleon): Add enhanced node assertions to verify that sp_ids and # sp_weights have equal indices and shapes. with ops.op_scope(params + [sp_ids], name, "embedding_lookup_sparse") as name: segment_ids = sp_ids.indices[:, 0] if segment_ids.dtype != dtypes.int32: segment_ids = math_ops.cast(segment_ids, dtypes.int32) ids = sp_ids.values if ignore_weights: ids, idx = array_ops.unique(ids) else: idx = None embeddings = embedding_lookup( params, ids, partition_strategy=partition_strategy) if not ignore_weights: weights = sp_weights.values if weights.dtype != embeddings.dtype: weights = math_ops.cast(weights, embeddings.dtype) # Reshape weights to allow broadcast ones = array_ops.fill( array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1) bcast_weights_shape = array_ops.concat(0, [ array_ops.shape(weights), ones]) orig_weights_shape = weights.get_shape() weights = array_ops.reshape(weights, bcast_weights_shape) # Set the weight shape, since after reshaping to bcast_weights_shape, # the shape becomes None. if embeddings.get_shape().ndims is not None: weights.set_shape(orig_weights_shape.concatenate( [1 for _ in range(embeddings.get_shape().ndims - 1)])) embeddings *= weights if combiner == "sum": embeddings = math_ops.segment_sum(embeddings, segment_ids, name=name) elif combiner == "mean": embeddings = math_ops.segment_sum(embeddings, segment_ids) weight_sum = math_ops.segment_sum(weights, segment_ids) embeddings = math_ops.div(embeddings, weight_sum, name=name) elif combiner == "sqrtn": embeddings = math_ops.segment_sum(embeddings, segment_ids) weights_squared = math_ops.pow(weights, 2) weight_sum = math_ops.segment_sum(weights_squared, segment_ids) weight_sum_sqrt = math_ops.sqrt(weight_sum) embeddings = math_ops.div(embeddings, weight_sum_sqrt, name=name) else: assert False, "Unrecognized combiner" else: assert idx is not None if combiner == "sum": embeddings = math_ops.sparse_segment_sum(embeddings, idx, segment_ids, name=name) elif combiner == "mean": embeddings = math_ops.sparse_segment_mean(embeddings, idx, segment_ids, name=name) elif combiner == "sqrtn": embeddings = math_ops.sparse_segment_sqrt_n(embeddings, idx, segment_ids, name=name) else: assert False, "Unrecognized combiner" return embeddings
def _std(self): return math_ops.sqrt(self.variance())
def exp_map(self, v, x): # eq (9), v is the gradient vnorm = math_ops.sqrt(self.lorentz_scalar_product(v, v)) return math_ops.cosh(vnorm) * x + math_ops.sinh(vnorm) * v / vnorm
def _mean(self): if self.cholesky_input_output_matrices: return (math_ops.sqrt(self.df) * self.scale_operator.to_dense()) return self.df * self._square_scale_operator()
def _apply_dense(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) g = [self.get_slot(var, "g%d" % i) for i in range(self._keep_num + 1)] v = self.get_slot(var, "v") z = self.get_slot(var, "z") b2p = self.get_slot(var, "b2p") if self._pred_g_op == 'none': v_t = state_ops.assign(v, v * beta2_t + tf.square(g[0]) * (1 - beta2_t), use_locking=self._use_locking) elif self._pred_g_op == 'max': v_t = state_ops.assign( v, v * beta2_t + tf.reduce_max(tf.square(g[0])) * (1 - beta2_t), use_locking=self._use_locking) elif self._pred_g_op == 'mean': v_t = state_ops.assign( v, v * beta2_t + tf.reduce_mean(tf.square(g[0])) * (1 - beta2_t), use_locking=self._use_locking) else: assert False with ops.control_dependencies([v_t]): g_t = state_ops.assign(g[-1], grad, use_locking=self._use_locking) for i in range(self._keep_num): with ops.control_dependencies([g_t]): g_t = state_ops.assign(g[i], g[i + 1], use_locking=self._use_locking) with ops.control_dependencies([g_t]): # m_t = tf.reduce_sum([g[-self._mov_num-1+i]*self.s[i] for i in range(self._mov_num)], axis=0) m_t = tf.reduce_sum( [g[-i - 2] * self.s[-i - 1] for i in range(self._mov_num)], axis=0) # m_t = tf.reduce_mean(g[:self._keep_num], axis=0) with ops.control_dependencies([v_t]): z_t = state_ops.assign( z, tf.cast(tf.logical_or(v_t > 0.0, z > 0.0), tf.float32)) b2p_t = state_ops.assign(b2p, b2p * beta2_t * tf.sign(z_t) + (1.0 - tf.sign(z_t)), use_locking=self._use_locking) b2_fix = tf.maximum(1e-8, 1.0 - b2p_t) step_t = z_t * m_t / (math_ops.sqrt(v_t / b2_fix) + epsilon_t) # if var.name == self.first_var.name: #'discriminator/final_linear/w:0': # idx = 0 # step_t = tf.Print(step_t, [z_t[idx]], 'z_t', summarize=1000) # step_t = tf.Print(step_t, [g[i][idx] for i in range(len(g))], 'g', summarize=1000) # step_t = tf.Print(step_t, [grad[idx]], 'grad', summarize=1000) # step_t = tf.Print(step_t, [b2p_t[idx]], 'b2p_t', summarize=1000) # step_t = tf.Print(step_t, [b2_fix], 'beta2_fix', summarize=1000) # step_t = tf.Print(step_t, [m_t[idx]], 'm_t', summarize=1000) # step_t = tf.Print(step_t, [tf.sqrt(v_t / b2_fix)[idx]], 'v_t', summarize=1000) # step_t = tf.Print(step_t, [step_t], 'step', summarize=1000) var_update = state_ops.assign_sub(var, lr_t * step_t, use_locking=self._use_locking) return control_flow_ops.group(*([var_update]))
def embedding_lookup_sparse( params, sp_ids, sp_weights, partition_strategy=None, # no used name="embedding_lookup_sparse", combiner="mean", max_norm=None, return_trainable=False, ): """Provides a dynamic version of embedding_lookup_sparse similar with tf.nn.embedding_lookup_sparse. This op assumes that there is at least one id for each row in the dense tensor represented by sp_ids (i.e. there are no rows with empty features), and that all the indices of sp_ids are in canonical row-major order. It also assumes that all id values lie in the range [0, p0), where p0 is the sum of the size of params along dimension 0. Args: params: A single `dynamic_embedding.Variable` instance representing the complete embedding tensor. sp_ids: N x M `SparseTensor` of int64 ids where N is typically batch size and M is arbitrary. sp_weights: either a `SparseTensor` of float / double weights, or `None` to indicate all weights should be taken to be 1. If specified, `sp_weights` must have exactly the same shape and indices as `sp_ids`. partition_strategy: No used. name: Optional name for the op. combiner: A string specifying the reduction op. Currently "mean", "sqrtn" and "sum" are supported. "sum" computes the weighted sum of the embedding results for each row. "mean" is the weighted sum divided by the total weight. "sqrtn" is the weighted sum divided by the square root of the sum of the squares of the weights. max_norm: If not `None`, each embedding is clipped if its l2-norm is larger than this value, before combining. return_trainable: optional, If True, also return TrainableWrapper create by `dynamic_embedding.embedding_lookup` Returns: combined_embeddings: A dense tensor representing the combined embeddings for the sparse ids. For each row in the dense tensor represented by `sp_ids`, the op looks up the embeddings for all ids in that row, multiplies them by the corresponding weight, and combines these embeddings as specified. In other words, if `shape(combined params) = [+infinity, dim]` and `shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]` then `shape(output) = [d0, dim]`. For instance, if params dim=20, and sp_ids / sp_weights are ```python [0, 0]: id 1, weight 2.0 [0, 1]: id 3, weight 0.5 [1, 0]: id 0, weight 1.0 [2, 3]: id 1, weight 3.0 ``` with `combiner`="mean", then the output will be a 3x20 matrix where ```python output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5) output[1, :] = (params[0, :] * 1.0) / 1.0 output[2, :] = (params[1, :] * 3.0) / 3.0 ``` trainable_wrap: A TrainableWrapper object used to fill the Optimizers `var_list` Only provided if `return_trainable` is True. Raises: TypeError: If `sp_ids` is not a `SparseTensor`, or if `sp_weights` is neither `None` nor `SparseTensor`. ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}. """ if combiner not in ("mean", "sqrtn", "sum"): raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'") if not isinstance(sp_ids, sparse_tensor.SparseTensor): raise TypeError("sp_ids must be SparseTensor") ignore_weights = sp_weights is None if not ignore_weights: if not isinstance(sp_weights, sparse_tensor.SparseTensor): raise TypeError("sp_weights must be either None or SparseTensor") scope = variable_scope.get_variable_scope() full_name = scope.name + "/" + name if scope.name else name with ops.name_scope(full_name + "/"): segment_ids = sp_ids.indices[:, 0] if segment_ids.dtype != dtypes.int32: segment_ids = math_ops.cast(segment_ids, dtypes.int32) ids = sp_ids.values ids, idx = array_ops.unique(ids) embeddings, trainable_ = embedding_lookup( params, ids, name=name + "/embedding_lookup", partition_strategy=partition_strategy, max_norm=max_norm, return_trainable=True, ) if embeddings.dtype in (dtypes.float16, dtypes.bfloat16): embeddings = math_ops.cast(embeddings, dtypes.float32) if not ignore_weights: weights = sp_weights.values if weights.dtype != embeddings.dtype: weights = math_ops.cast(weights, embeddings.dtype) embeddings = array_ops.gather(embeddings, idx) # Reshape weights to allow broadcast ones = array_ops.fill( array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1) bcast_weights_shape = array_ops.concat( [array_ops.shape(weights), ones], 0) orig_weights_shape = weights.get_shape() weights = array_ops.reshape(weights, bcast_weights_shape) # Set the weight shape, since after reshaping to bcast_weights_shape, # the shape becomes None. if embeddings.get_shape().ndims is not None: weights.set_shape( orig_weights_shape.concatenate( [1 for _ in range(embeddings.get_shape().ndims - 1)])) embeddings *= weights if combiner == "sum": embeddings = math_ops.segment_sum(embeddings, segment_ids, name=name) elif combiner == "mean": embeddings = math_ops.segment_sum(embeddings, segment_ids) weight_sum = math_ops.segment_sum(weights, segment_ids) embeddings = math_ops.div(embeddings, weight_sum, name=name) elif combiner == "sqrtn": embeddings = math_ops.segment_sum(embeddings, segment_ids) weights_squared = math_ops.pow(weights, 2) weight_sum = math_ops.segment_sum(weights_squared, segment_ids) weight_sum_sqrt = math_ops.sqrt(weight_sum) embeddings = math_ops.div(embeddings, weight_sum_sqrt, name=name) else: assert False, "Unrecognized combiner" else: assert idx is not None if combiner == "sum": embeddings = math_ops.sparse_segment_sum(embeddings, idx, segment_ids, name=name) elif combiner == "mean": embeddings = math_ops.sparse_segment_mean(embeddings, idx, segment_ids, name=name) elif combiner == "sqrtn": embeddings = math_ops.sparse_segment_sqrt_n(embeddings, idx, segment_ids, name=name) else: assert False, "Unrecognized combiner" return (embeddings, trainable_) if return_trainable else embeddings
def std(self, name="std"): with ops.name_scope(self.name): with ops.op_scope([self.range()], name): return self.range() / math_ops.sqrt(12.)
def _stddev(self): return math_ops.sqrt(self.concentration) / self.rate
def moving_stddev_initializer(*args, **kwargs): return math_ops.sqrt( self.moving_variance_initializer(*args, **kwargs))
def diagonal_only_frechet_classifier_distance_from_activations( real_activations, generated_activations): """Classifier distance for evaluating a generative model. This is based on the Frechet Inception distance, but for an arbitrary classifier. This technique is described in detail in https://arxiv.org/abs/1706.08500. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calcuates |m - m_w|^2 + (sigma + sigma_w - 2(sigma x sigma_w)^(1/2)) which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. In this variant, we compute diagonal-only covariance matrices. As a result, instead of computing an expensive matrix square root, we can do something much simpler, and has O(n) vs O(n^2) space complexity. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. Args: real_activations: Real images to use to compute Frechet Inception distance. generated_activations: Generated images to use to compute Frechet Inception distance. Returns: The diagonal-only Frechet Inception distance. A floating-point scalar of the same type as the output of the activations. Raises: ValueError: If the shape of the variance and mean vectors are not equal. """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) activations_dtype = real_activations.dtype if activations_dtype != dtypes.float64: real_activations = math_ops.cast(real_activations, dtypes.float64) generated_activations = math_ops.cast(generated_activations, dtypes.float64) # Compute mean and covariance matrices of activations. m, var = nn_impl.moments(real_activations, axes=[0]) m_w, var_w = nn_impl.moments(generated_activations, axes=[0]) actual_shape = var.get_shape() expected_shape = m.get_shape() if actual_shape != expected_shape: raise ValueError('shape: {} must match expected shape: {}'.format( actual_shape, expected_shape)) # Compute the two components of FID. # First the covariance component. # Here, note that trace(A + B) = trace(A) + trace(B) trace = math_ops.reduce_sum((var + var_w) - 2.0 * math_ops.sqrt(math_ops.multiply(var, var_w))) # Next the distance between means. mean = math_ops.reduce_sum(math_ops.squared_difference( m, m_w)) # Equivalent to L2 but more stable. dofid = trace + mean if activations_dtype != dtypes.float64: dofid = math_ops.cast(dofid, activations_dtype) return dofid
def _renorm_correction_and_moments(self, mean, variance, training): """Returns the correction and update values for renorm.""" stddev = math_ops.sqrt(variance + self.epsilon) # Compute the average mean and standard deviation, as if they were # initialized with this batch's moments. mixed_renorm_mean = (self.renorm_mean + (1. - self.renorm_mean_weight) * mean) mixed_renorm_stddev = (self.renorm_stddev + (1. - self.renorm_stddev_weight) * stddev) # Compute the corrections for batch renorm. r = stddev / mixed_renorm_stddev d = (mean - mixed_renorm_mean) / mixed_renorm_stddev # Ensure the corrections use pre-update moving averages. with ops.control_dependencies([r, d]): mean = array_ops.identity(mean) stddev = array_ops.identity(stddev) rmin, rmax, dmax = [ self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax'] ] if rmin is not None: r = math_ops.maximum(r, rmin) if rmax is not None: r = math_ops.minimum(r, rmax) if dmax is not None: d = math_ops.maximum(d, -dmax) d = math_ops.minimum(d, dmax) # When not training, use r=1, d=0, and decay=1 meaning no updates. r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r)) d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d)) decay = _smart_select(training, lambda: self.renorm_momentum, lambda: 1.) def _update_renorm_variable(var, weight, value): """Updates a moving average and weight, returns the unbiased value.""" # Update the variables without zero debiasing. The debiasing will be # accomplished by dividing the exponential moving average by the weight. # For example, after a single update, the moving average would be # (1-decay) * value. and the weight will be 1-decay, with their ratio # giving value. # Make sure the weight is not updated until before r and d computation. value = array_ops.identity(value) with ops.control_dependencies([value]): weight_value = array_ops.constant(1., dtype=weight.dtype) new_var = moving_averages.assign_moving_average(var, value, decay, zero_debias=False) new_weight = moving_averages.assign_moving_average( weight, weight_value, decay, zero_debias=False) return new_var / new_weight with ops.colocate_with(self.moving_mean): new_mean = _update_renorm_variable(self.renorm_mean, self.renorm_mean_weight, mean) with ops.colocate_with(self.moving_variance): new_stddev = _update_renorm_variable(self.renorm_stddev, self.renorm_stddev_weight, stddev) # Make sqrt(moving_variance + epsilon) = new_stddev. new_variance = math_ops.square(new_stddev) - self.epsilon return (r, d, new_mean, new_variance)
def _update_clip_coeff(self, grads_and_vars, precon_grads_and_vars): sq_norm_grad = self._squared_fisher_norm(grads_and_vars, precon_grads_and_vars) sq_norm_up = sq_norm_grad * self._learning_rate**2 return math_ops.minimum( 1., math_ops.sqrt(self._norm_constraint / sq_norm_up))
def kernel_classifier_distance_and_std_from_activations( real_activations, generated_activations, max_block_size=1024, dtype=None): """Kernel "classifier" distance for evaluating a generative model. This methods computes the kernel classifier distance from activations of real images and generated images. This can be used independently of the kernel_classifier_distance() method, especially in the case of using large batches during evaluation where we would like to precompute all of the activations before computing the classifier distance, or if we want to compute multiple metrics based on the same images. It also returns a rough estimate of the standard error of the estimator. This technique is described in detail in https://arxiv.org/abs/1801.01401. Given two distributions P and Q of activations, this function calculates E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')] - 2 E_{X ~ P, Y ~ Q}[k(X, Y)] where k is the polynomial kernel k(x, y) = ( x^T y / dimension + 1 )^3. This captures how different the distributions of real and generated images' visual features are. Like the Frechet distance (and unlike the Inception score), this is a true distance and incorporates information about the target images. Unlike the Frechet score, this function computes an *unbiased* and asymptotically normal estimator, which makes comparing estimates across models much more intuitive. The estimator used takes time quadratic in max_block_size. Larger values of max_block_size will decrease the variance of the estimator but increase the computational cost. This differs slightly from the estimator used by the original paper; it is the block estimator of https://arxiv.org/abs/1307.1954. The estimate of the standard error will also be more reliable when there are more blocks, i.e. when max_block_size is smaller. NOTE: the blocking code assumes that real_activations and generated_activations are both in random order. If either is sorted in a meaningful order, the estimator will behave poorly. Args: real_activations: 2D Tensor containing activations of real data. Shape is [batch_size, activation_size]. generated_activations: 2D Tensor containing activations of generated data. Shape is [batch_size, activation_size]. max_block_size: integer, default 1024. The distance estimator splits samples into blocks for computational efficiency. Larger values are more computationally expensive but decrease the variance of the distance estimate. Having a smaller block size also gives a better estimate of the standard error. dtype: If not None, coerce activations to this dtype before computations. Returns: The Kernel Inception Distance. A floating-point scalar of the same type as the output of the activations. An estimate of the standard error of the distance estimator (a scalar of the same type). """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) real_activations.shape[1].assert_is_compatible_with( generated_activations.shape[1]) if dtype is None: dtype = real_activations.dtype assert generated_activations.dtype == dtype else: real_activations = math_ops.cast(real_activations, dtype) generated_activations = math_ops.cast(generated_activations, dtype) # Figure out how to split the activations into blocks of approximately # equal size, with none larger than max_block_size. n_r = array_ops.shape(real_activations)[0] n_g = array_ops.shape(generated_activations)[0] n_bigger = math_ops.maximum(n_r, n_g) n_blocks = math_ops.cast(math_ops.ceil(n_bigger / max_block_size), dtypes.int32) v_r = n_r // n_blocks v_g = n_g // n_blocks n_plusone_r = n_r - v_r * n_blocks n_plusone_g = n_g - v_g * n_blocks sizes_r = array_ops.concat([ array_ops.fill([n_blocks - n_plusone_r], v_r), array_ops.fill([n_plusone_r], v_r + 1), ], 0) sizes_g = array_ops.concat([ array_ops.fill([n_blocks - n_plusone_g], v_g), array_ops.fill([n_plusone_g], v_g + 1), ], 0) zero = array_ops.zeros([1], dtype=dtypes.int32) inds_r = array_ops.concat([zero, math_ops.cumsum(sizes_r)], 0) inds_g = array_ops.concat([zero, math_ops.cumsum(sizes_g)], 0) dim = math_ops.cast(real_activations.shape[1], dtype) def compute_kid_block(i): """Computes the ith block of the KID estimate.""" r_s = inds_r[i] r_e = inds_r[i + 1] r = real_activations[r_s:r_e] m = math_ops.cast(r_e - r_s, dtype) g_s = inds_g[i] g_e = inds_g[i + 1] g = generated_activations[g_s:g_e] n = math_ops.cast(g_e - g_s, dtype) k_rr = (math_ops.matmul(r, r, transpose_b=True) / dim + 1)**3 k_rg = (math_ops.matmul(r, g, transpose_b=True) / dim + 1)**3 k_gg = (math_ops.matmul(g, g, transpose_b=True) / dim + 1)**3 return (-2 * math_ops.reduce_mean(k_rg) + (math_ops.reduce_sum(k_rr) - math_ops.trace(k_rr)) / (m * (m - 1)) + (math_ops.reduce_sum(k_gg) - math_ops.trace(k_gg)) / (n * (n - 1))) ests = map_fn.map_fn(compute_kid_block, math_ops.range(n_blocks), dtype=dtype, back_prop=False) mn = math_ops.reduce_mean(ests) # nn_impl.moments doesn't use the Bessel correction, which we want here n_blocks_ = math_ops.cast(n_blocks, dtype) var = control_flow_ops.cond( math_ops.less_equal(n_blocks, 1), lambda: array_ops.constant(float('nan'), dtype=dtype), lambda: math_ops.reduce_sum(math_ops.square(ests - mn)) / (n_blocks_ - 1)) return mn, math_ops.sqrt(var / n_blocks_)
def _resource_apply_sparse(self, grad, var, indices, apply_state=None): var_dtype = var.dtype.base_dtype lr_t = array_ops.identity(self._get_hyper('learning_rate', var_dtype)) beta_1_t = array_ops.identity(self._get_hyper('beta_1', var_dtype)) beta_2_t = array_ops.identity(self._get_hyper('beta_2', var_dtype)) epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') local_step = math_ops.cast(self.iterations + 1, var_dtype) next_step = math_ops.cast(self.iterations + 2, var_dtype) decay_base = math_ops.cast(0.96, var_dtype) # Learning rate multipliers if self.lr_multipliers is not None: lr_t = _apply_lr_multiplier(self, lr_t, var) momentum_cache_t = beta_1_t * (1. - 0.5 * ( math_ops.pow(decay_base, self._initial_decay * local_step))) momentum_cache_t_1 = beta_1_t * (1. - 0.5 * ( math_ops.pow(decay_base, self._initial_decay * next_step))) m_schedule_new = math_ops.cast(self._m_cache_read, var_dtype) * momentum_cache_t if var_dtype is self._m_cache.dtype: m_schedule_new = array_ops.identity(state_ops.assign( self._m_cache, m_schedule_new, use_locking=self._use_locking)) m_schedule_next = m_schedule_new * momentum_cache_t_1 m_scaled_g_values = grad * (1. - beta_1_t) m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking) with ops.control_dependencies([m_t]): m_t = self._resource_scatter_add(m, indices, m_scaled_g_values) m_t_slice = array_ops.gather(m_t, indices) m_t_prime = m_t_slice / (1. - m_schedule_next) g_prime = grad / (1. - m_schedule_new) m_t_bar = (1. - momentum_cache_t) * g_prime + ( momentum_cache_t_1 * m_t_prime) v_scaled_g_values = (grad * grad) * (1. - beta_2_t) v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = self._resource_scatter_add(v, indices, v_scaled_g_values) v_t_slice = array_ops.gather(v_t, indices) v_t_prime_denominator = 1. - math_ops.pow(beta_2_t, local_step) v_t_prime = v_t_slice / v_t_prime_denominator v_prime_sqrt_plus_eps = math_ops.sqrt(v_t_prime) + epsilon_t var_t = self._resource_scatter_add( var, indices, -self.eta_t * lr_t * m_t_bar / v_prime_sqrt_plus_eps) # Weight decays if var.name in self.weight_decays.keys(): var_t = _apply_weight_decays(self, var, var_t) var_update = state_ops.assign(var, var_t, use_locking=self._use_locking) # Cosine annealing (iteration_done, t_cur_update, eta_t_update ) = _update_t_cur_eta_t_v2(self, lr_t, var) if iteration_done and not self._init_notified: self._init_notified = True updates = [var_update, m_t_bar, v_t] if iteration_done: updates += [t_cur_update] if self.use_cosine_annealing and iteration_done: updates += [eta_t_update] return control_flow_ops.group(*updates)
def _resource_apply_sparse(self, grad, var, indices, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) rms = self.get_slot(var, "rms") if self._momentum: mom = self.get_slot(var, "momentum") if self.centered: mg = self.get_slot(var, "mg") return gen_training_ops.ResourceSparseApplyCenteredRMSProp( var=var.handle, mg=mg.handle, ms=rms.handle, mom=mom.handle, lr=coefficients["lr_t"], rho=coefficients["rho"], momentum=coefficients["momentum"], epsilon=coefficients["epsilon"], grad=grad, indices=indices, use_locking=self._use_locking) else: return gen_training_ops.ResourceSparseApplyRMSProp( var=var.handle, ms=rms.handle, mom=mom.handle, lr=coefficients["lr_t"], rho=coefficients["rho"], momentum=coefficients["momentum"], epsilon=coefficients["epsilon"], grad=grad, indices=indices, use_locking=self._use_locking) else: rms_scaled_g_values = (grad * grad) * coefficients["one_minus_rho"] rms_t = state_ops.assign(rms, rms * coefficients["rho"], use_locking=self._use_locking) with ops.control_dependencies([rms_t]): rms_t = self._resource_scatter_add(rms, indices, rms_scaled_g_values) rms_slice = array_ops.gather(rms_t, indices) denom_slice = rms_slice if self.centered: mg = self.get_slot(var, "mg") mg_scaled_g_values = grad * coefficients["one_minus_rho"] mg_t = state_ops.assign(mg, mg * coefficients["rho"], use_locking=self._use_locking) with ops.control_dependencies([mg_t]): mg_t = self._resource_scatter_add(mg, indices, mg_scaled_g_values) mg_slice = array_ops.gather(mg_t, indices) denom_slice = rms_slice - math_ops.square(mg_slice) var_update = self._resource_scatter_add( var, indices, coefficients["neg_lr_t"] * grad / (math_ops.sqrt(denom_slice) + coefficients["epsilon"])) if self.centered: return control_flow_ops.group(*[var_update, rms_t, mg_t]) return control_flow_ops.group(*[var_update, rms_t])
def wasserstein_gradient_penalty( real_data, generated_data, generator_inputs, discriminator_fn, discriminator_scope, epsilon=1e-10, target=1.0, one_sided=False, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS, add_summaries=False): """The gradient penalty for the Wasserstein discriminator loss. See `Improved Training of Wasserstein GANs` (https://arxiv.org/abs/1704.00028) for more details. Args: real_data: Real data. generated_data: Output of the generator. generator_inputs: Exact argument to pass to the generator, which is used as optional conditioning to the discriminator. discriminator_fn: A discriminator function that conforms to TFGAN API. discriminator_scope: If not `None`, reuse discriminators from this scope. epsilon: A small positive number added for numerical stability when computing the gradient norm. target: Optional Python number or `Tensor` indicating the target value of gradient norm. Defaults to 1.0. one_sided: If `True`, penalty proposed in https://arxiv.org/abs/1709.08894 is used. Defaults to `False`. weights: Optional `Tensor` whose rank is either 0, or the same rank as `real_data` and `generated_data`, and must be broadcastable to them (i.e., all dimensions must be either `1`, or the same as the corresponding dimension). scope: The scope for the operations performed in computing the loss. loss_collection: collection to which this loss will be added. reduction: A `tf.losses.Reduction` to apply to loss. add_summaries: Whether or not to add summaries for the loss. Returns: A loss Tensor. The shape depends on `reduction`. Raises: ValueError: If the rank of data Tensors is unknown. """ with ops.name_scope(scope, 'wasserstein_gradient_penalty', (real_data, generated_data)) as scope: real_data = ops.convert_to_tensor(real_data) generated_data = ops.convert_to_tensor(generated_data) if real_data.shape.ndims is None: raise ValueError('`real_data` can\'t have unknown rank.') if generated_data.shape.ndims is None: raise ValueError('`generated_data` can\'t have unknown rank.') differences = generated_data - real_data batch_size = differences.shape[0].value or array_ops.shape( differences)[0] alpha_shape = [batch_size] + [1] * (differences.shape.ndims - 1) alpha = random_ops.random_uniform(shape=alpha_shape) interpolates = real_data + (alpha * differences) with ops.name_scope( None): # Clear scope so update ops are added properly. # Reuse variables if variables already exists. with variable_scope.variable_scope( discriminator_scope, 'gpenalty_dscope', reuse=variable_scope.AUTO_REUSE): disc_interpolates = discriminator_fn(interpolates, generator_inputs) if isinstance(disc_interpolates, tuple): # ACGAN case: disc outputs more than one tensor disc_interpolates = disc_interpolates[0] gradients = gradients_impl.gradients(disc_interpolates, interpolates)[0] gradient_squares = math_ops.reduce_sum( math_ops.square(gradients), axis=list(range(1, gradients.shape.ndims))) # Propagate shape information, if possible. if isinstance(batch_size, int): gradient_squares.set_shape([batch_size] + gradient_squares.shape.as_list()[1:]) # For numerical stability, add epsilon to the sum before taking the square # root. Note tf.norm does not add epsilon. slopes = math_ops.sqrt(gradient_squares + epsilon) penalties = slopes / target - 1.0 if one_sided: penalties = math_ops.maximum(0., penalties) penalties_squared = math_ops.square(penalties) penalty = losses.compute_weighted_loss(penalties_squared, weights, scope=scope, loss_collection=loss_collection, reduction=reduction) if add_summaries: summary.scalar('gradient_penalty_loss', penalty) return penalty
def std(self, name="std"): """Standard deviation of the distribution.""" with ops.name_scope(self.name): with ops.name_scope(name, values=[self._n, self._p]): return math_ops.sqrt(self.variance())
def _sqrt_to_dense(self): return array_ops.matrix_diag(math_ops.sqrt(self._diag))
def embedding_lookup_sparse_with_distributed_aggregation( params, sp_ids, sp_weights, partition_strategy="mod", name=None, combiner=None, max_norm=None): """Computes embeddings for the given ids and weights. Embeddings belonging to same param are aggregated on that device first. This op is intended to decrease data transmission and improve parallelism. See `tf.nn.embedding_lookup_sparse` for the functionality and example of this op. Args: params: A single tensor representing the complete embedding tensor, or a list of P tensors all of same shape except for the first dimension, representing sharded embedding tensors. Alternatively, a `PartitionedVariable`, created by partitioning along dimension 0. Each element must be appropriately sized for the given `partition_strategy`. sp_ids: N x M SparseTensor of int64 ids (typically from FeatureValueToId), where N is typically batch size and M is arbitrary. sp_weights: either a SparseTensor of float / double weights, or None to indicate all weights should be taken to be 1. If specified, sp_weights must have exactly the same shape and indices as sp_ids. partition_strategy: A string specifying the partitioning strategy, relevant if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default is `"mod"`. See `tf.nn.embedding_lookup` for more details. name: Optional name for the op. combiner: A string specifying the reduction op. Currently "mean", "sqrtn" and "sum" are supported. "sum" computes the weighted sum of the embedding results for each row. "mean" is the weighted sum divided by the total weight. "sqrtn" is the weighted sum divided by the square root of the sum of the squares of the weights. max_norm: If not None, each embedding is normalized to have l2 norm equal to max_norm before combining. Returns: A dense tensor representing the combined embeddings for the sparse ids. For each row in the dense tensor represented by sp_ids, the op looks up the embeddings for all ids in that row, multiplies them by the corresponding weight, and combines these embeddings as specified. Raises: TypeError: If sp_ids is not a SparseTensor, or if sp_weights is neither None nor SparseTensor. ValueError: If combiner is not one of {"mean", "sqrtn", "sum"}. """ if combiner is None: logging.warn("The default value of combiner will change from \"mean\" " "to \"sqrtn\" after 2016/11/01.") combiner = "mean" if combiner not in ("mean", "sqrtn", "sum"): raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'") if isinstance(params, variables.PartitionedVariable): params = list(params) # Iterate to get the underlying Variables. if not isinstance(params, list): params = [params] if not isinstance(sp_ids, sparse_tensor.SparseTensor): raise TypeError("sp_ids must be SparseTensor") ignore_weights = sp_weights is None if not ignore_weights: if not isinstance(sp_weights, sparse_tensor.SparseTensor): raise TypeError("sp_weights must be either None or SparseTensor") sp_ids.values.get_shape().assert_is_compatible_with( sp_weights.values.get_shape()) sp_ids.indices.get_shape().assert_is_compatible_with( sp_weights.indices.get_shape()) sp_ids.dense_shape.get_shape().assert_is_compatible_with( sp_weights.dense_shape.get_shape()) # TODO(yleon): Add enhanced node assertions to verify that sp_ids and # sp_weights have equal indices and shapes. with ops.name_scope(name, "embedding_lookup_sparse", params + [sp_ids]) as name: segment_ids = sp_ids.indices[:, 0] if segment_ids.dtype != dtypes.int32: segment_ids = math_ops.cast(segment_ids, dtypes.int32) ids = sp_ids.values if ignore_weights: ids, idx = array_ops.unique(ids) else: idx = None weights = None if ignore_weights else sp_weights.values embeddings = _embedding_lookup_with_distributed_aggregation( params, ids, partition_strategy=partition_strategy, max_norm=max_norm, weights=weights, idx=idx, segment_ids=segment_ids) # Set weights to all one if ignore weights. if ignore_weights: weights = array_ops.fill([array_ops.shape(segment_ids)[0]], 1) if weights.dtype != embeddings.dtype: weights = math_ops.cast(weights, embeddings.dtype) # Reshape weights. ones = array_ops.fill( array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1) bcast_weights_shape = array_ops.concat( [array_ops.shape(weights), ones], 0) orig_weights_shape = weights.get_shape() weights = array_ops.reshape(weights, bcast_weights_shape) if embeddings.get_shape().ndims is not None: weights.set_shape( orig_weights_shape.concatenate( [1 for _ in range(embeddings.get_shape().ndims - 1)])) if combiner == "mean": weight_sum = math_ops.segment_sum(weights, segment_ids) embeddings = math_ops.div(embeddings, weight_sum) elif combiner == "sqrtn": weights_squared = math_ops.pow(weights, 2) weight_sum = math_ops.segment_sum(weights_squared, segment_ids) weight_sum_sqrt = math_ops.sqrt(weight_sum) embeddings = math_ops.div(embeddings, weight_sum_sqrt) elif combiner != "sum": assert False, "Unrecognized combiner" return embeddings