def testConcurrentExecutesWithoutError(self): with self.test_session(use_gpu=True) as sess: all_ops = [] for compute_uv_ in True, False: for full_matrices_ in True, False: matrix1 = random_ops.random_normal([5, 5], seed=42) matrix2 = random_ops.random_normal([5, 5], seed=42) if compute_uv_: s1, u1, v1 = linalg_ops.svd( matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_) s2, u2, v2 = linalg_ops.svd( matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_) all_ops += [s1, u1, v1, s2, u2, v2] else: s1 = linalg_ops.svd( matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_) s2 = linalg_ops.svd( matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_) all_ops += [s1, s2] val = sess.run(all_ops) for i in range(2): s = 6 * i self.assertAllEqual(val[s], val[s + 3]) # s1 == s2 self.assertAllEqual(val[s + 1], val[s + 4]) # u1 == u2 self.assertAllEqual(val[s + 2], val[s + 5]) # v1 == v2 for i in range(2): s = 12 + 2 * i self.assertAllEqual(val[s], val[s + 1]) # s1 == s2
def testExecuteMultipleWithoutError(self): all_ops = [] shape = [6, 5] seed = [42, 24] for compute_uv_ in True, False: for full_matrices_ in True, False: matrix1 = stateless_random_ops.stateless_random_normal( shape, seed) matrix2 = stateless_random_ops.stateless_random_normal( shape, seed) self.assertAllEqual(matrix1, matrix2) if compute_uv_: s1, u1, v1 = linalg_ops.svd(matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_) s2, u2, v2 = linalg_ops.svd(matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_) all_ops += [s1, s2, u1, u2, v1, v2] else: s1 = linalg_ops.svd(matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_) s2 = linalg_ops.svd(matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_) all_ops += [s1, s2] val = self.evaluate(all_ops) for i in range(0, len(val), 2): self.assertAllEqual(val[i], val[i + 1])
def testSpectralNormalize(self): weights = variable_scope.get_variable( 'w', dtype=dtypes.float32, shape=[2, 3, 50, 100]) weights = math_ops.multiply(weights, 10.0) normalized_weights = spectral_normalization.spectral_normalize( weights, power_iteration_rounds=1) unnormalized_sigma = linalg_ops.svd( array_ops.reshape(weights, [-1, weights.shape[-1]]), compute_uv=False)[..., 0] normalized_sigma = linalg_ops.svd( array_ops.reshape(normalized_weights, [-1, weights.shape[-1]]), compute_uv=False)[..., 0] with self.cached_session() as sess: sess.run(variables.global_variables_initializer()) s0 = sess.run(unnormalized_sigma) for i in range(50): sigma = sess.run(normalized_sigma) if i < 1: s1 = sigma if i < 5: s5 = sigma if i < 10: s10 = sigma s50 = sigma self.assertAlmostEqual(1., s50, 0) self.assertGreater(abs(s10 - 1.), abs(s50 - 1.)) self.assertGreater(abs(s5 - 1.), abs(s10 - 1.)) self.assertGreater(abs(s1 - 1.), abs(s5 - 1.)) self.assertGreater(abs(s0 - 1.), abs(s1 - 1.))
def benchmarkSVDOp(self): for shape_ in self.shapes: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix_value = np.random.uniform( low=-1.0, high=1.0, size=shape_).astype(np.float32) matrix = variables.Variable(matrix_value) u, s, v = linalg_ops.svd(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(u, s, v), min_iters=25, name="SVD_cpu_{shape}".format(shape=shape_)) if test.is_gpu_available(True): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/device:GPU:0"): matrix_value = np.random.uniform( low=-1.0, high=1.0, size=shape_).astype(np.float32) matrix = variables.Variable(matrix_value) u, s, v = linalg_ops.svd(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(u, s, v), min_iters=25, name="SVD_gpu_{shape}".format(shape=shape_))
def benchmarkSVDOp(self): for shape_ in self.shapes: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix_value = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype( np.float32) matrix = variables.Variable(matrix_value) u, s, v = linalg_ops.svd(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(u, s, v), min_iters=25, name="SVD_cpu_{shape}".format(shape=shape_)) if test.is_gpu_available(True): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/device:GPU:0"): matrix_value = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype( np.float32) matrix = variables.Variable(matrix_value) u, s, v = linalg_ops.svd(matrix) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(u, s, v), min_iters=25, name="SVD_gpu_{shape}".format(shape=shape_))
def testConcurrentExecutesWithoutError(self): with self.session(use_gpu=True) as sess: all_ops = [] for compute_uv_ in True, False: for full_matrices_ in True, False: matrix1 = random_ops.random_normal([5, 5], seed=42) matrix2 = random_ops.random_normal([5, 5], seed=42) if compute_uv_: s1, u1, v1 = linalg_ops.svd( matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_) s2, u2, v2 = linalg_ops.svd( matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_) all_ops += [s1, u1, v1, s2, u2, v2] else: s1 = linalg_ops.svd(matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_) s2 = linalg_ops.svd(matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_) all_ops += [s1, s2] val = self.evaluate(all_ops) for i in range(2): s = 6 * i self.assertAllEqual(val[s], val[s + 3]) # s1 == s2 self.assertAllEqual(val[s + 1], val[s + 4]) # u1 == u2 self.assertAllEqual(val[s + 2], val[s + 5]) # v1 == v2 for i in range(2): s = 12 + 2 * i self.assertAllEqual(val[s], val[s + 1]) # s1 == s2
def Test(self): if not use_static_shape_ and context.executing_eagerly(): return is_complex = dtype_ in (np.complex64, np.complex128) is_single = dtype_ in (np.float32, np.complex64) tol = 3e-4 if is_single else 1e-12 if test.is_gpu_available(): # The gpu version returns results that are much less accurate. tol *= 100 np.random.seed(42) x_np = np.random.uniform( low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_) if is_complex: x_np += 1j * np.random.uniform( low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_) if use_static_shape_: x_tf = constant_op.constant(x_np) else: x_tf = array_ops.placeholder(dtype_) if compute_uv_: s_tf, u_tf, v_tf = linalg_ops.svd( x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_) if use_static_shape_: s_tf_val, u_tf_val, v_tf_val = self.evaluate([s_tf, u_tf, v_tf]) else: with self.session(use_gpu=True) as sess: s_tf_val, u_tf_val, v_tf_val = sess.run( [s_tf, u_tf, v_tf], feed_dict={x_tf: x_np}) else: s_tf = linalg_ops.svd( x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_) if use_static_shape_: s_tf_val = self.evaluate(s_tf) else: with self.session(use_gpu=True) as sess: s_tf_val = sess.run(s_tf, feed_dict={x_tf: x_np}) if compute_uv_: u_np, s_np, v_np = np.linalg.svd( x_np, compute_uv=compute_uv_, full_matrices=full_matrices_) else: s_np = np.linalg.svd( x_np, compute_uv=compute_uv_, full_matrices=full_matrices_) # We explicitly avoid the situation where numpy eliminates a first # dimension that is equal to one. s_np = np.reshape(s_np, s_tf_val.shape) CompareSingularValues(self, s_np, s_tf_val, tol) if compute_uv_: CompareSingularVectors(self, u_np, u_tf_val, min(shape_[-2:]), tol) CompareSingularVectors(self, np.conj(np.swapaxes(v_np, -2, -1)), v_tf_val, min(shape_[-2:]), tol) CheckApproximation(self, x_np, u_tf_val, s_tf_val, v_tf_val, full_matrices_, tol) CheckUnitary(self, u_tf_val, tol) CheckUnitary(self, v_tf_val, tol)
def Test(self): np.random.seed(1) x_np = np.random.uniform( low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_) if is_complex: x_np += 1j * np.random.uniform( low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_) for compute_uv in False, True: for full_matrices in False, True: with self.test_session(use_gpu = use_gpu_) as sess: if use_static_shape_: x_tf = constant_op.constant(x_np) else: x_tf = array_ops.placeholder(dtype_) if compute_uv: s_tf, u_tf, v_tf = linalg_ops.svd(x_tf, compute_uv=compute_uv, full_matrices=full_matrices) if use_static_shape_: s_tf_val, u_tf_val, v_tf_val = sess.run([s_tf, u_tf, v_tf]) else: s_tf_val, u_tf_val, v_tf_val = sess.run([s_tf, u_tf, v_tf], feed_dict={x_tf: x_np}) else: s_tf = linalg_ops.svd(x_tf, compute_uv=compute_uv, full_matrices=full_matrices) if use_static_shape_: s_tf_val = sess.run(s_tf) else: s_tf_val = sess.run(s_tf, feed_dict={x_tf: x_np}) if compute_uv: u_np, s_np, v_np = np.linalg.svd(x_np, compute_uv=compute_uv, full_matrices=full_matrices) else: s_np = np.linalg.svd(x_np, compute_uv=compute_uv, full_matrices=full_matrices) # We explicitly avoid the situation where numpy eliminates a first # dimension that is equal to one s_np = np.reshape(s_np, s_tf_val.shape) CompareSingularValues(self, s_np, s_tf_val) if compute_uv: CompareSingularVectors(self, u_np, u_tf_val, min(shape_[-2:])) CompareSingularVectors(self, np.conj(np.swapaxes(v_np, -2, -1)), v_tf_val, min(shape_[-2:])) CheckApproximation(self, x_np, u_tf_val, s_tf_val, v_tf_val, full_matrices) CheckUnitary(self, u_tf_val) CheckUnitary(self, v_tf_val)
def Test(self): is_complex = dtype_ in (np.complex64, np.complex128) is_single = dtype_ in (np.float32, np.complex64) tol = 3e-4 if is_single else 1e-12 if test.is_gpu_available(): # The gpu version returns results that are much less accurate. tol *= 100 np.random.seed(42) x_np = np.random.uniform( low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_) if is_complex: x_np += 1j * np.random.uniform( low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_) with self.test_session(use_gpu=True) as sess: if use_static_shape_: x_tf = constant_op.constant(x_np) else: x_tf = array_ops.placeholder(dtype_) if compute_uv_: s_tf, u_tf, v_tf = linalg_ops.svd( x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_) if use_static_shape_: s_tf_val, u_tf_val, v_tf_val = sess.run([s_tf, u_tf, v_tf]) else: s_tf_val, u_tf_val, v_tf_val = sess.run( [s_tf, u_tf, v_tf], feed_dict={x_tf: x_np}) else: s_tf = linalg_ops.svd( x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_) if use_static_shape_: s_tf_val = sess.run(s_tf) else: s_tf_val = sess.run(s_tf, feed_dict={x_tf: x_np}) if compute_uv_: u_np, s_np, v_np = np.linalg.svd( x_np, compute_uv=compute_uv_, full_matrices=full_matrices_) else: s_np = np.linalg.svd( x_np, compute_uv=compute_uv_, full_matrices=full_matrices_) # We explicitly avoid the situation where numpy eliminates a first # dimension that is equal to one. s_np = np.reshape(s_np, s_tf_val.shape) CompareSingularValues(self, s_np, s_tf_val, tol) if compute_uv_: CompareSingularVectors(self, u_np, u_tf_val, min(shape_[-2:]), tol) CompareSingularVectors(self, np.conj(np.swapaxes(v_np, -2, -1)), v_tf_val, min(shape_[-2:]), tol) CheckApproximation(self, x_np, u_tf_val, s_tf_val, v_tf_val, full_matrices_, tol) CheckUnitary(self, u_tf_val, tol) CheckUnitary(self, v_tf_val, tol)
def testWrongDimensions(self): # The input to svd should be a tensor of at least rank 2. scalar = constant_op.constant(1.) with self.assertRaisesRegexp(ValueError, "Shape must be at least rank 2 but is rank 0"): linalg_ops.svd(scalar) vector = constant_op.constant([1., 2.]) with self.assertRaisesRegexp(ValueError, "Shape must be at least rank 2 but is rank 1"): linalg_ops.svd(vector)
def testWrongDimensions(self): # The input to svd should be a tensor of at least rank 2. scalar = constant_op.constant(1.) with self.assertRaisesRegex( (ValueError, errors_impl.InvalidArgumentError), "rank.* 2.*0"): linalg_ops.svd(scalar) vector = constant_op.constant([1., 2.]) with self.assertRaisesRegex( (ValueError, errors_impl.InvalidArgumentError), "rank.* 2.*1"): linalg_ops.svd(vector)
def testWrongDimensions(self): # The input to svd should be a tensor of at least rank 2. scalar = constant_op.constant(1.) with self.assertRaisesRegexp( ValueError, "Shape must be at least rank 2 but is rank 0"): linalg_ops.svd(scalar) vector = constant_op.constant([1., 2.]) with self.assertRaisesRegexp( ValueError, "Shape must be at least rank 2 but is rank 1"): linalg_ops.svd(vector)
def _initializer(shape, dtype=_assert_float_dtype(dtype), partition_info=None): # Check the shape if len(shape) < 2: raise ValueError('the tensor to initialize must be at least \ two-dimensional') # Flatten the input shape with the last dimension remaining its # original shape so it works for conv2d num_rows = 1 for dim in shape[:-1]: num_rows *= dim num_cols = shape[-1] flat_shape = (num_rows, num_cols) # Generate a random matrix a = random_ops.random_uniform(flat_shape, dtype=dtype, seed=seed) # Compute the svd _, svd_u, svd_v = linalg_ops.svd(a, full_matrices=False) # Pick the appropriate singular value decomposition if num_rows > num_cols: q_t = svd_u else: # Tensorflow departs from numpy conventions such that we need to # transpose axes here q_t = array_ops.transpose(svd_v) return gain * array_ops.reshape(q_t, shape)
def _testSvdCorrectness(self, dtype, shape): np.random.seed(1) x_np = np.random.uniform(low=-1.0, high=1.0, size=shape).astype(dtype) m, n = shape[-2], shape[-1] _, s_np, _ = np.linalg.svd(x_np) with self.session() as sess: x_tf = array_ops.placeholder(dtype) with self.test_scope(): s, u, v = linalg_ops.svd(x_tf, full_matrices=True) s_val, u_val, v_val = sess.run([s, u, v], feed_dict={x_tf: x_np}) u_diff = np.matmul(u_val, np.swapaxes(u_val, -1, -2)) - np.eye(m) v_diff = np.matmul(v_val, np.swapaxes(v_val, -1, -2)) - np.eye(n) # Check u_val and v_val are orthogonal matrices. self.assertLess(np.linalg.norm(u_diff), 1e-2) self.assertLess(np.linalg.norm(v_diff), 1e-2) # Check that the singular values are correct, i.e., close to the ones from # numpy.lingal.svd. self.assertLess(np.linalg.norm(s_val - s_np), 1e-2) # The tolerance is set based on our tests on numpy's svd. As our tests # have batch dimensions and all our operations are on float32, we set the # tolerance a bit larger. Numpy's svd calls LAPACK's svd, which operates # on double precision. self.assertLess( np.linalg.norm(self._compute_usvt(s_val, u_val, v_val) - x_np), 2e-2) # Check behavior with compute_uv=False. We expect to still see 3 outputs, # with a sentinel scalar 0 in the last two outputs. with self.test_scope(): no_uv_s, no_uv_u, no_uv_v = gen_linalg_ops.svd( x_tf, full_matrices=True, compute_uv=False) no_uv_s_val, no_uv_u_val, no_uv_v_val = sess.run( [no_uv_s, no_uv_u, no_uv_v], feed_dict={x_tf: x_np}) self.assertAllClose(no_uv_s_val, s_val, atol=1e-4, rtol=1e-4) self.assertEqual(no_uv_u_val.shape, tensor_shape.TensorShape([0])) self.assertEqual(no_uv_v_val.shape, tensor_shape.TensorShape([0]))
def _compute_power_svd(self, var, mat_g, mat_g_size, alpha, mat_h_slot_name): """Computes mat_h = mat_g^alpha using svd. mat_g is a symmetric PSD matrix. Args: var: the variable we are updating. mat_g: the symmetric PSD matrix whose power it to be computed mat_g_size: size of mat_g alpha: a real number mat_h_slot_name: name of slot to store the power, if needed. Returns: mat_h = mat_g^alpha Stores mat_h in the appropriate slot, if it exists. Note that mat_g is PSD. So we could use linalg_ops.self_adjoint_eig. """ if mat_g_size == 1: mat_h = math_ops.pow(mat_g + self._epsilon, alpha) else: damping = self._epsilon * linalg_ops.eye(math_ops.to_int32(mat_g_size)) diag_d, mat_u, mat_v = linalg_ops.svd(mat_g + damping, full_matrices=True) mat_h = math_ops.matmul( mat_v * math_ops.pow(math_ops.maximum(diag_d, self._epsilon), alpha), array_ops.transpose(mat_u)) if mat_h_slot_name is not None: return state_ops.assign(self.get_slot(var, mat_h_slot_name), mat_h) return mat_h
def testComputeSpectralNorm(self): weights = variable_scope.get_variable( 'w', dtype=dtypes.float32, shape=[2, 3, 50, 100]) weights = math_ops.multiply(weights, 10.0) s = linalg_ops.svd( array_ops.reshape(weights, [-1, weights.shape[-1]]), compute_uv=False) true_sn = s[..., 0] estimated_sn = spectral_normalization.compute_spectral_norm(weights) with self.cached_session() as sess: sess.run(variables.global_variables_initializer()) np_true_sn = sess.run(true_sn) for i in range(50): est = sess.run(estimated_sn) if i < 1: np_est_1 = est if i < 4: np_est_5 = est if i < 9: np_est_10 = est np_est_50 = est # Check that the estimate improves with more iterations. self.assertAlmostEqual(np_true_sn, np_est_50, 0) self.assertGreater( abs(np_true_sn - np_est_10), abs(np_true_sn - np_est_50)) self.assertGreater( abs(np_true_sn - np_est_5), abs(np_true_sn - np_est_10)) self.assertGreater(abs(np_true_sn - np_est_1), abs(np_true_sn - np_est_5))
def __call__(self, shape, dtype=None, partition_info=None): if dtype is None: dtype = self.dtype # Check the shape if len(shape) < 2: raise ValueError("The tensor to initialize must be " "at least two-dimensional") # Flatten the input shape with the last dimension remaining # its original shape so it works for conv2d num_rows = 1 for dim in shape[:-1]: num_rows *= dim num_cols = shape[-1] flat_shape = (num_rows, num_cols) # Generate a random matrix a = random_ops.random_uniform(flat_shape, dtype=dtype, seed=self.seed) # Compute the svd _, u, v = linalg_ops.svd(a, full_matrices=False) # Pick the appropriate singular value decomposition if num_rows > num_cols: q = u else: # Tensorflow departs from numpy conventions # such that we need to transpose axes here q = array_ops.transpose(v) return self.gain * array_ops.reshape(q, shape)
def Test(self): def RandomInput(): np.random.seed(42) a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_) if dtype_ in [np.complex64, np.complex128]: a += 1j * np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_) return a # Optimal stepsize for central difference is O(epsilon^{1/3}). # See Equation (21) in: # http://www.karenkopecky.net/Teaching/eco613614/Notes_NumericalDifferentiation.pdf # TODO(rmlarsen): Move step size control to gradient checker. epsilon = np.finfo(dtype_).eps delta = 0.25 * epsilon**(1.0 / 3.0) if dtype_ in [np.float32, np.complex64]: tol = 3e-2 else: tol = 1e-6 if compute_uv_: funcs = [ lambda a: _NormalizingSvd(a, full_matrices_)[0], lambda a: _NormalizingSvd(a, full_matrices_)[1], lambda a: _NormalizingSvd(a, full_matrices_)[2] ] else: funcs = [lambda a: linalg_ops.svd(a, compute_uv=False)] for f in funcs: theoretical, numerical = gradient_checker_v2.compute_gradient( f, [RandomInput()], delta=delta) self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
def _testSvdCorrectness(self, dtype, shape): np.random.seed(1) x_np = np.random.uniform(low=-1.0, high=1.0, size=shape).astype(dtype) m, n = shape[-2], shape[-1] _, s_np, _ = np.linalg.svd(x_np) with self.cached_session() as sess: x_tf = array_ops.placeholder(dtype) with self.test_scope(): s, u, v = linalg_ops.svd(x_tf, full_matrices=True) s_val, u_val, v_val = sess.run([s, u, v], feed_dict={x_tf: x_np}) u_diff = np.matmul(u_val, np.swapaxes(u_val, -1, -2)) - np.eye(m) v_diff = np.matmul(v_val, np.swapaxes(v_val, -1, -2)) - np.eye(n) # Check u_val and v_val are orthogonal matrices. self.assertLess(np.linalg.norm(u_diff), 1e-2) self.assertLess(np.linalg.norm(v_diff), 1e-2) # Check that the singular values are correct, i.e., close to the ones from # numpy.lingal.svd. self.assertLess(np.linalg.norm(s_val - s_np), 1e-2) # The tolerance is set based on our tests on numpy's svd. As our tests # have batch dimensions and all our operations are on float32, we set the # tolerance a bit larger. Numpy's svd calls LAPACK's svd, which operates # on double precision. self.assertLess( np.linalg.norm(self._compute_usvt(s_val, u_val, v_val) - x_np), 2e-2) # Check behavior with compute_uv=False. We expect to still see 3 outputs, # with a sentinel scalar 0 in the last two outputs. with self.test_scope(): no_uv_s, no_uv_u, no_uv_v = gen_linalg_ops.svd( x_tf, full_matrices=True, compute_uv=False) no_uv_s_val, no_uv_u_val, no_uv_v_val = sess.run( [no_uv_s, no_uv_u, no_uv_v], feed_dict={x_tf: x_np}) self.assertAllClose(no_uv_s_val, s_val, atol=1e-4, rtol=1e-4) self.assertEqual(no_uv_u_val, 0.0) self.assertEqual(no_uv_v_val, 0.0)
def _symmetric_matrix_square_root(mat, eps=1e-10): """Compute square root of a symmetric matrix. Note that this is different from an elementwise square root. We want to compute M' where M' = sqrt(mat) such that M' * M' = mat. Also note that this method **only** works for symmetric matrices. Args: mat: Matrix to take the square root of. eps: Small epsilon such that any element less than eps will not be square rooted to guard against numerical instability. Returns: Matrix square root of mat. """ # Unlike numpy, tensorflow's return order is (s, u, v) s, u, v = linalg_ops.svd(mat) # sqrt is unstable around 0, just use 0 in such case si = array_ops.where(math_ops.less(s, eps), s, math_ops.sqrt(s)) # Note that the v returned by Tensorflow is v = V # (when referencing the equation A = U S V^T) # This is unlike Numpy which returns v = V^T return math_ops.matmul( math_ops.matmul(u, array_ops.diag(si)), v, transpose_b=True)
def _testSvdCorrectness(self, dtype, shape): np.random.seed(1) x_np = np.random.uniform(low=-1.0, high=1.0, size=shape).astype(dtype) m, n = shape[-2], shape[-1] _, s_np, _ = np.linalg.svd(x_np) with self.cached_session() as sess: x_tf = array_ops.placeholder(dtype) with self.test_scope(): s, u, v = linalg_ops.svd(x_tf, full_matrices=True) s_val, u_val, v_val = sess.run([s, u, v], feed_dict={x_tf: x_np}) u_diff = np.matmul(u_val, np.swapaxes(u_val, -1, -2)) - np.eye(m) v_diff = np.matmul(v_val, np.swapaxes(v_val, -1, -2)) - np.eye(n) # Check u_val and v_val are orthogonal matrices. self.assertLess(np.linalg.norm(u_diff), 1e-2) self.assertLess(np.linalg.norm(v_diff), 1e-2) # Check that the singular values are correct, i.e., close to the ones from # numpy.lingal.svd. self.assertLess(np.linalg.norm(s_val - s_np), 1e-2) # The tolerance is set based on our tests on numpy's svd. As our tests # have batch dimensions and all our operations are on float32, we set the # tolerance a bit larger. Numpy's svd calls LAPACK's svd, which operates # on double precision. self.assertLess( np.linalg.norm(self._compute_usvt(s_val, u_val, v_val) - x_np), 2e-2)
def _compute_power_svd(self, var, mat_g, mat_g_size, alpha, mat_h_slot_name): """Computes mat_h = mat_g^alpha using svd. mat_g is a symmetric PSD matrix. Args: var: the variable we are updating. mat_g: the symmetric PSD matrix whose power it to be computed mat_g_size: size of mat_g alpha: a real number mat_h_slot_name: name of slot to store the power, if needed. Returns: mat_h = mat_g^alpha Stores mat_h in the appropriate slot, if it exists. Note that mat_g is PSD. So we could use linalg_ops.self_adjoint_eig. """ if mat_g_size == 1: mat_h = math_ops.pow(mat_g + self._epsilon, alpha) else: damping = self._epsilon * linalg_ops.eye( math_ops.cast(mat_g_size, dtypes.int32)) diag_d, mat_u, mat_v = linalg_ops.svd(mat_g + damping, full_matrices=True) mat_h = math_ops.matmul( mat_v * math_ops.pow(math_ops.maximum(diag_d, self._epsilon), alpha), array_ops.transpose(mat_u)) if mat_h_slot_name is not None: return state_ops.assign(self.get_slot(var, mat_h_slot_name), mat_h) return mat_h
def _symmetric_matrix_square_root(mat, eps=1e-10): """Compute square root of a symmetric matrix. Note that this is different from an elementwise square root. We want to compute M' where M' = sqrt(mat) such that M' * M' = mat. Also note that this method **only** works for symmetric matrices. Args: mat: Matrix to take the square root of. eps: Small epsilon such that any element less than eps will not be square rooted to guard against numerical instability. Returns: Matrix square root of mat. """ # Unlike numpy, tensorflow's return order is (s, u, v) s, u, v = linalg_ops.svd(mat) # sqrt is unstable around 0, just use 0 in such case si = array_ops.where(math_ops.less(s, eps), s, math_ops.sqrt(s)) # Note that the v returned by Tensorflow is v = V # (when referencing the equation A = U S V^T) # This is unlike Numpy which returns v = V^T return math_ops.matmul(math_ops.matmul(u, array_ops.diag(si)), v, transpose_b=True)
def _symmetric_matrix_square_root(mat, eps=1e-10): s, u, v = linalg_ops.svd(mat) si = array_ops.where(math_ops.less(s, eps), s, math_ops.sqrt(s)) return math_ops.matmul(math_ops.matmul(u, array_ops.diag(si)), v, transpose_b=True)
def testThrowDeterminismError(self): shape = [6, 5] seed = [42, 24] matrix1 = stateless_random_ops.stateless_random_normal(shape, seed) with test_util.deterministic_ops(): if test_util.is_gpu_available(cuda_only=True): with self.assertRaisesRegex( errors_impl.UnimplementedError, "Determinism is not yet supported " "for Svd."): self.evaluate(linalg_ops.svd(matrix1))
def testBadInputs(self): # The input to svd should be a tensor of at least rank 2. for bad_val in [np.nan, np.inf]: matrix = np.array([[1, bad_val], [0, 1]]) s, u, v = linalg_ops.svd(matrix, compute_uv=True) s, u, v = self.evaluate([s, u, v]) for i in range(2): self.assertTrue(np.isnan(s[i])) for j in range(2): self.assertTrue(np.isnan(u[i, j])) self.assertTrue(np.isnan(v[i, j]))
def DISABLED_testBadInputs(self): # TODO(b/185822300): re-enable after the bug is fixed in CUDA-11.x # The input to svd should be a tensor of at least rank 2. for bad_val in [np.nan, np.inf]: matrix = np.array([[1, bad_val], [0, 1]]) s, u, v = linalg_ops.svd(matrix, compute_uv=True) s, u, v = self.evaluate([s, u, v]) for i in range(2): self.assertTrue(np.isnan(s[i])) for j in range(2): self.assertTrue(np.isnan(u[i, j])) self.assertTrue(np.isnan(v[i, j]))
def _cond(self): if not self.is_self_adjoint: # In general the condition number is the ratio of the # absolute value of the largest and smallest singular values. vals = linalg_ops.svd(self.to_dense(), compute_uv=False) else: # For self-adjoint matrices, and in general normal matrices, # we can use eigenvalues. vals = math_ops.abs(self._eigvals()) return (math_ops.reduce_max(vals, axis=-1) / math_ops.reduce_min(vals, axis=-1))
def testTwoInputsSameOp(self): g = ops.Graph() with g.as_default(): m = array_ops.placeholder(dtypes.float32) s, u, v = linalg_ops.svd(m) ss = math_ops.reduce_sum(s) uu = math_ops.reduce_sum(u) vv = math_ops.reduce_sum(v) result = ss + uu + vv f = function._graph_to_function_def( g, g.get_operations()[1:], # skip the placeholder [s, u, v], [result]) self.assertEqual(len(f.signature.input_arg), 3)
def testTwoInputsSameOp(self): g = ops.Graph() with g.as_default(): m = array_ops.placeholder(dtypes.float32) s, u, v = linalg_ops.svd(m) ss = math_ops.reduce_sum(s) uu = math_ops.reduce_sum(u) vv = math_ops.reduce_sum(v) result = ss + uu + vv f = graph_to_function_def.graph_to_function_def( g, g.get_operations()[1:], # skip the placeholder [s, u, v], [result]) self.assertEqual(len(f.signature.input_arg), 3)
def _assert_non_singular(self): """Private default implementation of _assert_non_singular.""" logging.warn( "Using (possibly slow) default implementation of assert_non_singular." " Requires conversion to a dense matrix and O(N^3) operations.") if self._can_use_cholesky(): return self.assert_positive_definite() else: singular_values = linalg_ops.svd(self.to_dense(), compute_uv=False) # TODO(langmore) Add .eig and .cond as methods. cond = (math_ops.reduce_max(singular_values, axis=-1) / math_ops.reduce_min(singular_values, axis=-1)) return check_ops.assert_less( cond, self._max_condition_number_to_be_non_singular(), message="Singular matrix up to precision epsilon.")
def test_cond(self): with self.test_session(graph=ops.Graph()) as sess: # svd does not work with zero dimensional matrices, so we'll # skip if 0 in shapes_info.shape[-2:]: return # ROCm platform does not yet support complex types if test.is_built_with_rocm() and \ ((dtype == dtypes.complex64) or (dtype == dtypes.complex128)): return sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED # Ensure self-adjoint and PD so we get finite condition numbers. operator, mat = self.operator_and_matrix( shapes_info, dtype, use_placeholder=use_placeholder, ensure_self_adjoint_and_pd=True) # Eigenvalues are real, so we'll cast these to float64 and sort # for comparison. op_cond = operator.cond() s = math_ops.abs(linalg_ops.svd(mat, compute_uv=False)) mat_cond = math_ops.reduce_max(s, axis=-1) / math_ops.reduce_min( s, axis=-1) op_cond_v, mat_cond_v = sess.run([op_cond, mat_cond]) atol_override = { dtypes.float16: 1e-2, dtypes.float32: 1e-3, dtypes.float64: 1e-6, dtypes.complex64: 1e-3, dtypes.complex128: 1e-6, } rtol_override = { dtypes.float16: 1e-2, dtypes.float32: 1e-3, dtypes.float64: 1e-4, dtypes.complex64: 1e-3, dtypes.complex128: 1e-6, } atol = atol_override[dtype] rtol = rtol_override[dtype] self.assertAllClose(op_cond_v, mat_cond_v, atol=atol, rtol=rtol)
def _sliced_wasserstein_svd(a, b): """Compute the approximate sliced Wasserstein distance using an SVD. This is not part of the paper, it's a variant with possibly more accurate measure. Args: a: (matrix) Distribution "a" of samples (row, col). b: (matrix) Distribution "b" of samples (row, col). Returns: Float containing the approximate distance between "a" and "b". """ s = array_ops.shape(a) # Random projection matrix. sig, u = linalg_ops.svd(array_ops.concat([a, b], 0))[:2] proj_a, proj_b = array_ops.split(u * sig, 2, axis=0) proj_a = _sort_rows(proj_a[:, ::-1], s[0]) proj_b = _sort_rows(proj_b[:, ::-1], s[0]) # Pairwise Wasserstein distance. wdist = math_ops.reduce_mean(math_ops.abs(proj_a - proj_b)) return wdist
def _NormalizingSvd(tf_a): tf_s, tf_u, tf_v = linalg_ops.svd(tf_a, compute_uv=True, full_matrices=True) # Singular vectors are only unique up to an arbitrary phase. We normalize # the vectors such that the first component of u (if m >=n) or v (if n > m) # have phase 0. m = tf_a.shape[-2] n = tf_a.shape[-1] if m >= n: top_rows = tf_u[..., 0:1, :] else: top_rows = tf_v[..., 0:1, :] if tf_u.dtype.is_complex: angle = -math_ops.angle(top_rows) phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle)) else: phase = math_ops.sign(top_rows) tf_u *= phase[..., :m] tf_v *= phase[..., :n] return tf_s, tf_u, tf_v
def _NormalizingSvd(tf_a): tf_s, tf_u, tf_v = linalg_ops.svd( tf_a, compute_uv=True, full_matrices=full_matrices_) # Singular vectors are only unique up to an arbitrary phase. We normalize # the vectors such that the first component of u (if m >=n) or v (if n > m) # have phase 0. m = tf_a.shape[-2] n = tf_a.shape[-1] if m >= n: top_rows = tf_u[..., 0:1, :] else: top_rows = tf_v[..., 0:1, :] if tf_u.dtype.is_complex: angle = -math_ops.angle(top_rows) phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle)) else: phase = math_ops.sign(top_rows) tf_u *= phase[..., :m] tf_v *= phase[..., :n] return tf_s, tf_u, tf_v
def __call__(self, shape, dtype=None, partition_info=None): if dtype is None: dtype = self.dtype # Check the shape if len(shape) < 2: raise ValueError("The tensor to initialize must be " "at least two-dimensional") # Flatten the input shape with the last dimension remaining # its original shape so it works for conv2d num_rows = 1 for dim in shape[:-1]: num_rows *= dim num_cols = shape[-1] flat_shape = (num_rows, num_cols) # Generate a random matrix a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed) # Compute the qr factorization u, _, vt = linalg_ops.svd(a, full_matrices=False) return self.gain * array_ops.reshape(vt, shape)
def _matrix_square_root(mat, eps=1e-10): """Compute symmetric square root of matrix. Equivalent to matrix square root when matrix is invertible; note that this is different from an elementwise square root. We want to compute M' where M' = sqrt(mat) such that M' * M' = mat. Args: mat: Matrix to take the square root of. eps: Small epsilon such that any element less than eps will not be square rooted to guard against numerical instability. Returns: Matrix square root of mat. """ s, u, v = linalg_ops.svd(mat) # sqrt is unstable around 0, just use 0 in such case si = array_ops.where(math_ops.less(s, eps), s, math_ops.sqrt(s)) return math_ops.matmul( math_ops.matmul(u, array_ops.diag(si)), v, transpose_b=True)
def _matrix_square_root(mat, eps=1e-10): """Compute symmetric square root of matrix. Equivalent to matrix square root when matrix is invertible; note that this is different from an elementwise square root. We want to compute M' where M' = sqrt(mat) such that M' * M' = mat. Args: mat: Matrix to take the square root of. eps: Small epsilon such that any element less than eps will not be square rooted to guard against numerical instability. Returns: Matrix square root of mat. """ s, u, v = linalg_ops.svd(mat) # sqrt is unstable around 0, just use 0 in such case si = array_ops.where(math_ops.less(s, eps), s, math_ops.sqrt(s)) return math_ops.matmul(math_ops.matmul(u, array_ops.diag(si)), v, transpose_b=True)
def Test(self): np.random.seed(42) a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_) if dtype_ in [np.complex64, np.complex128]: a += 1j * np.random.uniform( low=-1.0, high=1.0, size=shape_).astype(dtype_) # Optimal stepsize for central difference is O(epsilon^{1/3}). # See Equation (21) in: # http://www.karenkopecky.net/Teaching/eco613614/Notes_NumericalDifferentiation.pdf # TODO(rmlarsen): Move step size control to gradient checker. epsilon = np.finfo(dtype_).eps delta = 0.1 * epsilon**(1.0 / 3.0) if dtype_ in [np.float32, np.complex64]: tol = 3e-2 else: tol = 1e-6 with self.test_session(use_gpu=True): tf_a = constant_op.constant(a) if compute_uv_: tf_s, tf_u, tf_v = _NormalizingSvd(tf_a) outputs = [tf_s, tf_u, tf_v] else: tf_s = linalg_ops.svd(tf_a, compute_uv=False) outputs = [tf_s] for b in outputs: x_init = np.random.uniform( low=-1.0, high=1.0, size=shape_).astype(dtype_) if dtype_ in [np.complex64, np.complex128]: x_init += 1j * np.random.uniform( low=-1.0, high=1.0, size=shape_).astype(dtype_) theoretical, numerical = gradient_checker.compute_gradient( tf_a, tf_a.get_shape().as_list(), b, b.get_shape().as_list(), x_init_value=x_init, delta=delta) self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
def Test(self): np.random.seed(42) a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_) if dtype_ in [np.complex64, np.complex128]: a += 1j * np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_) # Optimal stepsize for central difference is O(epsilon^{1/3}). # See Equation (21) in: # http://www.karenkopecky.net/Teaching/eco613614/Notes_NumericalDifferentiation.pdf # TODO(rmlarsen): Move step size control to gradient checker. epsilon = np.finfo(dtype_).eps delta = 0.1 * epsilon**(1.0 / 3.0) if dtype_ in [np.float32, np.complex64]: tol = 3e-2 else: tol = 1e-6 with self.session(use_gpu=True): tf_a = constant_op.constant(a) if compute_uv_: tf_s, tf_u, tf_v = _NormalizingSvd(tf_a, full_matrices_) outputs = [tf_s, tf_u, tf_v] else: tf_s = linalg_ops.svd(tf_a, compute_uv=False) outputs = [tf_s] for b in outputs: x_init = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_) if dtype_ in [np.complex64, np.complex128]: x_init += 1j * np.random.uniform( low=-1.0, high=1.0, size=shape_).astype(dtype_) theoretical, numerical = gradient_checker.compute_gradient( tf_a, tf_a.get_shape().as_list(), b, b.get_shape().as_list(), x_init_value=x_init, delta=delta) self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
def _SvdGrad(op, grad_s, grad_u, grad_v): """Gradient for the singular value decomposition.""" # The derivation for the compute_uv=False case, and most of # the derivation for the full_matrices=True case, are in # Giles' paper (see reference at top of file). A derivation for # the full_matrices=False case is available at # https://j-towns.github.io/papers/svd-derivative.pdf a = op.inputs[0] a_shape = a.get_shape().with_rank_at_least(2) grad_s_mat = array_ops.matrix_diag(grad_s) if not op.get_attr("compute_uv"): s, u, v = linalg_ops.svd(a, compute_uv=True) grad_a = math_ops.matmul(u, math_ops.matmul(grad_s_mat, v, adjoint_b=True)) grad_a.set_shape(a_shape) return grad_a full_matrices = op.get_attr("full_matrices") # TODO(rmlarsen): Make this work with complex types. if a.dtype.is_complex: raise NotImplementedError( "SVD gradient is not implemented for complex types and " "compute_uv=True.") grad_u_shape = grad_u.get_shape().with_rank_at_least(2) grad_v_shape = grad_v.get_shape().with_rank_at_least(2) m = a_shape.dims[-2].merge_with(grad_u_shape[-2]) n = a_shape.dims[-1].merge_with(grad_v_shape[-2]) batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with( grad_v_shape[:-2]) a_shape = batch_shape.concatenate([m, n]) m = a_shape.dims[-2].value n = a_shape.dims[-1].value # TODO(rmlarsen): Make this work with placeholders. if m is None or n is None: raise NotImplementedError( "SVD gradient has not been implemented for input with unknown " "inner matrix shape.") s = op.outputs[0] u = op.outputs[1] v = op.outputs[2] use_adjoint = False if m > n: # Compute the gradient for A^H = V * S^T * U^H, and (implicitly) take the # Hermitian transpose of the gradient at the end. use_adjoint = True m, n = n, m u, v = v, u grad_u, grad_v = grad_v, grad_u with ops.control_dependencies([grad_s, grad_u, grad_v]): if full_matrices and abs(m - n) > 1: raise NotImplementedError( "svd gradient is not implemented for abs(m - n) > 1 " "when full_matrices is True") s_mat = array_ops.matrix_diag(s) s2 = math_ops.square(s) # NOTICE: Because of the term involving f, the gradient becomes # infinite (or NaN in practice) when singular values are not unique. # Mathematically this should not be surprising, since for (k-fold) # degenerate singular values, the corresponding singular vectors are # only defined up a (k-dimensional) subspace. In practice, this can # lead to numerical instability when singular values are close but not # exactly equal. f = array_ops.matrix_set_diag( math_ops.reciprocal( array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1)), array_ops.zeros_like(s)) s_inv_mat = array_ops.matrix_diag(math_ops.reciprocal(s)) v1 = v[..., :, :m] grad_v1 = grad_v[..., :, :m] u_gu = math_ops.matmul(u, grad_u, adjoint_a=True) v_gv = math_ops.matmul(v1, grad_v1, adjoint_a=True) f_u = f * u_gu f_v = f * v_gv term1_nouv = ( grad_s_mat + math_ops.matmul(f_u + _linalg.adjoint(f_u), s_mat) + math_ops.matmul(s_mat, f_v + _linalg.adjoint(f_v))) term1 = math_ops.matmul(u, math_ops.matmul(term1_nouv, v1, adjoint_b=True)) if m == n: grad_a_before_transpose = term1 else: gv1t = array_ops.matrix_transpose(grad_v1) gv1t_v1 = math_ops.matmul(gv1t, v1) term2_nous = gv1t - math_ops.matmul(gv1t_v1, v1, adjoint_b=True) if full_matrices: v2 = v[..., :, m:n] grad_v2 = grad_v[..., :, m:n] v1t_gv2 = math_ops.matmul(v1, grad_v2, adjoint_a=True) term2_nous -= math_ops.matmul(v1t_gv2, v2, adjoint_b=True) u_s_inv = math_ops.matmul(u, s_inv_mat) term2 = math_ops.matmul(u_s_inv, term2_nous) grad_a_before_transpose = term1 + term2 if use_adjoint: grad_a = array_ops.matrix_transpose(grad_a_before_transpose) else: grad_a = grad_a_before_transpose grad_a.set_shape(a_shape) return grad_a
def _sharpOp(self, vector): s, u ,v = linalg_ops.svd(vector) return (math_ops.matmul(u, array_ops.transpose(v)) * math_ops.reduce_sum(s))
def _SvdGrad(op, grad_s, grad_u, grad_v): """Gradient for Svd based on Giles' algorithm. Reference at top of file.""" if op.get_attr("compute_uv") and not op.get_attr("full_matrices"): raise NotImplementedError( "SVD gradient is not implemented for compute_uv=True and " "full_matrices=False.") a = op.inputs[0] a_shape = a.get_shape().with_rank_at_least(2) if op.get_attr("compute_uv"): # TODO(rmlarsen): Make this work with complex types. if a.dtype.is_complex: raise NotImplementedError( "SVD gradient is not implemented for complex types and " "compute_uv=True.") grad_u_shape = grad_u.get_shape().with_rank_at_least(2) grad_v_shape = grad_v.get_shape().with_rank_at_least(2) m = a_shape[-2].merge_with(grad_u_shape[-2]) n = a_shape[-1].merge_with(grad_v_shape[-2]) batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with( grad_v_shape[:-2]) a_shape = batch_shape.concatenate([m, n]) m = a_shape[-2].value n = a_shape[-1].value # TODO(rmlarsen): Make this work with placeholders. if m is None or n is None: raise NotImplementedError( "SVD gradient has not been implemented for input with unknown " "inner matrix shape.") if not op.get_attr("full_matrices") or not op.get_attr("compute_uv"): s, u, v = linalg_ops.svd(a, compute_uv=True, full_matrices=True) else: s = op.outputs[0] u = op.outputs[1] v = op.outputs[2] use_adjoint = False if m > n: # Compute the gradient for A^H = V * S^T * U^H, and (implicitly) take the # Hermitian transpose of the gradient at the end. use_adjoint = True m, n = n, m u, v = v, u grad_u, grad_v = grad_v, grad_u with ops.control_dependencies([grad_s, grad_u, grad_v]): grad_s_mat = array_ops.matrix_diag(grad_s) if not op.get_attr("compute_uv"): if use_adjoint: grad_a = math_ops.matmul( v[..., :, :m], math_ops.matmul(u, grad_s_mat), adjoint_b=True) else: grad_a = math_ops.matmul(u, math_ops.matmul( grad_s_mat, v[..., :, :m], adjoint_b=True)) grad_a.set_shape(a_shape) return grad_a # TODO(rmlarsen): Define a gradient that is numerically stable for # abs(m-n) > 1. Currently this does not work because there are effectively # multiple singular values with value zero. I am not sure if this is a true # instability or if it simply throws off the finite difference gradient # checker. if abs(m - n) > 1: raise NotImplementedError( "svd gradient is not implemented for abs(m - n) > 1") s_mat = array_ops.matrix_diag(s) s2 = math_ops.square(s) # NOTICE: Because of the term involving f, the gradient becomes # infinite (or NaN in practice) when singular values are not unique. # Mathematically this should not be surprising, since for (k-fold) # degenerate singular values, the corresponding singular vectors are # only defined up a (k-dimensional) subspace. In practice, this can # lead to numerical instability when singular values are close but not # exactly equal. f = array_ops.matrix_set_diag( math_ops.reciprocal( array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1)), array_ops.zeros_like(s)) s_inv_mat = array_ops.matrix_diag(math_ops.reciprocal(s)) u_gu = math_ops.matmul(u, grad_u, adjoint_a=True) v_gv = math_ops.matmul(v, grad_v, adjoint_a=True) if m == n: f_u = f * u_gu f_v = f * v_gv else: dv2 = array_ops.matrix_transpose(v_gv[..., m:n, :m]) - v_gv[..., :m, m:n] f_u = f * u_gu f_v = f * v_gv[..., :m, :m] grad_a_nouv = ( grad_s_mat + math_ops.matmul(f_u + _linalg.adjoint(f_u), s_mat) + math_ops.matmul(s_mat, f_v + _linalg.adjoint(f_v))) if m != n: grad_a_nouv = array_ops.concat( [grad_a_nouv, math_ops.matmul(s_inv_mat, dv2)], -1) if use_adjoint: # Use (U X V^H)^H = V (U X)^H. grad_a = math_ops.matmul( v, math_ops.matmul(u, grad_a_nouv), adjoint_b=True) else: grad_a = math_ops.matmul(u, math_ops.matmul(grad_a_nouv, v, adjoint_b=True)) grad_a.set_shape(a_shape) return grad_a
def _SvdGrad(op, grad_s, grad_u, grad_v): """Gradient for the singular value decomposition.""" # The derivation for the compute_uv=False case, and most of # the derivation for the full_matrices=True case, are in # Giles' paper (see reference at top of file). A derivation for # the full_matrices=False case is available at # https://j-towns.github.io/papers/svd-derivative.pdf a = op.inputs[0] a_shape = a.get_shape().with_rank_at_least(2) grad_s_mat = array_ops.matrix_diag(grad_s) if not op.get_attr("compute_uv"): s, u, v = linalg_ops.svd(a, compute_uv=True) grad_a = math_ops.matmul( u, math_ops.matmul(grad_s_mat, v, adjoint_b=True)) grad_a.set_shape(a_shape) return grad_a full_matrices = op.get_attr("full_matrices") # TODO(rmlarsen): Make this work with complex types. if a.dtype.is_complex: raise NotImplementedError( "SVD gradient is not implemented for complex types and " "compute_uv=True.") grad_u_shape = grad_u.get_shape().with_rank_at_least(2) grad_v_shape = grad_v.get_shape().with_rank_at_least(2) m = a_shape[-2].merge_with(grad_u_shape[-2]) n = a_shape[-1].merge_with(grad_v_shape[-2]) batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with( grad_v_shape[:-2]) a_shape = batch_shape.concatenate([m, n]) m = a_shape[-2].value n = a_shape[-1].value # TODO(rmlarsen): Make this work with placeholders. if m is None or n is None: raise NotImplementedError( "SVD gradient has not been implemented for input with unknown " "inner matrix shape.") s = op.outputs[0] u = op.outputs[1] v = op.outputs[2] use_adjoint = False if m > n: # Compute the gradient for A^H = V * S^T * U^H, and (implicitly) take the # Hermitian transpose of the gradient at the end. use_adjoint = True m, n = n, m u, v = v, u grad_u, grad_v = grad_v, grad_u with ops.control_dependencies([grad_s, grad_u, grad_v]): if full_matrices and abs(m - n) > 1: raise NotImplementedError( "svd gradient is not implemented for abs(m - n) > 1 " "when full_matrices is True") s_mat = array_ops.matrix_diag(s) s2 = math_ops.square(s) # NOTICE: Because of the term involving f, the gradient becomes # infinite (or NaN in practice) when singular values are not unique. # Mathematically this should not be surprising, since for (k-fold) # degenerate singular values, the corresponding singular vectors are # only defined up a (k-dimensional) subspace. In practice, this can # lead to numerical instability when singular values are close but not # exactly equal. f = array_ops.matrix_set_diag( math_ops.reciprocal( array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1)), array_ops.zeros_like(s)) s_inv_mat = array_ops.matrix_diag(math_ops.reciprocal(s)) v1 = v[..., :, :m] grad_v1 = grad_v[..., :, :m] u_gu = math_ops.matmul(u, grad_u, adjoint_a=True) v_gv = math_ops.matmul(v1, grad_v1, adjoint_a=True) f_u = f * u_gu f_v = f * v_gv term1_nouv = (grad_s_mat + math_ops.matmul(f_u + _linalg.adjoint(f_u), s_mat) + math_ops.matmul(s_mat, f_v + _linalg.adjoint(f_v))) term1 = math_ops.matmul( u, math_ops.matmul(term1_nouv, v1, adjoint_b=True)) if m == n: grad_a_before_transpose = term1 else: gv1t = array_ops.matrix_transpose(grad_v1) gv1t_v1 = math_ops.matmul(gv1t, v1) term2_nous = gv1t - math_ops.matmul(gv1t_v1, v1, adjoint_b=True) if full_matrices: v2 = v[..., :, m:n] grad_v2 = grad_v[..., :, m:n] v1t_gv2 = math_ops.matmul(v1, grad_v2, adjoint_a=True) term2_nous -= math_ops.matmul(v1t_gv2, v2, adjoint_b=True) u_s_inv = math_ops.matmul(u, s_inv_mat) term2 = math_ops.matmul(u_s_inv, term2_nous) grad_a_before_transpose = term1 + term2 if use_adjoint: grad_a = array_ops.matrix_transpose(grad_a_before_transpose) else: grad_a = grad_a_before_transpose grad_a.set_shape(a_shape) return grad_a
def _finish(self, state): var_dtype = self._variables[0].dtype.base_dtype # Update global step. global_step = self._get_global_step(state) update_global_step = state_ops.assign_add(global_step, 1.) # Update the first moment estimate. beta1 = state.get_hyper("beta1", dtype=var_dtype) moment1 = self._get_moment1(state) flat_grad = self._get_flat_grad(state) # moment1_t := beta1 * moment1_{t-1} + (1 - beta1) * flat_grad_t update_moment1 = moment1.assign(beta1 * moment1 + (1. - beta1) * flat_grad) # Update the gradient buffer. window = state.get_hyper("window") grad_buffer = self._get_grad_buffer(state) next_grad_index = math_ops.floormod( math_ops.to_int32(update_global_step - 1.), window) # grad_buffer[(t-1) % window] := moment1_t update_grad_buffer = state_ops.scatter_update(grad_buffer, next_grad_index, update_moment1) # Compute the update step. eps = state.get_hyper("eps", dtype=var_dtype) svd_eps = state.get_hyper("svd_eps", dtype=var_dtype) sigma_eps = state.get_hyper("sigma_eps", dtype=var_dtype) lr = state.get_hyper("lr", dtype=var_dtype) denom = math_ops.sqrt( math_ops.minimum( ops.convert_to_tensor(update_global_step), ops.convert_to_tensor(math_ops.cast(window, dtype=var_dtype)))) moment1_2d = array_ops.expand_dims(update_moment1, -1) # m = grad_buffer^T / sqrt(min(t, window)) # m has shape [model dimension, window], where model dimension is the sum # of the dimensions of the flattened variables. m = array_ops.transpose(math_ops.divide(update_grad_buffer, denom)) # sigma, u, _ = SVD(m^Tm + I * svd_eps) mm = math_ops.matmul(m, m, transpose_a=True) damping = math_ops.cast(linalg_ops.eye(window), dtype=var_dtype) * svd_eps sigma, u, _ = linalg_ops.svd(mm + damping) sigma_sqrt = math_ops.sqrt(sigma) sigma_sqrt_min = math_ops.reduce_min(sigma_sqrt) # sigma_sqrt_inv = 1 / (\sqrt{sigma} + sigma_eps) ^ 3 # We add sigma_eps to alleviate numerical instability. # Note that (m^Tm)^(-3/2) = u diag(sigma_sqrt_inv) u^T. sigma_sqrt_inv = math_ops.divide( math_ops.cast(1.0, dtype=var_dtype), math_ops.pow(sigma_sqrt + sigma_eps, 3)) # In full matrix AdaGrad, the update step computes (mm^T)^(-1/2)g, where the # inversion of a model dimension by model dimension matrix is needed. To # speed up this computation we calculate the following instead: # m(m^Tm)^(-3/2)m^T moment1 = m u diag(sigma_sqrt_inv) u^T m^T moment1. new_step = array_ops.expand_dims( array_ops.zeros(flat_grad.get_shape(), dtype=var_dtype), -1) head = math_ops.matmul( m, math_ops.matmul( u, math_ops.matmul( array_ops.diag(sigma_sqrt_inv), math_ops.matmul( u, math_ops.matmul(m, moment1_2d, transpose_a=True), transpose_a=True)))) # When inverting (mm^t)^(1/2), we also add epsilon * I regularization for # degenerate cases. We expand ((mm^t)^(1/2) + epsilon * I)^(-1) using # Woodbury's identity. # For full derivation please see paper at # https://arxiv.org/pdf/1806.02958.pdf tail = moment1_2d - math_ops.matmul( m, math_ops.matmul( u, math_ops.matmul( array_ops.diag( math_ops.divide(math_ops.cast(1.0, dtype=var_dtype), sigma)), math_ops.matmul( u, math_ops.matmul(m, moment1_2d, transpose_a=True), transpose_a=True)))) scaled_tail = math_ops.divide(tail, sigma_sqrt_min) update_new_step = control_flow_ops.cond( sigma_sqrt_min > eps, lambda: math_ops.add(head, scaled_tail), lambda: math_ops.add(new_step, head)) # Update each variable. update_step = [] for var in self._variables: dim = self.shape_dict[var.name] start_index = self.index_dict[var.name] end_index = start_index + dim var_update_correct_shape = array_ops.reshape( update_new_step[start_index:end_index], var.get_shape()) var_updated = state_ops.assign_sub(var, lr * var_update_correct_shape) update_step.append(var_updated) return control_flow_ops.group(update_step)
def posdef_eig_svd(mat): """Computes the singular values and left singular vectors of a matrix.""" evals, evecs, _ = linalg_ops.svd(mat) return evals, evecs
def make_inverse_update_ops(self): """Create and return update ops corresponding to registered computations.""" # TODO(b/69918258): Add correctness tests for this method. # pylint: disable=invalid-name ops = super(FullyConnectedMultiKF, self).make_inverse_update_ops() if (len(self._option1quants_by_damping) + len(self._option2quants_by_damping)): # Note that C0 and C1 are stand-ins for A0 and A1, or G0 and G1, from # the pseudo-code in the original paper. Because the computations for # the A and G case are essentially the same they can both be performed by # the same class (this one). C1 = self.get_cov_dt1() # Get the eigendecomposition of C0 (= self.get_cov()) eigen_e, eigen_V = self.get_eigendecomp() # TODO(b/69678661): Note, there is an implicit assumption here that C1 # and C0 (as represented here by its eigen-decomp) are consistent. This # could fail to be the case if self._cov and self._cov_dt1 are not updated # consistently, or are somehow read between or during the cov updates. # Can this possibly happen? Is there a way to prevent it? for damping, (Lmat_var, psi_var) in self._option1quants_by_damping.items(): invsqrtC0 = math_ops.matmul( eigen_V * (eigen_e + damping)**(-0.5), eigen_V, transpose_b=True) # Might need to enforce symmetry lost due to numerical issues. invsqrtC0 = (invsqrtC0 + array_ops.transpose(invsqrtC0)) / 2.0 # The following line imposses the symmetry assumed by "Option 1" on C1. # Stangely the code can work okay with this line commented out, # depending on how psd_eig is defined. I'm not sure why. C1 = (C1 + array_ops.transpose(C1)) / 2.0 # hPsi = C0^(-1/2) * C1 * C0^(-1/2) (hPsi means hat{Psi}) hPsi = math_ops.matmul(math_ops.matmul(invsqrtC0, C1), invsqrtC0) # Compute the decomposition U*diag(psi)*U^T = hPsi psi, U = utils.posdef_eig(hPsi) # L = C0^(-1/2) * U Lmat = math_ops.matmul(invsqrtC0, U) ops.append(Lmat_var.assign(Lmat)) ops.append(psi_var.assign(psi)) for damping, (Pmat_var, Kmat_var, mu_var) in self._option2quants_by_damping.items(): # compute C0^(-1/2) invsqrtC0 = math_ops.matmul( eigen_V * (eigen_e + damping)**(-0.5), eigen_V, transpose_b=True) # Might need to enforce symmetry lost due to numerical issues. invsqrtC0 = (invsqrtC0 + array_ops.transpose(invsqrtC0)) / 2.0 # Compute the product C0^(-1/2) * C1 invsqrtC0C1 = math_ops.matmul(invsqrtC0, C1) # hPsi = C0^(-1/2) * C1 * C0^(-1/2) (hPsi means hat{Psi}) hPsi = math_ops.matmul(invsqrtC0C1, invsqrtC0) # Compute the decomposition E*diag(mu)*E^T = hPsi^T * hPsi # Note that we using the notation mu instead of "m" for the eigenvalues. # Instead of computing the product hPsi^T * hPsi and then doing an # eigen-decomposition of this we just compute the SVD of hPsi and then # square the singular values to get the eigenvalues. For a justification # of this approach, see: # https://en.wikipedia.org/wiki/Singular-value_decomposition#Relation_to_eigenvalue_decomposition sqrtmu, _, E = linalg_ops.svd(hPsi) mu = math_ops.square(sqrtmu) # Mathematically, the eigenvalues should not should not exceed 1.0, but # due to numerical issues, or possible issues with inconsistent # values of C1 and (the eigen-decomposition of) C0 they might. So # we enforce this condition. mu = math_ops.minimum(mu, 1.0) # P = (C0^(-1/2) * C1)^T * C0^(-1/2) = C_1^T * C_0^(-1) Pmat = math_ops.matmul(invsqrtC0C1, invsqrtC0, transpose_a=True) # K = C_0^(-1/2) * E Kmat = math_ops.matmul(invsqrtC0, E) ops.append(Pmat_var.assign(Pmat)) ops.append(Kmat_var.assign(Kmat)) ops.append(mu_var.assign(mu)) return [control_flow_ops.group(*ops)]