def test_defining_spd_operator_by_taking_real_part(self): with self.cached_session() as sess: # S is real and positive. s = linear_operator_test_util.random_uniform( shape=(10, 2, 3, 4), dtype=dtypes.float32, minval=1., maxval=2.) # Let S = S1 + S2, the Hermitian and anti-hermitian parts. # S1 = 0.5 * (S + S^H), S2 = 0.5 * (S - S^H), # where ^H is the Hermitian transpose of the function: # f(n0, n1, n2)^H := ComplexConjugate[f(N0-n0, N1-n1, N2-n2)]. # We want to isolate S1, since # S1 is Hermitian by construction # S1 is real since S is # S1 is positive since it is the sum of two positive kernels # IDFT[S] = IDFT[S1] + IDFT[S2] # = H1 + H2 # where H1 is real since it is Hermitian, # and H2 is imaginary since it is anti-Hermitian. ifft_s = fft_ops.ifft3d(math_ops.cast(s, dtypes.complex64)) # Throw away H2, keep H1. real_ifft_s = math_ops.real(ifft_s) # This is the perfect spectrum! # spectrum = DFT[H1] # = S1, fft_real_ifft_s = fft_ops.fft3d( math_ops.cast(real_ifft_s, dtypes.complex64)) # S1 is Hermitian ==> operator is real. # S1 is real ==> operator is self-adjoint. # S1 is positive ==> operator is positive-definite. operator = linalg.LinearOperatorCirculant3D(fft_real_ifft_s) # Allow for complex output so we can check operator has zero imag part. self.assertEqual(operator.dtype, dtypes.complex64) matrix, matrix_t = sess.run([ operator.to_dense(), array_ops.matrix_transpose(operator.to_dense()) ]) operator.assert_positive_definite().run() # Should not fail. np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6) self.assertAllClose(matrix, matrix_t) # Just to test the theory, get S2 as well. # This should create an imaginary operator. # S2 is anti-Hermitian ==> operator is imaginary. # S2 is real ==> operator is self-adjoint. imag_ifft_s = math_ops.imag(ifft_s) fft_imag_ifft_s = fft_ops.fft3d( 1j * math_ops.cast(imag_ifft_s, dtypes.complex64)) operator_imag = linalg.LinearOperatorCirculant3D(fft_imag_ifft_s) matrix, matrix_h = sess.run([ operator_imag.to_dense(), array_ops.matrix_transpose(math_ops.conj(operator_imag.to_dense())) ]) self.assertAllClose(matrix, matrix_h) np.testing.assert_allclose(0, np.real(matrix), atol=1e-7)
def testNonBatchMatrix(self): matrix = [[1, 2, 3], [4, 5, 6]] # Shape (2, 3) expected_transposed = [[1, 4], [2, 5], [3, 6]] # Shape (3, 2) with self.test_session(): transposed = array_ops.matrix_transpose(matrix) self.assertEqual((3, 2), transposed.get_shape()) self.assertAllEqual(expected_transposed, transposed.eval())
def __call__(self, shape, dtype=None, partition_info=None): if dtype is None: dtype = self.dtype # Check the shape if len(shape) < 2: raise ValueError("The tensor to initialize must be " "at least two-dimensional") # Flatten the input shape with the last dimension remaining # its original shape so it works for conv2d num_rows = 1 for dim in shape[:-1]: num_rows *= dim num_cols = shape[-1] flat_shape = (num_cols, num_rows) if num_rows < num_cols else (num_rows, num_cols) # Generate a random matrix a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed) # Compute the qr factorization q, r = linalg_ops.qr(a, full_matrices=False) # Make Q uniform d = array_ops.diag_part(r) q *= math_ops.sign(d) if num_rows < num_cols: q = array_ops.matrix_transpose(q) return self.gain * array_ops.reshape(q, shape)
def adjoint(matrix, name=None): """Transposes the last two dimensions of and conjugates tensor `matrix`. For example: ```python x = tf.constant([[1 + 1j, 2 + 2j, 3 + 3j], [4 + 4j, 5 + 5j, 6 + 6j]]) tf.linalg.adjoint(x) # [[1 - 1j, 4 - 4j], # [2 - 2j, 5 - 5j], # [3 - 3j, 6 - 6j]] ``` Args: matrix: A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`, or `complex128` with shape `[..., M, M]`. name: A name to give this `Op` (optional). Returns: The adjoint (a.k.a. Hermitian transpose a.k.a. conjugate transpose) of matrix. """ with ops.name_scope(name, 'adjoint', [matrix]): matrix = ops.convert_to_tensor(matrix, name='matrix') return array_ops.matrix_transpose(matrix, conjugate=True)
def _overdetermined(op, grad): """Gradients for the overdetermined case of MatrixSolveLs. This is the backprop for the solution to the normal equations of the first kind: X = F(A, B) = (A^T * A + lambda * I)^{-1} * A^T * B which solve the least squares problem min ||A * X - B||_F^2 + lambda ||X||_F^2. """ a = op.inputs[0] b = op.inputs[1] l2_regularizer = math_ops.cast(op.inputs[2], a.dtype.base_dtype) x = op.outputs[0] a_shape = array_ops.shape(a) batch_shape = a_shape[:-2] n = a_shape[-1] identity = linalg_ops.eye(n, batch_shape=batch_shape, dtype=a.dtype) gramian = math_ops.matmul(a, a, adjoint_a=True) + l2_regularizer * identity chol = linalg_ops.cholesky(gramian) # Temporary z = (A^T * A + lambda * I)^{-1} * grad. z = linalg_ops.cholesky_solve(chol, grad) xzt = math_ops.matmul(x, z, adjoint_b=True) zx_sym = xzt + array_ops.matrix_transpose(xzt) grad_a = -math_ops.matmul(a, zx_sym) + math_ops.matmul(b, z, adjoint_b=True) grad_b = math_ops.matmul(a, z) return (grad_a, grad_b, None)
def __call__(self, shape, dtype=dtypes.float32): """Returns a tensor object initialized as specified by the initializer. Args: shape: Shape of the tensor. dtype: Optional dtype of the tensor. Only floating point types are supported. Raises: ValueError: If the dtype is not floating point or the input shape is not valid. """ dtype = _assert_float_dtype(dtype) # Check the shape if len(shape) < 2: raise ValueError("The tensor to initialize must be " "at least two-dimensional") # Flatten the input shape with the last dimension remaining # its original shape so it works for conv2d num_rows = 1 for dim in shape[:-1]: num_rows *= dim num_cols = shape[-1] flat_shape = (max(num_cols, num_rows), min(num_cols, num_rows)) # Generate a random matrix a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed) # Compute the qr factorization q, r = gen_linalg_ops.qr(a, full_matrices=False) # Make Q uniform d = array_ops.diag_part(r) q *= math_ops.sign(d) if num_rows < num_cols: q = array_ops.matrix_transpose(q) return self.gain * array_ops.reshape(q, shape)
def _unvec_by(y, num_col): """Unstack vector to form a matrix, with a specified amount of columns.""" return array_ops.matrix_transpose( array_ops.reshape( y, array_ops.concat( [array_ops.shape(y)[:-1], [num_col, -1]], axis=0)))
def testNonBatchMatrixDynamicallyDefined(self): matrix = [[1, 2, 3], [4, 5, 6]] # Shape (2, 3) expected_transposed = [[1, 4], [2, 5], [3, 6]] # Shape (3, 2) with self.test_session(): matrix_ph = array_ops.placeholder(dtypes.int32) transposed = array_ops.matrix_transpose(matrix_ph) self.assertAllEqual( expected_transposed, transposed.eval(feed_dict={matrix_ph: matrix}))
def testConjugate(self): m = [[1 + 1j, 2 + 2j, 3 + 3j], [4 + 4j, 5 + 5j, 6 + 6j]] expected_transposed = [[1 - 1j, 4 - 4j], [2 - 2j, 5 - 5j], [3 - 3j, 6 - 6j]] with self.test_session(): matrix = ops.convert_to_tensor(m) transposed = array_ops.matrix_transpose(matrix, conjugate=True) self.assertEqual((3, 2), transposed.get_shape()) self.assertAllEqual(expected_transposed, transposed.eval())
def _covariance(self): if (isinstance(self.scale, linalg.LinearOperatorIdentity) or isinstance(self.scale, linalg.LinearOperatorScaledIdentity) or isinstance(self.scale, linalg.LinearOperatorDiag)): return array_ops.matrix_diag(math_ops.square(self.scale.diag_part())) else: # TODO(b/35040238): Remove transpose once LinOp supports `transpose`. return self.scale.apply(array_ops.matrix_transpose(self.scale.to_dense()))
def sign_magnitude_positive_definite( raw, off_diagonal_scale=0., overall_scale=0.): """Constructs a positive definite matrix from an unconstrained input matrix. We want to keep the whole matrix on a log scale, but also allow off-diagonal elements to be negative, so the sign of off-diagonal elements is modeled separately from their magnitude (using the lower and upper triangles respectively). Specifically: for i < j, we have: output_cholesky[i, j] = raw[j, i] / (abs(raw[j, i]) + 1) * exp((off_diagonal_scale + overall_scale + raw[i, j]) / 2) output_cholesky[i, i] = exp((raw[i, i] + overall_scale) / 2) output = output_cholesky^T * output_cholesky where raw, off_diagonal_scale, and overall_scale are un-constrained real-valued variables. The resulting values are stable around zero due to the exponential (and the softsign keeps the function smooth). Args: raw: A [..., M, M] Tensor. off_diagonal_scale: A scalar or [...] shaped Tensor controlling the relative scale of off-diagonal values in the output matrix. overall_scale: A scalar or [...] shaped Tensor controlling the overall scale of the output matrix. Returns: The `output` matrix described above, a [..., M, M] positive definite matrix. """ raw = ops.convert_to_tensor(raw) diagonal = array_ops.matrix_diag_part(raw) def _right_pad_with_ones(tensor, target_rank): # Allow broadcasting even if overall_scale and off_diagonal_scale have batch # dimensions tensor = ops.convert_to_tensor(tensor, dtype=raw.dtype.base_dtype) return array_ops.reshape(tensor, array_ops.concat( [ array_ops.shape(tensor), array_ops.ones( [target_rank - array_ops.rank(tensor)], dtype=target_rank.dtype) ], axis=0)) # We divide the log values by 2 to compensate for the squaring that happens # when transforming Cholesky factors into positive definite matrices. sign_magnitude = (gen_math_ops.exp( (raw + _right_pad_with_ones(off_diagonal_scale, array_ops.rank(raw)) + _right_pad_with_ones(overall_scale, array_ops.rank(raw))) / 2.) * nn.softsign(array_ops.matrix_transpose(raw))) sign_magnitude.set_shape(raw.get_shape()) cholesky_factor = array_ops.matrix_set_diag( input=array_ops.matrix_band_part(sign_magnitude, 0, -1), diagonal=gen_math_ops.exp((diagonal + _right_pad_with_ones( overall_scale, array_ops.rank(diagonal))) / 2.)) return math_ops.matmul(cholesky_factor, cholesky_factor, transpose_a=True)
def test_cholesky(self): z = random_ops.random_normal([2, 3, 3]) x = (math_ops.matmul(z, array_ops.matrix_transpose(z)) # Ensure pos. def. + linalg_ops.eye(3)) # Ensure well-conditioned. def loop_fn(i): return linalg_ops.cholesky(array_ops.gather(x, i)) self._test_loop_fn(loop_fn, 2)
def _GradWithInverseL(l, l_inverse, grad): middle = math_ops.matmul(l, grad, adjoint_a=True) middle = array_ops.matrix_set_diag(middle, 0.5 * array_ops.matrix_diag_part(middle)) middle = array_ops.matrix_band_part(middle, -1, 0) grad_a = math_ops.matmul( math_ops.matmul(l_inverse, middle, adjoint_a=True), l_inverse) grad_a += math_ops.conj(array_ops.matrix_transpose(grad_a)) return grad_a * 0.5
def TriAngSolveCompositeGrad(l, grad): # Gradient is l^{-H} @ ((l^{H} @ grad) * (tril(ones)-1/2*eye)) @ l^{-1} # Compute ((l^{H} @ grad) * (tril(ones)-1/2*eye)) = middle middle = math_ops.matmul(l, grad, adjoint_a=True) middle = array_ops.matrix_set_diag(middle, 0.5 * array_ops.matrix_diag_part(middle)) middle = array_ops.matrix_band_part(middle, -1, 0) # Compute l^{-H} @ middle = z l_inverse_middle = linalg_ops.matrix_triangular_solve(l, middle, adjoint=True) # We need to compute z @ l^{-1}. With matrix_triangular_solve we # actually compute l^{-H} @ z^{H} = grad. Since we later add grad^{H} # we can ommit the conjugate transpose here. z_h = math_ops.conj(array_ops.matrix_transpose(l_inverse_middle)) grad_a = linalg_ops.matrix_triangular_solve(l, z_h, adjoint=True) grad_a += math_ops.conj(array_ops.matrix_transpose(grad_a)) return grad_a * 0.5
def _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs=False, conjugate_rhs=False, name=None): """Helper function used after the input has been cast to compact form.""" diags_rank, rhs_rank = len(diagonals.shape), len(rhs.shape) if diags_rank < 2: raise ValueError( 'Expected diagonals to have rank at least 2, got {}'.format(diags_rank)) if rhs_rank != diags_rank and rhs_rank != diags_rank - 1: raise ValueError('Expected the rank of rhs to be {} or {}, got {}'.format( diags_rank - 1, diags_rank, rhs_rank)) if diagonals.shape[-2] != 3: raise ValueError('Expected 3 diagonals got {}'.format(diagonals.shape[-2])) if not diagonals.shape[:-2].is_compatible_with(rhs.shape[:diags_rank - 2]): raise ValueError('Batch shapes {} and {} are incompatible'.format( diagonals.shape[:-2], rhs.shape[:diags_rank - 2])) def check_num_lhs_matches_num_rhs(): if diagonals.shape[-1] != rhs.shape[-2]: raise ValueError('Expected number of left-hand sided and right-hand ' 'sides to be equal, got {} and {}'.format( diagonals.shape[-1], rhs.shape[-2])) if rhs_rank == diags_rank - 1: # Rhs provided as a vector, ignoring transpose_rhs if conjugate_rhs: rhs = math_ops.conj(rhs) rhs = array_ops.expand_dims(rhs, -1) check_num_lhs_matches_num_rhs() return array_ops.squeeze( linalg_ops.tridiagonal_solve(diagonals, rhs, name), -1) if transpose_rhs: rhs = array_ops.matrix_transpose(rhs, conjugate=conjugate_rhs) elif conjugate_rhs: rhs = math_ops.conj(rhs) check_num_lhs_matches_num_rhs() result = linalg_ops.tridiagonal_solve(diagonals, rhs, name) return array_ops.matrix_transpose(result) if transpose_rhs else result
def testBatchMatrix(self): matrix_0 = [[1, 2, 3], [4, 5, 6]] matrix_0_t = [[1, 4], [2, 5], [3, 6]] matrix_1 = [[11, 22, 33], [44, 55, 66]] matrix_1_t = [[11, 44], [22, 55], [33, 66]] batch_matrix = [matrix_0, matrix_1] # Shape (2, 2, 3) expected_transposed = [matrix_0_t, matrix_1_t] # Shape (2, 3, 2) with self.test_session(): transposed = array_ops.matrix_transpose(batch_matrix) self.assertEqual((2, 3, 2), transposed.get_shape()) self.assertAllEqual(expected_transposed, transposed.eval())
def testBatchMatrixDynamicallyDefined(self): matrix_0 = [[1, 2, 3], [4, 5, 6]] matrix_0_t = [[1, 4], [2, 5], [3, 6]] matrix_1 = [[11, 22, 33], [44, 55, 66]] matrix_1_t = [[11, 44], [22, 55], [33, 66]] batch_matrix = [matrix_0, matrix_1] # Shape (2, 2, 3) expected_transposed = [matrix_0_t, matrix_1_t] # Shape (2, 3, 2) with self.test_session(): batch_matrix_ph = array_ops.placeholder(dtypes.int32) transposed = array_ops.matrix_transpose(batch_matrix_ph) self.assertAllEqual( expected_transposed, transposed.eval(feed_dict={batch_matrix_ph: batch_matrix}))
def _stddev(self): if (isinstance(self.scale, linalg.LinearOperatorIdentity) or isinstance(self.scale, linalg.LinearOperatorScaledIdentity) or isinstance(self.scale, linalg.LinearOperatorDiag)): return math_ops.abs(self.scale.diag_part()) elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate) and self.scale.is_self_adjoint): return math_ops.sqrt(array_ops.matrix_diag_part( self.scale.apply(self.scale.to_dense()))) else: # TODO(b/35040238): Remove transpose once LinOp supports `transpose`. return math_ops.sqrt(array_ops.matrix_diag_part( self.scale.apply(array_ops.matrix_transpose(self.scale.to_dense()))))
def test_real_hermitian_spectrum_gives_real_symmetric_operator(self): with self.cached_session() as sess: # This is a real and hermitian spectrum. spectrum = [[1., 2., 2.], [3., 4., 4.], [3., 4., 4.]] operator = linalg.LinearOperatorCirculant(spectrum) matrix_tensor = operator.to_dense() self.assertEqual(matrix_tensor.dtype, dtypes.complex64) matrix_t = array_ops.matrix_transpose(matrix_tensor) imag_matrix = math_ops.imag(matrix_tensor) matrix, matrix_transpose, imag_matrix = sess.run( [matrix_tensor, matrix_t, imag_matrix]) np.testing.assert_allclose(0, imag_matrix, atol=1e-6) self.assertAllClose(matrix, matrix_transpose, atol=0)
def _symmetric_projection(self, n): """Compute a n x n symmetric projection matrix. Args: n: dimension. Returns: a n x n symmetric projection matrix, i.e. a matrix P s.t. P=P*P, P=P^T. """ q = self._orthogonal_matrix(n) # randomly zeroing out some columns mask = math_ops.cast(random_ops.random_normal([n], seed=self.seed) > 0, self.dtype) if self.seed: self.seed += 1 c = math_ops.multiply(q, mask) return math_ops.matmul(c, array_ops.matrix_transpose(c))
def _updated_mat(self, mat, v, diag): # Get dense matrix defined by its square root, which is an update of `mat`: # A = (mat + v D v^T) (mat + v D v^T)^T # D is the diagonal matrix with `diag` on the diagonal. # If diag is None, then it defaults to the identity matrix, so DV^T = V^T if diag is None: diag_vt = array_ops.matrix_transpose(v) else: diag_mat = array_ops.matrix_diag(diag) diag_vt = math_ops.matmul(diag_mat, v, adjoint_b=True) v_diag_vt = math_ops.matmul(v, diag_vt) sqrt = mat + v_diag_vt a = math_ops.matmul(sqrt, sqrt, adjoint_b=True) return a.eval()
def matrix_adjoint(a, name="matrix_adjoint"): """Transposes last two dimensions of tensor `a`, and takes complex conjugate. If `a` is real valued, the result is equivalent to `matrix_transpose`. For example: ```python # Matrix with no batch dimension. # 'x' is [[1 2 3j] # [4 5 -6j]] tf.matrix_adjoint(x) ==> [[1 4] [2 5] [-3j 6j]] # Matrix with two batch dimensions. # x.shape is [1, 2, 3, 4] # tf.matrix_adjoint(x) is shape [1, 2, 4, 3] ``` Note that `tf.matmul` provides kwargs allowing for adjoint of arguments. This is done with minimal cost, and is preferable to using this function. E.g. ``` # Good! Adjoint is taken at minimal additional cost. tf.matmul(matrix, b, adjoint_b=True) # Inefficient! tf.matmul(matrix, tf.matrix_adjoint(b)) ``` Args: a: A `Tensor` with `rank >= 2`. name: A name for the operation (optional). Returns: A batch matrix `Tensor` with same `dtype` as `a`. Raises: ValueError: If `a` is determined statically to have `rank < 2`. """ with ops.name_scope(name, values=[a]): a = ops.convert_to_tensor(a, name="a") a_transpose = array_ops.matrix_transpose(a) return math_ops.conj(a_transpose)
def _CholeskyGrad(op, grad): """Gradient for Cholesky.""" # Gradient is l^{-H} @ ((l^{H} @ grad) * (tril(ones)-1/2*eye)) @ l^{-1} l = op.outputs[0] num_rows = array_ops.shape(l)[-1] batch_shape = array_ops.shape(l)[:-2] l_inverse = linalg_ops.matrix_triangular_solve( l, linalg_ops.eye(num_rows, batch_shape=batch_shape, dtype=l.dtype)) middle = math_ops.matmul(l, grad, adjoint_a=True) middle = array_ops.matrix_set_diag(middle, 0.5 * array_ops.matrix_diag_part(middle)) middle = array_ops.matrix_band_part(middle, -1, 0) grad_a = math_ops.matmul( math_ops.matmul(l_inverse, middle, adjoint_a=True), l_inverse) grad_a += math_ops.conj(array_ops.matrix_transpose(grad_a)) return grad_a * 0.5
def _SelfAdjointEigV2Grad(op, grad_e, grad_v): """Gradient for SelfAdjointEigV2.""" e = op.outputs[0] compute_v = op.get_attr("compute_v") # a = op.inputs[0], which satisfies # a[...,:,:] * v[...,:,i] = e[...,i] * v[...,i] with ops.control_dependencies([grad_e, grad_v]): if compute_v: v = op.outputs[1] # Construct the matrix f(i,j) = (i != j ? 1 / (e_i - e_j) : 0). # Notice that because of the term involving f, the gradient becomes # infinite (or NaN in practice) when eigenvalues are not unique. # Mathematically this should not be surprising, since for (k-fold) # degenerate eigenvalues, the corresponding eigenvectors are only defined # up to arbitrary rotation in a (k-dimensional) subspace. f = array_ops.matrix_set_diag( math_ops.reciprocal( array_ops.expand_dims(e, -2) - array_ops.expand_dims(e, -1)), array_ops.zeros_like(e)) grad_a = math_ops.matmul( v, math_ops.matmul( array_ops.matrix_diag(grad_e) + f * math_ops.matmul(v, grad_v, adjoint_a=True), v, adjoint_b=True)) else: _, v = linalg_ops.self_adjoint_eig(op.inputs[0]) grad_a = math_ops.matmul(v, math_ops.matmul( array_ops.matrix_diag(grad_e), v, adjoint_b=True)) # The forward op only depends on the lower triangular part of a, so here we # symmetrize and take the lower triangle grad_a = array_ops.matrix_band_part( grad_a + math_ops.conj(array_ops.matrix_transpose(grad_a)), -1, 0) grad_a = array_ops.matrix_set_diag(grad_a, 0.5 * array_ops.matrix_diag_part(grad_a)) return grad_a
def _Overdetermined(op, grad): """Gradients for the overdetermined case of MatrixSolveLs. This is the backprop for the solution to the normal equations of the first kind: X = F(A, B) = (A^T * A + lambda * I)^{-1} * A^T * B which solve the least squares problem min ||A * X - B||_F^2 + lambda ||X||_F^2. """ a = op.inputs[0] b = op.inputs[1] x = op.outputs[0] l2_regularizer = math_ops.cast(op.inputs[2], a.dtype.base_dtype) # pylint: disable=protected-access chol = linalg_ops._RegularizedGramianCholesky( a, l2_regularizer=l2_regularizer, first_kind=True) # pylint: enable=protected-access # Temporary z = (A^T * A + lambda * I)^{-1} * grad. z = linalg_ops.cholesky_solve(chol, grad) xzt = math_ops.matmul(x, z, adjoint_b=True) zx_sym = xzt + array_ops.matrix_transpose(xzt) grad_a = -math_ops.matmul(a, zx_sym) + math_ops.matmul(b, z, adjoint_b=True) grad_b = math_ops.matmul(a, z) return (grad_a, grad_b, None)
def _batch_matmul(self, x, transpose_x=False): if transpose_x: x = array_ops.matrix_transpose(x) self._check_x(x) return x
def _Adjoint(x): return math_ops.conj(array_ops.matrix_transpose(x))
def _batch_matmul(self, x, transpose_x=False): if transpose_x: x = array_ops.matrix_transpose(x) diag_mat = array_ops.expand_dims(self._diag, -1) return math_ops.square(diag_mat) * x
def _SvdGrad(op, grad_s, grad_u, grad_v): """Gradient for Svd based on Giles' algorithm. Reference at top of file.""" if op.get_attr("compute_uv") and not op.get_attr("full_matrices"): raise NotImplementedError( "SVD gradient is not implemented for compute_uv=True and " "full_matrices=False.") a = op.inputs[0] a_shape = a.get_shape().with_rank_at_least(2) if op.get_attr("compute_uv"): # TODO(rmlarsen): Make this work with complex types. if a.dtype.is_complex: raise NotImplementedError( "SVD gradient is not implemented for complex types and " "compute_uv=True.") grad_u_shape = grad_u.get_shape().with_rank_at_least(2) grad_v_shape = grad_v.get_shape().with_rank_at_least(2) m = a_shape[-2].merge_with(grad_u_shape[-2]) n = a_shape[-1].merge_with(grad_v_shape[-2]) batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with( grad_v_shape[:-2]) a_shape = batch_shape.concatenate([m, n]) m = a_shape[-2].value n = a_shape[-1].value # TODO(rmlarsen): Make this work with placeholders. if m is None or n is None: raise NotImplementedError( "SVD gradient has not been implemented for input with unknown " "inner matrix shape.") if not op.get_attr("full_matrices") or not op.get_attr("compute_uv"): s, u, v = linalg_ops.svd(a, compute_uv=True, full_matrices=True) else: s = op.outputs[0] u = op.outputs[1] v = op.outputs[2] use_adjoint = False if m > n: # Compute the gradient for A^H = V * S^T * U^H, and (implicitly) take the # Hermitian transpose of the gradient at the end. use_adjoint = True m, n = n, m u, v = v, u grad_u, grad_v = grad_v, grad_u with ops.control_dependencies([grad_s, grad_u, grad_v]): grad_s_mat = array_ops.matrix_diag(grad_s) if not op.get_attr("compute_uv"): if use_adjoint: grad_a = math_ops.matmul(v[..., :, :m], math_ops.matmul(u, grad_s_mat), adjoint_b=True) else: grad_a = math_ops.matmul( u, math_ops.matmul(grad_s_mat, v[..., :, :m], adjoint_b=True)) grad_a.set_shape(a_shape) return grad_a # TODO(rmlarsen): Define a gradient that is numerically stable for # abs(m-n) > 1. Currently this does not work because there are effectively # multiple singular values with value zero. I am not sure if this is a true # instability or if it simply throws off the finite difference gradient # checker. if abs(m - n) > 1: raise NotImplementedError( "svd gradient is not implemented for abs(m - n) > 1") s_mat = array_ops.matrix_diag(s) s2 = math_ops.square(s) # NOTICE: Because of the term involving f, the gradient becomes # infinite (or NaN in practice) when singular values are not unique. # Mathematically this should not be surprising, since for (k-fold) # degenerate singular values, the corresponding singular vectors are # only defined up a (k-dimensional) subspace. In practice, this can # lead to numerical instability when singular values are close but not # exactly equal. f = array_ops.matrix_set_diag( math_ops.reciprocal( array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1)), array_ops.zeros_like(s)) s_inv_mat = array_ops.matrix_diag(math_ops.reciprocal(s)) u_gu = math_ops.matmul(u, grad_u, adjoint_a=True) v_gv = math_ops.matmul(v, grad_v, adjoint_a=True) if m == n: f_u = f * u_gu f_v = f * v_gv else: dv2 = array_ops.matrix_transpose( v_gv[..., m:n, :m]) - v_gv[..., :m, m:n] f_u = f * u_gu f_v = f * v_gv[..., :m, :m] grad_a_nouv = (grad_s_mat + math_ops.matmul(f_u + _linalg.adjoint(f_u), s_mat) + math_ops.matmul(s_mat, f_v + _linalg.adjoint(f_v))) if m != n: grad_a_nouv = array_ops.concat( [grad_a_nouv, math_ops.matmul(s_inv_mat, dv2)], -1) if use_adjoint: # Use (U X V^H)^H = V (U X)^H. grad_a = math_ops.matmul(v, math_ops.matmul(u, grad_a_nouv), adjoint_b=True) else: grad_a = math_ops.matmul( u, math_ops.matmul(grad_a_nouv, v, adjoint_b=True)) grad_a.set_shape(a_shape) return grad_a
def _batch_sqrt_matmul(self, x, transpose_x=False): if transpose_x: x = array_ops.matrix_transpose(x) diag_mat = array_ops.expand_dims(self._diag, -1) return diag_mat * x
def do_filter(self, estimated_state, estimated_state_covariance, predicted_observation, predicted_observation_covariance, observation, observation_model, observation_noise): """Convenience function for scoring predictions. Scores a prediction against an observation, and computes the updated posterior over states. Shapes given below for arguments are for single-model Kalman filtering (e.g. KalmanFilter). For ensembles, prior_state and prior_state_var are same-length tuples of values corresponding to each model. Args: estimated_state: A prior mean over states [batch size x state dimension] estimated_state_covariance: Covariance of state prior [batch size x D x D], with D depending on the Kalman filter implementation (typically the state dimension). predicted_observation: A prediction for the observed value, such as that returned by observed_from_state. A [batch size x num features] Tensor. predicted_observation_covariance: A covariance matrix corresponding to `predicted_observation`, a [batch size x num features x num features] Tensor. observation: The observed value corresponding to the predictions given [batch size x observation dimension] observation_model: The [batch size x observation dimension x model state dimension] Tensor indicating how a particular state is mapped to (pre-noise) observations for each part of the batch. observation_noise: A [batch size x observation dimension x observation dimension] Tensor or [observation dimension x observation dimension] Tensor with covariance matrices to use for each part of the batch (a two-dimensional input will be broadcast). Returns: posterior_state, posterior_state_var: Posterior mean and covariance, updated versions of prior_state and prior_state_var. log_prediction_prob: Log probability of the observations under the priors, suitable for optimization (should be maximized). """ symmetrized_observation_covariance = 0.5 * ( predicted_observation_covariance + array_ops.matrix_transpose(predicted_observation_covariance)) instability_message = ( "This may occur due to numerically unstable filtering when there is " "a large difference in posterior variances, or when inferences are " "near-deterministic. Considering tuning the " "'filtering_maximum_posterior_variance_ratio' or " "'filtering_minimum_posterior_variance' parameters in your " "StateSpaceModelConfiguration, or tuning the transition matrix.") symmetrized_observation_covariance = numerics.verify_tensor_all_finite( symmetrized_observation_covariance, "Predicted observation covariance was not finite. {}".format( instability_message)) diag = array_ops.matrix_diag_part(symmetrized_observation_covariance) min_diag = math_ops.reduce_min(diag) non_negative_assert = control_flow_ops.Assert( min_diag >= 0., [("The predicted observation covariance " "has a negative diagonal entry. {}").format(instability_message), min_diag]) with ops.control_dependencies([non_negative_assert]): observation_covariance_cholesky = linalg_ops.cholesky( symmetrized_observation_covariance) log_prediction_prob = distributions.MultivariateNormalTriL( predicted_observation, observation_covariance_cholesky).log_prob(observation) (posterior_state, posterior_state_var) = self.posterior_from_prior_state( prior_state=estimated_state, prior_state_var=estimated_state_covariance, observation=observation, observation_model=observation_model, predicted_observations=(predicted_observation, predicted_observation_covariance), observation_noise=observation_noise) return (posterior_state, posterior_state_var, log_prediction_prob)
def _matmul(self, x, adjoint=False, adjoint_arg=False): # Here we heavily rely on Roth's column Lemma [1]: # (A x B) * vec X = vec BXA^T, # where vec stacks all the columns of the matrix under each other. In our # case, x represents a batch of vec X (i.e. we think of x as a batch of # column vectors, rather than a matrix). Each member of the batch can be # reshaped to a matrix (hence we get a batch of matrices). # We can iteratively apply this lemma by noting that if B is a Kronecker # product, then we can apply the lemma again. # [1] W. E. Roth, "On direct product matrices," # Bulletin of the American Mathematical Society, vol. 40, pp. 461-468, # 1934 # Efficiency # Naively doing the Kronecker product, by calculating the dense matrix and # applying it will can take cubic time in the size of domain_dimension # (assuming a square matrix). The other issue is that calculating the dense # matrix can be prohibitively expensive, in that it can take a large amount # of memory. # # This implementation avoids this memory blow up by only computing matmuls # with the factors. In this way, we don't have to realize the dense matrix. # In terms of complexity, if we have Kronecker Factors of size: # (n1, n1), (n2, n2), (n3, n3), ... (nJ, nJ), with N = \prod n_i, and we # have as input a [N, M] matrix, the naive approach would take O(N^2 M). # With this approach (ignoring reshaping of tensors and transposes for now), # the time complexity can be O(M * (\sum n_i) * N). There is also the # benefit of batched multiplication (In this example, the batch size is # roughly M * N) so this can be much faster. However, not factored in are # the costs of the several transposing of tensors, which can affect cache # behavior. # Below we document the shape manipulation for adjoint=False, # adjoint_arg=False, but the general case of different adjoints is still # handled. if adjoint_arg: x = linalg.adjoint(x) # Always add a batch dimension to enable broadcasting to work. batch_shape = array_ops.concat( [array_ops.ones_like(self.batch_shape_tensor()), [1, 1]], 0) x += array_ops.zeros(batch_shape, dtype=x.dtype.base_dtype) # x has shape [B, R, C], where B represent some number of batch dimensions, # R represents the number of rows, and C represents the number of columns. # In order to apply Roth's column lemma, we need to operate on a batch of # column vectors, so we reshape into a batch of column vectors. We put it # at the front to ensure that broadcasting between operators to the batch # dimensions B still works. output = _rotate_last_dim(x, rotate_right=True) # Also expand the shape to be [A, C, B, R]. The first dimension will be # used to accumulate dimensions from each operator matmul. output = output[array_ops.newaxis, ...] # In this loop, A is going to refer to the value of the accumulated # dimension. A = 1 at the start, and will end up being self.range_dimension. # V will refer to the last dimension. V = R at the start, and will end up # being 1 in the end. for operator in self.operators[:-1]: # Reshape output from [A, C, B, V] to be # [A, C, B, V / op.domain_dimension, op.domain_dimension] if adjoint: operator_dimension = operator.range_dimension_tensor() else: operator_dimension = operator.domain_dimension_tensor() output = _unvec_by(output, operator_dimension) # We are computing (XA^T) = (AX^T)^T. # output has [A, C, B, V / op.domain_dimension, op.domain_dimension], # which is being converted to: # [A, C, B, V / op.domain_dimension, op.range_dimension] output = array_ops.matrix_transpose(output) output = operator.matmul(output, adjoint=adjoint, adjoint_arg=False) output = array_ops.matrix_transpose(output) # Rearrange it to [A * op.range_dimension, C, B, V / op.domain_dimension] output = _rotate_last_dim(output, rotate_right=False) output = _vec(output) output = _rotate_last_dim(output, rotate_right=True) # After the loop, we will have # A = self.range_dimension / op[-1].range_dimension # V = op[-1].domain_dimension # We convert that using matvec to get: # [A, C, B, op[-1].range_dimension] output = self.operators[-1].matvec(output, adjoint=adjoint) # Rearrange shape to be [B1, ... Bn, self.range_dimension, C] output = _rotate_last_dim(output, rotate_right=False) output = _vec(output) output = _rotate_last_dim(output, rotate_right=False) if x.shape.is_fully_defined(): column_dim = x.shape[-1] broadcast_batch_shape = common_shapes.broadcast_shape( x.shape[:-2], self.batch_shape) if adjoint: matrix_dimensions = [self.domain_dimension, column_dim] else: matrix_dimensions = [self.range_dimension, column_dim] output.set_shape( broadcast_batch_shape.concatenate(matrix_dimensions)) return output
def _vec(x): """Stacks column of matrix to form a single column.""" return array_ops.reshape( array_ops.matrix_transpose(x), array_ops.concat([array_ops.shape(x)[:-2], [-1]], axis=0))
def _solve(self, rhs, adjoint=False, adjoint_arg=False): # Here we follow the same use of Roth's column lemma as in `matmul`, with # the key difference that we replace all `matmul` instances with `solve`. # This follows from the property that inv(A x B) = inv(A) x inv(B). # Below we document the shape manipulation for adjoint=False, # adjoint_arg=False, but the general case of different adjoints is still # handled. if adjoint_arg: rhs = linalg.adjoint(rhs) # Always add a batch dimension to enable broadcasting to work. batch_shape = array_ops.concat( [array_ops.ones_like(self.batch_shape_tensor()), [1, 1]], 0) rhs += array_ops.zeros(batch_shape, dtype=rhs.dtype.base_dtype) # rhs has shape [B, R, C], where B represent some number of batch # dimensions, # R represents the number of rows, and C represents the number of columns. # In order to apply Roth's column lemma, we need to operate on a batch of # column vectors, so we reshape into a batch of column vectors. We put it # at the front to ensure that broadcasting between operators to the batch # dimensions B still works. output = _rotate_last_dim(rhs, rotate_right=True) # Also expand the shape to be [A, C, B, R]. The first dimension will be # used to accumulate dimensions from each operator matmul. output = output[array_ops.newaxis, ...] # In this loop, A is going to refer to the value of the accumulated # dimension. A = 1 at the start, and will end up being self.range_dimension. # V will refer to the last dimension. V = R at the start, and will end up # being 1 in the end. for operator in self.operators[:-1]: # Reshape output from [A, C, B, V] to be # [A, C, B, V / op.domain_dimension, op.domain_dimension] if adjoint: operator_dimension = operator.range_dimension_tensor() else: operator_dimension = operator.domain_dimension_tensor() output = _unvec_by(output, operator_dimension) # We are computing (XA^-1^T) = (A^-1 X^T)^T. # output has [A, C, B, V / op.domain_dimension, op.domain_dimension], # which is being converted to: # [A, C, B, V / op.domain_dimension, op.range_dimension] output = array_ops.matrix_transpose(output) output = operator.solve(output, adjoint=adjoint, adjoint_arg=False) output = array_ops.matrix_transpose(output) # Rearrange it to [A * op.range_dimension, C, B, V / op.domain_dimension] output = _rotate_last_dim(output, rotate_right=False) output = _vec(output) output = _rotate_last_dim(output, rotate_right=True) # After the loop, we will have # A = self.range_dimension / op[-1].range_dimension # V = op[-1].domain_dimension # We convert that using matvec to get: # [A, C, B, op[-1].range_dimension] output = self.operators[-1].solvevec(output, adjoint=adjoint) # Rearrange shape to be [B1, ... Bn, self.range_dimension, C] output = _rotate_last_dim(output, rotate_right=False) output = _vec(output) output = _rotate_last_dim(output, rotate_right=False) if rhs.shape.is_fully_defined(): column_dim = rhs.shape[-1] broadcast_batch_shape = common_shapes.broadcast_shape( rhs.shape[:-2], self.batch_shape) if adjoint: matrix_dimensions = [self.domain_dimension, column_dim] else: matrix_dimensions = [self.range_dimension, column_dim] output.set_shape( broadcast_batch_shape.concatenate(matrix_dimensions)) return output
def testTensorWithStaticRankLessThanTwoRaisesBecauseNotAMatrix(self): vector = [1, 2, 3] with self.test_session(): with self.assertRaisesRegexp(ValueError, "should be a "): array_ops.matrix_transpose(vector)
def _reshape_for_efficiency(a, b, transpose_a=False, transpose_b=False, adjoint_a=False, adjoint_b=False): """Maybe reshape a, b, and return an inverse map. For matmul/solve.""" def identity(x): return x # At this point, we have not taken transpose/adjoint of a/b. still_need_to_transpose = True if a.shape.ndims is None or b.shape.ndims is None: return a, b, identity, still_need_to_transpose # This could be handled in the future, but seems less common. if a.shape.ndims >= b.shape.ndims: return a, b, identity, still_need_to_transpose # From now on, we might modify b, but will not modify a. # Suppose: # a.shape = C + [m, n], b.shape = # b.shape = S + C + [n, r] b_extra_ndims = b.shape.ndims - a.shape.ndims # b_extra_sh = S, b_main_sh = C + [n, r] b_extra_sh = array_ops.shape(b)[:b_extra_ndims] b_main_sh = array_ops.shape(b)[b_extra_ndims:] # No reason to flip unless the extra dims of b are big enough. Why? # Assume adjoint/transpose = False. Then... # By not flipping, we have to replicate a to shape # b_extra_sh + a.shape, # which could use extra memory. But in all cases, the final output has shape # b_extra_sh + a.shape[:-1] + [b.shape[-1]] # So we only end up creating a larger object if the end dim of b is smaller # than the end dim of a. This often happens, e.g. if b was a vector that was # expanded to a matrix (by appending a singleton). # Since adjoint/transpose may not be False, we must make adjustments here. # The dim of b that holds the multiple equations. a_domain_sz_ = a.shape[-2 if adjoint_a or transpose_a else -1] b_eq_sz_ = b.shape[-2 if adjoint_b or transpose_b else -1] b_extra_sz_ = (np.prod(b.shape[:b_extra_ndims].as_list()) if b.shape[:b_extra_ndims].is_fully_defined() else None) if (a_domain_sz_ is not None and b_eq_sz_ is not None and b_extra_sz_ is not None): if b_extra_sz_ < 2 or a_domain_sz_ <= b_eq_sz_: return a, b, identity, still_need_to_transpose # At this point, we're flipping for sure! # Any transposes/adjoints will happen here explicitly, rather than in calling # code. Why? To avoid having to write separate complex code for each case. if adjoint_a: a = array_ops.matrix_transpose(a, conjugate=True) elif transpose_a: a = array_ops.matrix_transpose(a, conjugate=False) if adjoint_b: b = array_ops.matrix_transpose(b, conjugate=True) elif transpose_a: b = array_ops.matrix_transpose(b, conjugate=False) still_need_to_transpose = False # Recompute shapes, since the transpose/adjoint may have changed them. b_extra_sh = array_ops.shape(b)[:b_extra_ndims] b_main_sh = array_ops.shape(b)[b_extra_ndims:] # Permutation to put the extra dims at the end. perm = (np.concatenate( (np.arange(b_extra_ndims, b.shape.ndims), np.arange(0, b_extra_ndims)), 0)) b_extra_on_end = array_ops.transpose(b, perm=perm) # Now squash this end into one long dim. b_squashed_end = array_ops.reshape( b_extra_on_end, array_ops.concat((b_main_sh[:-1], [-1]), 0)) def reshape_inv(y): # Expand the extra dims hanging off the end, "b_extra_sh". # Note we use y_sh[:-1] + [b_main_sh[-1]] rather than b_main_sh, because y # Could have different batch dims than a and b, because of broadcasting. y_extra_shape = array_ops.concat( (array_ops.shape(y)[:-1], [b_main_sh[-1]], b_extra_sh), 0) y_extra_on_end = array_ops.reshape(y, y_extra_shape) inverse_perm = np.argsort(perm) return array_ops.transpose(y_extra_on_end, perm=inverse_perm) return a, b_squashed_end, reshape_inv, still_need_to_transpose
def __init__(self, loc=None, covariance_matrix=None, validate_args=False, allow_nan_stats=True, name="MultivariateNormalFullCovariance"): """Construct Multivariate Normal distribution on `R^k`. The `batch_shape` is the broadcast shape between `loc` and `covariance_matrix` arguments. The `event_shape` is given by last dimension of the matrix implied by `covariance_matrix`. The last dimension of `loc` (if provided) must broadcast with this. A non-batch `covariance_matrix` matrix is a `k x k` symmetric positive definite matrix. In other words it is (real) symmetric with all eigenvalues strictly positive. Additional leading dimensions (if any) will index batches. Args: loc: Floating-point `Tensor`. If this is set to `None`, `loc` is implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where `b >= 0` and `k` is the event size. covariance_matrix: Floating-point, symmetric positive definite `Tensor` of same `dtype` as `loc`. The strict upper triangle of `covariance_matrix` is ignored, so if `covariance_matrix` is not symmetric no error will be raised (unless `validate_args is True`). `covariance_matrix` has shape `[B1, ..., Bb, k, k]` where `b >= 0` and `k` is the event size. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: ValueError: if neither `loc` nor `covariance_matrix` are specified. """ parameters = dict(locals()) # Convert the covariance_matrix up to a scale_tril and call MVNTriL. with ops.name_scope(name) as name: with ops.name_scope("init", values=[loc, covariance_matrix]): if covariance_matrix is None: scale_tril = None else: covariance_matrix = ops.convert_to_tensor( covariance_matrix, name="covariance_matrix") if validate_args: covariance_matrix = control_flow_ops.with_dependencies( [ check_ops.assert_near( covariance_matrix, array_ops.matrix_transpose( covariance_matrix), message="Matrix was not symmetric") ], covariance_matrix) # No need to validate that covariance_matrix is non-singular. # LinearOperatorLowerTriangular has an assert_non_singular method that # is called by the Bijector. # However, cholesky() ignores the upper triangular part, so we do need # to separately assert symmetric. scale_tril = linalg_ops.cholesky(covariance_matrix) super(MultivariateNormalFullCovariance, self).__init__(loc=loc, scale_tril=scale_tril, validate_args=validate_args, allow_nan_stats=allow_nan_stats, name=name) self._parameters = parameters
def random_normal_correlated_columns(shape, mean=0.0, stddev=1.0, dtype=dtypes.float32, eps=1e-4, seed=None): """Batch matrix with (possibly complex) Gaussian entries and correlated cols. Returns random batch matrix `A` with specified element-wise `mean`, `stddev`, living close to an embedded hyperplane. Suppose `shape[-2:] = (M, N)`. If `M < N`, `A` is a random `M x N` [batch] matrix with iid Gaussian entries. If `M >= N`, then the colums of `A` will be made almost dependent as follows: ``` L = random normal N x N-1 matrix, mean = 0, stddev = 1 / sqrt(N - 1) B = random normal M x N-1 matrix, mean = 0, stddev = stddev. G = (L B^H)^H, a random normal M x N matrix, living on N-1 dim hyperplane E = a random normal M x N matrix, mean = 0, stddev = eps mu = a constant M x N matrix, equal to the argument "mean" A = G + E + mu ``` Args: shape: Python list of integers. Shape of the returned tensor. Must be at least length two. mean: `Tensor` giving mean of normal to sample from. stddev: `Tensor` giving stdev of normal to sample from. dtype: `TensorFlow` `dtype` or numpy dtype eps: Distance each column is perturbed from the low-dimensional subspace. seed: Python integer seed for the RNG. Returns: `Tensor` with desired shape and dtype. Raises: ValueError: If `shape` is not at least length 2. """ dtype = dtypes.as_dtype(dtype) if len(shape) < 2: raise ValueError( "Argument shape must be at least length 2. Found: %s" % shape) # Shape is the final shape, e.g. [..., M, N] shape = list(shape) batch_shape = shape[:-2] m, n = shape[-2:] # If there is only one column, "they" are by definition correlated. if n < 2 or n < m: return random_normal(shape, mean=mean, stddev=stddev, dtype=dtype, seed=seed) # Shape of the matrix with only n - 1 columns that we will embed in higher # dimensional space. smaller_shape = batch_shape + [m, n - 1] # Shape of the embedding matrix, mapping batch matrices # from [..., N-1, M] to [..., N, M] embedding_mat_shape = batch_shape + [n, n - 1] # This stddev for the embedding_mat ensures final result has correct stddev. stddev_mat = 1 / np.sqrt(n - 1) with ops.name_scope("random_normal_correlated_columns"): smaller_mat = random_normal(smaller_shape, mean=0.0, stddev=stddev_mat, dtype=dtype, seed=seed) if seed is not None: seed += 1287 embedding_mat = random_normal(embedding_mat_shape, dtype=dtype, seed=seed) embedded_t = math_ops.matmul(embedding_mat, smaller_mat, transpose_b=True) embedded = array_ops.matrix_transpose(embedded_t) mean_mat = array_ops.ones_like(embedded) * mean return embedded + random_normal(shape, stddev=eps, dtype=dtype) + mean_mat
def sign_magnitude_positive_definite(raw, off_diagonal_scale=0., overall_scale=0.): """Constructs a positive definite matrix from an unconstrained input matrix. We want to keep the whole matrix on a log scale, but also allow off-diagonal elements to be negative, so the sign of off-diagonal elements is modeled separately from their magnitude (using the lower and upper triangles respectively). Specifically: for i < j, we have: output_cholesky[i, j] = raw[j, i] / (abs(raw[j, i]) + 1) * exp((off_diagonal_scale + overall_scale + raw[i, j]) / 2) output_cholesky[i, i] = exp((raw[i, i] + overall_scale) / 2) output = output_cholesky^T * output_cholesky where raw, off_diagonal_scale, and overall_scale are un-constrained real-valued variables. The resulting values are stable around zero due to the exponential (and the softsign keeps the function smooth). Args: raw: A [..., M, M] Tensor. off_diagonal_scale: A scalar or [...] shaped Tensor controlling the relative scale of off-diagonal values in the output matrix. overall_scale: A scalar or [...] shaped Tensor controlling the overall scale of the output matrix. Returns: The `output` matrix described above, a [..., M, M] positive definite matrix. """ raw = ops.convert_to_tensor(raw) diagonal = array_ops.matrix_diag_part(raw) def _right_pad_with_ones(tensor, target_rank): # Allow broadcasting even if overall_scale and off_diagonal_scale have batch # dimensions tensor = ops.convert_to_tensor(tensor, dtype=raw.dtype.base_dtype) return array_ops.reshape( tensor, array_ops.concat([ array_ops.shape(tensor), array_ops.ones([target_rank - array_ops.rank(tensor)], dtype=target_rank.dtype) ], axis=0)) # We divide the log values by 2 to compensate for the squaring that happens # when transforming Cholesky factors into positive definite matrices. sign_magnitude = (gen_math_ops.exp( (raw + _right_pad_with_ones(off_diagonal_scale, array_ops.rank(raw)) + _right_pad_with_ones(overall_scale, array_ops.rank(raw))) / 2.) * nn.softsign(array_ops.matrix_transpose(raw))) sign_magnitude.set_shape(raw.get_shape()) cholesky_factor = array_ops.matrix_set_diag( input=array_ops.matrix_band_part(sign_magnitude, 0, -1), diagonal=gen_math_ops.exp( (diagonal + _right_pad_with_ones(overall_scale, array_ops.rank(diagonal))) / 2.)) return math_ops.matmul(cholesky_factor, cholesky_factor, transpose_a=True)
def _SvdGrad(op, grad_s, grad_u, grad_v): """Gradient for the singular value decomposition.""" # The derivation for the compute_uv=False case, and most of # the derivation for the full_matrices=True case, are in # Giles' paper (see reference at top of file). A derivation for # the full_matrices=False case is available at # https://j-towns.github.io/papers/svd-derivative.pdf # The derivation for complex valued SVD can be found in # https://re-ra.xyz/misc/complexsvd.pdf or # https://giggleliu.github.io/2019/04/02/einsumbp.html a = op.inputs[0] a_shape = a.get_shape().with_rank_at_least(2) grad_s = math_ops.cast(grad_s, a.dtype) grad_s_mat = array_ops.matrix_diag(grad_s) if not op.get_attr("compute_uv"): s, u, v = linalg_ops.svd(a, compute_uv=True) grad_a = math_ops.matmul( u, math_ops.matmul(grad_s_mat, v, adjoint_b=True)) grad_a.set_shape(a_shape) return grad_a full_matrices = op.get_attr("full_matrices") grad_u_shape = grad_u.get_shape().with_rank_at_least(2) grad_v_shape = grad_v.get_shape().with_rank_at_least(2) m = a_shape.dims[-2].merge_with(grad_u_shape[-2]) n = a_shape.dims[-1].merge_with(grad_v_shape[-2]) batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with( grad_v_shape[:-2]) a_shape = batch_shape.concatenate([m, n]) m = a_shape.dims[-2].value n = a_shape.dims[-1].value # TODO(rmlarsen): Make this work with placeholders. if m is None or n is None: raise NotImplementedError( "SVD gradient has not been implemented for input with unknown " "inner matrix shape.") s = op.outputs[0] u = op.outputs[1] v = op.outputs[2] s = math_ops.cast(s, a.dtype) use_adjoint = False if m > n: # Compute the gradient for A^H = V * S^T * U^H, and (implicitly) take the # Hermitian transpose of the gradient at the end. use_adjoint = True m, n = n, m u, v = v, u grad_u, grad_v = grad_v, grad_u with ops.control_dependencies([grad_s, grad_u, grad_v]): if full_matrices and abs(m - n) > 1: raise NotImplementedError( "svd gradient is not implemented for abs(m - n) > 1 " "when full_matrices is True") s_mat = array_ops.matrix_diag(s) s2 = math_ops.square(s) # NOTICE: Because of the term involving f, the gradient becomes # infinite (or NaN in practice) when singular values are not unique. # Mathematically this should not be surprising, since for (k-fold) # degenerate singular values, the corresponding singular vectors are # only defined up a (k-dimensional) subspace. In practice, this can # lead to numerical instability when singular values are close but not # exactly equal. s_shape = array_ops.shape(s) f = array_ops.matrix_set_diag( _SafeReciprocal( array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1)), array_ops.zeros_like(s)) s_inv_mat = array_ops.matrix_diag(_SafeReciprocal(s)) v1 = v[..., :, :m] grad_v1 = grad_v[..., :, :m] u_gu = math_ops.matmul(u, grad_u, adjoint_a=True) v_gv = math_ops.matmul(v1, grad_v1, adjoint_a=True) f_u = f * u_gu f_v = f * v_gv term1_nouv = (grad_s_mat + math_ops.matmul(f_u + _linalg.adjoint(f_u), s_mat) + math_ops.matmul(s_mat, f_v + _linalg.adjoint(f_v))) term1 = math_ops.matmul( u, math_ops.matmul(term1_nouv, v1, adjoint_b=True)) if m == n: grad_a_before_transpose = term1 else: gv1t = array_ops.matrix_transpose(grad_v1, conjugate=True) gv1t_v1 = math_ops.matmul(gv1t, v1) term2_nous = gv1t - math_ops.matmul(gv1t_v1, v1, adjoint_b=True) if full_matrices: v2 = v[..., :, m:n] grad_v2 = grad_v[..., :, m:n] v1t_gv2 = math_ops.matmul(v1, grad_v2, adjoint_a=True) term2_nous -= math_ops.matmul(v1t_gv2, v2, adjoint_b=True) u_s_inv = math_ops.matmul(u, s_inv_mat) term2 = math_ops.matmul(u_s_inv, term2_nous) grad_a_before_transpose = term1 + term2 if a.dtype.is_complex: eye = _linalg.eye(s_shape[-1], batch_shape=s_shape[:-1], dtype=a.dtype) l = eye * v_gv term3_nouv = math_ops.matmul(s_inv_mat, _linalg.adjoint(l) - l) term3 = 1 / 2. * math_ops.matmul( u, math_ops.matmul(term3_nouv, v1, adjoint_b=True)) grad_a_before_transpose += term3 if use_adjoint: grad_a = array_ops.matrix_transpose(grad_a_before_transpose, conjugate=True) else: grad_a = grad_a_before_transpose grad_a.set_shape(a_shape) return grad_a
def _MatrixSquareRootGrad(op, grad): """Gradient for MatrixSquareRoot.""" # Let A be an m x m square matrix (or batch of matrices) # Let R = sqrtm(A) # By definition, A = RR # Take the differential: dA = d(RR) = RdR + dRR # Solve the resulting Sylvester equation for dR # Used to find Kronecker products within the Sylvester equation def _KroneckerProduct(b1, b2): """Computes the Kronecker product of two batches of square matrices.""" b1_shape = array_ops.shape(b1) b2_shape = array_ops.shape(b2) b1_order = b1_shape[-1] b2_order = b2_shape[-1] shape_slice_size = [math_ops.subtract(array_ops.size(b1_shape), 2)] shape_slice = array_ops.slice( b1_shape, [0], shape_slice_size) # Same for both batches b1_reshape_shape = array_ops.concat( [shape_slice, [b1_order], [1], [b1_order], [1]], 0) b2_reshape_shape = array_ops.concat( [shape_slice, [1], [b2_order], [1], [b2_order]], 0) b1_reshape = array_ops.reshape(b1, b1_reshape_shape) b2_reshape = array_ops.reshape(b2, b2_reshape_shape) order_prod = b1_order * b2_order kprod_shape = array_ops.concat( [shape_slice, [order_prod], [order_prod]], 0) return array_ops.reshape(b1_reshape * b2_reshape, kprod_shape) sqrtm = op.outputs[0] # R shape = array_ops.shape(sqrtm) order = shape[-1] # m matrix_count = math_ops.reduce_prod(shape[0:-2]) # Get batch of m x m identity matrices eye = linalg_ops.eye(order, dtype=sqrtm.dtype) # m x m identity matrix eye_flat = array_ops.reshape(eye, [-1]) eye_tiled = array_ops.tile(eye_flat, [matrix_count]) eye_batch = array_ops.reshape(eye_tiled, shape) # The transpose of R is taken in the k1 term instead of k2 in # order to prevent redundant transposition of R (i.e. (R')' = R) sqrtm_transpose = array_ops.matrix_transpose(sqrtm) k1 = _KroneckerProduct(eye_batch, sqrtm_transpose) k2 = _KroneckerProduct(sqrtm, eye_batch) ksum = math_ops.add(k1, k2) # Vectorize dA shape_slice_size = [math_ops.subtract(array_ops.size(shape), 2)] shape_slice = array_ops.slice(shape, [0], shape_slice_size) shape_vec_da = array_ops.concat([shape_slice, [order * order], [1]], 0) vec_da = array_ops.reshape(array_ops.matrix_transpose(grad), shape_vec_da) # Solve for vec(dR) vec_dsqrtm = linalg_ops.matrix_solve(ksum, vec_da) # Solve for dR by inverse vectorizing vec(dR) dsqrtm_transpose = array_ops.reshape(vec_dsqrtm, shape) return array_ops.matrix_transpose(dsqrtm_transpose)
def lanczos_bidiag(operator, k, orthogonalize=True, starting_vector=None, name="lanczos_bidiag"): """Computes a Lanczos bidiagonalization for a linear operator. Computes matrices `U` of shape `[m, k+1]`, `V` of shape `[n, k]` and lower bidiagonal matrix `B` of shape `[k+1, k]`, that satisfy the equations `A * V = U * B` and `A' * U[:, :-1] = V * B[:-1, :]'`. The columns of `U` are orthonormal and form a basis for the Krylov subspace `K(A*A', U[:,0])`. The columns of `V` are orthonormal and form a basis for the Krylov subspace `K(A'*A, A' U[:,0])`. Args: operator: An object representing a linear operator with attributes: - shape: Either a list of integers or a 1-D `Tensor` of type `int32` of length 2. `shape[0]` is the dimension on the domain of the operator, `shape[1]` is the dimension of the co-domain of the operator. On other words, if operator represents an M x N matrix A, `shape` must contain `[M, N]`. - dtype: The datatype of input to and output from `apply` and `apply_adjoint`. - apply: Callable object taking a vector `x` as input and returning a vector with the result of applying the operator to `x`, i.e. if `operator` represents matrix `A`, `apply` should return `A * x`. - apply_adjoint: Callable object taking a vector `x` as input and returning a vector with the result of applying the adjoint operator to `x`, i.e. if `operator` represents matrix `A`, `apply_adjoint` should return `conj(transpose(A)) * x`. k: An integer or a scalar Tensor of type `int32`. Determines the maximum number of steps to run. If an invariant subspace is found, the algorithm may terminate before `k` steps have been run. orthogonalize: If `True`, perform full orthogonalization. If `False` no orthogonalization is performed. starting_vector: If not null, must be a `Tensor` of shape `[n]`. name: A name scope for the operation. Returns: output: A namedtuple representing a Lanczos bidiagonalization of `operator` with attributes: u: A rank-2 `Tensor` of type `operator.dtype` and shape `[operator.shape[0], k_actual+1]`, where `k_actual` is the number of steps run. v: A rank-2 `Tensor` of type `operator.dtype` and shape `[operator.shape[1], k_actual]`, where `k_actual` is the number of steps run. alpha: A rank-1 `Tensor` of type `operator.dtype` and shape `[k]`. beta: A rank-1 `Tensor` of type `operator.dtype` and shape `[k]`. """ def tarray(size, dtype, name): return tensor_array_ops.TensorArray(dtype=dtype, size=size, tensor_array_name=name, clear_after_read=False) # Reads a row-vector at location i in tarray and returns it as a # column-vector. def read_colvec(tarray, i): return array_ops.expand_dims(tarray.read(i), -1) # Writes an column-vector as a row-vecor at location i in tarray. def write_colvec(tarray, colvec, i): return tarray.write(i, array_ops.squeeze(colvec)) # Ephemeral class holding Lanczos bidiagonalization state: # u = left Lanczos vectors # v = right Lanczos vectors # alpha = diagonal of B_k. # beta = subdiagonal of B_k. # Notice that we store the left and right Lanczos vectors as the _rows_ # of u and v. This is done because tensors are stored row-major and # TensorArray only supports packing along dimension 0. lanzcos_bidiag_state = collections.namedtuple("LanczosBidiagState", ["u", "v", "alpha", "beta"]) def update_state(old, i, u, v, alpha, beta): return lanzcos_bidiag_state(write_colvec(old.u, u, i + 1), write_colvec(old.v, v, i), old.alpha.write(i, alpha), old.beta.write(i, beta)) def gram_schmidt_step(j, basis, v): """Makes v orthogonal to the j'th vector in basis.""" v_shape = v.get_shape() basis_vec = read_colvec(basis, j) v -= math_ops.matmul(basis_vec, v, adjoint_a=True) * basis_vec v.set_shape(v_shape) return j + 1, basis, v def orthogonalize_once(i, basis, v): j = constant_op.constant(0, dtype=dtypes.int32) _, _, v = control_flow_ops.while_loop(lambda j, basis, v: j < i, gram_schmidt_step, [j, basis, v]) return util.l2normalize(v) # Iterated modified Gram-Schmidt orthogonalization adapted from PROPACK. # TODO(rmlarsen): This is possibly the slowest implementation of # iterated Gram-Schmidt orthogonalization since the abacus. Move to C++. def orthogonalize_(i, basis, v): v_norm = util.l2norm(v) v_new, v_new_norm = orthogonalize_once(i, basis, v) # If the norm decreases more than 1/sqrt(2), run a second # round of MGS. See proof in: # B. N. Parlett, ``The Symmetric Eigenvalue Problem'', # Prentice-Hall, Englewood Cliffs, NJ, 1980. pp. 105-109 return control_flow_ops.cond(v_new_norm < 0.7071 * v_norm, lambda: orthogonalize_once(i, basis, v), lambda: (v_new, v_new_norm)) def stopping_criterion(i, _): # TODO(rmlarsen): Stop if an invariant subspace is detected. return i < k def lanczos_bidiag_step(i, ls): """Extends the Lanczos bidiagonalization ls by one step.""" u = read_colvec(ls.u, i) r = operator.apply_adjoint(u) # The shape inference doesn't work across cond, save and reapply the shape. r_shape = r.get_shape() r = control_flow_ops.cond( i > 0, lambda: r - ls.beta.read(i - 1) * read_colvec(ls.v, i - 1), lambda: r) r.set_shape(r_shape) if orthogonalize: v, alpha = orthogonalize_(i - 1, ls.v, r) else: v, alpha = util.l2normalize(r) p = operator.apply(v) - alpha * u if orthogonalize: u, beta = orthogonalize_(i, ls.u, p) else: u, beta = util.l2normalize(p) return i + 1, update_state(ls, i, u, v, alpha, beta) with ops.name_scope(name): dtype = operator.dtype if starting_vector is None: starting_vector = random_ops.random_uniform(operator.shape[:1], -1, 1, dtype=dtype) u0, _ = util.l2normalize(starting_vector) ls = lanzcos_bidiag_state(u=write_colvec(tarray(k + 1, dtype, "u"), u0, 0), v=tarray(k, dtype, "v"), alpha=tarray(k, dtype, "alpha"), beta=tarray(k, dtype, "beta")) i = constant_op.constant(0, dtype=dtypes.int32) _, ls = control_flow_ops.while_loop(stopping_criterion, lanczos_bidiag_step, [i, ls]) return lanzcos_bidiag_state(array_ops.matrix_transpose(ls.u.stack()), array_ops.matrix_transpose(ls.v.stack()), ls.alpha.stack(), ls.beta.stack())
def assert_symmetric(matrix): matrix_t = array_ops.matrix_transpose(matrix) return control_flow_ops.with_dependencies( [check_ops.assert_equal(matrix, matrix_t)], matrix)
def _batch_sqrt_matmul(self, x, transpose_x=False): if transpose_x: x = array_ops.matrix_transpose(x) self._check_x(x) return math_ops.sqrt(self._scale) * x
def test_defining_spd_operator_by_taking_real_part(self): with self.cached_session(): # Necessary for fft_kernel_label_map # S is real and positive. s = linear_operator_test_util.random_uniform(shape=(10, 2, 3, 4), dtype=dtypes.float32, minval=1., maxval=2.) # Let S = S1 + S2, the Hermitian and anti-hermitian parts. # S1 = 0.5 * (S + S^H), S2 = 0.5 * (S - S^H), # where ^H is the Hermitian transpose of the function: # f(n0, n1, n2)^H := ComplexConjugate[f(N0-n0, N1-n1, N2-n2)]. # We want to isolate S1, since # S1 is Hermitian by construction # S1 is real since S is # S1 is positive since it is the sum of two positive kernels # IDFT[S] = IDFT[S1] + IDFT[S2] # = H1 + H2 # where H1 is real since it is Hermitian, # and H2 is imaginary since it is anti-Hermitian. ifft_s = fft_ops.ifft3d(math_ops.cast(s, dtypes.complex64)) # Throw away H2, keep H1. real_ifft_s = math_ops.real(ifft_s) # This is the perfect spectrum! # spectrum = DFT[H1] # = S1, fft_real_ifft_s = fft_ops.fft3d( math_ops.cast(real_ifft_s, dtypes.complex64)) # S1 is Hermitian ==> operator is real. # S1 is real ==> operator is self-adjoint. # S1 is positive ==> operator is positive-definite. operator = linalg.LinearOperatorCirculant3D(fft_real_ifft_s) # Allow for complex output so we can check operator has zero imag part. self.assertEqual(operator.dtype, dtypes.complex64) matrix, matrix_t = self.evaluate([ operator.to_dense(), array_ops.matrix_transpose(operator.to_dense()) ]) self.evaluate( operator.assert_positive_definite()) # Should not fail. np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6) self.assertAllClose(matrix, matrix_t) # Just to test the theory, get S2 as well. # This should create an imaginary operator. # S2 is anti-Hermitian ==> operator is imaginary. # S2 is real ==> operator is self-adjoint. imag_ifft_s = math_ops.imag(ifft_s) fft_imag_ifft_s = fft_ops.fft3d( 1j * math_ops.cast(imag_ifft_s, dtypes.complex64)) operator_imag = linalg.LinearOperatorCirculant3D(fft_imag_ifft_s) matrix, matrix_h = self.evaluate([ operator_imag.to_dense(), array_ops.matrix_transpose( math_ops.conj(operator_imag.to_dense())) ]) self.assertAllClose(matrix, matrix_h) np.testing.assert_allclose(0, np.real(matrix), atol=1e-7)