def get_observation_model(self, times): """Construct observation model matrix from VARMA parameters. Args: times: A [batch size] vector indicating the times observation models are requested for. Unused. Returns: the observation model matrix. It has shape [self.num_features, self.state_dimension]. """ del times # StateSpaceModel will broadcast along the batch dimension if self.ar_order > self.ma_order or self.state_num_blocks < 2: return array_ops.pad( linalg_ops.eye(self.num_features, dtype=self.dtype), [[0, 0], [0, self.num_features * (self.state_num_blocks - 1)]], name="observation_model") else: # Add a second observed component which "catches" the accumulated moving # average errors as they reach the end of the state. If ar_order > # ma_order, this is unnecessary, since accumulated errors cycle naturally. return array_ops.concat( [ array_ops.pad( linalg_ops.eye(self.num_features, dtype=self.dtype), [[0, 0], [0, self.num_features * (self.state_num_blocks - 2)]]), linalg_ops.eye(self.num_features, dtype=self.dtype) ], axis=1, name="observation_model")
def Test(self): eye_np = np.eye(num_rows, M=num_columns, dtype=dtype.as_numpy_dtype) if batch_shape is not None: eye_np = np.tile(eye_np, batch_shape + [1, 1]) for use_placeholder in False, True: if use_placeholder and (num_columns is None or batch_shape is None): return with self.test_session(use_gpu=True) as sess: if use_placeholder: num_rows_placeholder = array_ops.placeholder( dtypes.int32, name="num_rows") num_columns_placeholder = array_ops.placeholder( dtypes.int32, name="num_columns") batch_shape_placeholder = array_ops.placeholder( dtypes.int32, name="batch_shape") eye = linalg_ops.eye( num_rows_placeholder, num_columns=num_columns_placeholder, batch_shape=batch_shape_placeholder, dtype=dtype) eye_tf = sess.run( eye, feed_dict={ num_rows_placeholder: num_rows, num_columns_placeholder: num_columns, batch_shape_placeholder: batch_shape }) else: eye_tf = linalg_ops.eye( num_rows, num_columns=num_columns, batch_shape=batch_shape, dtype=dtype).eval() self.assertAllEqual(eye_np, eye_tf)
def testShapeInferenceStaticBatch(self): batch_shape = (2, 3) self.assertEqual( (2, 3, 2, 2), linalg_ops.eye(num_rows=2, batch_shape=batch_shape).shape) self.assertEqual( (2, 3, 2, 3), linalg_ops.eye( num_rows=2, num_columns=3, batch_shape=batch_shape).shape)
def transition_power_test_template(test_case, model, num_steps): """Tests the transition_to_powers function of a state space model.""" transition_matrix = ops.convert_to_tensor( model.get_state_transition(), dtype=model.dtype) step_number = array_ops.placeholder(shape=[], dtype=dtypes.int64) state_dimension = transition_matrix.get_shape()[0].value previous_matrix = array_ops.placeholder( shape=[state_dimension, state_dimension], dtype=transition_matrix.dtype) true_single_step_update = math_ops.matmul(previous_matrix, transition_matrix) model_output_tensor = model.transition_to_powers(powers=array_ops.stack( [step_number, step_number])) with test_case.test_session(): starting_matrix = linalg_ops.eye( state_dimension, batch_shape=array_ops.shape(num_steps)).eval() evaled_current_matrix = starting_matrix for iteration_number in range(num_steps): model_output = model_output_tensor.eval( feed_dict={step_number: iteration_number}) test_case.assertAllClose( evaled_current_matrix, model_output[0], rtol=1e-8 if evaled_current_matrix.dtype == numpy.float64 else 1e-4) evaled_current_matrix = true_single_step_update.eval( feed_dict={previous_matrix: evaled_current_matrix})
def _underdetermined(op, grad): """Gradients for the underdetermined case of MatrixSolveLs. This is the backprop for the solution to the normal equations of the second kind: X = F(A, B) = A * (A*A^T + lambda*I)^{-1} * B that (for lambda=0) solve the least squares problem min ||X||_F subject to A*X = B. """ a = op.inputs[0] b = op.inputs[1] l2_regularizer = math_ops.cast(op.inputs[2], a.dtype.base_dtype) a_shape = array_ops.shape(a) batch_shape = a_shape[:-2] m = a_shape[-2] identity = linalg_ops.eye(m, batch_shape=batch_shape, dtype=a.dtype) gramian = math_ops.matmul(a, a, adjoint_b=True) + l2_regularizer * identity chol = linalg_ops.cholesky(gramian) grad_b = linalg_ops.cholesky_solve(chol, math_ops.matmul(a, grad)) # Temporary tmp = (A * A^T + lambda * I)^{-1} * B. tmp = linalg_ops.cholesky_solve(chol, b) a1 = math_ops.matmul(tmp, a, adjoint_a=True) a1 = -math_ops.matmul(grad_b, a1) a2 = grad - math_ops.matmul(a, grad_b, adjoint_a=True) a2 = math_ops.matmul(tmp, a2, adjoint_b=True) grad_a = a1 + a2 return (grad_a, grad_b, None)
def _compute_power_svd(self, var, mat_g, mat_g_size, alpha, mat_h_slot_name): """Computes mat_h = mat_g^alpha using svd. mat_g is a symmetric PSD matrix. Args: var: the variable we are updating. mat_g: the symmetric PSD matrix whose power it to be computed mat_g_size: size of mat_g alpha: a real number mat_h_slot_name: name of slot to store the power, if needed. Returns: mat_h = mat_g^alpha Stores mat_h in the appropriate slot, if it exists. Note that mat_g is PSD. So we could use linalg_ops.self_adjoint_eig. """ if mat_g_size == 1: mat_h = math_ops.pow(mat_g + self._epsilon, alpha) else: damping = self._epsilon * linalg_ops.eye(math_ops.to_int32(mat_g_size)) diag_d, mat_u, mat_v = linalg_ops.svd(mat_g + damping, full_matrices=True) mat_h = math_ops.matmul( mat_v * math_ops.pow(math_ops.maximum(diag_d, self._epsilon), alpha), array_ops.transpose(mat_u)) if mat_h_slot_name is not None: return state_ops.assign(self.get_slot(var, mat_h_slot_name), mat_h) return mat_h
def _operator_and_matrix(self, build_info, dtype, use_placeholder): shape = list(build_info.shape) assert shape[-1] == shape[-2] batch_shape = shape[:-2] num_rows = shape[-1] # Uniform values that are at least length 1 from the origin. Allows the # operator to be well conditioned. # Shape batch_shape multiplier = linear_operator_test_util.random_sign_uniform( shape=batch_shape, minval=1., maxval=2., dtype=dtype) # Nothing to feed since LinearOperatorScaledIdentity takes no Tensor args. lin_op_multiplier = multiplier if use_placeholder: lin_op_multiplier = array_ops.placeholder_with_default( multiplier, shape=None) operator = linalg_lib.LinearOperatorScaledIdentity( num_rows, lin_op_multiplier) multiplier_matrix = array_ops.expand_dims( array_ops.expand_dims(multiplier, -1), -1) matrix = multiplier_matrix * linalg_ops.eye( num_rows, batch_shape=batch_shape, dtype=dtype) return operator, matrix
def test_inv_update_thunks(self): """Ensures inverse update ops run once per global_step.""" with self._graph.as_default(), self.test_session() as sess: fisher_estimator = estimator.FisherEstimator( damping_fn=lambda: 0.2, variables=[self.weights], layer_collection=self.layer_collection, cov_ema_decay=0.0) # Construct op that updates one inverse per global step. global_step = training_util.get_or_create_global_step() inv_matrices = [ matrix for fisher_factor in self.layer_collection.get_factors() for matrix in fisher_factor._inverses_by_damping.values() ] inv_update_op_thunks = fisher_estimator.inv_update_thunks inv_update_op = control_flow_ops.case( [(math_ops.equal(global_step, i), thunk) for i, thunk in enumerate(inv_update_op_thunks)]) increment_global_step = global_step.assign_add(1) sess.run(variables.global_variables_initializer()) initial_inv_values = sess.run(inv_matrices) # Ensure there's one update per inverse matrix. This is true as long as # there's no fan-in/fan-out or parameter re-use. self.assertEqual(len(inv_matrices), len(inv_update_op_thunks)) # Test is no-op if only 1 invariance matrix. assert len(inv_matrices) > 1 # Assign each covariance matrix a value other than the identity. This # ensures that the inverse matrices are updated to something different as # well. cov_matrices = [ fisher_factor.get_cov() for fisher_factor in self.layer_collection.get_factors() ] sess.run([ cov_matrix.assign(2 * linalg_ops.eye(int(cov_matrix.shape[0]))) for cov_matrix in cov_matrices ]) for i in range(len(inv_matrices)): # Compare new and old inverse values new_inv_values = sess.run(inv_matrices) is_inv_equal = [ np.allclose(initial_inv_value, new_inv_value) for (initial_inv_value, new_inv_value) in zip(initial_inv_values, new_inv_values) ] num_inv_equal = sum(is_inv_equal) # Ensure exactly one inverse matrix changes per step. self.assertEqual(num_inv_equal, len(inv_matrices) - i) # Run all inverse update ops. sess.run(inv_update_op) sess.run(increment_global_step)
def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder): shape = list(shape) assert shape[-1] == shape[-2] batch_shape = shape[:-2] num_rows = shape[-1] # Uniform values that are at least length 1 from the origin. Allows the # operator to be well conditioned. # Shape batch_shape multiplier = linear_operator_test_util.random_sign_uniform( shape=batch_shape, minval=1., maxval=2., dtype=dtype) operator = linalg_lib.LinearOperatorScaledIdentity(num_rows, multiplier) # Nothing to feed since LinearOperatorScaledIdentity takes no Tensor args. if use_placeholder: multiplier_ph = array_ops.placeholder(dtype=dtype) multiplier = multiplier.eval() operator = linalg_lib.LinearOperatorScaledIdentity( num_rows, multiplier_ph) feed_dict = {multiplier_ph: multiplier} else: feed_dict = None multiplier_matrix = array_ops.expand_dims( array_ops.expand_dims(multiplier, -1), -1) mat = multiplier_matrix * linalg_ops.eye( num_rows, batch_shape=batch_shape, dtype=dtype) return operator, mat, feed_dict
def power_sums_tensor(array_size, power_matrix, multiplier): r"""Computes \sum_{i=0}^{N-1} A^i B (A^i)^T for N=0..(array_size + 1). Args: array_size: The number of non-trivial sums to pre-compute. power_matrix: The "A" matrix above. multiplier: The "B" matrix above Returns: A Tensor with S[N] = \sum_{i=0}^{N-1} A^i B (A^i)^T S[0] is the zero matrix S[1] is B S[2] is A B A^T + B ...and so on """ array_size = math_ops.cast(array_size, dtypes.int32) power_matrix = ops.convert_to_tensor(power_matrix) identity_like_power_matrix = linalg_ops.eye( array_ops.shape(power_matrix)[0], dtype=power_matrix.dtype) identity_like_power_matrix.set_shape( ops.convert_to_tensor(power_matrix).get_shape()) transition_powers = functional_ops.scan( lambda previous_power, _: math_ops.matmul(previous_power, power_matrix), math_ops.range(array_size - 1), initializer=identity_like_power_matrix) summed = math_ops.cumsum( array_ops.concat([ array_ops.expand_dims(multiplier, 0), math_ops.matmul( batch_times_matrix(transition_powers, multiplier), transition_powers, adjoint_b=True) ], 0)) return array_ops.concat( [array_ops.expand_dims(array_ops.zeros_like(multiplier), 0), summed], 0)
def _overdetermined(op, grad): """Gradients for the overdetermined case of MatrixSolveLs. This is the backprop for the solution to the normal equations of the first kind: X = F(A, B) = (A^T * A + lambda * I)^{-1} * A^T * B which solve the least squares problem min ||A * X - B||_F^2 + lambda ||X||_F^2. """ a = op.inputs[0] b = op.inputs[1] l2_regularizer = math_ops.cast(op.inputs[2], a.dtype.base_dtype) x = op.outputs[0] a_shape = array_ops.shape(a) batch_shape = a_shape[:-2] n = a_shape[-1] identity = linalg_ops.eye(n, batch_shape=batch_shape, dtype=a.dtype) gramian = math_ops.matmul(a, a, adjoint_a=True) + l2_regularizer * identity chol = linalg_ops.cholesky(gramian) # Temporary z = (A^T * A + lambda * I)^{-1} * grad. z = linalg_ops.cholesky_solve(chol, grad) xzt = math_ops.matmul(x, z, adjoint_b=True) zx_sym = xzt + array_ops.matrix_transpose(xzt) grad_a = -math_ops.matmul(a, zx_sym) + math_ops.matmul(b, z, adjoint_b=True) grad_b = math_ops.matmul(a, z) return (grad_a, grad_b, None)
def test_non_batch_2x2(self): num_rows = 2 dtype = np.float32 np_eye = np.eye(num_rows).astype(dtype) with self.test_session(): eye = linalg_ops.eye(num_rows, dtype=dtype) self.assertAllEqual((num_rows, num_rows), eye.get_shape()) self.assertAllEqual(np_eye, eye.eval())
def _create_slots(self, var_list): for v in var_list: with ops.colocate_with(v): _ = self._zeros_slot(v, "gbar", self._name) shape = np.array(v.get_shape()) for i, d in enumerate(shape): d_tensor = ops.convert_to_tensor(d) if d <= self._max_matrix_size: mat_g_init = array_ops.zeros_like(linalg_ops.eye(d_tensor)) if self._svd_interval > 1: _ = self._get_or_make_slot(v, linalg_ops.eye(d_tensor), "H_" + str(i), self._name) else: mat_g_init = array_ops.zeros([d_tensor]) _ = self._get_or_make_slot(v, mat_g_init, "Gbar_" + str(i), self._name)
def test_non_batch_0x0(self): num_rows = 0 dtype = np.int64 np_eye = np.eye(num_rows).astype(dtype) with self.test_session(use_gpu=True): eye = linalg_ops.eye(num_rows, dtype=dtype) self.assertAllEqual((num_rows, num_rows), eye.get_shape()) self.assertAllEqual(np_eye, eye.eval())
def _verifyLu(self, x, output_idx_type=dtypes.int64): # Verify that Px = LU. lu, perm = linalg_ops.lu(x, output_idx_type=output_idx_type) # Prepare the lower factor of shape num_rows x num_rows lu_shape = np.array(lu.shape.as_list()) batch_shape = lu_shape[:-2] num_rows = lu_shape[-2] num_cols = lu_shape[-1] lower = array_ops.matrix_band_part(lu, -1, 0) if num_rows > num_cols: eye = linalg_ops.eye( num_rows, batch_shape=batch_shape, dtype=lower.dtype) lower = array_ops.concat([lower, eye[..., num_cols:]], axis=-1) elif num_rows < num_cols: lower = lower[..., :num_rows] # Fill the diagonal with ones. ones_diag = array_ops.ones( np.append(batch_shape, num_rows), dtype=lower.dtype) lower = array_ops.matrix_set_diag(lower, ones_diag) # Prepare the upper factor. upper = array_ops.matrix_band_part(lu, 0, -1) verification = math_ops.matmul(lower, upper) # Permute the rows of product of the Cholesky factors. if num_rows > 0: # Reshape the product of the triangular factors and permutation indices # to a single batch dimension. This makes it easy to apply # invert_permutation and gather_nd ops. perm_reshaped = array_ops.reshape(perm, [-1, num_rows]) verification_reshaped = array_ops.reshape(verification, [-1, num_rows, num_cols]) # Invert the permutation in each batch. inv_perm_reshaped = map_fn.map_fn(array_ops.invert_permutation, perm_reshaped) batch_size = perm_reshaped.shape.as_list()[0] # Prepare the batch indices with the same shape as the permutation. # The corresponding batch index is paired with each of the `num_rows` # permutation indices. batch_indices = math_ops.cast( array_ops.broadcast_to( math_ops.range(batch_size)[:, None], perm_reshaped.shape), dtype=output_idx_type) permuted_verification_reshaped = array_ops.gather_nd( verification_reshaped, array_ops.stack([batch_indices, inv_perm_reshaped], axis=-1)) # Reshape the verification matrix back to the original shape. verification = array_ops.reshape(permuted_verification_reshaped, lu_shape) self._verifyLuBase(x, lower, upper, perm, verification, output_idx_type)
def test_cholesky(self): z = random_ops.random_normal([2, 3, 3]) x = (math_ops.matmul(z, array_ops.matrix_transpose(z)) # Ensure pos. def. + linalg_ops.eye(3)) # Ensure well-conditioned. def loop_fn(i): return linalg_ops.cholesky(array_ops.gather(x, i)) self._test_loop_fn(loop_fn, 2)
def test_non_batch_0x2(self): num_rows = 0 num_columns = 2 dtype = np.int64 np_eye = np.eye(num_rows, num_columns).astype(dtype) with self.test_session(): eye = linalg_ops.eye(num_rows, num_columns=num_columns, dtype=dtype) self.assertAllEqual((num_rows, num_columns), eye.get_shape()) self.assertAllEqual(np_eye, eye.eval())
def TriAngInvCompositeGrad(l, grad): num_rows = array_ops.shape(l)[-1] batch_shape = array_ops.shape(l)[:-2] l_inverse = linalg_ops.matrix_triangular_solve(l, linalg_ops.eye( num_rows, batch_shape=batch_shape, dtype=l.dtype)) return _GradWithInverseL(l, l_inverse, grad)
def test_non_batch_2x3(self): num_rows = 2 num_columns = 3 dtype = np.float32 np_eye = np.eye(num_rows, num_columns).astype(dtype) with self.test_session(use_gpu=True): eye = linalg_ops.eye(num_rows, num_columns=num_columns, dtype=dtype) self.assertAllEqual((num_rows, num_columns), eye.get_shape()) self.assertAllEqual(np_eye, eye.eval())
def test_eye_no_placeholder(self, num_rows, num_columns, batch_shape, dtype): eye_np = np.eye(num_rows, M=num_columns, dtype=dtype.as_numpy_dtype) if batch_shape is not None: eye_np = np.tile(eye_np, batch_shape + [1, 1]) eye_tf = self.evaluate(linalg_ops.eye( num_rows, num_columns=num_columns, batch_shape=batch_shape, dtype=dtype)) self.assertAllEqual(eye_np, eye_tf)
def testLossFunctionWithoutName(self): """Ensure loss functions get unique names if 'name' not specified.""" with ops.Graph().as_default(): logits = linalg_ops.eye(2) lc = layer_collection.LayerCollection() # Create a new loss function with default names. lc.register_categorical_predictive_distribution(logits) lc.register_categorical_predictive_distribution(logits) self.assertEqual(2, len(lc.losses))
def _ctc_state_trans(label_seq): """Compute CTC alignment model transition matrix. Args: label_seq: tensor of shape [batch_size, max_seq_length] Returns: tensor of shape [batch_size, states, states] with a state transition matrix computed for each sequence of the batch. """ with ops.name_scope("ctc_state_trans"): label_seq = ops.convert_to_tensor(label_seq, name="label_seq") batch_size = _get_dim(label_seq, 0) num_labels = _get_dim(label_seq, 1) num_label_states = num_labels + 1 num_states = 2 * num_label_states label_states = math_ops.range(num_label_states) blank_states = label_states + num_label_states # Start state to first label. start_to_label = [[1, 0]] # Blank to label transitions. blank_to_label = array_ops.stack([label_states[1:], blank_states[:-1]], 1) # Label to blank transitions. label_to_blank = array_ops.stack([blank_states, label_states], 1) # Scatter transitions that don't depend on sequence. indices = array_ops.concat( [start_to_label, blank_to_label, label_to_blank], 0) values = array_ops.ones([_get_dim(indices, 0)]) trans = array_ops.scatter_nd( indices, values, shape=[num_states, num_states]) trans += linalg_ops.eye(num_states) # Self-loops. # Label to label transitions. Disallow transitions between repeated labels # with no blank state in between. batch_idx = array_ops.zeros_like(label_states[2:]) indices = array_ops.stack( [batch_idx, label_states[2:], label_states[1:-1]], 1) indices = array_ops.tile( array_ops.expand_dims(indices, 0), [batch_size, 1, 1]) batch_idx = array_ops.expand_dims(math_ops.range(batch_size), 1) * [1, 0, 0] indices += array_ops.expand_dims(batch_idx, 1) repeats = math_ops.equal(label_seq[:, :-1], label_seq[:, 1:]) values = 1.0 - math_ops.cast(repeats, dtypes.float32) batched_shape = [batch_size, num_states, num_states] label_to_label = array_ops.scatter_nd(indices, values, batched_shape) return array_ops.expand_dims(trans, 0) + label_to_label
def __call__(self, shape, dtype=None, partition_info=None): full_shape = shape if partition_info is None else partition_info.full_shape if len(full_shape) != 2: raise ValueError( "Identity matrix initializer can only be used for 2D matrices.") if dtype is None: dtype = self.dtype initializer = linalg_ops.eye(*full_shape, dtype=dtype) if partition_info is not None: initializer = array_ops.slice(initializer, partition_info.var_offset, shape) return self.gain * initializer
def _operator_and_matrix(self, build_info, dtype, use_placeholder): shape = list(build_info.shape) assert shape[-1] == shape[-2] batch_shape = shape[:-2] num_rows = shape[-1] operator = linalg_lib.LinearOperatorIdentity( num_rows, batch_shape=batch_shape, dtype=dtype) mat = linalg_ops.eye(num_rows, batch_shape=batch_shape, dtype=dtype) return operator, mat
def test_1x3_batch_4x4(self): num_rows = 4 batch_shape = [1, 3] dtype = np.float32 np_eye = np.eye(num_rows).astype(dtype) with self.test_session(): eye = linalg_ops.eye(num_rows, batch_shape=batch_shape, dtype=dtype) self.assertAllEqual(batch_shape + [num_rows, num_rows], eye.get_shape()) eye_v = eye.eval() for i in range(batch_shape[0]): for j in range(batch_shape[1]): self.assertAllEqual(np_eye, eye_v[i, j, :, :])
def _matrix_exp_pade3(matrix): """3rd-order Pade approximant for matrix exponential.""" b = [120.0, 60.0, 12.0] b = [constant_op.constant(x, matrix.dtype) for x in b] ident = linalg_ops.eye(array_ops.shape(matrix)[-2], batch_shape=array_ops.shape(matrix)[:-2], dtype=matrix.dtype) matrix_2 = math_ops.matmul(matrix, matrix) tmp = matrix_2 + b[1] * ident matrix_u = math_ops.matmul(matrix, tmp) matrix_v = b[2] * matrix_2 + b[0] * ident return matrix_u, matrix_v
def test_1x3_batch_0x0(self): num_rows = 0 batch_shape = [1, 3] dtype = np.float32 np_eye = np.eye(num_rows).astype(dtype) with self.test_session(use_gpu=True): eye = linalg_ops.eye(num_rows, batch_shape=batch_shape, dtype=dtype) self.assertAllEqual((1, 3, 0, 0), eye.get_shape()) eye_v = eye.eval() for i in range(batch_shape[0]): for j in range(batch_shape[1]): self.assertAllEqual(np_eye, eye_v[i, j, :, :])
def test_posterior_from_prior_state_multivariate_3d(self): self._posterior_from_prior_state_test_template( state=constant_op.constant([[1.9, 1., 5.]]), state_var=constant_op.constant( [[[200., 0., 1.], [0., 2000., 0.], [1., 0., 40000.]]]), observation=constant_op.constant([[1., 1., 3.]]), observation_model=constant_op.constant( [[[0.5, 0., 0.], [0., 10., 0.], [0., 0., 100.]]]), observation_noise=linalg_ops.eye(3) / 10000., expected_state=numpy.array([[2., .1, .03]]), expected_state_var=numpy.zeros([1, 3, 3]))
def _matrix_exp_pade5(matrix): """5th-order Pade approximant for matrix exponential.""" b = [30240.0, 15120.0, 3360.0, 420.0, 30.0] b = [constant_op.constant(x, matrix.dtype) for x in b] ident = linalg_ops.eye(array_ops.shape(matrix)[-2], batch_shape=array_ops.shape(matrix)[:-2], dtype=matrix.dtype) matrix_2 = math_ops.matmul(matrix, matrix) matrix_4 = math_ops.matmul(matrix_2, matrix_2) tmp = matrix_4 + b[3] * matrix_2 + b[1] * ident matrix_u = math_ops.matmul(matrix, tmp) matrix_v = b[4] * matrix_4 + b[2] * matrix_2 + b[0] * ident return matrix_u, matrix_v
def testRegisterCategoricalPredictiveDistribution(self): with ops.Graph().as_default(), self.test_session() as sess: random_seed.set_random_seed(200) logits = linalg_ops.eye(2) lc = layer_collection.LayerCollection() lc.register_categorical_predictive_distribution(logits, seed=200) single_loss = sess.run(lc.total_sampled_loss()) lc2 = layer_collection.LayerCollection() lc2.register_categorical_predictive_distribution(logits, seed=200) lc2.register_categorical_predictive_distribution(logits, seed=200) double_loss = sess.run(lc2.total_sampled_loss()) self.assertAlmostEqual(2 * single_loss, double_loss)
def _to_dense(self): return linalg_ops.eye(num_rows=self.domain_dimension_dynamic(), batch_shape=self.batch_shape_dynamic(), dtype=self.dtype)
def testShapeInferenceNoBatch(self): self.assertEqual((2, 2), linalg_ops.eye(num_rows=2).shape) self.assertEqual((2, 3), linalg_ops.eye(num_rows=2, num_columns=3).shape)
def _compute_power_iter(self, var, mat_g, mat_g_size, alpha, mat_h_slot_name, iter_count=100, epsilon=1e-6): """Computes mat_g^alpha, where alpha = -1/p, p a positive integer. We use an iterative Schur-Newton method from equation 3.2 on page 9 of: A Schur-Newton Method for the Matrix p-th Root and its Inverse by Chun-Hua Guo and Nicholas J. Higham SIAM Journal on Matrix Analysis and Applications, 2006, Vol. 28, No. 3 : pp. 788-804 https://pdfs.semanticscholar.org/0abe/7f77433cf5908bfe2b79aa91af881da83858.pdf Args: var: the variable we are updating. mat_g: the symmetric PSD matrix whose power it to be computed mat_g_size: size of mat_g. alpha: exponent, must be -1/p for p a positive integer. mat_h_slot_name: name of slot to store the power, if needed. iter_count: Maximum number of iterations. epsilon: accuracy indicator, useful for early termination. Returns: mat_g^alpha """ identity = linalg_ops.eye(math_ops.to_int32(mat_g_size)) def MatPower(mat_m, p): """Computes mat_m^p, for p a positive integer. Power p is known at graph compile time, so no need for loop and cond. Args: mat_m: a square matrix p: a positive integer Returns: mat_m^p """ assert p == int(p) and p > 0 power = None while p > 0: if p % 2 == 1: power = math_ops.matmul( mat_m, power) if power is not None else mat_m p //= 2 mat_m = math_ops.matmul(mat_m, mat_m) return power def IterCondition(i, mat_m, _): return math_ops.logical_and( i < iter_count, math_ops.reduce_max(math_ops.abs(mat_m - identity)) > epsilon) def IterBody(i, mat_m, mat_x): mat_m_i = (1 - alpha) * identity + alpha * mat_m return (i + 1, math_ops.matmul(MatPower(mat_m_i, -1.0 / alpha), mat_m), math_ops.matmul(mat_x, mat_m_i)) if mat_g_size == 1: mat_h = math_ops.pow(mat_g + self._epsilon, alpha) else: damped_mat_g = mat_g + self._epsilon * identity z = (1 - 1 / alpha) / (2 * linalg_ops.norm(damped_mat_g)) # The best value for z is # (1 - 1/alpha) * (c_max^{-alpha} - c_min^{-alpha}) / # (c_max^{1-alpha} - c_min^{1-alpha}) # where c_max and c_min are the largest and smallest singular values of # damped_mat_g. # The above estimate assumes that c_max > c_min * 2^p. (p = -1/alpha) # Can replace above line by the one below, but it is less accurate, # hence needs more iterations to converge. # z = (1 - 1/alpha) / math_ops.trace(damped_mat_g) # If we want the method to always converge, use z = 1 / norm(damped_mat_g) # or z = 1 / math_ops.trace(damped_mat_g), but these can result in many # extra iterations. _, _, mat_h = control_flow_ops.while_loop( IterCondition, IterBody, [0, damped_mat_g * z, identity * math_ops.pow(z, -alpha)]) if mat_h_slot_name is not None: return state_ops.assign(self.get_slot(var, mat_h_slot_name), mat_h) return mat_h
def extract_convolution_patches(inputs, filter_shape, padding, strides=None, dilation_rate=None, name=None, data_format=None): """Extracts inputs to each output coordinate in tf.nn.convolution. This is a generalization of tf.extract_image_patches() to tf.nn.convolution(), where the number of spatial dimensions may be something other than 2. Assumes, - First dimension of inputs is batch_size - Convolution filter is applied to all input channels. Args: inputs: Tensor of shape [batch_size, ..spatial_image_shape.., ..spatial_filter_shape.., in_channels]. Inputs to tf.nn.convolution(). filter_shape: List of ints. Shape of filter passed to tf.nn.convolution(). padding: string. Padding method. One of "VALID", "SAME". strides: None or list of ints. Strides along spatial dimensions. dilation_rate: None or list of ints. Dilation along spatial dimensions. name: None or str. Name of Op. data_format: None or str. Format of data. Returns: Tensor of shape [batch_size, ..spatial_image_shape.., ..spatial_filter_shape.., in_channels] Raises: ValueError: If data_format does not put channel last. ValueError: If inputs and filter disagree on in_channels. """ if not is_data_format_channel_last(data_format): raise ValueError("Channel must be last dimension.") with ops.name_scope( name, "extract_convolution_patches", [inputs, filter_shape, padding, strides, dilation_rate]): batch_size = inputs.shape.as_list()[0] in_channels = inputs.shape.as_list()[-1] # filter_shape = spatial_filter_shape + [in_channels, out_channels] spatial_filter_shape = filter_shape[:-2] if in_channels != filter_shape[-2]: raise ValueError( "inputs and filter_shape must agree on in_channels.") # Map each input feature to a location in the output. out_channels = np.prod(spatial_filter_shape) * in_channels filters = linalg_ops.eye(out_channels) filters = array_ops.reshape( filters, list(spatial_filter_shape) + [in_channels, out_channels]) result = nn_ops.convolution(inputs, filters, padding=padding, strides=strides, dilation_rate=dilation_rate) spatial_output_shape = result.shape.as_list()[1:-1] result = array_ops.reshape(result, [batch_size or -1] + spatial_output_shape + list(spatial_filter_shape) + [in_channels]) return result
def posterior_from_prior_state(self, prior_state, prior_state_var, observation, observation_model, predicted_observations, observation_noise): """Compute a posterior over states given an observation. Args: prior_state: Prior state mean [batch size x state dimension] prior_state_var: Prior state covariance [batch size x state dimension x state dimension] observation: The observed value corresponding to the predictions given [batch size x observation dimension] observation_model: The [batch size x observation dimension x model state dimension] Tensor indicating how a particular state is mapped to (pre-noise) observations for each part of the batch. predicted_observations: An (observation mean, observation variance) tuple computed based on the current state, usually the output of observed_from_state. observation_noise: A [batch size x observation dimension x observation dimension] or [observation dimension x observation dimension] Tensor with covariance matrices to use for each part of the batch (a two-dimensional input will be broadcast). Returns: Posterior mean and covariance (dimensions matching the first two arguments). """ observed_mean, observed_var = predicted_observations residual = observation - observed_mean # TODO(allenl): Can more of this be done using matrix_solve_ls? kalman_solve_rhs = math_ops.matmul(observation_model, prior_state_var, adjoint_b=True) # This matrix_solve adjoint doesn't make a difference symbolically (since # observed_var is a covariance matrix, and should be symmetric), but # filtering on multivariate series is unstable without it. See # test_multivariate_symmetric_covariance_float64 in kalman_filter_test.py # for an example of the instability (fails with adjoint=False). kalman_gain_transposed = linalg_ops.matrix_solve(matrix=observed_var, rhs=kalman_solve_rhs, adjoint=True) posterior_state = prior_state + array_ops.squeeze(math_ops.matmul( kalman_gain_transposed, array_ops.expand_dims(residual, -1), adjoint_a=True), axis=[-1]) gain_obs = math_ops.matmul(kalman_gain_transposed, observation_model, adjoint_a=True) identity_extradim = linalg_ops.eye(array_ops.shape(gain_obs)[1], dtype=gain_obs.dtype)[None] identity_minus_factor = identity_extradim - gain_obs if self._simplified_posterior_covariance_computation: # posterior covariance = # (I - kalman_gain * observation_model) * prior_state_var posterior_state_var = math_ops.matmul(identity_minus_factor, prior_state_var) else: observation_noise = ops.convert_to_tensor(observation_noise) # A Joseph form update, which provides better numeric stability than the # simplified optimal Kalman gain update, at the cost of a few extra # operations. Joseph form updates are valid for any gain (not just the # optimal Kalman gain), and so are more forgiving of numerical errors in # computing the optimal Kalman gain. # # posterior covariance = # (I - kalman_gain * observation_model) * prior_state_var # * (I - kalman_gain * observation_model)^T # + kalman_gain * observation_noise * kalman_gain^T left_multiplied_state_var = math_ops.matmul( identity_minus_factor, prior_state_var) multiplied_state_var = math_ops.matmul(identity_minus_factor, left_multiplied_state_var, adjoint_b=True) def _batch_observation_noise_update(): return (multiplied_state_var + math_ops.matmul( math_ops.matmul(kalman_gain_transposed, observation_noise, adjoint_a=True), kalman_gain_transposed)) def _matrix_observation_noise_update(): return (multiplied_state_var + math_ops.matmul( math_utils.batch_times_matrix( kalman_gain_transposed, observation_noise, adj_x=True), kalman_gain_transposed)) if observation_noise.get_shape().ndims is None: posterior_state_var = control_flow_ops.cond( math_ops.equal(array_ops.rank(observation_noise), 2), _matrix_observation_noise_update, _batch_observation_noise_update) else: # If static shape information exists, it gets checked in each cond() # branch, so we need a special case to avoid graph-build-time # exceptions. if observation_noise.get_shape().ndims == 2: posterior_state_var = _matrix_observation_noise_update() else: posterior_state_var = _batch_observation_noise_update() return posterior_state, posterior_state_var
def _forward(self, x): with ops.control_dependencies(self._assertions(x)): shape = array_ops.shape(x) return linalg_ops.matrix_triangular_solve( x, linalg_ops.eye(shape[-1], batch_shape=shape[:-2]), lower=True)
def _finish(self, state): var_dtype = self._variables[0].dtype.base_dtype # Update global step. global_step = self._get_global_step(state) update_global_step = state_ops.assign_add(global_step, 1.) # Update the first moment estimate. beta1 = state.get_hyper("beta1", dtype=var_dtype) moment1 = self._get_moment1(state) flat_grad = self._get_flat_grad(state) # moment1_t := beta1 * moment1_{t-1} + (1 - beta1) * flat_grad_t update_moment1 = moment1.assign(beta1 * moment1 + (1. - beta1) * flat_grad) # Update the gradient buffer. window = state.get_hyper("window") grad_buffer = self._get_grad_buffer(state) next_grad_index = math_ops.floormod( math_ops.to_int32(update_global_step - 1.), window) # grad_buffer[(t-1) % window] := moment1_t update_grad_buffer = state_ops.scatter_update(grad_buffer, next_grad_index, update_moment1) # Compute the update step. eps = state.get_hyper("eps", dtype=var_dtype) svd_eps = state.get_hyper("svd_eps", dtype=var_dtype) sigma_eps = state.get_hyper("sigma_eps", dtype=var_dtype) lr = state.get_hyper("lr", dtype=var_dtype) denom = math_ops.sqrt( math_ops.minimum( ops.convert_to_tensor(update_global_step), ops.convert_to_tensor(math_ops.cast(window, dtype=var_dtype)))) moment1_2d = array_ops.expand_dims(update_moment1, -1) # m = grad_buffer^T / sqrt(min(t, window)) # m has shape [model dimension, window], where model dimension is the sum # of the dimensions of the flattened variables. m = array_ops.transpose(math_ops.divide(update_grad_buffer, denom)) # sigma, u, _ = SVD(m^Tm + I * svd_eps) mm = math_ops.matmul(m, m, transpose_a=True) damping = math_ops.cast(linalg_ops.eye(window), dtype=var_dtype) * svd_eps sigma, u, _ = linalg_ops.svd(mm + damping) sigma_sqrt = math_ops.sqrt(sigma) sigma_sqrt_min = math_ops.reduce_min(sigma_sqrt) # sigma_sqrt_inv = 1 / (\sqrt{sigma} + sigma_eps) ^ 3 # We add sigma_eps to alleviate numerical instability. # Note that (m^Tm)^(-3/2) = u diag(sigma_sqrt_inv) u^T. sigma_sqrt_inv = math_ops.divide( math_ops.cast(1.0, dtype=var_dtype), math_ops.pow(sigma_sqrt + sigma_eps, 3)) # In full matrix AdaGrad, the update step computes (mm^T)^(-1/2)g, where the # inversion of a model dimension by model dimension matrix is needed. To # speed up this computation we calculate the following instead: # m(m^Tm)^(-3/2)m^T moment1 = m u diag(sigma_sqrt_inv) u^T m^T moment1. new_step = array_ops.expand_dims( array_ops.zeros(flat_grad.get_shape(), dtype=var_dtype), -1) head = math_ops.matmul( m, math_ops.matmul( u, math_ops.matmul( array_ops.diag(sigma_sqrt_inv), math_ops.matmul(u, math_ops.matmul(m, moment1_2d, transpose_a=True), transpose_a=True)))) # When inverting (mm^t)^(1/2), we also add epsilon * I regularization for # degenerate cases. We expand ((mm^t)^(1/2) + epsilon * I)^(-1) using # Woodbury's identity. # For full derivation please see paper at # https://arxiv.org/pdf/1806.02958.pdf tail = moment1_2d - math_ops.matmul( m, math_ops.matmul( u, math_ops.matmul( array_ops.diag( math_ops.divide(math_ops.cast(1.0, dtype=var_dtype), sigma)), math_ops.matmul(u, math_ops.matmul( m, moment1_2d, transpose_a=True), transpose_a=True)))) scaled_tail = math_ops.divide(tail, sigma_sqrt_min) update_new_step = control_flow_ops.cond( sigma_sqrt_min > eps, lambda: math_ops.add(head, scaled_tail), lambda: math_ops.add(new_step, head)) # Update each variable. update_step = [] for var in self._variables: dim = self.shape_dict[var.name] start_index = self.index_dict[var.name] end_index = start_index + dim var_update_correct_shape = array_ops.reshape( update_new_step[start_index:end_index], var.get_shape()) var_updated = state_ops.assign_sub(var, lr * var_update_correct_shape) update_step.append(var_updated) return control_flow_ops.group(update_step)
def get_noise_transform(self): return linalg_ops.eye(1, dtype=self.dtype)
def __call__(self, shape, dtype=None, partition_info=None): if dtype is None: dtype = self.dtype return linalg_ops.eye(shape[0], shape[1], dtype=dtype)
def tridiag(d, diag_value, offdiag_value): """d x d matrix with given value on diag, and one super/sub diag.""" diag_mat = linalg_ops.eye(d) * (diag_value - offdiag_value) three_bands = array_ops.matrix_band_part( array_ops.fill([d, d], offdiag_value), 1, 1) return diag_mat + three_bands
def posdef_inv(tensor, damping): """Computes the inverse of tensor + damping * identity.""" identity = linalg_ops.eye(tensor.shape.as_list()[0], dtype=tensor.dtype) damping = math_ops.cast(damping, dtype=tensor.dtype) return posdef_inv_functions[POSDEF_INV_METHOD](tensor, identity, damping)
def get_state_transition(self): return linalg_ops.eye(1, dtype=self.dtype)
def _MatrixSquareRootGrad(op, grad): """Gradient for MatrixSquareRoot.""" # Let A be an m x m square matrix (or batch of matrices) # Let R = sqrtm(A) # By definition, A = RR # Take the differential: dA = d(RR) = RdR + dRR # Solve the resulting Sylvester equation for dR # Used to find Kronecker products within the Sylvester equation def _KroneckerProduct(b1, b2): """Computes the Kronecker product of two batches of square matrices.""" b1_shape = array_ops.shape(b1) b2_shape = array_ops.shape(b2) b1_order = b1_shape[-1] b2_order = b2_shape[-1] shape_slice_size = [math_ops.subtract(array_ops.size(b1_shape), 2)] shape_slice = array_ops.slice( b1_shape, [0], shape_slice_size) # Same for both batches b1_reshape_shape = array_ops.concat( [shape_slice, [b1_order], [1], [b1_order], [1]], 0) b2_reshape_shape = array_ops.concat( [shape_slice, [1], [b2_order], [1], [b2_order]], 0) b1_reshape = array_ops.reshape(b1, b1_reshape_shape) b2_reshape = array_ops.reshape(b2, b2_reshape_shape) order_prod = b1_order * b2_order kprod_shape = array_ops.concat( [shape_slice, [order_prod], [order_prod]], 0) return array_ops.reshape(b1_reshape * b2_reshape, kprod_shape) sqrtm = op.outputs[0] # R shape = array_ops.shape(sqrtm) order = shape[-1] # m matrix_count = math_ops.reduce_prod(shape[0:-2]) # Get batch of m x m identity matrices eye = linalg_ops.eye(order, dtype=sqrtm.dtype) # m x m identity matrix eye_flat = array_ops.reshape(eye, [-1]) eye_tiled = array_ops.tile(eye_flat, [matrix_count]) eye_batch = array_ops.reshape(eye_tiled, shape) # The transpose of R is taken in the k1 term instead of k2 in # order to prevent redundant transposition of R (i.e. (R')' = R) sqrtm_transpose = array_ops.matrix_transpose(sqrtm) k1 = _KroneckerProduct(eye_batch, sqrtm_transpose) k2 = _KroneckerProduct(sqrtm, eye_batch) ksum = math_ops.add(k1, k2) # Vectorize dA shape_slice_size = [math_ops.subtract(array_ops.size(shape), 2)] shape_slice = array_ops.slice(shape, [0], shape_slice_size) shape_vec_da = array_ops.concat([shape_slice, [order * order], [1]], 0) vec_da = array_ops.reshape(array_ops.matrix_transpose(grad), shape_vec_da) # Solve for vec(dR) vec_dsqrtm = linalg_ops.matrix_solve(ksum, vec_da) # Solve for dR by inverse vectorizing vec(dR) dsqrtm_transpose = array_ops.reshape(vec_dsqrtm, shape) return array_ops.matrix_transpose(dsqrtm_transpose)
def _verifyLu(self, x, output_idx_type=dtypes.int64): # Verify that Px = LU. with test_util.use_gpu(): lu, perm = linalg_ops.lu(x, output_idx_type=output_idx_type) # Prepare the lower factor of shape num_rows x num_rows lu_shape = np.array(lu.shape.as_list()) batch_shape = lu_shape[:-2] num_rows = lu_shape[-2] num_cols = lu_shape[-1] lower = array_ops.matrix_band_part(lu, -1, 0) if num_rows > num_cols: eye = linalg_ops.eye(num_rows, batch_shape=batch_shape, dtype=lower.dtype) lower = array_ops.concat([lower, eye[..., num_cols:]], axis=-1) elif num_rows < num_cols: lower = lower[..., :num_rows] # Fill the diagonal with ones. ones_diag = array_ops.ones(np.append(batch_shape, num_rows), dtype=lower.dtype) lower = array_ops.matrix_set_diag(lower, ones_diag) # Prepare the upper factor. upper = array_ops.matrix_band_part(lu, 0, -1) verification = math_ops.matmul(lower, upper) # Permute the rows of product of the Cholesky factors. if num_rows > 0: # Reshape the product of the triangular factors and permutation indices # to a single batch dimension. This makes it easy to apply # invert_permutation and gather_nd ops. perm_reshaped = array_ops.reshape(perm, [-1, num_rows]) verification_reshaped = array_ops.reshape( verification, [-1, num_rows, num_cols]) # Invert the permutation in each batch. inv_perm_reshaped = map_fn.map_fn(array_ops.invert_permutation, perm_reshaped) batch_size = perm_reshaped.shape.as_list()[0] # Prepare the batch indices with the same shape as the permutation. # The corresponding batch index is paired with each of the `num_rows` # permutation indices. batch_indices = math_ops.cast(array_ops.broadcast_to( math_ops.range(batch_size)[:, None], perm_reshaped.shape), dtype=output_idx_type) permuted_verification_reshaped = array_ops.gather_nd( verification_reshaped, array_ops.stack([batch_indices, inv_perm_reshaped], axis=-1)) # Reshape the verification matrix back to the original shape. verification = array_ops.reshape( permuted_verification_reshaped, lu_shape) self._verifyLuBase(x, lower, upper, perm, verification, output_idx_type)
def TriAngInvCompositeGrad(l, grad): num_rows = array_ops.shape(l)[-1] batch_shape = array_ops.shape(l)[:-2] l_inverse = linalg_ops.matrix_triangular_solve( l, linalg_ops.eye(num_rows, batch_shape=batch_shape, dtype=l.dtype)) return _GradWithInverseL(l, l_inverse, grad)
def _solve_interpolation(train_points, train_values, order, regularization_weight): """Solve for interpolation coefficients. Computes the coefficients of the polyharmonic interpolant for the 'training' data defined by (train_points, train_values) using the kernel phi. Args: train_points: `[b, n, d]` interpolation centers train_values: `[b, n, k]` function values order: order of the interpolation regularization_weight: weight to place on smoothness regularization term Returns: w: `[b, n, k]` weights on each interpolation center v: `[b, d, k]` weights on each input dimension Raises: ValueError: if d or k is not fully specified. """ # These dimensions are set dynamically at runtime. b, n, _ = array_ops.unstack(array_ops.shape(train_points), num=3) d = train_points.shape[-1] if d.value is None: raise ValueError('The dimensionality of the input points (d) must be ' 'statically-inferrable.') k = train_values.shape[-1] if k.value is None: raise ValueError('The dimensionality of the output values (k) must be ' 'statically-inferrable.') # First, rename variables so that the notation (c, f, w, v, A, B, etc.) # follows https://en.wikipedia.org/wiki/Polyharmonic_spline. # To account for python style guidelines we use # matrix_a for A and matrix_b for B. c = train_points f = train_values # Next, construct the linear system. with ops.name_scope('construct_linear_system'): matrix_a = _phi(_pairwise_squared_distance_matrix(c), order) # [b, n, n] if regularization_weight > 0: batch_identity_matrix = array_ops.expand_dims( linalg_ops.eye(n, dtype=c.dtype), 0) matrix_a += regularization_weight * batch_identity_matrix # Append ones to the feature values for the bias term in the linear model. ones = array_ops.ones_like(c[..., :1], dtype=c.dtype) matrix_b = array_ops.concat([c, ones], 2) # [b, n, d + 1] # [b, n + d + 1, n] left_block = array_ops.concat( [matrix_a, array_ops.transpose(matrix_b, [0, 2, 1])], 1) num_b_cols = matrix_b.get_shape()[2] # d + 1 lhs_zeros = array_ops.zeros([b, num_b_cols, num_b_cols], train_points.dtype) right_block = array_ops.concat([matrix_b, lhs_zeros], 1) # [b, n + d + 1, d + 1] lhs = array_ops.concat([left_block, right_block], 2) # [b, n + d + 1, n + d + 1] rhs_zeros = array_ops.zeros([b, d + 1, k], train_points.dtype) rhs = array_ops.concat([f, rhs_zeros], 1) # [b, n + d + 1, k] # Then, solve the linear system and unpack the results. with ops.name_scope('solve_linear_system'): w_v = linalg_ops.matrix_solve(lhs, rhs) w = w_v[:, :n, :] v = w_v[:, n:, :] return w, v
def test_inv_update_thunks(self): """Ensures inverse update ops run once per global_step.""" with self._graph.as_default(), self.test_session() as sess: fisher_estimator = estimator.FisherEstimatorRoundRobin( variables=[self.weights], layer_collection=self.layer_collection, damping=0.2, cov_ema_decay=0.0) # Construct op that updates one inverse per global step. global_step = training_util.get_or_create_global_step() (cov_variable_thunks, _, inv_variable_thunks, inv_update_op_thunks ) = fisher_estimator.create_ops_and_vars_thunks() for thunk in cov_variable_thunks: thunk() for thunk in inv_variable_thunks: thunk() inv_matrices = [ matrix for fisher_factor in self.layer_collection.get_factors() for matrix in fisher_factor._matpower_by_exp_and_damping.values() ] inv_update_op = control_flow_ops.case([ (math_ops.equal(global_step, i), thunk) for i, thunk in enumerate(inv_update_op_thunks) ]) increment_global_step = global_step.assign_add(1) sess.run(variables.global_variables_initializer()) initial_inv_values = sess.run(inv_matrices) # Ensure there's one update per inverse matrix. This is true as long as # there's no fan-in/fan-out or parameter re-use. self.assertEqual(len(inv_matrices), len(inv_update_op_thunks)) # Test is no-op if only 1 invariance matrix. assert len(inv_matrices) > 1 # Assign each covariance matrix a value other than the identity. This # ensures that the inverse matrices are updated to something different as # well. cov_matrices = [ fisher_factor.get_cov() for fisher_factor in self.layer_collection.get_factors() ] sess.run([ cov_matrix.assign(2 * linalg_ops.eye(int(cov_matrix.shape[0]))) for cov_matrix in cov_matrices ]) for i in range(len(inv_matrices)): # Compare new and old inverse values new_inv_values = sess.run(inv_matrices) is_inv_equal = [ np.allclose(initial_inv_value, new_inv_value) for (initial_inv_value, new_inv_value ) in zip(initial_inv_values, new_inv_values) ] num_inv_equal = sum(is_inv_equal) # Ensure exactly one inverse matrix changes per step. self.assertEqual(num_inv_equal, len(inv_matrices) - i) # Run all inverse update ops. sess.run(inv_update_op) sess.run(increment_global_step)
def __init__(self, input_rows, input_cols, n_components, unobserved_weight=0.1, regularization=None, row_init="random", col_init="random", num_row_shards=1, num_col_shards=1, row_weights=1, col_weights=1, use_factors_weights_cache=True, use_gramian_cache=True, use_scoped_vars=False): """Creates model for WALS matrix factorization. Args: input_rows: total number of rows for input matrix. input_cols: total number of cols for input matrix. n_components: number of dimensions to use for the factors. unobserved_weight: weight given to unobserved entries of matrix. regularization: weight of L2 regularization term. If None, no regularization is done. row_init: initializer for row factor. Can be a tensor or numpy constant. If set to "random", the value is initialized randomly. col_init: initializer for column factor. See row_init for details. num_row_shards: number of shards to use for row factors. num_col_shards: number of shards to use for column factors. row_weights: Must be in one of the following three formats: None, a list of lists of non-negative real numbers (or equivalent iterables) or a single non-negative real number. - When set to None, w_ij = unobserved_weight, which simplifies to ALS. Note that col_weights must also be set to "None" in this case. - If it is a list of lists of non-negative real numbers, it needs to be in the form of [[w_0, w_1, ...], [w_k, ... ], [...]], with the number of inner lists matching the number of row factor shards and the elements in each inner list are the weights for the rows of the corresponding row factor shard. In this case, w_ij = unobserved_weight + row_weights[i] * col_weights[j]. - If this is a single non-negative real number, this value is used for all row weights and \\(w_ij\\) = unobserved_weight + row_weights * col_weights[j]. Note that it is allowed to have row_weights as a list while col_weights a single number or vice versa. col_weights: See row_weights. use_factors_weights_cache: When True, the factors and weights will be cached on the workers before the updates start. Defaults to True. Note that the weights cache is initialized through `worker_init`, and the row/col factors cache is initialized through `initialize_{col/row}_update_op`. In the case where the weights are computed outside and set before the training iterations start, it is important to ensure the `worker_init` op is run afterwards for the weights cache to take effect. use_gramian_cache: When True, the Gramians will be cached on the workers before the updates start. Defaults to True. use_scoped_vars: When True, the factor and weight vars will also be nested in a tf.name_scope. """ self._input_rows = input_rows self._input_cols = input_cols self._num_row_shards = num_row_shards self._num_col_shards = num_col_shards self._n_components = n_components self._unobserved_weight = unobserved_weight self._regularization = regularization self._regularization_matrix = ( regularization * linalg_ops.eye(self._n_components) if regularization is not None else None) assert (row_weights is None) == (col_weights is None) self._use_factors_weights_cache = use_factors_weights_cache self._use_gramian_cache = use_gramian_cache if use_scoped_vars: with ops.name_scope("row_weights"): self._row_weights = WALSModel._create_weights( row_weights, self._input_rows, self._num_row_shards, "row_weights") with ops.name_scope("col_weights"): self._col_weights = WALSModel._create_weights( col_weights, self._input_cols, self._num_col_shards, "col_weights") with ops.name_scope("row_factors"): self._row_factors = self._create_factors( self._input_rows, self._n_components, self._num_row_shards, row_init, "row_factors") with ops.name_scope("col_factors"): self._col_factors = self._create_factors( self._input_cols, self._n_components, self._num_col_shards, col_init, "col_factors") else: self._row_weights = WALSModel._create_weights( row_weights, self._input_rows, self._num_row_shards, "row_weights") self._col_weights = WALSModel._create_weights( col_weights, self._input_cols, self._num_col_shards, "col_weights") self._row_factors = self._create_factors( self._input_rows, self._n_components, self._num_row_shards, row_init, "row_factors") self._col_factors = self._create_factors( self._input_cols, self._n_components, self._num_col_shards, col_init, "col_factors") self._row_gramian = self._create_gramian(self._n_components, "row_gramian") self._col_gramian = self._create_gramian(self._n_components, "col_gramian") with ops.name_scope("row_prepare_gramian"): self._row_update_prep_gramian = self._prepare_gramian( self._col_factors, self._col_gramian) with ops.name_scope("col_prepare_gramian"): self._col_update_prep_gramian = self._prepare_gramian( self._row_factors, self._row_gramian) with ops.name_scope("transient_vars"): self._create_transient_vars()
def posdef_inv_cholesky(tensor, reg_mat, damping): """Computes inverse(tensor + damping * reg_mat) with Cholesky.""" chol = linalg_ops.cholesky(tensor + damping * reg_mat) identity = linalg_ops.eye(tf.shape(tensor)[0], dtype=tensor.dtype) return linalg_ops.cholesky_solve(chol, identity)