def _do_update(x_update_diff_norm_sq, x_update, hess_matmul_x_update): # pylint: disable=missing-docstring hessian_column_with_l2 = sparse_or_dense_matvecmul( hessian_unregularized_loss_outer, hessian_unregularized_loss_middle * _sparse_or_dense_matmul_onehot( hessian_unregularized_loss_outer, coord), adjoint_a=True) if l2_regularizer is not None: hessian_column_with_l2 += _one_hot_like( hessian_column_with_l2, coord, on_value=2. * l2_regularizer) # Move the batch dimensions of `hessian_column_with_l2` to rightmost in # order to conform to `hess_matmul_x_update`. n = tf.rank(hessian_column_with_l2) perm = tf.roll(tf.range(n), shift=1, axis=0) hessian_column_with_l2 = tf.transpose(a=hessian_column_with_l2, perm=perm) # Update the entire batch at `coord` even if `delta` may be 0 at some # batch coordinates. In those cases, adding `delta` is a no-op. x_update = tf.tensor_scatter_nd_add(x_update, [[coord]], [delta]) with tf.control_dependencies([x_update]): x_update_diff_norm_sq_ = x_update_diff_norm_sq + delta**2 hess_matmul_x_update_ = (hess_matmul_x_update + delta * hessian_column_with_l2) # Hint that loop vars retain the same shape. x_update_diff_norm_sq_.set_shape( x_update_diff_norm_sq_.shape.merge_with( x_update_diff_norm_sq.shape)) hess_matmul_x_update_.set_shape( hess_matmul_x_update_.shape.merge_with( hess_matmul_x_update.shape)) return [ x_update_diff_norm_sq_, x_update, hess_matmul_x_update_ ]
def _grad_neg_log_likelihood_and_fim(model_matrix, linear_response, response, model): """Computes the neg-log-likelihood gradient and Fisher information for a GLM. Note that Fisher information is related to the Hessian of the log-likelihood by the equation ```none FisherInfo = E[Hessian with respect to model_coefficients of -LogLikelihood( Y | model_matrix, model_coefficients)] ``` where `LogLikelihood` is the log-likelihood of a generalized linear model parameterized by `model_matrix` and `model_coefficients`, and the expectation is taken over Y, distributed according to the same GLM with the same parameter values. Args: model_matrix: (Batch of) matrix-shaped, `float` `Tensor` or `SparseTensor` where each row represents a sample's features. Has shape `[N, n]` where `N` is the number of data samples and `n` is the number of features per sample. linear_response: (Batch of) vector-shaped `Tensor` with the same dtype as `model_matrix`, equal to `model_matix @ model_coefficients` where `model_coefficients` are the coefficients of the linear component of the GLM. response: (Batch of) vector-shaped `Tensor` with the same dtype as `model_matrix` where each element represents a sample's observed response (to the corresponding row of features). model: `tfp.glm.ExponentialFamily`-like instance, which specifies the link function and distribution of the GLM, and thus characterizes the negative log-likelihood. Must have sufficient statistic equal to the response, that is, `T(y) = y`. Returns: grad_neg_log_likelihood: (Batch of) vector-shaped `Tensor` with the same shape and dtype as a single row of `model_matrix`, representing the gradient of the negative log likelihood of `response` given linear response `linear_response`. fim_middle: (Batch of) vector-shaped `Tensor` with the same shape and dtype as a single column of `model_matrix`, satisfying the equation `Fisher information = Transpose(model_matrix) @ diag(fim_middle) @ model_matrix`. """ # TODO(b/111926503): Determine whether there are some practical cases where it # is computationally favorable to compute the full FIM. mean, variance, grad_mean = model(linear_response) is_valid = (tf.math.is_finite(grad_mean) & tf.not_equal(grad_mean, 0.) & tf.math.is_finite(variance) & (variance > 0.)) def _mask_if_invalid(x, mask): return tf.where(is_valid, x, np.array(mask, dtype_util.as_numpy_dtype(x.dtype))) # TODO(b/111923449): Link to derivation once it's available. v = (response - mean) * _mask_if_invalid(grad_mean, 1) / _mask_if_invalid( variance, np.inf) grad_log_likelihood = sparse_or_dense_matvecmul(model_matrix, v, adjoint_a=True) fim_middle = _mask_if_invalid(grad_mean, 0.)**2 / _mask_if_invalid( variance, np.inf) return -grad_log_likelihood, fim_middle
def _neg_log_likelihood(x): predicted_linear_response = sparse_or_dense_matvecmul(model_matrix, x) log_probs = model.log_prob(response, predicted_linear_response) return -log_probs
def _grad_neg_log_likelihood_and_fim_fn(x): predicted_linear_response = sparse_or_dense_matvecmul(model_matrix, x) g, h_middle = _grad_neg_log_likelihood_and_fim( model_matrix, predicted_linear_response, response, model) return g, model_matrix, h_middle
def fit_sparse_one_step(model_matrix, response, model, model_coefficients_start, tolerance, l1_regularizer, l2_regularizer=None, maximum_full_sweeps=None, learning_rate=None, name=None): """One step of (the outer loop of) the GLM fitting algorithm. This function returns a new value of `model_coefficients`, equal to `model_coefficients_start + model_coefficients_update`. The increment `model_coefficients_update in R^n` is computed by a coordinate descent method, that is, by a loop in which each iteration updates exactly one coordinate of `model_coefficients_update`. (Some updates may leave the value of the coordinate unchanged.) The particular update method used is to apply an L1-based proximity operator, "soft threshold", whose fixed point `model_coefficients_update^*` is the desired minimum ```none model_coefficients_update^* = argmin{ -LogLikelihood(model_coefficients_start + model_coefficients_update') + l1_regularizer * ||model_coefficients_start + model_coefficients_update'||_1 + l2_regularizer * ||model_coefficients_start + model_coefficients_update'||_2**2 : model_coefficients_update' } ``` where in each iteration `model_coefficients_update'` has at most one nonzero coordinate. This update method preserves sparsity, i.e., tends to find sparse solutions if `model_coefficients_start` is sparse. Additionally, the choice of step size is based on curvature (Fisher information matrix), which significantly speeds up convergence. Args: model_matrix: (Batch of) matrix-shaped, `float` `Tensor` or `SparseTensor` where each row represents a sample's features. Has shape `[N, n]` where `N` is the number of data samples and `n` is the number of features per sample. response: (Batch of) vector-shaped `Tensor` with the same dtype as `model_matrix` where each element represents a sample's observed response (to the corresponding row of features). model: `tfp.glm.ExponentialFamily`-like instance, which specifies the link function and distribution of the GLM, and thus characterizes the negative log-likelihood which will be minimized. Must have sufficient statistic equal to the response, that is, `T(y) = y`. model_coefficients_start: (Batch of) vector-shaped, `float` `Tensor` with the same dtype as `model_matrix`, representing the initial values of the coefficients for the GLM regression. Has shape `[n]` where `model_matrix` has shape `[N, n]`. tolerance: scalar, `float` `Tensor` representing the convergence threshold. The optimization step will terminate early, returning its current value of `model_coefficients_start + model_coefficients_update`, once the following condition is met: `||model_coefficients_update_end - model_coefficients_update_start||_2 / (1 + ||model_coefficients_start||_2) < sqrt(tolerance)`, where `model_coefficients_update_end` is the value of `model_coefficients_update` at the end of a sweep and `model_coefficients_update_start` is the value of `model_coefficients_update` at the beginning of that sweep. l1_regularizer: scalar, `float` `Tensor` representing the weight of the L1 regularization term (see equation above). l2_regularizer: scalar, `float` `Tensor` representing the weight of the L2 regularization term (see equation above). Default value: `None` (i.e., no L2 regularization). maximum_full_sweeps: Python integer specifying maximum number of sweeps to run. A "sweep" consists of an iteration of coordinate descent on each coordinate. After this many sweeps, the algorithm will terminate even if convergence has not been reached. Default value: `1`. learning_rate: scalar, `float` `Tensor` representing a multiplicative factor used to dampen the proximal gradient descent steps. Default value: `None` (i.e., factor is conceptually `1`). name: Python string representing the name of the TensorFlow operation. The default name is `"fit_sparse_one_step"`. Returns: model_coefficients: (Batch of) `Tensor` having the same shape and dtype as `model_coefficients_start`, representing the updated value of `model_coefficients`, that is, `model_coefficients_start + model_coefficients_update`. is_converged: scalar, `bool` `Tensor` indicating whether convergence occurred across all batches within the specified number of sweeps. iter: scalar, `int` `Tensor` representing the actual number of coordinate updates made (before achieving convergence). Since each sweep consists of `tf.size(model_coefficients_start)` iterations, the maximum number of updates is `maximum_full_sweeps * tf.size(model_coefficients_start)`. """ graph_deps = [ model_matrix, response, model_coefficients_start, l1_regularizer, l2_regularizer, maximum_full_sweeps, tolerance, learning_rate, ] with tf.name_scope(name, 'fit_sparse_one_step', graph_deps): predicted_linear_response = sparse_or_dense_matvecmul( model_matrix, model_coefficients_start) g, h_middle = _grad_neg_log_likelihood_and_fim( model_matrix, predicted_linear_response, response, model) return tfp.optimizer.proximal_hessian_sparse_one_step( gradient_unregularized_loss=g, hessian_unregularized_loss_outer=model_matrix, hessian_unregularized_loss_middle=h_middle, x_start=model_coefficients_start, l1_regularizer=l1_regularizer, l2_regularizer=l2_regularizer, maximum_full_sweeps=maximum_full_sweeps, tolerance=tolerance, learning_rate=learning_rate, name=name)