def _measure_valued_normal_scale_grad(function,
                                      dist_samples,
                                      dist,
                                      coupling=True):
    """Computes the measure valued gradient wrt the `scale of the Normal `dist`.

  For details, see Section 6 of
    "Monte Carlo Gradient Estimation in Machine learning".

  Args:
    function: A function for which to compute stochastic gradient for.
    dist_samples: a tf.Tensor of samples from `dist`.
    dist: A tfp.distributions.Distribution instance.
      The code here assumes this distribution is from the Normal family.
    coupling: A boolean. Whether or not to use coupling for the positive and
      negative samples. Recommended: True, as this reduces variance.

  Returns:
    A tf.Tensor of size `num_samples`, where `num_samples` are the number
      of samples (the first dimension) of `dist_samples`. The gradient of
      function(dist_samples) wrt to the scale of `dist`.
  """
    mean = dist.loc
    # We will rely on backprop to compute the right gradient with respect
    # to the log scale.
    scale = dist.stddev()
    utils.assert_rank(mean, 1)
    utils.assert_rank(scale, 1)

    # Duplicate the D dimension - N x D x D
    base_dist_samples = utils.tile_second_to_last_dim(dist_samples)

    shape = dist_samples.shape
    # N x D
    pos_sample = dist_utils.sample_ds_maxwell(shape, loc=0., scale=1.0)
    if coupling:
        neg_sample = dist_utils.std_gaussian_from_std_dsmaxwell(pos_sample)
    else:
        neg_sample = tf.random.normal(shape)

    # N x D
    positive_diag = mean + scale * pos_sample
    positive_diag.shape.assert_is_compatible_with(shape)
    # N x D
    negative_diag = mean + scale * neg_sample
    negative_diag.shape.assert_is_compatible_with(shape)
    # Set the positive and negative values - N x D x D.
    positive = tf.linalg.set_diag(base_dist_samples, positive_diag)
    negative = tf.linalg.set_diag(base_dist_samples, negative_diag)

    c = scale  # D
    f = function
    # Broadcast the division.
    grads = (_apply_f(f, positive) - _apply_f(f, negative)) / c
    # grads - N x D
    grads.shape.assert_is_compatible_with(shape)

    return grads
def score_function_loss(function, dist_samples, dist):
    """Computes the score_function surrogate loss."""
    log_probs = dist.log_prob(tf.stop_gradient(dist_samples))

    # log_probs is of the size of the number of samples.
    utils.assert_rank(log_probs, 1)

    # Broadcast the log_probs to the loss.
    loss = tf.stop_gradient(function(dist_samples)) * log_probs
    return loss
Пример #3
0
def control_variates_surrogate_loss(dist,
                                    dist_samples,
                                    dist_vars,
                                    model_loss_fn,
                                    grad_loss_fn,
                                    control_variate_fn,
                                    estimate_cv_coeff=True,
                                    num_posterior_samples_cv_coeff=20):
    r"""Computes a surrogate loss by computing the gradients manually.

  The loss function returned is:
     \sum_i stop_grad(grad_i) * var_i,
    where grad_i was computed from stochastic_loss and control variate.

  This function uses `compute_control_variate_coeff` to compute the control
  variate coefficients and should be used only in conjunction with control
  variates.

  Args:
    dist: a tfp.distributions.Distribution instance.
    dist_samples: samples from dist.
    dist_vars: the variables for which we are interested in computing gradients.
      The distribution samples should depend on these variables.
    model_loss_fn: A function with signature: lambda samples: f(samples).
      The model loss function.
    grad_loss_fn: The gradient estimator function.
      Needs to return both a surrogate loss and a dictionary of jacobians.
    control_variate_fn: The surrogate control variate function. Its gradient
      will be used as a control variate.
    estimate_cv_coeff: Boolean. Whether or not to use a coefficient
       for the control variate to minimize variance of the surrogate loss
       estimate. If False, the control variate coefficient is set to 1.
       If True, uses `compute_control_variate_coeff` to compute the coefficient.
    num_posterior_samples_cv_coeff: The number of posterior samples used
      to compute the cv coeff. Only used if `estimate_cv_coeff`
      is True.

  Returns:
    A tuple containing three elements:
      * the surrogate loss - a tf.Tensor [num_samples].
      * the jacobians wrt dist_vars.
      * a dict of debug information.
  """
    _, expected_control_variate, _, cv_jacobians = control_variate_fn(
        dist, dist_samples, model_loss_fn, grad_loss_fn=grad_loss_fn)

    _, loss_jacobians = grad_loss_fn(model_loss_fn, dist_samples, dist)
    jacobians = {}

    for dist_var in dist_vars:
        if estimate_cv_coeff:
            cv_coeff = compute_control_variate_coeff(
                dist,
                dist_var,
                model_loss_fn=model_loss_fn,
                grad_loss_fn=grad_loss_fn,
                control_variate_fn=control_variate_fn,
                num_samples=num_posterior_samples_cv_coeff)
        else:
            cv_coeff = 1.

        var_jacobians = loss_jacobians[
            dist_var] - cv_coeff * cv_jacobians[dist_var]
        # Num samples x num_variables
        utils.assert_rank(var_jacobians, 2)

        jacobians[dist_var] = var_jacobians

        utils.add_grads_to_jacobians(jacobians,
                                     expected_control_variate * cv_coeff,
                                     [dist_var])

    surrogate_loss = 0.0
    for dist_var in dist_vars:
        surrogate_loss += tf.stop_gradient(jacobians[dist_var]) * dist_var

    # Sum over variable dimensions.
    surrogate_loss = tf.reduce_sum(surrogate_loss, axis=1)

    return surrogate_loss, jacobians
Пример #4
0
def compute_control_variate_coeff(dist,
                                  dist_var,
                                  model_loss_fn,
                                  grad_loss_fn,
                                  control_variate_fn,
                                  num_samples,
                                  moving_averages=False,
                                  eps=1e-3):
    r"""Computes the control variate coefficients for the given variable.

  The coefficient is given by:
    \sum_k cov(df/d var_k, dcv/d var_k) / (\sum var(dcv/d var_k) + eps)

  Where var_k is the k'th element of the variable dist_var.
  The covariance and variance calculations are done from samples obtained
  from the distribution `dist`.

  Args:
    dist: a tfp.distributions.Distribution instance.
    dist_var: the variable for which we are interested in computing the
      coefficient.
      The distribution samples should depend on these variables.
    model_loss_fn: A function with signature: lambda samples: f(samples).
      The model loss function.
    grad_loss_fn: The gradient estimator function.
      Needs to return both a surrogate loss and a dictionary of jacobians.
    control_variate_fn: The surrogate control variate function. Its gradient
      will be used as a control variate.
    num_samples: Int. The number of samples to use for the cov/var calculation.
    moving_averages: Bool. Whether or not to use moving averages for the
      calculation.
    eps: Float. Used to stabilize division.

  Returns:
    a tf.Tensor of rank 0. The coefficient for the input variable.
  """
    # Resample to avoid biased gradients.

    cv_dist_samples = dist.sample(num_samples)
    cv_jacobians = control_variate_fn(dist,
                                      cv_dist_samples,
                                      model_loss_fn,
                                      grad_loss_fn=grad_loss_fn)[-1]
    loss_jacobians = grad_loss_fn(model_loss_fn, cv_dist_samples, dist)[-1]

    cv_jacobians = cv_jacobians[dist_var]
    loss_jacobians = loss_jacobians[dist_var]
    # Num samples x num_variables
    utils.assert_rank(loss_jacobians, 2)
    # Num samples x num_variables
    utils.assert_rank(cv_jacobians, 2)

    mean_f = tf.reduce_mean(loss_jacobians, axis=0)
    mean_cv, var_cv = tf.nn.moments(cv_jacobians, axes=[0])

    cov = tf.reduce_mean((loss_jacobians - mean_f) * (cv_jacobians - mean_cv),
                         axis=0)

    utils.assert_rank(var_cv, 1)
    utils.assert_rank(cov, 1)

    # Compute the coefficients which minimize variance.
    # Since we want to minimize the variances across parameter dimensions,
    # the optimal # coefficients are given by the sum of covariances per
    # dimensions over the sum of variances per dimension.
    cv_coeff = tf.reduce_sum(cov) / (tf.reduce_sum(var_cv) + eps)
    cv_coeff = tf.stop_gradient(cv_coeff)
    utils.assert_rank(cv_coeff, 0)
    if moving_averages:
        cv_coeff = tf.stop_gradient(snt.MovingAverage(decay=0.9)(cv_coeff))

    return cv_coeff