def testApproximateLossIsAccurate(self, float_dtype): # Check that the approximate loss (lossfun() with epsilon=1e-6) reasonably # approximates the true loss (lossfun() with epsilon=0.) for a range of # values of alpha (skipping alpha=0, where the approximation is poor). x = np.arange(-10, 10, 0.1, float_dtype) scale = float_dtype(1.7) for alpha in [-4, -2, -0.2, -0.01, 0.01, 0.2, 1, 1.99, 2, 2.01, 4]: alpha = float_dtype(alpha) loss = general.lossfun(x, alpha, scale) loss_approx = general.lossfun(x, alpha, scale, approximate=True) self.assertAllClose( loss, loss_approx, rtol=1e-5, atol=1e-4, msg='alpha=%g' % (alpha))
def testGradientMatchesFiniteDifferences(self, float_dtype): # Test that the loss and its approximation both return gradients that are # close to the numerical gradient from finite differences, with forward # differencing. Returning correct gradients is TensorFlow's job, so this is # just an aggressive sanity check in case some implementation detail causes # gradients to incorrectly go to zero due to quantization or stop_gradients # in some op that is used by the loss. for approximate in [False, True]: num_samples = 100000 # Normally distributed inputs. x = float_dtype(np.random.normal(size=num_samples)) # Uniformly distributed values in (-16, 3), quantized to the nearest # 0.1 and then shifted by 0.05 so that we avoid the special cases at # 0 and 2 where the analytical gradient wont match finite differences. alpha = float_dtype( np.round(np.random.uniform(-16, 3, num_samples) * 10) / 10.) # Random uniformy distributed values in [0.5, 1.5] scale = float_dtype(np.random.uniform(0.5, 1.5, num_samples)) # Compute the loss and its derivative with respect to all three inputs. x, alpha, scale = [ tf.convert_to_tensor(z) for z in (x, alpha, scale) ] with tf.GradientTape(persistent=True) as tape: for z in (x, alpha, scale): tape.watch(z) loss = general.lossfun(x, alpha, scale, approximate=approximate) d_x, d_alpha, d_scale = [ tape.gradient(tf.reduce_sum(loss), z) for z in (x, alpha, scale) ] # Assert that the 95th percentile of errors is <= 1e-2. def assert_percentile_close(v1, v2): self.assertLessEqual(np.percentile(np.abs(v1 - v2), 95), 1e-2) step_size = float_dtype(1e-3) n_x = (general.lossfun(x + step_size, alpha, scale) - loss) / step_size n_alpha = (general.lossfun(x, alpha + step_size, scale) - loss) / step_size n_scale = (general.lossfun(x, alpha, scale + step_size) - loss) / step_size assert_percentile_close(n_x, d_x) assert_percentile_close(n_alpha, d_alpha) assert_percentile_close(n_scale, d_scale)
def while_body(samples, accepted): """Generate N proposal samples, and then perform rejection sampling.""" # Draw N samples from a Cauchy, our proposal distribution. cauchy_sample = tf.cast(cauchy.sample(shape), float_dtype) # Compute the likelihood of each sample under its target distribution. nll = self.nllfun(cauchy_sample, alpha, tf.cast(1, float_dtype)) # Bound the NLL. We don't use the approximate loss as it may cause # unpredictable behavior in the context of sampling. nll_bound = general.lossfun( cauchy_sample, tf.cast(0, float_dtype), tf.cast(1, float_dtype), approximate=False) + self.log_base_partition_function( alpha) # Draw N samples from a uniform distribution, and use each uniform # sample to decide whether or not to accept each proposal sample. uniform_sample = tf.cast(uniform.sample(shape), float_dtype) accept = uniform_sample <= tf.math.exp(nll_bound - nll) # If a sample is accepted, replace its element in `samples` with the # proposal sample, and set its bit in `accepted` to True. samples = tf.where(accept, cauchy_sample, samples) accepted = accept | accepted return (samples, accepted)
def testLossAndGradientsAreFinite(self, float_dtype): # Test that the loss and its approximation both give finite losses and # derivatives everywhere that they should for a wide range of values. for approximate in [False, True]: num_samples = 100000 # Normally distributed inputs. x = float_dtype(np.random.normal(size=num_samples)) # Uniformly distributed values in (-16, 3), quantized to the nearest # 0.1 to ensure that we hit the special cases at 0, 2. alpha = float_dtype( np.round(np.random.uniform(-16, 3, num_samples) * 10) / 10.) # Random log-normally distributed values in approx (1e-5, 100000): scale = float_dtype( np.exp(np.random.normal(size=num_samples) * 4.) + 1e-5) # Compute the loss and its derivative with respect to all three inputs. x, alpha, scale = [tf.convert_to_tensor(z) for z in (x, alpha, scale)] with tf.GradientTape(persistent=True) as tape: for z in (x, alpha, scale): tape.watch(z) loss = general.lossfun(x, alpha, scale, approximate=approximate) d_x, d_alpha, d_scale = [ tape.gradient(tf.reduce_sum(loss), z) for z in (x, alpha, scale) ] for v in [loss, d_x, d_alpha, d_scale]: self.assertTrue(np.all(np.isfinite(v)))
def _precompute_lossfun_inputs(self, float_dtype): """Precompute a loss and its derivatives for random inputs and parameters. Generates a large number of random inputs to the loss, and random shape/scale parameters for the loss function at each sample, and computes the loss and its derivative with respect to all inputs and parameters, returning everything to be used to assert various properties in our unit tests. Args: float_dtype: The float precision to be used (np.float32 or np.float64). Returns: A tuple containing: (the number (int) of samples, and the length of all following arrays, A np.array (float_dtype) of losses for each sample, A np.array (float_dtype) of residuals of each sample (the loss inputs), A np array (float_dtype) of shape parameters of each loss, A np.array (float_dtype) of scale parameters of each loss, A np.array (float_dtype) of derivatives of each loss wrt each x, A np.array (float_dtype) of derivatives of each loss wrt each alpha, A np.array (float_dtype) of derivatives of each loss wrt each scale) Typical usage example: (num_samples, loss, x, alpha, scale, d_x, d_alpha, d_scale) = self._precompute_lossfun_inputs(np.float32) """ with self.session() as sess: num_samples = 100000 # Normally distributed inputs. x = float_dtype(np.random.normal(size=num_samples)) # Uniformly distributed values in (-16, 3), quantized to the nearest 0.1 # to ensure that we hit the special cases at 0, 2. alpha = float_dtype( np.round(np.random.uniform(-16, 3, num_samples) * 10) / 10.) # Push the sampled alphas at the extents of the range to +/- infinity, so # that we probe those cases too. alpha[alpha == 3.] = float_dtype(float('inf')) alpha[alpha == -16.] = -float_dtype(float('inf')) # Random log-normally distributed values in approx (1e-5, 100000): scale = float_dtype( np.exp(np.random.normal(size=num_samples) * 4.) + 1e-5) # Compute the loss and its derivative with respect to all three inputs. x_ph = tf.placeholder(x.dtype, num_samples) alpha_ph = tf.placeholder(alpha.dtype, num_samples) scale_ph = tf.placeholder(scale.dtype, num_samples) lossfun_ph = general.lossfun(x_ph, alpha_ph, scale_ph) loss, (d_x, d_alpha, d_scale) = sess.run( (lossfun_ph, tf.gradients(tf.reduce_sum(lossfun_ph), (x_ph, alpha_ph, scale_ph))), { x_ph: x, alpha_ph: alpha, scale_ph: scale, }) return (num_samples, loss, x, alpha, scale, d_x, d_alpha, d_scale)
def testLossfunPreservesDtype(self, float_dtype): """Check the loss's output has the same precision as its input.""" n = 16 x = float_dtype(np.random.normal(size=n)) alpha = float_dtype(np.random.normal(size=n)) scale = float_dtype(np.exp(np.random.normal(size=n))) y = general.lossfun(x, alpha, scale) self.assertDTypeEqual(y, float_dtype)
def _lossfun_preserves_dtype(self, float_dtype): """Check the loss's output has the same precision as its input.""" n = 16 x = float_dtype(np.random.normal(size=n)) alpha = float_dtype(np.random.normal(size=n)) scale = float_dtype(np.exp(np.random.normal(size=n))) with self.session(): y = general.lossfun(x, alpha, scale).eval() self.assertDTypeEqual(y, float_dtype)
def testLossIsScaleInvariant(self, float_dtype): # Check that loss(mult * x, alpha, mult * scale) == loss(x, alpha, scale) (num_samples, loss, x, alpha, scale, _, _, _) = ( self._precompute_lossfun_inputs(float_dtype)) # Random log-normally distributed scalings in ~(0.2, 20) mult = float_dtype( np.maximum(0.2, np.exp(np.random.normal(size=num_samples)))) # Compute the scaled loss. loss_scaled = general.lossfun(mult * x, alpha, mult * scale) self.assertAllClose(loss, loss_scaled, atol=1e-4, rtol=1e-4)
def testAlphaEqualsZero(self, float_dtype): # Check that alpha == 0 reproduces Cauchy aka Lorentzian loss. x = np.arange(-20, 20, 0.1, float_dtype) alpha = float_dtype(0.) scale = float_dtype(1.7) # Our loss. loss = general.lossfun(x, alpha, scale) # Cauchy/Lorentzian loss. loss_true = (tf.math.log(0.5 * tf.square(x / scale) + 1.)) self._assert_all_close_according_to_type(loss, loss_true)
def testAlphaEqualsNegativeInfinity(self, float_dtype): # Check that alpha == -Infinity reproduces Welsch aka Leclerc loss. x = np.arange(-20, 20, 0.1, float_dtype) alpha = float_dtype(-float('inf')) scale = float_dtype(1.7) # Our loss. loss = general.lossfun(x, alpha, scale) # Welsch/Leclerc loss. loss_true = (1. - tf.math.exp(-0.5 * tf.square(x / scale))) self._assert_all_close_according_to_type(loss, loss_true)
def testAlphaEqualsInfinity(self, float_dtype): # Check that alpha == Infinity takes the correct form. x = np.arange(-20, 20, 0.1, float_dtype) alpha = float_dtype(float('inf')) scale = float_dtype(1.7) # Our loss. loss = general.lossfun(x, alpha, scale) # The true loss. loss_true = (tf.math.exp(0.5 * tf.square(x / scale)) - 1.) self._assert_all_close_according_to_type(loss, loss_true)
def testAlphaEqualsTwo(self, float_dtype): # Check that alpha == 2 reproduces L2 loss. x = np.arange(-20, 20, 0.1, float_dtype) alpha = float_dtype(2.) scale = float_dtype(1.7) # Our loss. loss = general.lossfun(x, alpha, scale) # L2 Loss. loss_true = (0.5 * tf.square(x / scale)) self._assert_all_close_according_to_type(loss, loss_true)
def testAlphaEqualsOne(self, float_dtype): # Check that alpha == 1 reproduces Charbonnier aka pseudo-Huber loss. x = np.arange(-20, 20, 0.1, float_dtype) alpha = float_dtype(1.) scale = float_dtype(1.7) # Our loss. loss = general.lossfun(x, alpha, scale) # Charbonnier loss. loss_true = (tf.sqrt(tf.square(x / scale) + 1.) - 1.) self._assert_all_close_according_to_type(loss, loss_true)
def testAlphaEqualsNegativeTwo(self, float_dtype): # Check that alpha == -2 reproduces Geman-McClure loss. x = np.arange(-20, 20, 0.1, float_dtype) alpha = float_dtype(-2.) scale = float_dtype(1.7) # Our loss. loss = general.lossfun(x, alpha, scale) # Geman-McClure loss. loss_true = (2. * tf.square(x / scale) / (tf.square(x / scale) + 4.)) self._assert_all_close_according_to_type(loss, loss_true)
def _alpha_equals_two(self, float_dtype): # Check that alpha == 2 reproduces L2 loss. with self.session(): x = np.arange(-20, 20, 0.1, float_dtype) alpha = float_dtype(2.) scale = float_dtype(1.7) # Our loss. loss = general.lossfun(x, alpha, scale).eval() # L2 Loss. loss_true = (0.5 * tf.square(x / scale)).eval() self._assert_all_close_according_to_type(loss, loss_true)
def _alpha_equals_zero(self, float_dtype): # Check that alpha == 0 reproduces Cauchy aka Lorentzian loss. with self.session(): x = np.arange(-20, 20, 0.1, float_dtype) alpha = float_dtype(0.) scale = float_dtype(1.7) # Our loss. loss = general.lossfun(x, alpha, scale).eval() # Cauchy/Lorentzian loss. loss_true = (tf.log(0.5 * tf.square(x / scale) + 1.)).eval() self._assert_all_close_according_to_type(loss, loss_true)
def testAlphaEqualsFour(self, float_dtype): # Check that alpha == 4 reproduces a quartic. x = np.arange(-20, 20, 0.1, float_dtype) alpha = float_dtype(4.) scale = float_dtype(1.7) # Our loss. loss = general.lossfun(x, alpha, scale) # The true loss. loss_true = ( tf.square(tf.square(x / scale)) / 8. + tf.square(x / scale) / 2.) self._assert_all_close_according_to_type(loss, loss_true)
def _alpha_equals_four(self, float_dtype): # Check that alpha == 4 reproduces a quartic. with self.session(): x = np.arange(-20, 20, 0.1, float_dtype) alpha = float_dtype(4.) scale = float_dtype(1.7) # Our loss. loss = general.lossfun(x, alpha, scale).eval() # The true loss. loss_true = (tf.square(tf.square(x / scale)) / 8. + tf.square(x / scale) / 2.).eval() self._assert_all_close_according_to_type(loss, loss_true)
def _alpha_equals_negative_two(self, float_dtype): # Check that alpha == -2 reproduces Geman-McClure loss. with self.session(): x = np.arange(-20, 20, 0.1, float_dtype) alpha = float_dtype(-2.) scale = float_dtype(1.7) # Our loss. loss = general.lossfun(x, alpha, scale).eval() # Geman-McClure loss. loss_true = (2. * tf.square(x / scale) / (tf.square(x / scale) + 4.)).eval() self._assert_all_close_according_to_type(loss, loss_true)
def numerical_base_partition_function(alpha): """Numerically approximate the partition function Z(alpha).""" # Generate values `num_samples` values in [-x_max, x_max], with more samples # near the origin as `power` is set to larger values. num_samples = 2**24 + 1 # We want an odd value so that 0 gets sampled. x_max = 10**10 power = 6 t = t = tf.linspace( tf.constant(-1, tf.float64), tf.constant(1, tf.float64), num_samples) t = tf.sign(t) * tf.abs(t)**power x = t * x_max # Compute losses for the values, then exponentiate the negative losses and # integrate with the trapezoid rule to get the partition function. losses = general.lossfun(x, alpha, np.float64(1)) y = tf.math.exp(-losses) partition = tf.reduce_sum((y[1:] + y[:-1]) * (x[1:] - x[:-1])) / 2. return partition
def nllfun(self, x, alpha, scale): r"""Implements the negative log-likelihood (NLL). Specifically, we implement -log(p(x | 0, \alpha, c) of Equation 16 in the paper as nllfun(x, alpha, shape). Args: x: The residual for which the NLL is being computed. x can have any shape, and alpha and scale will be broadcasted to match x's shape if necessary. Must be a tensorflow tensor or numpy array of floats. alpha: The shape parameter of the NLL (\alpha in the paper), where more negative values cause outliers to "cost" more and inliers to "cost" less. Alpha can be any non-negative value, but the gradient of the NLL with respect to alpha has singularities at 0 and 2 so you may want to limit usage to (0, 2) during gradient descent. Must be a tensorflow tensor or numpy array of floats. Varying alpha in that range allows for smooth interpolation between a Cauchy distribution (alpha = 0) and a Normal distribution (alpha = 2) similar to a Student's T distribution. scale: The scale parameter of the loss. When |x| < scale, the NLL is like that of a (possibly unnormalized) normal distribution, and when |x| > scale the NLL takes on a different shape according to alpha. Must be a tensorflow tensor or numpy array of floats. Returns: The NLLs for each element of x, in the same shape as x. This is returned as a TensorFlow graph node of floats with the same precision as x. """ # `scale` and `alpha` must have the same type as `x`. tf.debugging.assert_type(scale, x.dtype) tf.debugging.assert_type(alpha, x.dtype) assert_ops = [ # `scale` must be > 0. tf.Assert(tf.reduce_all(scale > 0.), [scale]), # `alpha` must be >= 0. tf.Assert(tf.reduce_all(alpha >= 0.), [alpha]), ] with tf.control_dependencies(assert_ops): loss = general.lossfun(x, alpha, scale, approximate=False) log_partition = (tf.math.log(scale) + self.log_base_partition_function(alpha)) nll = loss + log_partition return nll
def _loss_is_scale_invariant(self, float_dtype): # Check that loss(mult * x, alpha, mult * scale) == loss(x, alpha, scale) (num_samples, loss, x, alpha, scale, _, _, _) = self._precompute_lossfun_inputs(float_dtype) with self.session() as sess: # Random log-normally distributed scalings in ~(0.2, 20) mult = float_dtype( np.maximum(0.2, np.exp(np.random.normal(size=num_samples)))) # Compute the scaled loss. x_ph = tf.placeholder(x.dtype, num_samples) alpha_ph = tf.placeholder(alpha.dtype, num_samples) scale_ph = tf.placeholder(scale.dtype, num_samples) lossfun_ph = general.lossfun(x_ph, alpha_ph, scale_ph) loss_scaled = sess.run(lossfun_ph, { x_ph: mult * x, scale_ph: mult * scale, alpha_ph: alpha }) self.assertAllClose(loss, loss_scaled, atol=1e-4, rtol=1e-4)
def _loss_and_gradients_are_finite(self, float_dtype): # Test that the loss and its approximation both give finite losses and # derivatives everywhere that they should for a wide range of values. for approximate in [False, True]: with self.session() as sess: num_samples = 100000 # Normally distributed inputs. x = float_dtype(np.random.normal(size=num_samples)) # Uniformly distributed values in (-16, 3), quantized to the nearest # 0.1 to ensure that we hit the special cases at 0, 2. alpha = float_dtype( np.round(np.random.uniform(-16, 3, num_samples) * 10) / 10.) # Random log-normally distributed values in approx (1e-5, 100000): scale = float_dtype( np.exp(np.random.normal(size=num_samples) * 4.) + 1e-5) # Compute the loss and its derivative with respect to all three inputs. x_ph = tf.placeholder(x.dtype, num_samples) alpha_ph = tf.placeholder(alpha.dtype, num_samples) scale_ph = tf.placeholder(scale.dtype, num_samples) lossfun_ph = general.lossfun(x_ph, alpha_ph, scale_ph, approximate=approximate) loss, (d_x, d_alpha, d_scale) = sess.run( (lossfun_ph, tf.gradients(tf.reduce_sum(lossfun_ph), (x_ph, alpha_ph, scale_ph))), { x_ph: x, scale_ph: scale, alpha_ph: alpha }) for v in [loss, d_x, d_alpha, d_scale]: self.assertTrue(np.all(np.isfinite(v)))
def _gradient_matches_finite_differences(self, float_dtype): # Test that the loss and its approximation both return gradients that are # close to the numerical gradient from finite differences, with forward # differencing. Returning correct gradients is TensorFlow's job, so this is # just an aggressive sanity check in case some implementation detail causes # gradients to incorrectly go to zero due to quantization or stop_gradients # in some op that is used by the loss. for approximate in [False, True]: with self.session() as sess: num_samples = 100000 # Normally distributed inputs. x = float_dtype(np.random.normal(size=num_samples)) # Uniformly distributed values in (-16, 3), quantized to the nearest # 0.1 and then shifted by 0.05 so that we avoid the special cases at # 0 and 2 where the analytical gradient wont match finite differences. alpha = float_dtype( np.round(np.random.uniform(-16, 3, num_samples) * 10) / 10.) # Random uniformy distributed values in [0.5, 1.5] scale = float_dtype(np.random.uniform(0.5, 1.5, num_samples)) # Compute the loss and its derivative with respect to all three inputs. x_ph = tf.placeholder(x.dtype, num_samples) alpha_ph = tf.placeholder(alpha.dtype, num_samples) scale_ph = tf.placeholder(scale.dtype, num_samples) lossfun_ph = general.lossfun(x_ph, alpha_ph, scale_ph, approximate=approximate) loss, (d_x, d_alpha, d_scale) = sess.run( (lossfun_ph, tf.gradients(tf.reduce_sum(lossfun_ph), (x_ph, alpha_ph, scale_ph))), { x_ph: x, alpha_ph: alpha, scale_ph: scale }) step_size = float_dtype(1e-3) # Assert that the 95th percentile of errors is <= 1e-2. def assert_percentile_close(v1, v2): self.assertLessEqual(np.percentile(np.abs(v1 - v2), 95), 1e-2) n_x = (sess.run(lossfun_ph, { x_ph: x + step_size, alpha_ph: alpha, scale_ph: scale }) - loss) / step_size assert_percentile_close(n_x, d_x) n_alpha = ( sess.run(lossfun_ph, { x_ph: x, alpha_ph: alpha + step_size, scale_ph: scale }) - loss) / step_size assert_percentile_close(n_alpha, d_alpha) n_scale = ( sess.run(lossfun_ph, { x_ph: x, alpha_ph: alpha, scale_ph: scale + step_size }) - loss) / step_size assert_percentile_close(n_scale, d_scale)