Exemplo n.º 1
0
 def testInvalidAxis(self):
   matrix = [[0., 1.], [2., 3.]]
   for axis_ in [], [1, 2, 3], [[1]], [[1], [2]], [3.1415], [1, 1]:
     error_prefix = ("'axis' must be None, an integer, or a tuple of 2 unique "
                     "integers")
     with self.assertRaisesRegexp(ValueError, error_prefix):
       linalg_ops.norm(matrix, axis=axis_)
Exemplo n.º 2
0
  def testShapesValues(self):
    def circular_pad(input_, width, kernel_size):
      """Padding input_ for computing circular convolution.

      Args:
        input_: the input tensor
        width: the width of the tensor.
        kernel_size: the kernel size of the filter.

      Returns:
        a tensor whose width is (width + kernel_size - 1).
      """

      beginning = kernel_size // 2
      end = kernel_size - 1 - beginning

      tmp_up = array_ops.slice(input_, [0, width - beginning, 0, 0, 0],
                               [-1, beginning, -1, -1, -1])
      tmp_down = array_ops.slice(input_, [0, 0, 0, 0, 0],
                                 [-1, end, -1, -1, -1])
      tmp = array_ops.concat([tmp_up, input_, tmp_down], 1)

      tmp_left = array_ops.slice(tmp, [0, 0, width - beginning, 0, 0],
                                 [-1, -1, beginning, -1, -1])
      tmp_right = array_ops.slice(tmp, [0, 0, 0, 0, 0],
                                  [-1, -1, end, -1, -1])
      tmp = array_ops.concat([tmp_left, tmp, tmp_right], 2)

      tmp_front = array_ops.slice(tmp, [0, 0, 0, width - beginning, 0],
                                  [-1, -1, -1, beginning, -1])
      tmp_back = array_ops.slice(tmp, [0, 0, 0, 0, 0], [-1, -1, -1, end, -1])
      return array_ops.concat([tmp_front, tmp, tmp_back], 3)

    cout = 32
    shape = [1, 7, 7, 7, 16]
    outputs_shape = shape[0:-1] + [cout]
    dtype = dtypes.float32
    tol = 1e-3
    gain = 3.14
    # Check orthogonality/isometry by computing the ratio between
    # the 2-norms of the inputs and outputs.
    for kernel_size in [[1, 1, 1], [2, 2, 2], [3, 3, 3]]:
      convolution = convolutional.conv3d
      inputs = random_ops.random_normal(shape, dtype=dtype)
      inputs_2norm = linalg_ops.norm(inputs)
      input_with_circular_pad = circular_pad(inputs, shape[1], kernel_size[0])
      outputs = convolution(
          input_with_circular_pad, padding="valid", filters=cout,
          kernel_size=kernel_size[0], use_bias=False,
          kernel_initializer=init_ops.convolutional_orthogonal_3d(gain=gain))
      outputs_2norm = linalg_ops.norm(outputs)
      ratio = outputs_2norm / inputs_2norm
      my_ops = variables.global_variables_initializer()
      with self.test_session(use_gpu=True) as sess:
        sess.run(my_ops)
        # Check the shape of the outputs
        t = outputs.eval()
        self.assertAllEqual(t.shape, outputs_shape)
        # Check isometry of the orthogonal kernel.
        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
Exemplo n.º 3
0
 def testShapesValues(self):
   gain = 3.14
   for dtype in [dtypes.float32]:
     for kernel_size in [[3], [8], [3, 5], [2, 4], [3, 3, 3], [2, 2, 2]]:
       tol = 1e-2
       # Check orthogonality by computing ratio between
       # the 2-norms of the inputs and outputs.
       if len(kernel_size) == 1:
         shape = [4, 32, 64]
         convolution = convolutional.conv1d
       elif len(kernel_size) == 2:
         convolution = convolutional.conv2d
         shape = [4, 32, 32, 64]
       else:
         shape = [4, 16, 16, 16, 64]
         convolution = convolutional.conv3d
       inputs = random_ops.random_normal(shape, dtype=dtype)
       inputs_2norm = linalg_ops.norm(inputs)
       outputs = convolution(
           inputs, padding="same", filters=128,
           kernel_size=kernel_size, use_bias=False,
           kernel_initializer=init_ops.convolutional_delta_orthogonal(
               gain=gain))
       outputs_shape = shape[0:-1] + [128]
       outputs_2norm = linalg_ops.norm(outputs)
       ratio = outputs_2norm / inputs_2norm
       my_ops = variables.global_variables_initializer()
       with self.test_session(use_gpu=True) as sess:
         sess.run(my_ops)
         # Check the shape of the outputs
         t = outputs.eval()
         self.assertAllEqual(t.shape, outputs_shape)
         # Check isometry of the delta-orthogonal kernel.
         self.assertAllClose(sess.run(ratio), np.sqrt(gain),
                             rtol=tol, atol=tol)
Exemplo n.º 4
0
 def _CompareNorm(self, matrix):
   np_norm = np.linalg.norm(matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
   with self.cached_session(use_gpu=True) as sess:
     if use_static_shape_:
       tf_matrix = constant_op.constant(matrix)
       tf_norm = linalg_ops.norm(
           tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
       tf_norm_val = self.evaluate(tf_norm)
     else:
       tf_matrix = array_ops.placeholder(dtype_)
       tf_norm = linalg_ops.norm(
           tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
       tf_norm_val = sess.run(tf_norm, feed_dict={tf_matrix: matrix})
   self.assertAllClose(np_norm, tf_norm_val, rtol=1e-5, atol=1e-5)
Exemplo n.º 5
0
 def compute_lr(self, grad, var):
   scaled_lr = self._learning_rate
   if self._skip_list is None or not any(v in var.name
                                         for v in self._skip_list):
     w_norm = linalg_ops.norm(var, ord=2)
     g_norm = linalg_ops.norm(grad, ord=2)
     trust_ratio = array_ops.where(
         math_ops.greater(w_norm, 0),
         array_ops.where(
             math_ops.greater(g_norm, 0),
             (self._eeta * w_norm /
              (g_norm + self._weight_decay * w_norm + self._epsilon)), 1.0),
         1.0)
     scaled_lr = self._learning_rate * trust_ratio
   return scaled_lr
Exemplo n.º 6
0
 def testTransform(self):
   # This tests all combinations of:
   #   - ids rank 0, 1, >1
   #   - params sharded/unsharded
   # It always applies max_norm.
   np.random.seed(8)
   l2_norm = 2.
   with self.test_session():
     # Param values are in [l2_norm, l2_norm+1) so it will always clip.
     params = np.random.rand(6, 3) + l2_norm
     params_norm = l2_norm * params / np.sqrt(
         np.sum(params * params, axis=1, keepdims=True))
     # Compute the norm of each embedding. This will change the embedding
     # rank to 0.
     params_norm = np.linalg.norm(params_norm, axis=1)
     transform = lambda x: linalg_ops.norm(x, axis=1)
     for ids_shape in (), (3), (4, 3), (2, 3, 4):
       # Test ids rank 0, 1, 2, 3.
       ids = np.random.randint(
           params.shape[0], size=np.prod(ids_shape,
                                         dtype=np.int64)).reshape(ids_shape)
       # Compare nonsharded to gather.
       simple = embedding_ops._embedding_lookup_and_transform(
           params, ids, max_norm=l2_norm, transform_fn=transform).eval()
       self.assertAllClose(simple, array_ops.gather(params_norm, ids).eval())
       # Run a few different sharded versions.
       for procs in 1, 2, 3:
         stride = procs * math_ops.range(params.shape[0] // procs)
         split_params = [
             array_ops.gather(params, stride + p) for p in xrange(procs)
         ]
         sharded = embedding_ops._embedding_lookup_and_transform(
             split_params, ids, max_norm=l2_norm,
             transform_fn=transform).eval()
         self.assertAllEqual(simple, sharded)
    def body(i, prev_c, prev_h, actions, log_probs):
      # pylint: disable=g-long-lambda
      signal = control_flow_ops.cond(
          math_ops.equal(i, 0),
          lambda: array_ops.tile(device_go_embedding,
                                 [self.hparams.num_children, 1]),
          lambda: embedding_ops.embedding_lookup(device_embeddings,
                                                 actions.read(i - 1))
      )
      if self.hparams.keep_prob is not None:
        signal = nn_ops.dropout(signal, self.hparams.keep_prob)
      next_c, next_h = lstm(signal, prev_c, prev_h, w_lstm, forget_bias)
      query = math_ops.matmul(next_h, attn_w_2)
      query = array_ops.reshape(
          query, [self.hparams.num_children, 1, self.hparams.hidden_size])
      query = math_ops.tanh(query + attn_mem)
      query = array_ops.reshape(query, [
          self.hparams.num_children * self.num_groups, self.hparams.hidden_size
      ])
      query = math_ops.matmul(query, attn_v)
      query = array_ops.reshape(query,
                                [self.hparams.num_children, self.num_groups])
      query = nn_ops.softmax(query)
      query = array_ops.reshape(query,
                                [self.hparams.num_children, self.num_groups, 1])
      query = math_ops.reduce_sum(attn_mem * query, axis=1)
      query = array_ops.concat([next_h, query], axis=1)
      logits = math_ops.matmul(query, device_softmax)
      logits /= self.hparams.temperature
      if self.hparams.tanh_constant > 0:
        logits = math_ops.tanh(logits) * self.hparams.tanh_constant
      if self.hparams.logits_std_noise > 0:
        num_in_logits = math_ops.cast(
            array_ops.size(logits), dtype=dtypes.float32)
        avg_norm = math_ops.divide(
            linalg_ops.norm(logits), math_ops.sqrt(num_in_logits))
        logits_noise = random_ops.random_normal(
            array_ops.shape(logits),
            stddev=self.hparams.logits_std_noise * avg_norm)
        logits = control_flow_ops.cond(
            self.global_step > self.hparams.stop_noise_step, lambda: logits,
            lambda: logits + logits_noise)

      if mode == "sample":
        next_y = random_ops.multinomial(logits, 1, seed=self.hparams.seed)
      elif mode == "greedy":
        next_y = math_ops.argmax(logits, 1)
      elif mode == "target":
        next_y = array_ops.slice(y, [0, i], [-1, 1])
      else:
        raise NotImplementedError
      next_y = math_ops.to_int32(next_y)
      next_y = array_ops.reshape(next_y, [self.hparams.num_children])
      actions = actions.write(i, next_y)
      log_probs += nn_ops.sparse_softmax_cross_entropy_with_logits(
          logits=logits, labels=next_y)
      return i + 1, next_c, next_h, actions, log_probs
  def _verifySolve(self,
                   x,
                   y,
                   dtype,
                   use_placeholder,
                   fast,
                   l2_regularizer,
                   batch_shape=()):
    if not fast and l2_regularizer != 0:
      # The slow path does not support regularization.
      return
    maxdim = np.max(x.shape)
    if dtype == np.float32 or dtype == np.complex64:
      tol = maxdim * 5e-4
    else:
      tol = maxdim * 5e-7
      a = x.astype(dtype)
      b = y.astype(dtype)
      if dtype in [np.complex64, np.complex128]:
        a.imag = a.real
        b.imag = b.real
      # numpy.linalg.lstqr does not batching, so we just solve a single system
      # and replicate the solution. and residual norm.
      np_ans = _SolveWithNumpy(x, y, l2_regularizer=l2_regularizer)
      np_r = np.dot(np.conj(a.T), b - np.dot(a, np_ans))
      np_r_norm = np.sqrt(np.sum(np.conj(np_r) * np_r))
      if batch_shape is not ():
        a = np.tile(a, batch_shape + (1, 1))
        b = np.tile(b, batch_shape + (1, 1))
        np_ans = np.tile(np_ans, batch_shape + (1, 1))
        np_r_norm = np.tile(np_r_norm, batch_shape)
      with self.cached_session(use_gpu=fast) as sess:
        if use_placeholder:
          a_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
          b_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
          feed_dict = {a_ph: a, b_ph: b}
          tf_ans = linalg_ops.matrix_solve_ls(
              a_ph, b_ph, fast=fast, l2_regularizer=l2_regularizer)
        else:
          tf_ans = linalg_ops.matrix_solve_ls(
              a, b, fast=fast, l2_regularizer=l2_regularizer)
          feed_dict = {}
          self.assertEqual(np_ans.shape, tf_ans.get_shape())
        if l2_regularizer == 0:
          # The least squares solution should satisfy A^H * (b - A*x) = 0.
          tf_r = b - math_ops.matmul(a, tf_ans)
          tf_r = math_ops.matmul(a, tf_r, adjoint_a=True)
          tf_r_norm = linalg_ops.norm(tf_r, ord="fro", axis=[-2, -1])
          tf_ans_val, tf_r_norm_val = sess.run(
              [tf_ans, tf_r_norm], feed_dict=feed_dict)
          self.assertAllClose(np_r_norm, tf_r_norm_val, atol=tol, rtol=tol)
        else:
          tf_ans_val = sess.run(tf_ans, feed_dict=feed_dict)

      self.assertEqual(np_ans.shape, tf_ans_val.shape)
      self.assertAllClose(np_ans, tf_ans_val, atol=2 * tol, rtol=2 * tol)
def mean_only_frechet_classifier_distance_from_activations(
    real_activations, generated_activations):
  """Classifier distance for evaluating a generative model from activations.

  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calcuates

                                |m - m_w|^2

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images.

  Note that when computed using sample means and sample covariance matrices,
  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
  even if the two distributions are the same, for a small sample size, the
  expected Frechet distance is large). It is important to use the same
  sample size to compute frechet classifier distance when comparing two
  generative models.

  In this variant, we only compute the difference between the means of the
  fitted Gaussians. The computation leads to O(n) vs. O(n^2) memory usage, yet
  still retains much of the same information as FID.

  Args:
    real_activations: 2D array of activations of real images of size
      [num_images, num_dims] to use to compute Frechet Inception distance.
    generated_activations: 2D array of activations of generated images of size
      [num_images, num_dims] to use to compute Frechet Inception distance.

  Returns:
    The mean-only Frechet Inception distance. A floating-point scalar of the
    same type as the output of the activations.
  """
  real_activations.shape.assert_has_rank(2)
  generated_activations.shape.assert_has_rank(2)

  activations_dtype = real_activations.dtype
  if activations_dtype != dtypes.float64:
    real_activations = math_ops.to_double(real_activations)
    generated_activations = math_ops.to_double(generated_activations)

  # Compute means of activations.
  m = math_ops.reduce_mean(real_activations, 0)
  m_w = math_ops.reduce_mean(generated_activations, 0)

  # Next the distance between means.
  mean = math_ops.square(linalg_ops.norm(m - m_w))  # This uses the L2 norm.
  mofid = mean
  if activations_dtype != dtypes.float64:
    mofid = math_ops.cast(mofid, activations_dtype)

  return mofid
 def Test(self):
   np.random.seed(1)
   n = shape_[-1]
   batch_shape = shape_[:-2]
   np_dtype = dtype_.as_numpy_dtype
   a = np.random.uniform(
       low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
   if dtype_.is_complex:
     a += 1j * np.random.uniform(
         low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
   a += np.conj(a.T)
   a = np.tile(a, batch_shape + (1, 1))
   # Optimal stepsize for central difference is O(epsilon^{1/3}).
   epsilon = np.finfo(np_dtype).eps
   delta = 0.1 * epsilon**(1.0 / 3.0)
   # tolerance obtained by looking at actual differences using
   # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
   if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64):
     tol = 1e-2
   else:
     tol = 1e-7
   with self.test_session():
     tf_a = constant_op.constant(a)
     if compute_v_:
       tf_e, tf_v = linalg_ops.self_adjoint_eig(tf_a)
       # (complex) Eigenvectors are only unique up to an arbitrary phase
       # We normalize the vectors such that the first component has phase 0.
       reference = tf_v / linalg_ops.norm(
           tf_v[..., 0:1, :], axis=-1, keep_dims=True)
       tf_v *= math_ops.conj(reference)
       outputs = [tf_e, tf_v]
     else:
       tf_e = linalg_ops.self_adjoint_eigvals(tf_a)
       outputs = [tf_e,]
     for b in outputs:
       x_init = np.random.uniform(
           low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
       if dtype_.is_complex:
         x_init += 1j * np.random.uniform(
             low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
       x_init += np.conj(x_init.T)
       x_init = np.tile(x_init, batch_shape + (1, 1))
       theoretical, numerical = gradient_checker.compute_gradient(
           tf_a,
           tf_a.get_shape().as_list(),
           b,
           b.get_shape().as_list(),
           x_init_value=x_init,
           delta=delta)
       self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
  def make_grouping_predictions(self, input_layer, reuse=None):
    """model that predicts grouping (grouping_actions).

    Args:
      input_layer: group_input_layer
      reuse: reuse

    Returns:
       grouping_actions: actions
       grouping_log_probs: log probabilities corresponding to actions
    """
    with variable_scope.variable_scope(self.hparams.name, reuse=True):
      # input_layer: tensor of size [1, num_ops, hidden_size]
      w_grouping_ff = variable_scope.get_variable("w_grouping_ff")
      w_grouping_softmax = variable_scope.get_variable("w_grouping_softmax")

    batch_size = array_ops.shape(input_layer)[0]
    embedding_dim = array_ops.shape(input_layer)[2]

    reshaped = array_ops.reshape(input_layer,
                                 [batch_size * self.num_ops, embedding_dim])
    ff_output = math_ops.matmul(reshaped, w_grouping_ff)
    logits = math_ops.matmul(ff_output, w_grouping_softmax)
    if self.hparams.logits_std_noise > 0:
      num_in_logits = math_ops.cast(
          array_ops.size(logits), dtype=dtypes.float32)
      avg_norm = math_ops.divide(
          linalg_ops.norm(logits), math_ops.sqrt(num_in_logits))
      logits_noise = random_ops.random_normal(
          array_ops.shape(logits),
          stddev=self.hparams.logits_std_noise * avg_norm)
      logits = control_flow_ops.cond(
          self.global_step > self.hparams.stop_noise_step, lambda: logits,
          lambda: logits + logits_noise)
    logits = array_ops.reshape(logits,
                               [batch_size * self.num_ops, self.num_groups])
    actions = random_ops.multinomial(logits, 1, seed=self.hparams.seed)
    actions = math_ops.to_int32(actions)
    actions = array_ops.reshape(actions, [batch_size, self.num_ops])
    action_label = array_ops.reshape(actions, [-1])
    log_probs = nn_ops.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=action_label)
    log_probs = array_ops.reshape(log_probs, [batch_size, -1])
    log_probs = math_ops.reduce_sum(log_probs, 1)
    grouping_actions = actions
    grouping_log_probs = log_probs
    return grouping_actions, grouping_log_probs
Exemplo n.º 12
0
 def testTransform(self):
   # This tests all combinations of:
   #   - ids rank 0, 1, >1
   #   - params sharded/unsharded
   # It always applies max_norm.
   np.random.seed(8)
   l2_norm = 2.
   with self.cached_session():
     # Param values are in [l2_norm, l2_norm+1) so it will always clip.
     params = np.random.rand(6, 3) + l2_norm
     params_norm = l2_norm * params / np.sqrt(
         np.sum(params * params, axis=1, keepdims=True))
     # Compute the norm of each embedding. This will change the embedding
     # rank to 0.
     params_norm = np.linalg.norm(params_norm, axis=1)
     transform = lambda x: linalg_ops.norm(x, axis=1)
     for ids_shape in (), (3), (4, 3), (2, 3, 4):
       # Test ids rank 0, 1, 2, 3.
       ids = np.random.randint(
           params.shape[0], size=np.prod(ids_shape,
                                         dtype=np.int64)).reshape(ids_shape)
       # Compare nonsharded to gather.
       simple = embedding_ops._embedding_lookup_and_transform(
           params, ids, max_norm=l2_norm, transform_fn=transform).eval()
       self.assertAllClose(simple, array_ops.gather(params_norm, ids).eval())
       # Run a few different sharded versions.
       for procs in 1, 2, 3:
         stride = procs * math_ops.range(params.shape[0] // procs)
         split_params = [
             array_ops.gather(params, stride + p) for p in xrange(procs)
         ]
         sharded = embedding_ops._embedding_lookup_and_transform(
             split_params, ids, max_norm=l2_norm,
             transform_fn=transform).eval()
         # assertAllClose is used here as different implementations of sqrt may
         # be used to compute each of the values being compared.  For example,
         # on AVX512 builds the embedding operation makes use of Eigen's fast
         # vectorized square root algorithm for doubles.  These different
         # implementations of sqrt are not guaranteed to produce exactly the
         # same results. Therefore, an exact comparison cannot be made.
         self.assertAllClose(simple, sharded)
Exemplo n.º 13
0
  def testBadOrder(self):
    matrix = [[0., 1.], [2., 3.]]
    for ord_ in "fro", -7, -1.1, 0:
      with self.assertRaisesRegexp(ValueError,
                                   "'ord' must be a supported vector norm"):
        linalg_ops.norm(matrix, ord=ord_)

    for ord_ in "fro", -7, -1.1, 0:
      with self.assertRaisesRegexp(ValueError,
                                   "'ord' must be a supported vector norm"):
        linalg_ops.norm(matrix, ord=ord_, axis=-1)

    for ord_ in "foo", -7, -1.1, 1.1:
      with self.assertRaisesRegexp(ValueError,
                                   "'ord' must be a supported matrix norm"):
        linalg_ops.norm(matrix, ord=ord_, axis=[-2, -1])
  def operator_and_matrix(
      self, build_info, dtype, use_placeholder,
      ensure_self_adjoint_and_pd=False):
    shape = list(build_info.shape)
    reflection_axis = linear_operator_test_util.random_sign_uniform(
        shape[:-1], minval=1., maxval=2., dtype=dtype)
    # Make sure unit norm.
    reflection_axis = reflection_axis / linalg_ops.norm(
        reflection_axis, axis=-1, keepdims=True)

    lin_op_reflection_axis = reflection_axis

    if use_placeholder:
      lin_op_reflection_axis = array_ops.placeholder_with_default(
          reflection_axis, shape=None)

    operator = householder.LinearOperatorHouseholder(lin_op_reflection_axis)

    mat = reflection_axis[..., array_ops.newaxis]
    matrix = -2 * math_ops.matmul(mat, mat, adjoint_b=True)
    matrix = array_ops.matrix_set_diag(
        matrix, 1. + array_ops.matrix_diag_part(matrix))

    return operator, matrix
Exemplo n.º 15
0
 def squared_frobenius_norm(x):
   """Helper to make KL calculation slightly more readable."""
   # http://mathworld.wolfram.com/FrobeniusNorm.html
   return math_ops.square(linalg_ops.norm(x, ord="fro", axis=[-2, -1]))
def frechet_classifier_distance(real_images,
                                generated_images,
                                classifier_fn,
                                num_batches=1):
    """Classifier distance for evaluating a generative model.

  This is based on the Frechet Inception distance, but for an arbitrary
  classifier.

  This technique is described in detail in https://arxiv.org/abs/1706.08500.
  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calcuates

  |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images.

  Note that when computed using sample means and sample covariance matrices,
  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
  even if the two distributions are the same, for a small sample size, the
  expected Frechet distance is large). It is important to use the same
  sample size to compute frechet classifier distance when comparing two
  generative models.

  Args:
    real_images: Real images to use to compute Frechet Inception distance.
    generated_images: Generated images to use to compute Frechet Inception
      distance.
    classifier_fn: A function that takes images and produces activations
      based on a classifier.
    num_batches: Number of batches to split images in to in order to
      efficiently run them through the classifier network.

  Returns:
    The Frechet Inception distance. A floating-point scalar.
  """

    real_images_list = array_ops.split(real_images,
                                       num_or_size_splits=num_batches)
    generated_images_list = array_ops.split(generated_images,
                                            num_or_size_splits=num_batches)

    imgs = array_ops.stack(real_images_list + generated_images_list)

    # Compute the activations using the memory-efficient `map_fn`.
    activations = functional_ops.map_fn(fn=classifier_fn,
                                        elems=imgs,
                                        parallel_iterations=1,
                                        back_prop=False,
                                        swap_memory=True,
                                        name='RunClassifier')

    # Split the activations by the real and generated images.
    real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0)

    # Ensure the activations have the right shapes.
    real_a = array_ops.concat(array_ops.unstack(real_a), 0)
    gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
    real_a.shape.assert_has_rank(2)
    gen_a.shape.assert_has_rank(2)

    # Compute mean and covariance matrices of activations.
    m = math_ops.reduce_mean(real_a, 0)
    m_v = math_ops.reduce_mean(gen_a, 0)
    num_examples = math_ops.to_float(array_ops.shape(real_a)[0])

    # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
    sigma = math_ops.matmul(real_a - m, real_a - m,
                            transpose_a=True) / (num_examples - 1)

    sigma_v = math_ops.matmul(gen_a - m_v, gen_a - m_v,
                              transpose_a=True) / (num_examples - 1)

    # Find the Tr(sqrt(sigma sigma_v)) component of FID
    sqrt_trace_component = trace_sqrt_product(sigma, sigma_v)

    # Compute the two components of FID.

    # First the covariance component.
    # Here, note that trace(A + B) = trace(A) + trace(B)
    trace = math_ops.trace(sigma + sigma_v) - 2.0 * sqrt_trace_component

    # Next the distance between means.
    mean = math_ops.square(linalg_ops.norm(m - m_v))  # This uses the L2 norm.
    fid = trace + mean

    return fid
Exemplo n.º 17
0
def monotone_linear_layer(input_tensor,
                          input_dim,
                          output_dim,
                          is_monotone=None,
                          add_bias=True,
                          normalization_order=None,
                          init_weight_mean=2.0,
                          init_weight_stddev=0.5,
                          init_bias=None,
                          l1_reg=None,
                          l2_reg=None):
  """Creates a partially monotonic linear embedding layer.

  Returns an output of partially monotonic linear embedding layer, weights in
  the linear embedding layer, projection ops and regularizers.

    output = input * weight' + bias

  and the kth row is constrained to be non-negative, if is_monotone[k] == True.
  weight is initialized to entrywise Normal random variable (init_weight_mean,
  init_weight_stdev). If init_b is not provided, then the initial bias is
  initialized to -1/2 * init_weight_mean * input_dim. This offset term is used
  to make the initial mean to 0, assuming each input tensor is from the uniform
  distribution [0, 1]:
    E[output] = E[input * weight' + bias] = E[input] * E[weight] + bias
      = 1/2 * init_weight_mean * input_dim + bias
      = 0.

  Args:
    input_tensor: [batch_size, input_dim] tensor.
    input_dim: (int) input dimension.
    output_dim: (int) output dimension.
    is_monotone:  A list of input_dim booleans, a single boolean, or None. If
      None or False, linear layer will not have monotonicity constraints. If
      True, all of inputs are set to be monotonic. In the case of boolean list,
      input_tensor[:, k] is set to be monotonic if is_monotone[k] == True.
    add_bias: (bool) If a bias term should be added.
    normalization_order: If specified, the returned projection will normalize
      the weight vector across each output dimension to have norm 1. The norm
      order can be 1, 2 or np.inf. Norm is lower bounded by 1e-12.
    init_weight_mean: (float) A mean for Normal random weight initializer.
    init_weight_stddev: (float) A standard deviation for Normal random weight
      initializer.
    init_bias: (float) initial bias. If not provided, -1/2 * init_weight_mean *
      input_dim is used.
    l1_reg: (float) amount of l1 regularization.
    l2_reg: (float) amount of l2 regularization.

  Returns:
    A tuple of:
    * output tensor of shape [batch_size, output_dim]
    * weight tensor of shape [output_dim, input_dim]
    * None or projection ops, that must be applied at each
      step (or every so many steps) to project the model to a feasible space:
      used for bounding the outputs or for imposing monotonicity.
    * None or a regularization loss, if regularization is configured.

  Raises:
    ValueError: If is_monotone is not None, but its length != input_dim.
  """
  with variable_scope.variable_scope('monotone_linear'):
    # We use [output_dim, input_dim] convention to use broadcasting in
    # projeciton.
    init_weights = random_ops.random_normal(
        [output_dim, input_dim],
        mean=init_weight_mean,
        stddev=init_weight_stddev)
    if init_bias is None:
      init_biases = [-init_weight_mean * 0.5 * input_dim] * output_dim
    else:
      init_biases = [init_bias] * output_dim

    w = variable_scope.get_variable(
        name='weight', initializer=init_weights, dtype=input_tensor.dtype)
    output_tensor = math_ops.matmul(input_tensor, w, transpose_b=True)
    if add_bias:
      b = variable_scope.get_variable(
          name='bias', initializer=init_biases, dtype=input_tensor.dtype)
      output_tensor = output_tensor + b

    # Constructing a projection op.
    projection = None
    if is_monotone or normalization_order:
      with ops.name_scope('monotonic_projection'):
        diff = None
        if is_monotone:
          if isinstance(is_monotone, list):
            # is_monotone is given as a list. We should only apply positivity
            # constraints to a masked version of the weights.
            if input_dim != len(is_monotone):
              raise ValueError('input_dim (%d) != is_monotone length (%d)' %
                               (input_dim, len(is_monotone)))
            # Construct a multiplicative mask for monotonic dimension
            # selection.
            monotone_mask = array_ops.constant(
                [1.0 if monotone else 0.0 for monotone in is_monotone],
                dtype=w.dtype)
            # Since input_dim is the last dimension of the weight, we can use
            # broadcasting.
            masked_w = math_ops.multiply(w, monotone_mask)
          else:
            # is_monotone is set to True.
            masked_w = w

          projected_w = math_ops.maximum(masked_w, 0.0)
          diff = projected_w - masked_w

        if normalization_order:
          unnormalized_w = w if diff is None else w + diff
          normalized_w = unnormalized_w / math_ops.maximum(
              linalg_ops.norm(
                  unnormalized_w,
                  ord=normalization_order,
                  axis=1,
                  keepdims=True), 1e-12)
          diff = normalized_w - w

        projection = w.assign_add(diff)

    # Constructing a regularization op.
    regularizer = None
    if l1_reg is not None or l2_reg is not None:
      with ops.name_scope('linear_regularization'):
        regularizer = regularizers.linear_regularization(w, l1_reg, l2_reg)

    return (output_tensor, w, projection, regularizer)
Exemplo n.º 18
0
def frechet_classifier_distance(real_images,
                                generated_images,
                                classifier_fn,
                                num_batches=1):
  """Classifier distance for evaluating a generative model.

  This is based on the Frechet Inception distance, but for an arbitrary
  classifier.

  This technique is described in detail in https://arxiv.org/abs/1706.08500.
  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calcuates

  |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images.

  Note that when computed using sample means and sample covariance matrices,
  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
  even if the two distributions are the same, for a small sample size, the
  expected Frechet distance is large). It is important to use the same
  sample size to compute frechet classifier distance when comparing two
  generative models.

  Args:
    real_images: Real images to use to compute Frechet Inception distance.
    generated_images: Generated images to use to compute Frechet Inception
      distance.
    classifier_fn: A function that takes images and produces activations
      based on a classifier.
    num_batches: Number of batches to split images in to in order to
      efficiently run them through the classifier network.

  Returns:
    The Frechet Inception distance. A floating-point scalar.
  """

  real_images_list = array_ops.split(
      real_images, num_or_size_splits=num_batches)
  generated_images_list = array_ops.split(
      generated_images, num_or_size_splits=num_batches)

  imgs = array_ops.stack(real_images_list + generated_images_list)

  # Compute the activations using the memory-efficient `map_fn`.
  activations = functional_ops.map_fn(
      fn=classifier_fn,
      elems=imgs,
      parallel_iterations=1,
      back_prop=False,
      swap_memory=True,
      name='RunClassifier')

  # Split the activations by the real and generated images.
  real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0)

  # Ensure the activations have the right shapes.
  real_a = array_ops.concat(array_ops.unstack(real_a), 0)
  gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
  real_a.shape.assert_has_rank(2)
  gen_a.shape.assert_has_rank(2)

  # Compute mean and covariance matrices of activations.
  m = math_ops.reduce_mean(real_a, 0)
  m_v = math_ops.reduce_mean(gen_a, 0)
  num_examples = math_ops.to_float(array_ops.shape(real_a)[0])

  # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
  sigma = math_ops.matmul(
      real_a - m, real_a - m, transpose_a=True) / (num_examples - 1)

  sigma_v = math_ops.matmul(
      gen_a - m_v, gen_a - m_v, transpose_a=True) / (num_examples - 1)

  # Find the Tr(sqrt(sigma sigma_v)) component of FID
  sqrt_trace_component = trace_sqrt_product(sigma, sigma_v)

  # Compute the two components of FID.

  # First the covariance component.
  # Here, note that trace(A + B) = trace(A) + trace(B)
  trace = math_ops.trace(sigma + sigma_v) - 2.0 * sqrt_trace_component

  # Next the distance between means.
  mean = math_ops.square(linalg_ops.norm(m - m_v))  # This uses the L2 norm.
  fid = trace + mean

  return fid
Exemplo n.º 19
0
def diagonal_only_frechet_classifier_distance_from_activations(
        real_activations, generated_activations):
    """Classifier distance for evaluating a generative model.

  This is based on the Frechet Inception distance, but for an arbitrary
  classifier.

  This technique is described in detail in https://arxiv.org/abs/1706.08500.
  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calcuates

          |m - m_w|^2 + (sigma + sigma_w - 2(sigma x sigma_w)^(1/2))

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images. In this variant, we compute diagonal-only covariance matrices.
  As a result, instead of computing an expensive matrix square root, we can do
  something much simpler, and has O(n) vs O(n^2) space complexity.

  Note that when computed using sample means and sample covariance matrices,
  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
  even if the two distributions are the same, for a small sample size, the
  expected Frechet distance is large). It is important to use the same
  sample size to compute frechet classifier distance when comparing two
  generative models.

  Args:
    real_activations: Real images to use to compute Frechet Inception distance.
    generated_activations: Generated images to use to compute Frechet Inception
      distance.

  Returns:
    The diagonal-only Frechet Inception distance. A floating-point scalar of
    the same type as the output of the activations.

  Raises:
    ValueError: If the shape of the variance and mean vectors are not equal.
  """
    real_activations.shape.assert_has_rank(2)
    generated_activations.shape.assert_has_rank(2)

    activations_dtype = real_activations.dtype
    if activations_dtype != dtypes.float64:
        real_activations = math_ops.to_double(real_activations)
        generated_activations = math_ops.to_double(generated_activations)

    # Compute mean and covariance matrices of activations.
    m, var = nn_impl.moments(real_activations, axes=[0])
    m_w, var_w = nn_impl.moments(generated_activations, axes=[0])

    actual_shape = var.get_shape()
    expected_shape = m.get_shape()

    if actual_shape != expected_shape:
        raise ValueError('shape: {} must match expected shape: {}'.format(
            actual_shape, expected_shape))

    # Compute the two components of FID.

    # First the covariance component.
    # Here, note that trace(A + B) = trace(A) + trace(B)
    trace = math_ops.reduce_sum((var + var_w) - 2.0 *
                                math_ops.sqrt(math_ops.multiply(var, var_w)))

    # Next the distance between means.
    mean = math_ops.square(linalg_ops.norm(m - m_w))  # This uses the L2 norm.
    dofid = trace + mean
    if activations_dtype != dtypes.float64:
        dofid = math_ops.cast(dofid, activations_dtype)

    return dofid
Exemplo n.º 20
0
def process_quadrature_grid_and_probs(quadrature_grid_and_probs,
                                      dtype,
                                      validate_args,
                                      name=None):
    """Validates quadrature grid, probs or computes them as necessary.

  Args:
    quadrature_grid_and_probs: Python pair of `float`-like `Tensor`s
      representing the sample points and the corresponding (possibly
      normalized) weight.  When `None`, defaults to:
      `np.polynomial.hermite.hermgauss(deg=8)`.
    dtype: The expected `dtype` of `grid` and `probs`.
    validate_args: Python `bool`, default `False`. When `True` distribution
      parameters are checked for validity despite possibly degrading runtime
      performance. When `False` invalid inputs may silently render incorrect
      outputs.
    name: Python `str` name prefixed to Ops created by this class.

  Returns:
     quadrature_grid_and_probs: Python pair of `float`-like `Tensor`s
      representing the sample points and the corresponding (possibly
      normalized) weight.

  Raises:
    ValueError: if `quadrature_grid_and_probs is not None` and
      `len(quadrature_grid_and_probs[0]) != len(quadrature_grid_and_probs[1])`
  """
    with ops.name_scope(name, "process_quadrature_grid_and_probs",
                        [quadrature_grid_and_probs]):
        if quadrature_grid_and_probs is None:
            grid, probs = np.polynomial.hermite.hermgauss(deg=8)
            grid = grid.astype(dtype.as_numpy_dtype)
            probs = probs.astype(dtype.as_numpy_dtype)
            probs /= np.linalg.norm(probs, ord=1, keepdims=True)
            grid = ops.convert_to_tensor(grid, name="grid", dtype=dtype)
            probs = ops.convert_to_tensor(probs, name="probs", dtype=dtype)
            return grid, probs

        grid, probs = tuple(quadrature_grid_and_probs)
        grid = ops.convert_to_tensor(grid, name="grid", dtype=dtype)
        probs = ops.convert_to_tensor(probs,
                                      name="unnormalized_probs",
                                      dtype=dtype)
        probs /= linalg_ops.norm(probs,
                                 ord=1,
                                 axis=-1,
                                 keep_dims=True,
                                 name="probs")

        def _static_dim_size(x, axis):
            """Returns the static size of a specific dimension or `None`."""
            return x.shape.with_rank_at_least(axis + 1)[axis].value

        m, n = _static_dim_size(probs, axis=0), _static_dim_size(grid, axis=0)
        if m is not None and n is not None:
            if m != n:
                raise ValueError(
                    "`quadrature_grid_and_probs` must be a `tuple` of "
                    "same-length zero-th-dimension `Tensor`s "
                    "(saw lengths {}, {})".format(m, n))
        elif validate_args:
            grid = control_flow_ops.with_dependencies([
                check_ops.assert_equal(
                    dimension_size(probs, axis=0),
                    dimension_size(grid, axis=0),
                    message=(
                        "`quadrature_grid_and_probs` must be a `tuple` of "
                        "same-length zero-th-dimension `Tensor`s")),
            ], grid)

        return grid, probs
Exemplo n.º 21
0
 def is_in_ball(x, radius, center):
   return math_ops.cast(
       linalg_ops.norm(x - center, axis=-1) <= radius, dtype=x.dtype)
Exemplo n.º 22
0
 def _show_norm(tensor):
   tensor = math_ops.cast(tensor, dtypes.float64)
   output_tensor = linalg_ops.norm(tensor)
   return _print_tensor(tensor_name, -1, tensor, output_tensor)
def frechet_classifier_distance(real_images,
                                generated_images,
                                classifier_fn,
                                num_batches=1):
  """Classifier distance for evaluating a conditional generative model.

  This is based on the Frechet Inception distance, but for an arbitrary
  classifier.

  This technique is described in detail in https://arxiv.org/abs/1706.08500.
  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calcuates

  |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images.

  Args:
    real_images: Real images to use to compute Frechet Inception distance.
    generated_images: Generated images to use to compute Frechet Inception
      distance.
    classifier_fn: A function that takes images and produces activations
      based on a classifier.
    num_batches: Number of batches to split images in to in order to
      efficiently run them through the classifier network.

  Returns:
    The Frechet Inception distance. A floating-point scalar.
  """

  real_images_list = array_ops.split(
      real_images, num_or_size_splits=num_batches)
  generated_images_list = array_ops.split(
      generated_images, num_or_size_splits=num_batches)

  imgs = array_ops.stack(real_images_list + generated_images_list)

  # Compute the activations using the memory-efficient `map_fn`.
  activations = functional_ops.map_fn(
      fn=classifier_fn,
      elems=imgs,
      parallel_iterations=1,
      back_prop=False,
      swap_memory=True,
      name='RunClassifier')

  # Split the activations by the real and generated images.
  real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0)

  # Ensure the activations have the right shapes.
  real_a = array_ops.concat(array_ops.unstack(real_a), 0)
  gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
  real_a.shape.assert_has_rank(2)
  gen_a.shape.assert_has_rank(2)

  # Compute mean and covariance matrices of activations.
  m = math_ops.reduce_mean(real_a, 0)
  m_v = math_ops.reduce_mean(gen_a, 0)
  dim = math_ops.to_float(array_ops.shape(m)[0])
  sigma = math_ops.matmul(real_a - m, real_a - m, transpose_b=True) / dim
  sigma_v = math_ops.matmul(gen_a - m, gen_a - m, transpose_b=True) / dim

  # Take matrix square root of the product of covariance matrices.
  sqcc = _matrix_square_root(math_ops.matmul(sigma, sigma_v))

  # Compute the two components of FID.
  trace = math_ops.trace(sigma + sigma_v - 2.0 * sqcc)
  mean = math_ops.square(linalg_ops.norm(m - m_v))  # This uses the L2 norm.
  fid = trace + mean

  return fid
def frechet_classifier_distance_from_activations(
    real_activations, generated_activations):
  """Classifier distance for evaluating a generative model.

  This is based on the Frechet Inception distance, but for an arbitrary
  classifier.

  This technique is described in detail in https://arxiv.org/abs/1706.08500.
  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calcuates

  |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images.

  Note that when computed using sample means and sample covariance matrices,
  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
  even if the two distributions are the same, for a small sample size, the
  expected Frechet distance is large). It is important to use the same
  sample size to compute frechet classifier distance when comparing two
  generative models.

  Args:
    real_activations: Real images to use to compute Frechet Inception distance.
    generated_activations: Generated images to use to compute Frechet Inception
      distance.

  Returns:
    The Frechet Inception distance. A floating-point scalar of the same type
    as the output of the activations.
  """
  real_activations.shape.assert_has_rank(2)
  generated_activations.shape.assert_has_rank(2)

  activations_dtype = real_activations.dtype
  if activations_dtype != dtypes.float64:
    real_activations = math_ops.to_double(real_activations)
    generated_activations = math_ops.to_double(generated_activations)

  # Compute mean and covariance matrices of activations.
  m = math_ops.reduce_mean(real_activations, 0)
  m_v = math_ops.reduce_mean(generated_activations, 0)
  num_examples = math_ops.to_double(array_ops.shape(real_activations)[0])

  # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
  real_centered = real_activations - m
  sigma = math_ops.matmul(
      real_centered, real_centered, transpose_a=True) / (num_examples - 1)

  gen_centered = generated_activations - m_v
  sigma_v = math_ops.matmul(
      gen_centered, gen_centered, transpose_a=True) / (num_examples - 1)

  # Find the Tr(sqrt(sigma sigma_v)) component of FID
  sqrt_trace_component = trace_sqrt_product(sigma, sigma_v)

  # Compute the two components of FID.

  # First the covariance component.
  # Here, note that trace(A + B) = trace(A) + trace(B)
  trace = math_ops.trace(sigma + sigma_v) - 2.0 * sqrt_trace_component

  # Next the distance between means.
  mean = math_ops.square(linalg_ops.norm(m - m_v))  # This uses the L2 norm.
  fid = trace + mean
  if activations_dtype != dtypes.float64:
    fid = math_ops.cast(fid, activations_dtype)

  return fid
Exemplo n.º 25
0
def frechet_classifier_distance_from_activations(real_activations,
                                                 generated_activations):
    """Classifier distance for evaluating a generative model.

  This methods computes the Frechet classifier distance from activations of
  real images and generated images. This can be used independently of the
  frechet_classifier_distance() method, especially in the case of using large
  batches during evaluation where we would like precompute all of the
  activations before computing the classifier distance.

  This technique is described in detail in https://arxiv.org/abs/1706.08500.
  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calculates

                |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images.

  Note that when computed using sample means and sample covariance matrices,
  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
  even if the two distributions are the same, for a small sample size, the
  expected Frechet distance is large). It is important to use the same
  sample size to compute frechet classifier distance when comparing two
  generative models.

  Args:
    real_activations: 2D Tensor containing activations of real data. Shape is
      [batch_size, activation_size].
    generated_activations: 2D Tensor containing activations of generated data.
      Shape is [batch_size, activation_size].

  Returns:
   The Frechet Inception distance. A floating-point scalar of the same type
   as the output of the activations.

  """
    real_activations.shape.assert_has_rank(2)
    generated_activations.shape.assert_has_rank(2)

    activations_dtype = real_activations.dtype
    if activations_dtype != dtypes.float64:
        real_activations = math_ops.to_double(real_activations)
        generated_activations = math_ops.to_double(generated_activations)

    # Compute mean and covariance matrices of activations.
    m = math_ops.reduce_mean(real_activations, 0)
    m_w = math_ops.reduce_mean(generated_activations, 0)
    num_examples = math_ops.to_double(array_ops.shape(real_activations)[0])

    # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
    real_centered = real_activations - m
    sigma = math_ops.matmul(real_centered, real_centered,
                            transpose_a=True) / (num_examples - 1)

    gen_centered = generated_activations - m_w
    sigma_w = math_ops.matmul(gen_centered, gen_centered,
                              transpose_a=True) / (num_examples - 1)

    # Find the Tr(sqrt(sigma sigma_w)) component of FID
    sqrt_trace_component = trace_sqrt_product(sigma, sigma_w)

    # Compute the two components of FID.

    # First the covariance component.
    # Here, note that trace(A + B) = trace(A) + trace(B)
    trace = math_ops.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component

    # Next the distance between means.
    mean = math_ops.square(linalg_ops.norm(m - m_w))  # This uses the L2 norm.
    fid = trace + mean
    if activations_dtype != dtypes.float64:
        fid = math_ops.cast(fid, activations_dtype)

    return fid
Exemplo n.º 26
0
 def _init_norm(self, weights):
     """Set the norm of the weight vector"""
     from tensorflow.python.ops.linalg_ops import norm
     with name_scope('init_norm'):
         flat = array_ops.reshape(weights, [-1, self.layer_depth])
         return array_ops.reshape(norm(flat, axis=0), (self.layer_depth, ))
Exemplo n.º 27
0
 def _show_norm(tensor):
   tensor = math_ops.cast(tensor, dtypes.float32)
   output_tensor = linalg_ops.norm(tensor)
   # The shape has to be 1. Set it if it does not have the information.
   output_tensor = array_ops.reshape(output_tensor, [1])
   return output_tensor
Exemplo n.º 28
0
def conjugate_gradient(operator,
                       rhs,
                       preconditioner=None,
                       x=None,
                       tol=1e-4,
                       max_iter=20,
                       name="conjugate_gradient"):
    r"""Conjugate gradient solver.

  Solves a linear system of equations `A*x = rhs` for selfadjoint, positive
  definite matrix `A` and right-hand side vector `rhs`, using an iterative,
  matrix-free algorithm where the action of the matrix A is represented by
  `operator`. The iteration terminates when either the number of iterations
  exceeds `max_iter` or when the residual norm has been reduced to `tol`
  times its initial value, i.e. \\(||rhs - A x_k|| <= tol ||rhs||\\).

  Args:
    operator: An object representing a linear operator with attributes:
      - shape: Either a list of integers or a 1-D `Tensor` of type `int32` of
        length 2. `shape[0]` is the dimension on the domain of the operator,
        `shape[1]` is the dimension of the co-domain of the operator. On other
        words, if operator represents an N x N matrix A, `shape` must contain
        `[N, N]`.
      - dtype: The datatype of input to and output from `apply`.
      - apply: Callable object taking a vector `x` as input and returning a
        vector with the result of applying the operator to `x`, i.e. if
       `operator` represents matrix `A`, `apply` should return `A * x`.
    rhs: A rank-1 `Tensor` of shape `[N]` containing the right-hand size vector.
    preconditioner: An object representing a linear operator, see `operator`
      for detail. The preconditioner should approximate the inverse of `A`.
      An efficient preconditioner could dramatically improve the rate of
      convergence. If `preconditioner` represents matrix `M`(`M` approximates
      `A^{-1}`), the algorithm uses `preconditioner.apply(x)` to estimate
      `A^{-1}x`. For this to be useful, the cost of applying `M` should be
      much lower than computing `A^{-1}` directly.
    x: A rank-1 `Tensor` of shape `[N]` containing the initial guess for the
      solution.
    tol: A float scalar convergence tolerance.
    max_iter: An integer giving the maximum number of iterations.
    name: A name scope for the operation.

  Returns:
    output: A namedtuple representing the final state with fields:
      - i: A scalar `int32` `Tensor`. Number of iterations executed.
      - x: A rank-1 `Tensor` of shape `[N]` containing the computed solution.
      - r: A rank-1 `Tensor` of shape `[M]` containing the residual vector.
      - p: A rank-1 `Tensor` of shape `[N]`. `A`-conjugate basis vector.
      - gamma: \\(r \dot M \dot r\\), equivalent to  \\(||r||_2^2\\) when
        `preconditioner=None`.
  """
    # ephemeral class holding CG state.
    cg_state = collections.namedtuple("CGState", ["i", "x", "r", "p", "gamma"])

    def stopping_criterion(i, state):
        return math_ops.logical_and(i < max_iter,
                                    linalg_ops.norm(state.r) > tol)

    def cg_step(i, state):  # pylint: disable=missing-docstring
        z = operator.apply(state.p)
        alpha = state.gamma / util.dot(state.p, z)
        x = state.x + alpha * state.p
        r = state.r - alpha * z
        if preconditioner is None:
            gamma = util.dot(r, r)
            beta = gamma / state.gamma
            p = r + beta * state.p
        else:
            q = preconditioner.apply(r)
            gamma = util.dot(r, q)
            beta = gamma / state.gamma
            p = q + beta * state.p
        return i + 1, cg_state(i + 1, x, r, p, gamma)

    with ops.name_scope(name):
        n = operator.shape[1:]
        rhs = array_ops.expand_dims(rhs, -1)
        if x is None:
            x = array_ops.expand_dims(
                array_ops.zeros(n, dtype=rhs.dtype.base_dtype), -1)
            r0 = rhs
        else:
            x = array_ops.expand_dims(x, -1)
            r0 = rhs - operator.apply(x)
        if preconditioner is None:
            p0 = r0
        else:
            p0 = preconditioner.apply(r0)
        gamma0 = util.dot(r0, p0)
        tol *= linalg_ops.norm(r0)
        i = constant_op.constant(0, dtype=dtypes.int32)
        state = cg_state(i=i, x=x, r=r0, p=p0, gamma=gamma0)
        _, state = control_flow_ops.while_loop(stopping_criterion, cg_step,
                                               [i, state])
        return cg_state(state.i,
                        x=array_ops.squeeze(state.x),
                        r=array_ops.squeeze(state.r),
                        p=array_ops.squeeze(state.p),
                        gamma=state.gamma)
def frechet_classifier_distance_from_activations(
    real_activations, generated_activations):
  """Classifier distance for evaluating a generative model from activations.

  This methods computes the Frechet classifier distance from activations of
  real images and generated images. This can be used independently of the
  frechet_classifier_distance() method, especially in the case of using large
  batches during evaluation where we would like precompute all of the
  activations before computing the classifier distance.

  This technique is described in detail in https://arxiv.org/abs/1706.08500.
  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calcuates

  |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images.

  Args:
    real_activations: 2D Tensor containing activations of real data. Shape is
      [batch_size, activation_size].
    generated_activations: 2D Tensor containing activations of generated data.
      Shape is [batch_size, activation_size].

  Returns:
   The Frechet Inception distance. A floating-point scalar of the same type
   as the output of the activations.

  """
  real_activations.shape.assert_has_rank(2)
  generated_activations.shape.assert_has_rank(2)

  activations_dtype = real_activations.dtype
  if activations_dtype != dtypes.float64:
    real_activations = math_ops.to_double(real_activations)
    generated_activations = math_ops.to_double(generated_activations)

  # Compute mean and covariance matrices of activations.
  m = math_ops.reduce_mean(real_activations, 0)
  m_v = math_ops.reduce_mean(generated_activations, 0)
  num_examples = math_ops.to_double(array_ops.shape(real_activations)[0])

  # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
  real_centered = real_activations - m
  sigma = math_ops.matmul(
      real_centered, real_centered, transpose_a=True) / (num_examples - 1)

  gen_centered = generated_activations - m_v
  sigma_v = math_ops.matmul(
      gen_centered, gen_centered, transpose_a=True) / (num_examples - 1)

  # Find the Tr(sqrt(sigma sigma_v)) component of FID
  sqrt_trace_component = trace_sqrt_product(sigma, sigma_v)

  # Compute the two components of FID.

  # First the covariance component.
  # Here, note that trace(A + B) = trace(A) + trace(B)
  trace = math_ops.trace(sigma + sigma_v) - 2.0 * sqrt_trace_component

  # Next the distance between means.
  mean = math_ops.square(linalg_ops.norm(m - m_v))  # This uses the L2 norm.
  fid = trace + mean
  if activations_dtype != dtypes.float64:
    fid = math_ops.cast(fid, activations_dtype)

  return fid
Exemplo n.º 30
0
def conjugate_gradient(operator,
                       rhs,
                       preconditioner=None,
                       x=None,
                       tol=1e-4,
                       max_iter=20,
                       name="conjugate_gradient"):
  r"""Conjugate gradient solver.

  Solves a linear system of equations `A*x = rhs` for selfadjoint, positive
  definite matrix `A` and right-hand side vector `rhs`, using an iterative,
  matrix-free algorithm where the action of the matrix A is represented by
  `operator`. The iteration terminates when either the number of iterations
  exceeds `max_iter` or when the residual norm has been reduced to `tol`
  times its initial value, i.e. \\(||rhs - A x_k|| <= tol ||rhs||\\).

  Args:
    operator: An object representing a linear operator with attributes:
      - shape: Either a list of integers or a 1-D `Tensor` of type `int32` of
        length 2. `shape[0]` is the dimension on the domain of the operator,
        `shape[1]` is the dimension of the co-domain of the operator. On other
        words, if operator represents an N x N matrix A, `shape` must contain
        `[N, N]`.
      - dtype: The datatype of input to and output from `apply`.
      - apply: Callable object taking a vector `x` as input and returning a
        vector with the result of applying the operator to `x`, i.e. if
       `operator` represents matrix `A`, `apply` should return `A * x`.
    rhs: A rank-1 `Tensor` of shape `[N]` containing the right-hand size vector.
    preconditioner: An object representing a linear operator, see `operator`
      for detail. The preconditioner should approximate the inverse of `A`.
      An efficient preconditioner could dramatically improve the rate of
      convergence. If `preconditioner` represents matrix `M`(`M` approximates
      `A^{-1}`), the algorithm uses `preconditioner.apply(x)` to estimate
      `A^{-1}x`. For this to be useful, the cost of applying `M` should be
      much lower than computing `A^{-1}` directly.
    x: A rank-1 `Tensor` of shape `[N]` containing the initial guess for the
      solution.
    tol: A float scalar convergence tolerance.
    max_iter: An integer giving the maximum number of iterations.
    name: A name scope for the operation.

  Returns:
    output: A namedtuple representing the final state with fields:
      - i: A scalar `int32` `Tensor`. Number of iterations executed.
      - x: A rank-1 `Tensor` of shape `[N]` containing the computed solution.
      - r: A rank-1 `Tensor` of shape `[M]` containing the residual vector.
      - p: A rank-1 `Tensor` of shape `[N]`. `A`-conjugate basis vector.
      - gamma: \\(r \dot M \dot r\\), equivalent to  \\(||r||_2^2\\) when
        `preconditioner=None`.
  """
  # ephemeral class holding CG state.
  cg_state = collections.namedtuple("CGState", ["i", "x", "r", "p", "gamma"])

  def stopping_criterion(i, state):
    return math_ops.logical_and(i < max_iter, linalg_ops.norm(state.r) > tol)

  def cg_step(i, state):  # pylint: disable=missing-docstring
    z = operator.apply(state.p)
    alpha = state.gamma / util.dot(state.p, z)
    x = state.x + alpha * state.p
    r = state.r - alpha * z
    if preconditioner is None:
      gamma = util.dot(r, r)
      beta = gamma / state.gamma
      p = r + beta * state.p
    else:
      q = preconditioner.apply(r)
      gamma = util.dot(r, q)
      beta = gamma / state.gamma
      p = q + beta * state.p
    return i + 1, cg_state(i + 1, x, r, p, gamma)

  with ops.name_scope(name):
    n = operator.shape[1:]
    rhs = array_ops.expand_dims(rhs, -1)
    if x is None:
      x = array_ops.expand_dims(
          array_ops.zeros(n, dtype=rhs.dtype.base_dtype), -1)
      r0 = rhs
    else:
      x = array_ops.expand_dims(x, -1)
      r0 = rhs - operator.apply(x)
    if preconditioner is None:
      p0 = r0
    else:
      p0 = preconditioner.apply(r0)
    gamma0 = util.dot(r0, p0)
    tol *= linalg_ops.norm(r0)
    i = constant_op.constant(0, dtype=dtypes.int32)
    state = cg_state(i=i, x=x, r=r0, p=p0, gamma=gamma0)
    _, state = control_flow_ops.while_loop(stopping_criterion, cg_step,
                                           [i, state])
    return cg_state(
        state.i,
        x=array_ops.squeeze(state.x),
        r=array_ops.squeeze(state.r),
        p=array_ops.squeeze(state.p),
        gamma=state.gamma)
Exemplo n.º 31
0
 def stopping_criterion(i, state):
   return math_ops.logical_and(i < max_iter, linalg_ops.norm(state.r) > tol)
Exemplo n.º 32
0
 def stopping_criterion(i, state):
     return math_ops.logical_and(i < max_iter,
                                 linalg_ops.norm(state.r) > tol)
def diagonal_only_frechet_classifier_distance_from_activations(
    real_activations, generated_activations):
  """Classifier distance for evaluating a generative model.

  This is based on the Frechet Inception distance, but for an arbitrary
  classifier.

  This technique is described in detail in https://arxiv.org/abs/1706.08500.
  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calcuates

          |m - m_w|^2 + (sigma + sigma_w - 2(sigma x sigma_w)^(1/2))

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images. In this variant, we compute diagonal-only covariance matrices.
  As a result, instead of computing an expensive matrix square root, we can do
  something much simpler, and has O(n) vs O(n^2) space complexity.

  Note that when computed using sample means and sample covariance matrices,
  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
  even if the two distributions are the same, for a small sample size, the
  expected Frechet distance is large). It is important to use the same
  sample size to compute frechet classifier distance when comparing two
  generative models.

  Args:
    real_activations: Real images to use to compute Frechet Inception distance.
    generated_activations: Generated images to use to compute Frechet Inception
      distance.

  Returns:
    The diagonal-only Frechet Inception distance. A floating-point scalar of
    the same type as the output of the activations.

  Raises:
    ValueError: If the shape of the variance and mean vectors are not equal.
  """
  real_activations.shape.assert_has_rank(2)
  generated_activations.shape.assert_has_rank(2)

  activations_dtype = real_activations.dtype
  if activations_dtype != dtypes.float64:
    real_activations = math_ops.to_double(real_activations)
    generated_activations = math_ops.to_double(generated_activations)

  # Compute mean and covariance matrices of activations.
  m, var = nn_impl.moments(real_activations, axes=[0])
  m_w, var_w = nn_impl.moments(generated_activations, axes=[0])

  actual_shape = var.get_shape()
  expected_shape = m.get_shape()

  if actual_shape != expected_shape:
    raise ValueError('shape: {} must match expected shape: {}'.format(
        actual_shape, expected_shape))

  # Compute the two components of FID.

  # First the covariance component.
  # Here, note that trace(A + B) = trace(A) + trace(B)
  trace = math_ops.reduce_sum(
      (var + var_w) - 2.0 * math_ops.sqrt(math_ops.multiply(var, var_w)))

  # Next the distance between means.
  mean = math_ops.square(linalg_ops.norm(m - m_w))  # This uses the L2 norm.
  dofid = trace + mean
  if activations_dtype != dtypes.float64:
    dofid = math_ops.cast(dofid, activations_dtype)

  return dofid
Exemplo n.º 34
0
def process_quadrature_grid_and_probs(
    quadrature_grid_and_probs, dtype, validate_args, name=None):
  """Validates quadrature grid, probs or computes them as necessary.

  Args:
    quadrature_grid_and_probs: Python pair of `float`-like `Tensor`s
      representing the sample points and the corresponding (possibly
      normalized) weight.  When `None`, defaults to:
      `np.polynomial.hermite.hermgauss(deg=8)`.
    dtype: The expected `dtype` of `grid` and `probs`.
    validate_args: Python `bool`, default `False`. When `True` distribution
      parameters are checked for validity despite possibly degrading runtime
      performance. When `False` invalid inputs may silently render incorrect
      outputs.
    name: Python `str` name prefixed to Ops created by this class.

  Returns:
     quadrature_grid_and_probs: Python pair of `float`-like `Tensor`s
      representing the sample points and the corresponding (possibly
      normalized) weight.

  Raises:
    ValueError: if `quadrature_grid_and_probs is not None` and
      `len(quadrature_grid_and_probs[0]) != len(quadrature_grid_and_probs[1])`
  """
  with ops.name_scope(name, "process_quadrature_grid_and_probs",
                      [quadrature_grid_and_probs]):
    if quadrature_grid_and_probs is None:
      grid, probs = np.polynomial.hermite.hermgauss(deg=8)
      grid = grid.astype(dtype.as_numpy_dtype)
      probs = probs.astype(dtype.as_numpy_dtype)
      probs /= np.linalg.norm(probs, ord=1, keepdims=True)
      grid = ops.convert_to_tensor(grid, name="grid", dtype=dtype)
      probs = ops.convert_to_tensor(probs, name="probs", dtype=dtype)
      return grid, probs

    grid, probs = tuple(quadrature_grid_and_probs)
    grid = ops.convert_to_tensor(grid, name="grid", dtype=dtype)
    probs = ops.convert_to_tensor(probs, name="unnormalized_probs",
                                  dtype=dtype)
    probs /= linalg_ops.norm(probs, ord=1, axis=-1, keep_dims=True,
                             name="probs")

    def _static_dim_size(x, axis):
      """Returns the static size of a specific dimension or `None`."""
      return x.shape.with_rank_at_least(axis + 1)[axis].value

    m, n = _static_dim_size(probs, axis=0), _static_dim_size(grid, axis=0)
    if m is not None and n is not None:
      if m != n:
        raise ValueError("`quadrature_grid_and_probs` must be a `tuple` of "
                         "same-length zero-th-dimension `Tensor`s "
                         "(saw lengths {}, {})".format(m, n))
    elif validate_args:
      grid = control_flow_ops.with_dependencies([
          check_ops.assert_equal(
              dimension_size(probs, axis=0),
              dimension_size(grid, axis=0),
              message=("`quadrature_grid_and_probs` must be a `tuple` of "
                       "same-length zero-th-dimension `Tensor`s")),
      ], grid)

    return grid, probs
Exemplo n.º 35
0
  def _compute_power_iter(self, var, mat_g, mat_g_size, alpha, mat_h_slot_name,
                          iter_count=100, epsilon=1e-6):
    """Computes mat_g^alpha, where alpha = -1/p, p a positive integer.

    We use an iterative Schur-Newton method from equation 3.2 on page 9 of:

    A Schur-Newton Method for the Matrix p-th Root and its Inverse
    by Chun-Hua Guo and Nicholas J. Higham
    SIAM Journal on Matrix Analysis and Applications,
    2006, Vol. 28, No. 3 : pp. 788-804
    https://pdfs.semanticscholar.org/0abe/7f77433cf5908bfe2b79aa91af881da83858.pdf

    Args:
      var: the variable we are updating.
      mat_g: the symmetric PSD matrix whose power it to be computed
      mat_g_size: size of mat_g.
      alpha: exponent, must be -1/p for p a positive integer.
      mat_h_slot_name: name of slot to store the power, if needed.
      iter_count: Maximum number of iterations.
      epsilon: accuracy indicator, useful for early termination.

    Returns:
      mat_g^alpha
    """

    identity = linalg_ops.eye(math_ops.to_int32(mat_g_size))

    def MatPower(mat_m, p):
      """Computes mat_m^p, for p a positive integer.

      Power p is known at graph compile time, so no need for loop and cond.
      Args:
        mat_m: a square matrix
        p: a positive integer

      Returns:
        mat_m^p
      """
      assert p == int(p) and p > 0
      power = None
      while p > 0:
        if p % 2 == 1:
          power = math_ops.matmul(mat_m, power) if power is not None else mat_m
        p //= 2
        mat_m = math_ops.matmul(mat_m, mat_m)
      return power

    def IterCondition(i, mat_m, _):
      return math_ops.logical_and(
          i < iter_count,
          math_ops.reduce_max(math_ops.abs(mat_m - identity)) > epsilon)

    def IterBody(i, mat_m, mat_x):
      mat_m_i = (1 - alpha) * identity + alpha * mat_m
      return (i + 1, math_ops.matmul(MatPower(mat_m_i, -1.0/alpha), mat_m),
              math_ops.matmul(mat_x, mat_m_i))

    if mat_g_size == 1:
      mat_h = math_ops.pow(mat_g + self._epsilon, alpha)
    else:
      damped_mat_g = mat_g + self._epsilon * identity
      z = (1 - 1 / alpha) / (2 * linalg_ops.norm(damped_mat_g))
      # The best value for z is
      # (1 - 1/alpha) * (c_max^{-alpha} - c_min^{-alpha}) /
      #                 (c_max^{1-alpha} - c_min^{1-alpha})
      # where c_max and c_min are the largest and smallest singular values of
      # damped_mat_g.
      # The above estimate assumes that c_max > c_min * 2^p. (p = -1/alpha)
      # Can replace above line by the one below, but it is less accurate,
      # hence needs more iterations to converge.
      # z = (1 - 1/alpha) / math_ops.trace(damped_mat_g)
      # If we want the method to always converge, use z = 1 / norm(damped_mat_g)
      # or z = 1 / math_ops.trace(damped_mat_g), but these can result in many
      # extra iterations.
      _, _, mat_h = control_flow_ops.while_loop(
          IterCondition, IterBody,
          [0, damped_mat_g * z, identity * math_ops.pow(z, -alpha)])
    if mat_h_slot_name is not None:
      return state_ops.assign(self.get_slot(var, mat_h_slot_name), mat_h)
    return mat_h
Exemplo n.º 36
0
    def apply_gradients(self,
                        grads_and_vars,
                        global_step,
                        name=None,
                        manual_fp16=False):
        """See base class."""
        assignments = []
        steps = tf.cast(global_step, tf.float32)
        for (grad, param) in grads_and_vars:
            if grad is None or param is None:
                continue

            param_name = self._get_variable_name(param.name)
            has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
            if has_shadow:
                # create shadow fp32 weights for fp16 variable
                param_fp32 = tf.get_variable(name=param_name + "/shadow",
                                             dtype=tf.float32,
                                             trainable=False,
                                             initializer=tf.cast(
                                                 param.initialized_value(),
                                                 tf.float32))
            else:
                param_fp32 = param

            m = tf.get_variable(name=param_name + "/adam_m",
                                shape=param.shape.as_list(),
                                dtype=tf.float32,
                                trainable=False,
                                initializer=tf.zeros_initializer())
            v = tf.get_variable(name=param_name + "/adam_v",
                                shape=param.shape.as_list(),
                                dtype=tf.float32,
                                trainable=False,
                                initializer=tf.zeros_initializer())

            # LAMB update
            next_m = (tf.multiply(self.beta_1, m) +
                      tf.multiply(1.0 - self.beta_1, grad))
            next_v = (tf.multiply(self.beta_2, v) +
                      tf.multiply(1.0 - self.beta_2, tf.square(grad)))

            beta1_correction = (1 - self.beta_1**steps)
            beta2_correction = (1 - self.beta_2**steps)

            next_m_unbiased = next_m / beta1_correction
            next_v_unbiased = next_v / beta2_correction

            update = next_m_unbiased / (tf.sqrt(next_v_unbiased) +
                                        self.epsilon)

            # Just adding the square of the weights to the loss function is *not*
            # the correct way of using L2 regularization/weight decay with Adam,
            # since that will interact with the m and v parameters in strange ways.
            #
            # Instead we want to decay the weights in a manner that doesn't interact
            # with the m/v parameters. This is equivalent to adding the square
            # of the weights to the loss with plain (non-momentum) SGD.
            if self._do_use_weight_decay(param_name):
                update += self.weight_decay_rate * param_fp32

            w_norm = linalg_ops.norm(param, ord=2)
            g_norm = linalg_ops.norm(update, ord=2)
            ratio = array_ops.where(
                math_ops.greater(w_norm, 0),
                array_ops.where(math_ops.greater(g_norm, 0), (w_norm / g_norm),
                                1.0), 1.0)

            update_with_lr = ratio * self.learning_rate * update

            next_param = param_fp32 - update_with_lr

            if has_shadow:
                # cast shadow fp32 weights to fp16 and assign to trainable variable
                param.assign(tf.cast(next_param, param.dtype.base_dtype))
            assignments.extend([
                param_fp32.assign(next_param),
                m.assign(next_m),
                v.assign(next_v)
            ])
        return tf.group(*assignments, name=name)