Exemplo n.º 1
0
 def multiply_inverse(self, vector):
   left_factor_inv = self._input_factor.get_damped_inverse(self._input_damping)
   right_factor_inv = self._output_factor.get_damped_inverse(
       self._output_damping)
   reshaped_vector = utils.layer_params_to_mat2d(vector)
   reshaped_out = math_ops.matmul(left_factor_inv,
                                  math_ops.matmul(reshaped_vector,
                                                  right_factor_inv))
   if self._renorm_coeff != 1.0:
     reshaped_out /= math_ops.cast(
         self._renorm_coeff, dtype=reshaped_out.dtype)
   return utils.mat2d_to_layer_params(vector, reshaped_out)
Exemplo n.º 2
0
 def multiply(self, vector):
     left_factor = self._input_factor.get_cov()
     right_factor = self._output_factor.get_cov()
     reshaped_vector = utils.layer_params_to_mat2d(vector)
     reshaped_out = (math_ops.matmul(reshaped_vector, right_factor) +
                     self._output_damping * reshaped_vector)
     reshaped_out = (math_ops.matmul(left_factor, reshaped_out) +
                     self._input_damping * reshaped_out)
     if self._renorm_coeff != 1.0:
         reshaped_out *= math_ops.cast(self._renorm_coeff,
                                       dtype=reshaped_out.dtype)
     return utils.mat2d_to_layer_params(vector, reshaped_out)
Exemplo n.º 3
0
 def multiply(self, vector):
   """Approximate damped Fisher-vector product.
   Args:
     vector: Tensor or 2-tuple of Tensors. if self._has_bias, Tensor of shape
       [input_size, output_size] corresponding to layer's weights. If not, a
       2-tuple of the former and a Tensor of shape [output_size] corresponding
       to the layer's bias.
   Returns:
     Tensor of the same shape, corresponding to the Fisher-vector product.
   """
   reshaped_vect = utils.layer_params_to_mat2d(vector)
   reshaped_out = reshaped_vect * (self._factor.get_cov() + self._damping)
   return utils.mat2d_to_layer_params(vector, reshaped_out)
Exemplo n.º 4
0
  def multiply_inverse(self, vector):
    # pylint: disable=invalid-name

    Z = utils.layer_params_to_mat2d(vector)

    # Derivations were done for "batch_dim==1" case so we need to convert to
    # that orientation:
    Z = array_ops.transpose(Z)

    if self._option == SeriesFBApproximation.option1:

      # Note that L_A = A0^(-1/2) * U_A and L_G = G0^(-1/2) * U_G.
      L_A, psi_A = self._input_factor.get_option1quants(self._damping_input)
      L_G, psi_G = self._output_factor.get_option1quants(self._damping_output)

      def gamma(x):
        # We are assuming that each case has the same number of time-steps.
        # If this stops being the case one shouldn't simply replace this T
        # with its average value.  Instead, one needs to go back to the
        # definition of the gamma function from the paper.
        T = self._num_timesteps
        return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T))

      # Y = gamma( psi_G*psi_A^T ) (computed element-wise)
      # Even though Y is Z-independent we are recomputing it from the psi's
      # each since Y depends on both A and G quantities, and it is relatively
      # cheap to compute.
      Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A)

      # Z = L_G^T * Z * L_A
      # This is equivalent to the following computation from the original
      # pseudo-code:
      # Z = G0^(-1/2) * Z * A0^(-1/2)
      # Z = U_G^T * Z * U_A
      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True)

      # Z = Z .* Y
      Z *= Y

      # Z = L_G * Z * L_A^T
      # This is equivalent to the following computation from the original
      # pseudo-code:
      # Z = U_G * Z * U_A^T
      # Z = G0^(-1/2) * Z * A0^(-1/2)
      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True))

    elif self._option == SeriesFBApproximation.option2:

      # Note that P_A = A_1^T * A_0^(-1) and P_G = G_1^T * G_0^(-1),
      # and K_A = A_0^(-1/2) * E_A and K_G = G_0^(-1/2) * E_G.
      P_A, K_A, mu_A = self._input_factor.get_option2quants(self._damping_input)
      P_G, K_G, mu_G = self._output_factor.get_option2quants(
          self._damping_output)

      # Our approach differs superficially from the pseudo-code in the paper
      # in order to reduce the total number of matrix-matrix multiplies.
      # In particular, the first three computations in the pseudo code are
      # Z = G0^(-1/2) * Z * A0^(-1/2)
      # Z = Z - hPsi_G^T * Z * hPsi_A
      # Z = E_G^T * Z * E_A
      # Noting that hPsi = C0^(-1/2) * C1 * C0^(-1/2), so that
      # C0^(-1/2) * hPsi = C0^(-1) * C1 * C0^(-1/2) = P^T * C0^(-1/2)
      # the entire computation can be written as
      # Z = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
      #     - hPsi_G^T * G0^(-1/2) * Z * A0^(-1/2) * hPsi_A) * E_A
      #   = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
      #     - G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2)) * E_A
      #   = E_G^T * G0^(-1/2) * Z * A0^(-1/2) * E_A
      #     -  E_G^T* G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2) * E_A
      #   = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A
      # This final expression is computed by the following two lines:
      # Z = Z - P_G * Z * P_A^T
      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True))
      # Z = K_G^T * Z * K_A
      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True)

      # Z = Z ./ (1*1^T - mu_G*mu_A^T)
      # Be careful with the outer product.  We don't want to accidentally
      # make it an inner-product instead.
      tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A
      # Prevent some numerical issues by setting any 0.0 eigs to 1.0
      tmp += 1.0 * math_ops.cast(math_ops.equal(tmp, 0.0), dtype=tmp.dtype)
      Z /= tmp

      # We now perform the transpose/reverse version of the operations
      # derived above, whose derivation from the original pseudo-code is
      # analgous.
      # Z = K_G * Z * K_A^T
      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True))

      # Z = Z - P_G^T * Z * P_A
      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True)

      # Z = normalize (1/E[T]) * Z
      # Note that this normalization is done because we compute the statistics
      # by averaging, not summing, over time. (And the gradient is presumably
      # summed over time, not averaged, and thus their scales are different.)
      Z /= math_ops.cast(self._num_timesteps, Z.dtype)

    # Convert back to the "batch_dim==0" orientation.
    Z = array_ops.transpose(Z)

    return utils.mat2d_to_layer_params(vector, Z)
Exemplo n.º 5
0
 def multiply(self, vector):
   reshaped_vect = utils.layer_params_to_mat2d(vector)
   reshaped_out = reshaped_vect * (self._factor.get_cov() + self._damping)
   return utils.mat2d_to_layer_params(vector, reshaped_out)