Пример #1
0
 def __init__(self,
              axis=-1,
              momentum=0.99,
              epsilon=1e-3,
              center=True,
              scale=True,
              beta_initializer=init_ops.zeros_initializer(),
              gamma_initializer=init_ops.ones_initializer(),
              moving_mean_initializer=init_ops.zeros_initializer(),
              moving_variance_initializer=init_ops.ones_initializer(),
              beta_regularizer=None,
              gamma_regularizer=None,
              trainable=True,
              name=None,
              **kwargs):
   super(BatchNormalization, self).__init__(
       name=name, trainable=trainable, **kwargs)
   self.axis = axis
   self.momentum = momentum
   self.epsilon = epsilon
   self.center = center
   self.scale = scale
   self.beta_initializer = beta_initializer
   self.gamma_initializer = gamma_initializer
   self.moving_mean_initializer = moving_mean_initializer
   self.moving_variance_initializer = moving_variance_initializer
   self.beta_regularizer = beta_regularizer
   self.gamma_regularizer = gamma_regularizer
Пример #2
0
  def __init__(self,
               axis=-1,
               momentum=0.99,
               epsilon=1e-3,
               center=True,
               scale=True,
               beta_initializer=init_ops.zeros_initializer(),
               gamma_initializer=init_ops.ones_initializer(),
               moving_mean_initializer=init_ops.zeros_initializer(),
               moving_variance_initializer=init_ops.ones_initializer(),
               beta_regularizer=None,
               gamma_regularizer=None,
               beta_constraint=None,
               gamma_constraint=None,
               renorm=False,
               renorm_clipping=None,
               renorm_momentum=0.99,
               fused=None,
               trainable=True,
               virtual_batch_size=None,
               adjustment=None,
               name=None,
               **kwargs):
    super(BatchNormalization, self).__init__(
        name=name, trainable=trainable, **kwargs)
    if isinstance(axis, list):
      self.axis = axis[:]
    else:
      self.axis = axis
    self.momentum = momentum
    self.epsilon = epsilon
    self.center = center
    self.scale = scale
    self.beta_initializer = beta_initializer
    self.gamma_initializer = gamma_initializer
    self.moving_mean_initializer = moving_mean_initializer
    self.moving_variance_initializer = moving_variance_initializer
    self.beta_regularizer = beta_regularizer
    self.gamma_regularizer = gamma_regularizer
    self.beta_constraint = beta_constraint
    self.gamma_constraint = gamma_constraint
    self.renorm = renorm
    self.virtual_batch_size = virtual_batch_size
    self.adjustment = adjustment
    if fused is None:
      fused = True

    self.fused = fused
    self._bessels_correction_test_only = True

    if renorm:
      renorm_clipping = renorm_clipping or {}
      keys = ['rmax', 'rmin', 'dmax']
      if set(renorm_clipping) - set(keys):
        raise ValueError('renorm_clipping %s contains keys not in %s' %
                         (renorm_clipping, keys))
      self.renorm_clipping = renorm_clipping
      self.renorm_momentum = renorm_momentum
Пример #3
0
  def __init__(self,
               axis=-1,
               momentum=0.99,
               epsilon=1e-3,
               center=True,
               scale=True,
               beta_initializer=init_ops.zeros_initializer(),
               gamma_initializer=init_ops.ones_initializer(),
               moving_mean_initializer=init_ops.zeros_initializer(),
               moving_variance_initializer=init_ops.ones_initializer(),
               beta_regularizer=None,
               gamma_regularizer=None,
               beta_constraint=None,
               gamma_constraint=None,
               renorm=False,
               renorm_clipping=None,
               renorm_momentum=0.99,
               fused=None,
               trainable=True,
               name=None,
               **kwargs):
    super(BatchNormalization, self).__init__(
        name=name, trainable=trainable, **kwargs)
    self.axis = axis
    self.momentum = momentum
    self.epsilon = epsilon
    self.center = center
    self.scale = scale
    self.beta_initializer = beta_initializer
    self.gamma_initializer = gamma_initializer
    self.moving_mean_initializer = moving_mean_initializer
    self.moving_variance_initializer = moving_variance_initializer
    self.beta_regularizer = beta_regularizer
    self.gamma_regularizer = gamma_regularizer
    self.beta_constraint = beta_constraint
    self.gamma_constraint = gamma_constraint
    self.renorm = renorm
    # This environment variable is only used during the testing period of fused
    # batch norm and will be removed after that.
    if fused is None:
      fused = _FUSED_DEFAULT

    self.fused = fused
    self._bessels_correction_test_only = True
    if renorm:
      renorm_clipping = renorm_clipping or {}
      keys = ['rmax', 'rmin', 'dmax']
      if set(renorm_clipping) - set(keys):
        raise ValueError('renorm_clipping %s contains keys not in %s' %
                         (renorm_clipping, keys))
      self.renorm_clipping = renorm_clipping
      self.renorm_momentum = renorm_momentum
Пример #4
0
 def __init__(self,
              axis=-1,
              momentum=0.99,
              epsilon=1e-3,
              center=True,
              scale=True,
              beta_initializer=init_ops.zeros_initializer(),
              gamma_initializer=init_ops.ones_initializer(),
              moving_mean_initializer=init_ops.zeros_initializer(),
              moving_variance_initializer=init_ops.ones_initializer(),
              beta_regularizer=None,
              gamma_regularizer=None,
              renorm=False,
              renorm_clipping=None,
              renorm_momentum=0.99,
              fused=False,
              trainable=True,
              name=None,
              **kwargs):
   super(BatchNormalization, self).__init__(
       name=name, trainable=trainable, **kwargs)
   self.axis = axis
   self.momentum = momentum
   self.epsilon = epsilon
   self.center = center
   self.scale = scale
   self.beta_initializer = beta_initializer
   self.gamma_initializer = gamma_initializer
   self.moving_mean_initializer = moving_mean_initializer
   self.moving_variance_initializer = moving_variance_initializer
   self.beta_regularizer = beta_regularizer
   self.gamma_regularizer = gamma_regularizer
   self.renorm = renorm
   self.fused = fused
   if self.fused and renorm:
     raise ValueError(
         'Batch renorm is currently not supported with fused batch norm.')
   if self.fused and (beta_regularizer is not None or
                      gamma_regularizer is not None):
     raise ValueError('Regularizers are not currently '
                      'supported for fused batch norm.')
   if renorm:
     renorm_clipping = renorm_clipping or {}
     keys = ['rmax', 'rmin', 'dmax']
     if set(renorm_clipping) - set(keys):
       raise ValueError('renorm_clipping %s contains keys not in %s' %
                        (renorm_clipping, keys))
     self.renorm_clipping = renorm_clipping
     self.renorm_momentum = renorm_momentum
  def testControlDepsNone(self):
    with self.test_session() as session:
      c = constant_op.constant(1.0)
      with ops.control_dependencies([c]):
        # d get the control dependency.
        d = constant_op.constant(2.0)
        # Partitioned variables do not.
        var_x = variable_scope.get_variable(
            "x",
            shape=[2],
            initializer=init_ops.ones_initializer(),
            partitioner=partitioned_variables.variable_axis_size_partitioner(4))

        ops_before_read = session.graph.get_operations()
        var_x.as_tensor()  # Caches the ops for subsequent reads.
        reading_ops = [
            op for op in session.graph.get_operations()
            if op not in ops_before_read
        ]

      self.assertEqual([c.op], d.op.control_inputs)
      # Tests that no control dependencies are added to reading a partitioned
      # variable which is similar to reading a variable.
      for op in reading_ops:
        self.assertEqual([], op.control_inputs)
Пример #6
0
 def testEagerExecution(self):
   with context.eager_mode():
     container = variable_scope.EagerVariableStore()
     x = constant_op.constant([[2.0]])
     with container.as_default():
       y = core_layers.dense(
           x, 1, name='my_dense',
           kernel_initializer=init_ops.ones_initializer())
     self.assertAllEqual(y, [[2.0]])
     self.assertEqual(len(container.variables()), 2)
     # Recreate the layer to test reuse.
     with container.as_default():
       core_layers.dense(
           x, 1, name='my_dense',
           kernel_initializer=init_ops.ones_initializer())
     self.assertEqual(len(container.variables()), 2)
Пример #7
0
 def testOnesInitializer(self):
   with self.test_session(use_gpu=True):
     shape = [2, 3]
     x = variable_scope.get_variable(
         "x", shape=shape, initializer=init_ops.ones_initializer())
     x.initializer.run()
     self.assertAllEqual(x.eval(), np.ones(shape))
  def testVariableCreationInALoop(self):
    """Tests the variable created inside a loop can be used outside the loop."""
    with self.test_session():
      with variable_scope.variable_scope("ascope") as scope:
        def Body(i, _):
          var_x = variable_scope.get_variable(
              "x",
              shape=[2],
              initializer=init_ops.ones_initializer(),
              partitioner=partitioned_variables.variable_axis_size_partitioner(
                  4))
          return (i + 1, var_x.as_tensor())

        cond = lambda i, _: i < 2
        _, x = control_flow_ops.while_loop(
            cond, Body, (0, constant_op.constant([7, 8], dtypes.float32)))
        variables.global_variables_initializer().run()
        self.assertAllClose([1.0, 1.0], x.eval())

        scope.reuse_variables()
        var_x = variable_scope.get_variable(
            "x",
            shape=[2],
            initializer=init_ops.ones_initializer(),
            partitioner=partitioned_variables.variable_axis_size_partitioner(4))

        self.assertAllClose([1.0, 1.0], var_x.as_tensor().eval())
Пример #9
0
 def Foo(inputs):
   var = variable_scope.get_variable(
       "var",
       shape=[10],
       dtype=dtypes.float32,
       initializer=init_ops.ones_initializer())
   return inputs + var
Пример #10
0
  def build(self, inputs_shape):
    # Call the build method of the parent class.
    super(MaskedBasicLSTMCell, self).build(inputs_shape)

    self.built = False

    input_depth = inputs_shape[1].value
    h_depth = self._num_units
    self._mask = self.add_variable(
        name="mask",
        shape=[input_depth + h_depth, 4 * h_depth],
        initializer=init_ops.ones_initializer(),
        trainable=False,
        dtype=self.dtype)
    self._threshold = self.add_variable(
        name="threshold",
        shape=[],
        initializer=init_ops.zeros_initializer(),
        trainable=False,
        dtype=self.dtype)
    # Add masked_weights in the weights namescope so as to make it easier
    # for the quantization library to add quant ops.
    self._masked_kernel = math_ops.multiply(self._mask, self._kernel,
                                            core_layers.MASKED_WEIGHT_NAME)
    if self._mask not in ops.get_collection_ref(core_layers.MASK_COLLECTION):
      ops.add_to_collection(core_layers.MASK_COLLECTION, self._mask)
      ops.add_to_collection(core_layers.MASKED_WEIGHT_COLLECTION,
                            self._masked_kernel)
      ops.add_to_collection(core_layers.THRESHOLD_COLLECTION, self._threshold)
      ops.add_to_collection(core_layers.WEIGHT_COLLECTION, self._kernel)

    self.built = True
 def Body(i, _):
   var_x = variable_scope.get_variable(
       "x",
       shape=[2],
       initializer=init_ops.ones_initializer(),
       partitioner=partitioned_variables.variable_axis_size_partitioner(
           4))
   return (i + 1, var_x.as_tensor())
Пример #12
0
 def _create_slots(self, var_list):
   for v in var_list:
     init_rms = init_ops.ones_initializer(dtype=v.dtype)
     self._get_or_make_slot_with_initializer(v, init_rms, v.get_shape(),
                                             v.dtype, "rms", self._name)
     if self._centered:
       self._zeros_slot(v, "mg", self._name)
     self._zeros_slot(v, "momentum", self._name)
Пример #13
0
 def __init__(self,
              axis=-1,
              momentum=0.99,
              epsilon=1e-3,
              center=True,
              scale=True,
              beta_initializer=init_ops.zeros_initializer(),
              gamma_initializer=init_ops.ones_initializer(),
              moving_mean_initializer=init_ops.zeros_initializer(),
              moving_variance_initializer=init_ops.ones_initializer(),
              beta_regularizer=None,
              gamma_regularizer=None,
              beta_constraint=None,
              gamma_constraint=None,
              renorm=False,
              renorm_clipping=None,
              renorm_momentum=0.99,
              fused=None,
              trainable=True,
              virtual_batch_size=None,
              adjustment=None,
              name=None,
              **kwargs):
   super(BatchNormalization, self).__init__(
       axis=axis,
       momentum=momentum,
       epsilon=epsilon,
       center=center,
       scale=scale,
       beta_initializer=beta_initializer,
       gamma_initializer=gamma_initializer,
       moving_mean_initializer=moving_mean_initializer,
       moving_variance_initializer=moving_variance_initializer,
       beta_regularizer=beta_regularizer,
       gamma_regularizer=gamma_regularizer,
       beta_constraint=beta_constraint,
       gamma_constraint=gamma_constraint,
       renorm=renorm,
       renorm_clipping=renorm_clipping,
       renorm_momentum=renorm_momentum,
       fused=fused,
       trainable=trainable,
       virtual_batch_size=virtual_batch_size,
       adjustment=adjustment,
       name=name,
       **kwargs)
  def testAddVariable(self):
    obj = NonLayerCheckpointable()
    with self.assertRaisesRegexp(ValueError, "do not specify shape"):
      checkpointable_utils.add_variable(
          obj, name="shape_specified_twice", shape=[], initializer=1)
    constant_initializer = checkpointable_utils.add_variable(
        obj, name="constant_initializer", initializer=1)
    with variable_scope.variable_scope("some_variable_scope"):
      ones_initializer = checkpointable_utils.add_variable(
          obj,
          name="ones_initializer",
          shape=[2],
          initializer=init_ops.ones_initializer(dtype=dtypes.float32))
    bare_initializer = checkpointable_utils.add_variable(
        obj,
        name="bare_initializer",
        shape=[2, 2],
        dtype=dtypes.float64,
        initializer=init_ops.zeros_initializer)

    # Even in graph mode, there are no naming conflicts between objects, only
    # naming conflicts within an object.
    other_duplicate = resource_variable_ops.ResourceVariable(
        name="duplicate", initial_value=1.)
    duplicate = checkpointable_utils.add_variable(
        obj, name="duplicate", shape=[])
    with self.assertRaisesRegexp(ValueError, "'duplicate' already exists"):
      checkpointable_utils.add_variable(obj, name="duplicate", shape=[])

    if context.in_graph_mode():
      self.evaluate(variables.global_variables_initializer())
    self.assertEqual("constant_initializer:0", constant_initializer.name)
    self.assertEqual(1, self.evaluate(constant_initializer))
    self.assertEqual("some_variable_scope/ones_initializer:0",
                     ones_initializer.name)
    self.assertAllEqual([1, 1], self.evaluate(ones_initializer))
    self.assertAllEqual([[0., 0.],
                         [0., 0.]], self.evaluate(bare_initializer))
    self.assertEqual("a_variable:0", obj.a_variable.name)
    self.assertEqual("duplicate:0", other_duplicate.name)
    if context.in_graph_mode():
      # The .name attribute may be globally influenced, but the checkpoint name
      # won't be (tested below).
      self.assertEqual("duplicate_1:0", duplicate.name)
    else:
      # When executing eagerly, there's no uniquification of variable names. The
      # checkpoint name will be the same.
      self.assertEqual("duplicate:0", duplicate.name)
    named_variables, _ = checkpointable_utils._serialize_object_graph(obj)
    expected_checkpoint_names = (
        "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
        "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
        "constant_initializer/.ATTRIBUTES/VARIABLE_VALUE",
        "duplicate/.ATTRIBUTES/VARIABLE_VALUE",
        "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE",
    )
    six.assertCountEqual(
        self, expected_checkpoint_names, named_variables.keys())
Пример #15
0
 def _create_variable_statistics_object(self):
   """Creates non-trainable variables representing input statistics."""
   series_start_moments = Moments(
       mean=variable_scope.get_variable(
           name="series_start_mean",
           shape=[self._num_features],
           dtype=self._dtype,
           initializer=init_ops.zeros_initializer(),
           trainable=False),
       variance=variable_scope.get_variable(
           name="series_start_variance",
           shape=[self._num_features],
           dtype=self._dtype,
           initializer=init_ops.ones_initializer(),
           trainable=False))
   overall_feature_moments = Moments(
       mean=variable_scope.get_variable(
           name="overall_feature_mean",
           shape=[self._num_features],
           dtype=self._dtype,
           initializer=init_ops.zeros_initializer(),
           trainable=False),
       variance=variable_scope.get_variable(
           name="overall_feature_var",
           shape=[self._num_features],
           dtype=self._dtype,
           initializer=init_ops.ones_initializer(),
           trainable=False))
   start_time = variable_scope.get_variable(
       name="start_time",
       dtype=dtypes.int64,
       initializer=init_ops.zeros_initializer(),
       shape=[],
       trainable=False)
   total_observation_count = variable_scope.get_variable(
       name="total_observation_count",
       shape=[],
       dtype=dtypes.int64,
       initializer=init_ops.ones_initializer(),
       trainable=False)
   return InputStatistics(
       series_start_moments=series_start_moments,
       overall_feature_moments=overall_feature_moments,
       start_time=start_time,
       total_observation_count=total_observation_count)
Пример #16
0
  def testLSTMLayer(self):
    # Run with all-0 weights, no padding.
    o = self._RunLSTMLayer('zeros', init_ops.zeros_initializer(), 0., 0., 0.)
    self.assertAllClose(o, [[[0.]] * self._batch_size] * 3)
    o = self._RunLSTMLayer('zeros', init_ops.zeros_initializer(), 0., 1., 0.)
    self.assertAllClose(o, [[[.25]] * self._batch_size,
                            [[.125]] * self._batch_size,
                            [[.0625]] * self._batch_size])
    o = self._RunLSTMLayer('zeros', init_ops.zeros_initializer(), 1., 0., 0.)
    self.assertAllClose(o, [[[0.]] * self._batch_size] * 3)
    o = self._RunLSTMLayer('zeros', init_ops.zeros_initializer(), 1., 1., 0.)
    self.assertAllClose(o, [[[.25]] * self._batch_size,
                            [[.125]] * self._batch_size,
                            [[.0625]] * self._batch_size])

    # Run with all-1 weights, no padding.
    weight1 = 1.
    for m_init in [0., 1.]:
      for c_init in [0., 1.]:
        o = self._RunLSTMLayer('ones',
                               init_ops.ones_initializer(), m_init, c_init, 0.)
        m0 = self._NextM(self._inputs, weight1, m_init, c_init)
        c0 = self._NextC(self._inputs, weight1, m_init, c_init)
        self.assertAllClose(o[0], m0)
        m1 = self._NextM(self._inputs, weight1, m0, c0)
        c1 = self._NextC(self._inputs, weight1, m0, c0)
        self.assertAllClose(o[1], m1)
        m2 = self._NextM(self._inputs, weight1, m1, c1)
        self.assertAllClose(o[2], m2)

    # Run with random weights.
    for weight in np.random.rand(3):
      weight_tf = constant_op.constant(weight, dtypes.float32)
      random_weight = lambda shape, w=weight_tf: array_ops.fill(shape, w)

      # No padding.
      for m_init in [0., 1.]:
        for c_init in [0., 1.]:
          o = self._RunLSTMLayer('random', random_weight, m_init, c_init, 0.)
          m0 = self._NextM(self._inputs, weight, m_init, c_init)
          c0 = self._NextC(self._inputs, weight, m_init, c_init)
          self.assertAllClose(o[0], m0)
          m1 = self._NextM(self._inputs, weight, m0, c0)
          c1 = self._NextC(self._inputs, weight, m0, c0)
          self.assertAllClose(o[1], m1)
          m2 = self._NextM(self._inputs, weight, m1, c1)
          self.assertAllClose(o[2], m2)

      # Set padding.
      o = self._RunLSTMLayer('random', random_weight, 0., 0., 1.)
      self.assertAllClose(o, [[[0.]] * self._batch_size] * 3)
      o = self._RunLSTMLayer('random', random_weight, 0., 1., 1.)
      self.assertAllClose(o, [[[0.]] * self._batch_size] * 3)
      o = self._RunLSTMLayer('random', random_weight, 1., 0., 1.)
      self.assertAllClose(o, [[[1.]] * self._batch_size] * 3)
      o = self._RunLSTMLayer('random', random_weight, 1., 1., 1.)
      self.assertAllClose(o, [[[1.]] * self._batch_size] * 3)
  def testGPU(self):
    with self.test_session(use_gpu=True) as sess:
      abc = variable_scope.get_variable(
          "abc",
          shape=[1],
          initializer=init_ops.ones_initializer(),
          use_resource=True)

      sess.run(variables.global_variables_initializer())
      print(sess.run(abc))
Пример #18
0
def l2_normalization(
        inputs,
        scaling=False,
        scale_initializer=init_ops.ones_initializer(),
        reuse=None,
        variables_collections=None,
        outputs_collections=None,
        trainable=True,
        scope=None):
    """Implement L2 normalization on every feature (i.e. spatial normalization).

    Should be extended in some near future to other dimensions, providing a more
    flexible normalization framework.

    inputs: a 4-D tensor with dimensions [batch_size, height, width, channels].
    scaling: whether or not to add a post scaling operation along the dimensions
      which have been normalized.
    scale_initializer: An initializer for the weights.
    reuse: whether or not the layer and its variables should be reused. To be
      able to reuse the layer scope must be given.
    variables_collections: optional list of collections for all the variables or
      a dictionary containing a different list of collection per variable.
    outputs_collections: collection to add the outputs.
    trainable: If `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    scope: Optional scope for `variable_scope`.
    Returns:
      A `Tensor` representing the output of the operation.
    """

    with variable_scope.variable_scope(
            scope, 'L2Normalization', [inputs], reuse=reuse) as sc:

        inputs_shape = inputs.get_shape()
        inputs_rank = inputs_shape.ndims
        params_shape = inputs_shape[-1:]
        dtype = inputs.dtype.base_dtype

        # Normalize along spatial dimensions.
        norm_dim = tf.range(1, inputs_rank-1)
        outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12)
        # Additional scaling.
        if scaling:
            scale_collections = utils.get_variable_collections(
                variables_collections, 'scale')
            scale = variables.model_variable('gamma',
                                             shape=params_shape,
                                             dtype=dtype,
                                             initializer=scale_initializer,
                                             collections=scale_collections,
                                             trainable=trainable)
            outputs = tf.multiply(outputs, scale)
        return utils.collect_named_outputs(outputs_collections,
                                           sc.original_name_scope, outputs)
Пример #19
0
 def _create_vars(self, var_list, state):
   for v in var_list:
     if v.get_shape().is_fully_defined():
       init_rms = init_ops.ones_initializer(dtype=v.dtype.base_dtype)
     else:
       init_rms = array_ops.ones_like(v)
     state.create_slot_with_initializer(v, init_rms, v.get_shape(),
                                        v.dtype.base_dtype, "rms")
     if self._centered:
       state.zeros_slot(v, "mg")
     state.zeros_slot(v, "momentum")
Пример #20
0
  def build(self, input_shape):
    input_shape = tensor_shape.TensorShape(input_shape)
    if tensor_shape.dimension_value(input_shape[-1]) is None:
      raise ValueError('The last dimension of the inputs to `Dense` '
                       'should be defined. Found `None`.')
    self.input_spec = base.InputSpec(
        min_ndim=2, axes={-1: tensor_shape.dimension_value(input_shape[-1])})

    self.kernel = self.add_variable(
        'kernel',
        shape=[tensor_shape.dimension_value(input_shape[-1]), self.units],
        initializer=self.kernel_initializer,
        regularizer=self.kernel_regularizer,
        dtype=self.dtype,
        trainable=True)

    self.mask = self.add_variable(
        name='mask',
        shape=[tensor_shape.dimension_value(input_shape[-1]), self.units],
        initializer=init_ops.ones_initializer(),
        trainable=False,
        dtype=self.dtype)

    self.threshold = self.add_variable(
        name='threshold',
        shape=[],
        initializer=init_ops.zeros_initializer(),
        trainable=False,
        dtype=self.dtype)

    # Add masked_weights in the weights namescope so as to make it easier
    # for the quantization library to add quant ops.
    self.masked_kernel = math_ops.multiply(self.mask, self.kernel,
                                           MASKED_WEIGHT_NAME)

    ops.add_to_collection(MASK_COLLECTION, self.mask)
    ops.add_to_collection(MASKED_WEIGHT_COLLECTION, self.masked_kernel)
    ops.add_to_collection(THRESHOLD_COLLECTION, self.threshold)
    ops.add_to_collection(WEIGHT_COLLECTION, self.kernel)

    if self.use_bias:
      self.bias = self.add_variable(
          'bias',
          shape=[
              self.units,
          ],
          initializer=self.bias_initializer,
          regularizer=self.bias_regularizer,
          dtype=self.dtype,
          trainable=True)
    else:
      self.bias = None
    self.built = True
Пример #21
0
 def build(self, input_shape):
   """Creates scale variable if use_scale==True."""
   if self.use_scale:
     self.scale = self.add_weight(
         name='scale',
         shape=(),
         initializer=init_ops.ones_initializer(),
         dtype=self.dtype,
         trainable=True)
   else:
     self.scale = None
   super(Attention, self).build(input_shape)
  def testGPU(self):
    with self.test_session(use_gpu=True) as sess:
      abc = variable_scope.get_variable(
          "abc",
          shape=[1],
          initializer=init_ops.ones_initializer(),
          use_resource=True)

      sess.run(variables.global_variables_initializer())
      self.assertEqual(
          resource_variable_ops.var_is_initialized_op(abc.handle).eval(), True)
      print(sess.run(abc))
Пример #23
0
  def build(self, input_shape):
    input_shape = tensor_shape.TensorShape(input_shape)
    channel_axis = 1 if self.data_format == 'channels_first' else -1
    if tensor_shape.dimension_value(input_shape[channel_axis]) is None:
      raise ValueError('The channel dimension of the inputs '
                       'should be defined. Found `None`.')
    input_dim = tensor_shape.dimension_value(input_shape[channel_axis])
    kernel_shape = self.kernel_size + (input_dim, self.filters)
    self.mask = self.add_variable(
        name='mask',
        shape=kernel_shape,
        initializer=init_ops.ones_initializer(),
        trainable=False,
        dtype=self.dtype)

    self.kernel = self.add_variable(
        name='kernel',
        shape=kernel_shape,
        initializer=self.kernel_initializer,
        regularizer=self.kernel_regularizer,
        trainable=True,
        dtype=self.dtype)

    self.threshold = self.add_variable(
        name='threshold',
        shape=[],
        initializer=init_ops.zeros_initializer(),
        trainable=False,
        dtype=self.dtype)

    # Add masked_weights in the weights namescope so as to make it easier
    # for the quantization library to add quant ops.
    self.masked_kernel = math_ops.multiply(self.mask, self.kernel,
                                           MASKED_WEIGHT_NAME)

    ops.add_to_collection(MASK_COLLECTION, self.mask)
    ops.add_to_collection(MASKED_WEIGHT_COLLECTION, self.masked_kernel)
    ops.add_to_collection(THRESHOLD_COLLECTION, self.threshold)
    ops.add_to_collection(WEIGHT_COLLECTION, self.kernel)

    if self.use_bias:
      self.bias = self.add_variable(
          name='bias',
          shape=(self.filters,),
          initializer=self.bias_initializer,
          regularizer=self.bias_regularizer,
          trainable=True,
          dtype=self.dtype)
    else:
      self.bias = None
    self.input_spec = base.InputSpec(
        ndim=self.rank + 2, axes={channel_axis: input_dim})
    self.built = True
Пример #24
0
 def _create_slots(self, var_list):
   for v in var_list:
     if v.get_shape().is_fully_defined():
       init_rms = init_ops.ones_initializer(dtype=v.dtype.base_dtype)
     else:
       init_rms = array_ops.ones_like(v)
     self._get_or_make_slot_with_initializer(v, init_rms, v.get_shape(),
                                             v.dtype.base_dtype, "rms",
                                             self._name)
     if self._centered:
       self._zeros_slot(v, "mg", self._name)
     self._zeros_slot(v, "momentum", self._name)
  def testGPU(self):
    with test_util.use_gpu():
      abc = variable_scope.get_variable(
          "abc",
          shape=[1],
          initializer=init_ops.ones_initializer(),
          use_resource=True)

      self.evaluate(variables.global_variables_initializer())
      self.assertEqual(
          self.evaluate(
              resource_variable_ops.var_is_initialized_op(abc.handle)),
          True)
Пример #26
0
 def testFunctionalDenseInitializerFromScope(self):
   with self.test_session() as sess:
     with variable_scope.variable_scope(
         'scope', initializer=init_ops.ones_initializer()):
       inputs = random_ops.random_uniform((5, 3), seed=1)
       core_layers.dense(inputs, 2)
       sess.run(variables.global_variables_initializer())
       weights = sess.run(variables.trainable_variables())
       self.assertEqual(len(weights), 2)
       # Check that the matrix weights got initialized to ones (from scope).
       self.assertAllClose(weights[0], np.ones((3, 2)))
       # Check that the bias still got initialized to zeros.
       self.assertAllClose(weights[1], np.zeros((2)))
Пример #27
0
 def testFunctionalDenseInitializerFromScope(self):
   with variable_scope.variable_scope(
       'scope', initializer=init_ops.ones_initializer()):
     inputs = random_ops.random_uniform((5, 3), seed=1)
     core_layers.dense(inputs, 2)
     if context.in_graph_mode():
       self.evaluate(variables.global_variables_initializer())
     weights = variables.trainable_variables()
     self.assertEqual(len(weights), 2)
     # Check that the matrix weights got initialized to ones (from scope).
     self.assertAllClose(
         self.evaluate(weights[0].read_value()), np.ones((3, 2)))
     # Check that the bias still got initialized to zeros.
     self.assertAllClose(self.evaluate(weights[1].read_value()), np.zeros((2)))
Пример #28
0
 def testFunctionalDenseInitializerFromScope(self):
   with variable_scope.variable_scope(
       'scope', initializer=init_ops.ones_initializer()), self.test_session():
     inputs = random_ops.random_uniform((5, 3), seed=1)
     core_layers.dense(inputs, 2)
     variables.global_variables_initializer().run()
     weights = _get_variable_dict_from_varstore()
     self.assertEqual(len(weights), 2)
     # Check that the matrix weights got initialized to ones (from scope).
     self.assertAllClose(weights['scope/dense/kernel'].read_value().eval(),
                         np.ones((3, 2)))
     # Check that the bias still got initialized to zeros.
     self.assertAllClose(weights['scope/dense/bias'].read_value().eval(),
                         np.zeros((2)))
Пример #29
0
  def testLayerInDefun(self):
    conv = convolutional.Conv2D(
        filters=1,
        kernel_size=2,
        kernel_initializer=init_ops.ones_initializer(),
        bias_initializer=init_ops.zeros_initializer())

    @function.defun
    def model(x):
      return conv(x)

    x = array_ops.ones([1, 2, 2, 1])
    y = model(x)
    self.assertAllEqual([[[[4.0]]]], y.numpy())
  def testPrepareInputsForRnnSparseAndDense(self):
    num_unroll = 2
    embedding_dimension = 8
    dense_dimension = 2

    expected = [
        np.array([[1., 1., 1., 1., 1., 1., 1., 1., 111., 112.],
                  [1., 1., 1., 1., 1., 1., 1., 1., 211., 212.],
                  [1., 1., 1., 1., 1., 1., 1., 1., 311., 312.]]),
        np.array([[1., 1., 1., 1., 1., 1., 1., 1., 121., 122.],
                  [2., 2., 2., 2., 2., 2., 2., 2., 221., 222.],
                  [1., 1., 1., 1., 1., 1., 1., 1., 321., 322.]])
    ]

    sequence_features = {
        'wire_cast':
            sparse_tensor.SparseTensor(
                indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1],
                         [2, 0, 0], [2, 1, 1]],
                values=[
                    b'marlo', b'stringer', b'omar', b'stringer', b'marlo',
                    b'marlo', b'omar'
                ],
                dense_shape=[3, 2, 2]),
        'seq_feature0':
            constant_op.constant([[[111., 112.], [121., 122.]],
                                  [[211., 212.], [221., 222.]],
                                  [[311., 312.], [321., 322.]]])
    }

    wire_cast = feature_column.sparse_column_with_keys(
        'wire_cast', ['marlo', 'omar', 'stringer'])
    wire_cast_embedded = feature_column.embedding_column(
        wire_cast,
        dimension=embedding_dimension,
        combiner='sum',
        initializer=init_ops.ones_initializer())
    seq_feature0_column = feature_column.real_valued_column(
        'seq_feature0', dimension=dense_dimension)

    sequence_feature_columns = [seq_feature0_column, wire_cast_embedded]

    context_features = None

    self._test_prepare_inputs_for_rnn(sequence_features, context_features,
                                      sequence_feature_columns, num_unroll,
                                      expected)
Пример #31
0
def batch_normalization(inputs,
                        axis=-1,
                        momentum=0.99,
                        epsilon=1e-3,
                        center=True,
                        scale=True,
                        beta_initializer=init_ops.zeros_initializer(),
                        gamma_initializer=init_ops.ones_initializer(),
                        moving_mean_initializer=init_ops.zeros_initializer(),
                        moving_variance_initializer=init_ops.ones_initializer(),
                        beta_regularizer=None,
                        gamma_regularizer=None,
                        training=False,
                        trainable=True,
                        name=None,
                        reuse=None,
                        renorm=False,
                        renorm_clipping=None,
                        renorm_momentum=0.99):
  """Functional interface for the batch normalization layer.

  Reference: http://arxiv.org/abs/1502.03167

  "Batch Normalization: Accelerating Deep Network Training by Reducing
  Internal Covariate Shift"

  Sergey Ioffe, Christian Szegedy

  Note: the operations which update the `moving_mean` and `moving_variance`
  variables will not be added as dependencies of your training operation and so
  must be run separately. For example:

  ```
  extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
  sess.run([train_op, extra_update_ops], ...)
  ```
  Alternatively, add the operations as a dependency to your training operation
  manually, and then just run your training operation as normal:

  ```
  extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
  with tf.control_dependencies(extra_update_ops):
    train_op = optimizer.minimize(loss)
  ...
  sess.run([train_op], ...)
  ```

  Arguments:
    inputs: Tensor input.
    axis: Integer, the axis that should be normalized (typically the features
      axis). For instance, after a `Convolution2D` layer with
      `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
    momentum: Momentum for the moving average.
    epsilon: Small float added to variance to avoid dividing by zero.
    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
      is ignored.
    scale: If True, multiply by `gamma`. If False, `gamma` is
      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
      disabled since the scaling can be done by the next layer.
    beta_initializer: Initializer for the beta weight.
    gamma_initializer: Initializer for the gamma weight.
    moving_mean_initializer: Initializer for the moving mean.
    moving_variance_initializer: Initializer for the moving variance.
    beta_regularizer: Optional regularizer for the beta weight.
    gamma_regularizer: Optional regularizer for the gamma weight.
    training: Either a Python boolean, or a TensorFlow boolean scalar tensor
      (e.g. a placeholder). Whether to return the output in training mode
      (normalized with statistics of the current batch) or in inference mode
      (normalized with moving statistics). **NOTE**: make sure to set this
      parameter correctly, or else your training/inference will not work
      properly.
    trainable: Boolean, if `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    name: String, the name of the layer.
    reuse: Boolean, whether to reuse the weights of a previous layer
      by the same name.
    renorm: Whether to use Batch Renormalization
      (https://arxiv.org/abs/1702.03275). This adds extra variables during
      training. The inference is the same for either value of this parameter.
    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
      scalar `Tensors` used to clip the renorm correction. The correction
      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
      dmax are set to inf, 0, inf, respectively.
    renorm_momentum: Momentum used to update the moving means and standard
      deviations with renorm. Unlike `momentum`, this affects training
      and should be neither too small (which would add noise) nor too large
      (which would give stale estimates). Note that `momentum` is still applied
      to get the means and variances for inference.

  Returns:
    Output tensor.
  """
  layer = BatchNormalization(
      axis=axis,
      momentum=momentum,
      epsilon=epsilon,
      center=center,
      scale=scale,
      beta_initializer=beta_initializer,
      gamma_initializer=gamma_initializer,
      moving_mean_initializer=moving_mean_initializer,
      moving_variance_initializer=moving_variance_initializer,
      beta_regularizer=beta_regularizer,
      gamma_regularizer=gamma_regularizer,
      trainable=trainable,
      renorm=renorm,
      renorm_clipping=renorm_clipping,
      renorm_momentum=renorm_momentum,
      name=name,
      _reuse=reuse,
      _scope=name)
  return layer.apply(inputs, training=training)
Пример #32
0
    def testBatchNormsMatchFwdBwdSomeOnShard0SomeOnShard1(self):
        with ops.device("/device:IPU:0"):
            x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])

            with variable_scope.variable_scope("vs", use_resource=True):
                with tu.ipu_shard(0):
                    y = convolutional.conv2d(
                        x,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv1')
                    y = layers_norm.batch_normalization(y,
                                                        fused=True,
                                                        training=True)
                    y = convolutional.conv2d(
                        y,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv2')
                    y = layers_norm.batch_normalization(y,
                                                        fused=True,
                                                        training=True)

                with tu.ipu_shard(1):
                    y = convolutional.conv2d(
                        y,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv3')
                    y = layers_norm.batch_normalization(y,
                                                        fused=True,
                                                        training=True)

            loss = math_ops.reduce_sum(y)
            optimizer = gradient_descent.GradientDescentOptimizer(0.1)
            train = optimizer.minimize(loss)

            with ops.device('cpu'):
                report = gen_ipu_ops.ipu_event_trace()

        tu.configure_ipu_system(True, True, True, sharded=True)

        with tu.ipu_session() as sess:
            sess.run(variables.global_variables_initializer())

            sess.run(report)

            sess.run([train, loss], {x: np.zeros([1, 4, 4, 2])})

            result = sess.run(report)

            s = tu.extract_all_strings_from_event_trace(result)
            cs_list = tu.get_compute_sets_from_report(s)
            # Two BN for forwards (on shards 0 and 1) and two BN for grad
            # (note that we don't cache gradient application)
            ok = [
                '__seed*', '*OnTileCopy*', 'Copy_',
                'vs/conv1/Conv2D/convolution.*/Conv_1x1',
                'vs/conv3/Conv2D/convolution.*/Conv_1x1',
                'vs/batch_normalization/FusedBatchNorm/batch-norm-training.*/',
                'vs/batch_normalization_2/FusedBatchNorm/batch-norm-training.*/',
                'Sum/reduce.*/ReduceFinalStage/IntermediateToOutput/Reduce',
                'gradients/vs/batch_normalization_2/FusedBatchNorm_grad/FusedBatchNormGrad/batch-norm-grad.*/',
                'gradients/vs/batch_normalization_1/FusedBatchNorm_grad/FusedBatchNormGrad/batch-norm-grad.*/',
                'GradientDescent/update_vs/batch_normalization/',
                'GradientDescent/update_vs/batch_normalization_1/',
                'GradientDescent/update_vs/batch_normalization_2/',
                'gradients/vs/conv3/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Conv_4x4',
                'gradients/vs/conv3/Conv2D_grad/Conv2DBackpropFilter/fusion.*/AddTo',
                'gradients/vs/conv1/Conv2D_grad/Conv2DBackpropFilter/fusion*/Conv_4x4',
                'gradients/vs/conv1/Conv2D_grad/Conv2DBackpropFilter/fusion.*/AddTo'
            ]

            self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
Пример #33
0
def conv_layer_norm(inputs,
                    bptr,
                    nclass=None,
                    center=True,
                    scale=True,
                    activation_fn=None,
                    reuse=None,
                    variables_collections=None,
                    outputs_collections=None,
                    trainable=True,
                    scope=None):
    """Adds a Layer Normalization layer from https://arxiv.org/abs/1607.06450.
    "Layer Normalization"
    Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
    Can be used as a normalizer function for conv2d and fully_connected.
    Args:
    inputs: a tensor with 2 or more dimensions. The normalization
              occurs over all but the first dimension.
    center: If True, subtract `beta`. If False, `beta` is ignored.
    scale: If True, multiply by `gamma`. If False, `gamma` is
      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
      disabled since the scaling can be done by the next layer.
    activation_fn: activation function, default set to None to skip it and
      maintain a linear activation.
    reuse: whether or not the layer and its variables should be reused. To be
      able to reuse the layer scope must be given.
    variables_collections: optional collections for the variables.
    outputs_collections: collections to add the outputs.
    trainable: If `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    scope: Optional scope for `variable_scope`.
    Returns:
    A `Tensor` representing the output of the operation.
    Raises:
    ValueError: if rank or last dimension of `inputs` is undefined.
    """
    with variable_scope.variable_scope(scope,
                                       'LayerNorm', [inputs],
                                       reuse=reuse) as sc:
        inputs = ops.convert_to_tensor(inputs)
        inputs_shape = inputs.get_shape()
        inputs_rank = inputs_shape.ndims
        if inputs_rank is None:
            raise ValueError('Inputs %s has undefined rank.' % inputs.name)
        dtype = inputs.dtype.base_dtype
        axis = list(range(1, inputs_rank))
        params_shape = inputs_shape[-1:]
        if not params_shape.is_fully_defined():
            raise ValueError('Inputs %s has undefined last dimension %s.' %
                             (inputs.name, params_shape))
        # Allocate parameters for the beta and gamma of the normalization.
        beta, gamma = None, None
        if center:
            beta_collections = utils.get_variable_collections(
                variables_collections, 'beta')
            beta = variables.model_variable(
                'beta',
                shape=params_shape,
                dtype=dtype,
                initializer=init_ops.zeros_initializer,
                collections=beta_collections,
                trainable=trainable)
        if scale:
            gamma_collections = utils.get_variable_collections(
                variables_collections, 'gamma')
            gamma = variables.model_variable(
                'gamma',
                shape=params_shape,
                dtype=dtype,
                initializer=init_ops.ones_initializer(),
                collections=gamma_collections,
                trainable=trainable)
        # Calculate the moments on the last axis (layer activations).
        if nclass is not None:
            x = bptr / nclass
            fm = tf.exp(x - x**2)
        else:
            fm = tf.exp(bptr)
        fm_sum = tf.reduce_sum(fm, [1, 2])
        fm_sum = tf.expand_dims(tf.expand_dims(fm_sum, 1), 2)
        norm_weight = tf.div(fm, fm_sum)
        mean = tf.mul(norm_weight, inputs)
        mean = tf.reduce_sum(mean, [1, 2])
        mean = tf.expand_dims(tf.expand_dims(mean, 1), 2)
        #print (mean.get_shape().as_list())
        variance = tf.sub(inputs, mean)
        variance = tf.square(variance)
        variance = tf.mul(norm_weight, variance)
        variance = nn.moments(variance, [1, 2], keep_dims=True)[1]
        #print (variance.get_shape().as_list())
        #mean, variance = nn.moments(inputs, axis, keep_dims=True)
        # Compute layer normalization using the batch_normalization function.
        variance_epsilon = 1E-12
        outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma,
                                         variance_epsilon)
        outputs.set_shape(inputs_shape)
        if activation_fn is not None:
            outputs = activation_fn(outputs)
        return utils.collect_named_outputs(outputs_collections,
                                           sc.original_name_scope, outputs)
    def __init__(self,
                 reference_batch,
                 axis=-1,
                 epsilon=1e-3,
                 center=True,
                 scale=True,
                 beta_initializer=init_ops.zeros_initializer(),
                 gamma_initializer=init_ops.ones_initializer(),
                 beta_regularizer=None,
                 gamma_regularizer=None,
                 trainable=True,
                 name=None,
                 batch_axis=0):
        """Initialize virtual batch normalization object.

    We precompute the 'mean' and 'mean squared' of the reference batch, so that
    `__call__` is efficient. This means that the axis must be supplied when the
    object is created, not when it is called.

    We precompute 'square mean' instead of 'variance', because the square mean
    can be easily adjusted on a per-example basis.

    Args:
      reference_batch: A minibatch tensors. This will form the reference data
        from which the normalization statistics are calculated. See
        https://arxiv.org/abs/1606.03498 for more details.
      axis: Integer, the axis that should be normalized (typically the features
        axis). For instance, after a `Convolution2D` layer with
        `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
      epsilon: Small float added to variance to avoid dividing by zero.
      center: If True, add offset of `beta` to normalized tensor. If False,
        `beta` is ignored.
      scale: If True, multiply by `gamma`. If False, `gamma` is
        not used. When the next layer is linear (also e.g. `nn.relu`), this can
        be disabled since the scaling can be done by the next layer.
      beta_initializer: Initializer for the beta weight.
      gamma_initializer: Initializer for the gamma weight.
      beta_regularizer: Optional regularizer for the beta weight.
      gamma_regularizer: Optional regularizer for the gamma weight.
      trainable: Boolean, if `True` also add variables to the graph collection
        `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
      name: String, the name of the ops.
      batch_axis: The axis of the batch dimension. This dimension is treated
        differently in `virtual batch normalization` vs `batch normalization`.

    Raises:
      ValueError: If `reference_batch` has unknown dimensions at graph
        construction.
      ValueError: If `batch_axis` is the same as `axis`.
    """
        axis = _validate_init_input_and_get_axis(reference_batch, axis)
        self._epsilon = epsilon
        self._beta = 0
        self._gamma = 1
        self._batch_axis = _validate_init_input_and_get_axis(
            reference_batch, batch_axis)

        if axis == self._batch_axis:
            raise ValueError('`axis` and `batch_axis` cannot be the same.')

        with variable_scope.variable_scope(name,
                                           'VBN',
                                           values=[reference_batch
                                                   ]) as self._vs:
            self._reference_batch = reference_batch

            # Calculate important shapes:
            #  1) Reduction axes for the reference batch
            #  2) Broadcast shape, if necessary
            #  3) Reduction axes for the virtual batchnormed batch
            #  4) Shape for optional parameters
            input_shape = self._reference_batch.shape
            ndims = input_shape.ndims
            reduction_axes = list(range(ndims))
            del reduction_axes[axis]

            self._broadcast_shape = [1] * len(input_shape)
            self._broadcast_shape[axis] = input_shape.dims[axis]

            self._example_reduction_axes = list(range(ndims))
            del self._example_reduction_axes[max(axis, self._batch_axis)]
            del self._example_reduction_axes[min(axis, self._batch_axis)]

            params_shape = self._reference_batch.shape[axis]

            # Determines whether broadcasting is needed. This is slightly different
            # than in the `nn.batch_normalization` case, due to `batch_dim`.
            self._needs_broadcasting = (sorted(self._example_reduction_axes) !=
                                        list(range(ndims))[:-2])

            # Calculate the sufficient statistics for the reference batch in a way
            # that can be easily modified by additional examples.
            self._ref_mean, self._ref_mean_squares = _statistics(
                self._reference_batch, reduction_axes)
            self._ref_variance = (self._ref_mean_squares -
                                  math_ops.square(self._ref_mean))

            # Virtual batch normalization uses a weighted average between example
            # statistics and the reference batch statistics.
            ref_batch_size = _static_or_dynamic_batch_size(
                self._reference_batch, self._batch_axis)
            self._example_weight = 1. / (math_ops.to_float(ref_batch_size) +
                                         1.)
            self._ref_weight = 1. - self._example_weight

            # Make the variables, if necessary.
            if center:
                self._beta = variable_scope.get_variable(
                    name='beta',
                    shape=(params_shape, ),
                    initializer=beta_initializer,
                    regularizer=beta_regularizer,
                    trainable=trainable)
            if scale:
                self._gamma = variable_scope.get_variable(
                    name='gamma',
                    shape=(params_shape, ),
                    initializer=gamma_initializer,
                    regularizer=gamma_regularizer,
                    trainable=trainable)
def fused_layer_norm(inputs,
                     center=True,
                     scale=True,
                     activation_fn=None,
                     reuse=None,
                     variables_collections=None,
                     outputs_collections=None,
                     trainable=True,
                     begin_norm_axis=1,
                     begin_params_axis=-1,
                     scope=None,
                     use_fused_batch_norm=False):
    with tf.variable_scope(scope, 'LayerNorm', [inputs], reuse=reuse) as sc:
        inputs = ops.convert_to_tensor(inputs)
        inputs_shape = inputs.shape
        inputs_rank = inputs_shape.ndims
        if inputs_rank is None:
            raise ValueError('Inputs %s has undefined rank.' % inputs.name)
        dtype = inputs.dtype.base_dtype
        if begin_norm_axis < 0:
            begin_norm_axis = inputs_rank + begin_norm_axis
        if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
            raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) '
                             'must be < rank(inputs) (%d)' %
                             (begin_params_axis, begin_norm_axis, inputs_rank))
        params_shape = inputs_shape[begin_params_axis:]
        if not params_shape.is_fully_defined():
            raise ValueError(
                'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' %
                (inputs.name, begin_params_axis, inputs_shape))
        # Allocate parameters for the beta and gamma of the normalization.
        beta, gamma = None, None
        if center:
            beta_collections = utils.get_variable_collections(
                variables_collections, 'beta')
            beta = variables.model_variable(
                'beta',
                shape=params_shape,
                dtype=dtype,
                initializer=init_ops.zeros_initializer(),
                collections=beta_collections,
                trainable=trainable)
        if scale:
            gamma_collections = utils.get_variable_collections(
                variables_collections, 'gamma')
            gamma = variables.model_variable(
                'gamma',
                shape=params_shape,
                dtype=dtype,
                initializer=init_ops.ones_initializer(),
                collections=gamma_collections,
                trainable=trainable)
        if use_fused_batch_norm:
            # get static TensorShape if fully defined,
            # otherwise retrieve shape tensor
            norm_shape = inputs.shape[begin_norm_axis:]
            if norm_shape.is_fully_defined():
                bn_shape = [1, -1, 1, numpy.prod(norm_shape.as_list())]
            else:
                norm_shape = tf.shape(inputs)[begin_norm_axis:]
                bn_shape = [1, -1, 1, tf.reduce_prod(norm_shape)]
            if inputs.get_shape().is_fully_defined():
                outputs_shape = inputs.get_shape()
            else:
                outputs_shape = tf.shape(inputs)
            inputs = array_ops.reshape(inputs, bn_shape)
            if inputs.get_shape().is_fully_defined():
                # static inputs TensorShape fully defined after reshape.
                ones = array_ops.ones(inputs.get_shape()[1],
                                      dtype=dtypes.float32)
                zeros = array_ops.zeros(inputs.get_shape()[1],
                                        dtype=dtypes.float32)
            else:
                # static inputs TensorShape NOT fully defined after reshape.
                # must use dynamic shape, which means these input tensors
                # have to be created at runtime, which causes a slowdown.
                scale_shape = tf.shape(inputs)[1]
                ones = array_ops.ones(scale_shape, dtype=dtypes.float32)
                zeros = array_ops.zeros(scale_shape, dtype=dtypes.float32)
            outputs, mean, variance = nn.fused_batch_norm(inputs,
                                                          ones,
                                                          zeros,
                                                          epsilon=1e-4,
                                                          data_format="NCHW")
            outputs = array_ops.reshape(outputs, outputs_shape)
            if center and scale:
                outputs = outputs * gamma + beta
            elif center:
                outputs = outputs + beta
            elif scale:
                outputs = outputs * gamma
        else:
            # Calculate the moments on the last axis (layer activations).
            norm_axes = list(range(begin_norm_axis, inputs_rank))
            mean, variance = nn.moments(inputs, norm_axes, keep_dims=True)
            # Compute layer normalization using the batch_normalization function.
            variance_epsilon = 1e-4
            outputs = nn.batch_normalization(inputs,
                                             mean,
                                             variance,
                                             offset=beta,
                                             scale=gamma,
                                             variance_epsilon=variance_epsilon)
            outputs.set_shape(inputs_shape)
        if activation_fn is not None:
            outputs = activation_fn(outputs)
        return utils.collect_named_outputs(outputs_collections, sc.name,
                                           outputs)
Пример #36
0
 def Foo(inputs):
   var = variable_scope.get_variable("var", shape=[10], dtype=dtypes.float32,
                                     initializer=init_ops.ones_initializer())
   return inputs + var
Пример #37
0
def batch_normalization(
        inputs,
        axis=-1,
        momentum=0.99,
        epsilon=1e-3,
        center=True,
        scale=True,
        beta_initializer=init_ops.zeros_initializer(),
        gamma_initializer=init_ops.ones_initializer(),
        moving_mean_initializer=init_ops.zeros_initializer(),
        moving_variance_initializer=init_ops.ones_initializer(),
        beta_regularizer=None,
        gamma_regularizer=None,
        beta_constraint=None,
        gamma_constraint=None,
        training=False,
        trainable=True,
        name=None,
        reuse=None,
        renorm=False,
        renorm_clipping=None,
        renorm_momentum=0.99,
        fused=None,
        virtual_batch_size=None,
        adjustment=None):
    """Functional interface for the batch normalization layer.

  Reference: http://arxiv.org/abs/1502.03167

  "Batch Normalization: Accelerating Deep Network Training by Reducing
  Internal Covariate Shift"

  Sergey Ioffe, Christian Szegedy

  Note: when training, the moving_mean and moving_variance need to be updated.
  By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
  need to be added as a dependency to the `train_op`. For example:

  ```python
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
      train_op = optimizer.minimize(loss)
  ```

  Arguments:
    inputs: Tensor input.
    axis: An `int`, the axis that should be normalized (typically the features
      axis). For instance, after a `Convolution2D` layer with
      `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
    momentum: Momentum for the moving average.
    epsilon: Small float added to variance to avoid dividing by zero.
    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
      is ignored.
    scale: If True, multiply by `gamma`. If False, `gamma` is
      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
      disabled since the scaling can be done by the next layer.
    beta_initializer: Initializer for the beta weight.
    gamma_initializer: Initializer for the gamma weight.
    moving_mean_initializer: Initializer for the moving mean.
    moving_variance_initializer: Initializer for the moving variance.
    beta_regularizer: Optional regularizer for the beta weight.
    gamma_regularizer: Optional regularizer for the gamma weight.
    beta_constraint: An optional projection function to be applied to the `beta`
        weight after being updated by an `Optimizer` (e.g. used to implement
        norm constraints or value constraints for layer weights). The function
        must take as input the unprojected variable and must return the
        projected variable (which must have the same shape). Constraints are
        not safe to use when doing asynchronous distributed training.
    gamma_constraint: An optional projection function to be applied to the
        `gamma` weight after being updated by an `Optimizer`.
    training: Either a Python boolean, or a TensorFlow boolean scalar tensor
      (e.g. a placeholder). Whether to return the output in training mode
      (normalized with statistics of the current batch) or in inference mode
      (normalized with moving statistics). **NOTE**: make sure to set this
      parameter correctly, or else your training/inference will not work
      properly.
    trainable: Boolean, if `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    name: String, the name of the layer.
    reuse: Boolean, whether to reuse the weights of a previous layer
      by the same name.
    renorm: Whether to use Batch Renormalization
      (https://arxiv.org/abs/1702.03275). This adds extra variables during
      training. The inference is the same for either value of this parameter.
    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
      scalar `Tensors` used to clip the renorm correction. The correction
      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
      dmax are set to inf, 0, inf, respectively.
    renorm_momentum: Momentum used to update the moving means and standard
      deviations with renorm. Unlike `momentum`, this affects training
      and should be neither too small (which would add noise) nor too large
      (which would give stale estimates). Note that `momentum` is still applied
      to get the means and variances for inference.
    fused: if `True`, use a faster, fused implementation if possible.
      If `None`, use the system recommended implementation.
    virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
      which means batch normalization is performed across the whole batch. When
      `virtual_batch_size` is not `None`, instead perform "Ghost Batch
      Normalization", which creates virtual sub-batches which are each
      normalized separately (with shared gamma, beta, and moving statistics).
      Must divide the actual batch size during execution.
    adjustment: A function taking the `Tensor` containing the (dynamic) shape of
      the input tensor and returning a pair (scale, bias) to apply to the
      normalized values (before gamma and beta), only during training. For
      example, if axis==-1,
        `adjustment = lambda shape: (
          tf.random_uniform(shape[-1:], 0.93, 1.07),
          tf.random_uniform(shape[-1:], -0.1, 0.1))`
      will scale the normalized value by up to 7% up or down, then shift the
      result by up to 0.1 (with independent scaling and bias for each feature
      but shared across all examples), and finally apply gamma and/or beta. If
      `None`, no adjustment is applied. Cannot be specified if
      virtual_batch_size is specified.

  Returns:
    Output tensor.

  Raises:
    ValueError: if eager execution is enabled.
  """
    layer = BatchNormalization(
        axis=axis,
        momentum=momentum,
        epsilon=epsilon,
        center=center,
        scale=scale,
        beta_initializer=beta_initializer,
        gamma_initializer=gamma_initializer,
        moving_mean_initializer=moving_mean_initializer,
        moving_variance_initializer=moving_variance_initializer,
        beta_regularizer=beta_regularizer,
        gamma_regularizer=gamma_regularizer,
        beta_constraint=beta_constraint,
        gamma_constraint=gamma_constraint,
        renorm=renorm,
        renorm_clipping=renorm_clipping,
        renorm_momentum=renorm_momentum,
        fused=fused,
        trainable=trainable,
        virtual_batch_size=virtual_batch_size,
        adjustment=adjustment,
        name=name,
        _reuse=reuse,
        _scope=name)
    return layer.apply(inputs, training=training)
Пример #38
0
    def testAddVariable(self):
        obj = NonLayerTrackable()
        with self.assertRaisesRegex(ValueError, "do not specify shape"):
            trackable_utils.add_variable(obj,
                                         name="shape_specified_twice",
                                         shape=[],
                                         initializer=1)
        constant_initializer = trackable_utils.add_variable(
            obj, name="constant_initializer", initializer=1)
        with variable_scope.variable_scope("some_variable_scope"):
            ones_initializer = trackable_utils.add_variable(
                obj,
                name="ones_initializer",
                shape=[2],
                initializer=init_ops.ones_initializer(dtype=dtypes.float32))
        bare_initializer = trackable_utils.add_variable(
            obj,
            name="bare_initializer",
            shape=[2, 2],
            dtype=dtypes.float64,
            initializer=init_ops.zeros_initializer)

        # Even in graph mode, there are no naming conflicts between objects, only
        # naming conflicts within an object.
        other_duplicate = resource_variable_ops.ResourceVariable(
            name="duplicate", initial_value=1.)
        duplicate = trackable_utils.add_variable(obj,
                                                 name="duplicate",
                                                 shape=[])
        with self.assertRaisesRegex(ValueError,
                                    "'duplicate'.*already declared"):
            trackable_utils.add_variable(obj, name="duplicate", shape=[])

        self.evaluate(trackable_utils.gather_initializers(obj))
        self.assertEqual("constant_initializer:0", constant_initializer.name)
        self.assertEqual(1, self.evaluate(constant_initializer))
        self.assertEqual("some_variable_scope/ones_initializer:0",
                         ones_initializer.name)
        self.assertAllEqual([1, 1], self.evaluate(ones_initializer))
        self.assertAllEqual([[0., 0.], [0., 0.]],
                            self.evaluate(bare_initializer))
        self.assertEqual("a_variable:0", obj.a_variable.name)
        self.assertEqual("duplicate:0", other_duplicate.name)
        if context.executing_eagerly():
            # When executing eagerly, there's no uniquification of variable names. The
            # checkpoint name will be the same.
            self.assertEqual("duplicate:0", duplicate.name)
        else:
            # The .name attribute may be globally influenced, but the checkpoint name
            # won't be (tested below).
            self.assertEqual("duplicate_1:0", duplicate.name)
        named_variables, _, _ = (
            graph_view.ObjectGraphView(obj).serialize_object_graph())
        expected_checkpoint_names = (
            "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
            "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
            "constant_initializer/.ATTRIBUTES/VARIABLE_VALUE",
            "duplicate/.ATTRIBUTES/VARIABLE_VALUE",
            "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE",
        )
        six.assertCountEqual(self, expected_checkpoint_names,
                             [v.name for v in named_variables])
 def _create_slots(self, var_list):
   for v in var_list:
     init_rms = init_ops.ones_initializer(dtype=v.dtype)
     self._get_or_make_slot_with_initializer(v, init_rms, v.get_shape(),
                                             v.dtype, 'rms', self._name)
Пример #40
0
def custom_layer_norm(inputs,
                      center=True,
                      scale=True,
                      activation_fn=None,
                      reuse=None,
                      outputs_collections=None,
                      trainable=True,
                      begin_norm_axis=1,
                      begin_params_axis=-1,
                      scope=None):
    """Adds a Layer Normalization layer.
  Based on the paper:
    "Layer Normalization"
    Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
    https://arxiv.org/abs/1607.06450.
  Can be used as a normalizer function for conv2d and fully_connected.
  Given a tensor `inputs` of rank `R`, moments are calculated and normalization
  is performed over axes `begin_norm_axis ... R - 1`.  Scaling and centering,
  if requested, is performed over axes `begin_params_axis .. R - 1`.
  By default, `begin_norm_axis = 1` and `begin_params_axis = -1`,
  meaning that normalization is performed over all but the first axis
  (the `HWC` if `inputs` is `NHWC`), while the `beta` and `gamma` trainable
  parameters are calculated for the rightmost axis (the `C` if `inputs` is
  `NHWC`).  Scaling and recentering is performed via broadcast of the
  `beta` and `gamma` parameters with the normalized tensor.
  The shapes of `beta` and `gamma` are `inputs.shape[begin_params_axis:]`,
  and this part of the inputs' shape must be fully defined.
  Args:
    inputs: A tensor having rank `R`. The normalization is performed over axes
      `begin_norm_axis ... R - 1` and centering and scaling parameters are
      calculated over `begin_params_axis ... R - 1`.
    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
      is ignored.
    scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the
      next layer is linear (also e.g. `nn.relu`), this can be disabled since the
      scaling can be done by the next layer.
    activation_fn: Activation function, default set to None to skip it and
      maintain a linear activation.
    reuse: Whether or not the layer and its variables should be reused. To be
      able to reuse the layer scope must be given.
    variables_collections: Optional collections for the variables.
    outputs_collections: Collections to add the outputs.
    trainable: If `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    begin_norm_axis: The first normalization dimension: normalization will be
      performed along dimensions `begin_norm_axis : rank(inputs)`
    begin_params_axis: The first parameter (beta, gamma) dimension: scale and
      centering parameters will have dimensions
      `begin_params_axis : rank(inputs)` and will be broadcast with the
        normalized inputs accordingly.
    scope: Optional scope for `variable_scope`.
  Returns:
    A `Tensor` representing the output of the operation, having the same
    shape and dtype as `inputs`.
  Raises:
    ValueError: If the rank of `inputs` is not known at graph build time,
      or if `inputs.shape[begin_params_axis:]` is not fully defined at
      graph build time.
  """
    with tf.compat.v1.variable_scope(scope, 'LayerNorm', [inputs],
                                     reuse=reuse) as sc:
        inputs = ops.convert_to_tensor(inputs)
        inputs_shape = inputs.shape
        inputs_rank = inputs_shape.ndims
        if inputs_rank is None:
            raise ValueError('Inputs %s has undefined rank.' % inputs.name)
        dtype = inputs.dtype.base_dtype
        if begin_norm_axis < 0:
            begin_norm_axis = inputs_rank + begin_norm_axis
        if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
            raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) '
                             'must be < rank(inputs) (%d)' %
                             (begin_params_axis, begin_norm_axis, inputs_rank))
        params_shape = inputs_shape[begin_params_axis:]
        if not params_shape.is_fully_defined():
            raise ValueError(
                'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' %
                (inputs.name, begin_params_axis, inputs_shape))
        # Allocate parameters for the beta and gamma of the normalization.
        beta, gamma = None, None
        if center:
            beta = model_variable('beta',
                                  shape=params_shape,
                                  dtype=dtype,
                                  initializer=init_ops.zeros_initializer(),
                                  collections=None,
                                  trainable=trainable)
        if scale:
            gamma = model_variable('gamma',
                                   shape=params_shape,
                                   dtype=dtype,
                                   initializer=init_ops.ones_initializer(),
                                   collections=None,
                                   trainable=trainable)
        # By default, compute the moments across all the dimensions except the one with index 0.
        norm_axes = list(range(begin_norm_axis, inputs_rank))
        mean, variance = nn.moments(inputs, norm_axes, keep_dims=True)
        # Compute layer normalization using the batch_normalization function.
        # Note that epsilon must be increased for float16 due to the limited
        # representable range.
        variance_epsilon = 1e-12 if dtype != dtypes.float16 else 1e-3

        devices = device_lib.list_local_devices()
        use_fused_layer_norm = any(dev.device_type == "HPU" for dev in devices)

        if use_fused_layer_norm:
            outputs, _, _ = habana_ops.habana_layer_norm(
                x=inputs,
                beta=beta,
                gamma=gamma,
                axes=tensor_util.make_tensor_proto(len(inputs.shape) - 1),
                epsilon=tensor_util.make_tensor_proto(variance_epsilon))

        else:
            outputs = nn.batch_normalization(inputs,
                                             mean,
                                             variance,
                                             offset=beta,
                                             scale=gamma,
                                             variance_epsilon=variance_epsilon)

        outputs.set_shape(inputs_shape)

        if activation_fn is not None:
            outputs = activation_fn(outputs)
        return outputs
Пример #41
0
def instance_norm(inputs,
                  center=True,
                  scale=True,
                  epsilon=1e-6,
                  activation_fn=None,
                  param_initializers=None,
                  reuse=None,
                  variables_collections=None,
                  outputs_collections=None,
                  trainable=True,
                  data_format=DATA_FORMAT_NHWC,
                  scope=None):
    """Functional interface for the instance normalization layer.

  Reference: https://arxiv.org/abs/1607.08022.

    "Instance Normalization: The Missing Ingredient for Fast Stylization"
    Dmitry Ulyanov, Andrea Vedaldi, Victor Lempitsky

  Args:
    inputs: A tensor with 2 or more dimensions, where the first dimension has
      `batch_size`. The normalization is over all but the last dimension if
      `data_format` is `NHWC` and the second dimension if `data_format` is
      `NCHW`.
    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
      is ignored.
    scale: If True, multiply by `gamma`. If False, `gamma` is
      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
      disabled since the scaling can be done by the next layer.
    epsilon: Small float added to variance to avoid dividing by zero.
    activation_fn: Activation function, default set to None to skip it and
      maintain a linear activation.
    param_initializers: Optional initializers for beta, gamma, moving mean and
      moving variance.
    reuse: Whether or not the layer and its variables should be reused. To be
      able to reuse the layer scope must be given.
    variables_collections: Optional collections for the variables.
    outputs_collections: Collections to add the outputs.
    trainable: If `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
    data_format: A string. `NHWC` (default) and `NCHW` are supported.
    scope: Optional scope for `variable_scope`.

  Returns:
    A `Tensor` representing the output of the operation.

  Raises:
    ValueError: If `data_format` is neither `NHWC` nor `NCHW`.
    ValueError: If the rank of `inputs` is undefined.
    ValueError: If rank or channels dimension of `inputs` is undefined.
  """
    inputs = ops.convert_to_tensor(inputs)
    inputs_shape = inputs.shape
    inputs_rank = inputs.shape.ndims

    if inputs_rank is None:
        raise ValueError('Inputs %s has undefined rank.' % inputs.name)
    if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
        raise ValueError('data_format has to be either NCHW or NHWC.')

    with variable_scope.variable_scope(scope,
                                       'InstanceNorm', [inputs],
                                       reuse=reuse) as sc:
        if data_format == DATA_FORMAT_NCHW:
            reduction_axis = 1
            # For NCHW format, rather than relying on implicit broadcasting, we
            # explicitly reshape the params to params_shape_broadcast when computing
            # the moments and the batch normalization.
            params_shape_broadcast = list([1, inputs_shape[1].value] +
                                          [1 for _ in range(2, inputs_rank)])
        else:
            reduction_axis = inputs_rank - 1
            params_shape_broadcast = None
        moments_axes = list(range(inputs_rank))
        del moments_axes[reduction_axis]
        del moments_axes[0]
        params_shape = inputs_shape[reduction_axis:reduction_axis + 1]
        if not params_shape.is_fully_defined():
            raise ValueError('Inputs %s has undefined channels dimension %s.' %
                             (inputs.name, params_shape))

        # Allocate parameters for the beta and gamma of the normalization.
        beta, gamma = None, None
        dtype = inputs.dtype.base_dtype
        if param_initializers is None:
            param_initializers = {}
        if center:
            beta_collections = utils.get_variable_collections(
                variables_collections, 'beta')
            beta_initializer = param_initializers.get(
                'beta', init_ops.zeros_initializer())
            beta = variables.model_variable('beta',
                                            shape=params_shape,
                                            dtype=dtype,
                                            initializer=beta_initializer,
                                            collections=beta_collections,
                                            trainable=trainable)
            if params_shape_broadcast:
                beta = array_ops.reshape(beta, params_shape_broadcast)
        if scale:
            gamma_collections = utils.get_variable_collections(
                variables_collections, 'gamma')
            gamma_initializer = param_initializers.get(
                'gamma', init_ops.ones_initializer())
            gamma = variables.model_variable('gamma',
                                             shape=params_shape,
                                             dtype=dtype,
                                             initializer=gamma_initializer,
                                             collections=gamma_collections,
                                             trainable=trainable)
            if params_shape_broadcast:
                gamma = array_ops.reshape(gamma, params_shape_broadcast)

        # Calculate the moments (instance activations).
        mean, variance = nn.moments(inputs, moments_axes, keep_dims=True)

        # Compute instance normalization.
        outputs = nn.batch_normalization(inputs,
                                         mean,
                                         variance,
                                         beta,
                                         gamma,
                                         epsilon,
                                         name='instancenorm')
        if activation_fn is not None:
            outputs = activation_fn(outputs)
        return utils.collect_named_outputs(outputs_collections, sc.name,
                                           outputs)
Пример #42
0
def l2_normalization(inputs,
                     scaling=False,
                     scale_initializer=init_ops.ones_initializer(),
                     reuse=None,
                     variables_collections=None,
                     outputs_collections=None,
                     data_format='NHWC',
                     trainable=True,
                     scope=None):
    """Implement L2 normalization on every feature (i.e. spatial normalization).

    Should be extended in some near future to other dimensions, providing a more
    flexible normalization framework.

    Args:
      inputs: a 4-D tensor with dimensions [batch_size, height, width, channels].
      scaling: whether or not to add a post scaling operation along the dimensions
        which have been normalized.
      scale_initializer: An initializer for the weights.
      reuse: whether or not the layer and its variables should be reused. To be
        able to reuse the layer scope must be given.
      variables_collections: optional list of collections for all the variables or
        a dictionary containing a different list of collection per variable.
      outputs_collections: collection to add the outputs.
      data_format:  NHWC or NCHW data format.
      trainable: If `True` also add variables to the graph collection
        `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
      scope: Optional scope for `variable_scope`.
    Returns:
      A `Tensor` representing the output of the operation.
    """

    with variable_scope.variable_scope(scope,
                                       'L2Normalization', [inputs],
                                       reuse=reuse) as sc:

        inputs_shape = inputs.get_shape()
        inputs_rank = inputs_shape.ndims
        dtype = inputs.dtype.base_dtype
        if data_format == 'NHWC':
            params_shape = inputs_shape[-1:]
        elif data_format == 'NCHW':
            params_shape = (inputs_shape[1])

        # Normalize along spatial dimensions.
        norm_dim = tf.range(1, inputs_rank - 1)
        outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12)
        # Additional scaling.
        if scaling:
            scale_collections = utils.get_variable_collections(
                variables_collections, 'scale')
            scale = variables.model_variable('gamma',
                                             shape=params_shape,
                                             dtype=dtype,
                                             initializer=scale_initializer,
                                             collections=scale_collections,
                                             trainable=trainable)
            if data_format == 'NCHW':
                scale = tf.expand_dims(scale, axis=-1)
                scale = tf.expand_dims(scale, axis=-1)

            outputs = tf.multiply(outputs, scale)
        return utils.collect_named_outputs(outputs_collections,
                                           sc.original_name_scope, outputs)
Пример #43
0
def _zero_debias(unbiased_var, value, decay):
  """Compute the delta required for a debiased Variable.

  All exponential moving averages initialized with Tensors are initialized to 0,
  and therefore are biased to 0. Variables initialized to 0 and used as EMAs are
  similarly biased. This function creates the debias updated amount according to
  a scale factor, as in https://arxiv.org/abs/1412.6980.

  To demonstrate the bias the results from 0-initialization, take an EMA that
  was initialized to `0` with decay `b`. After `t` timesteps of seeing the
  constant `c`, the variable have the following value:

  ```
    EMA = 0*b^(t) + c*(1 - b)*b^(t-1) + c*(1 - b)*b^(t-2) + ...
        = c*(1 - b^t)
  ```

  To have the true value `c`, we would divide by the scale factor `1 - b^t`.

  In order to perform debiasing, we use two shadow variables. One keeps track of
  the biased estimate, and the other keeps track of the number of updates that
  have occurred.

  Args:
    unbiased_var: A Variable representing the current value of the unbiased EMA.
    value: A Tensor representing the most recent value.
    decay: A Tensor representing `1-decay` for the EMA.

  Returns:
    The amount that the unbiased variable should be updated. Computing this
    tensor will also update the shadow variables appropriately.
  """
  with variable_scope.variable_scope(
      unbiased_var.op.name, values=[unbiased_var, value, decay]) as scope:
    with ops.colocate_with(unbiased_var):
      biased_var = variable_scope.get_variable(
          "biased",
          initializer=init_ops.zeros_initializer(
              unbiased_var.get_shape(), dtype=unbiased_var.dtype),
          trainable=False)
      # Initializing the local_step to `0` would cause problems with the
      # debiasing equation, so we instead initialize to `1`.
      local_step = variable_scope.get_variable(
          "local_step",
          shape=[], dtype=unbiased_var.dtype,
          initializer=init_ops.ones_initializer(),
          trainable=False)

      # Get an update ops for both shadow variables.
      update_biased = state_ops.assign_sub(biased_var,
                                           (biased_var - value) * decay,
                                           name=scope.name)
      update_local_step = local_step.assign_add(1)

      # Compute the value of the delta to update the unbiased EMA. Make sure to
      # use the new values of the biased variable and the local step.
      with ops.control_dependencies([update_biased, update_local_step]):
        # This function gets `1 - decay`, so use `1.0 - decay` in the exponent.
        unbiased_ema_delta = (unbiased_var - biased_var.read_value() /
                              (1 - math_ops.pow(
                                  1.0 - decay, local_step.read_value())))

      return unbiased_ema_delta
    def testPrepareFeaturesForSQSS(self):
        mode = model_fn_lib.ModeKeys.TRAIN
        seq_feature_name = 'seq_feature'
        sparse_seq_feature_name = 'wire_cast'
        ctx_feature_name = 'ctx_feature'
        input_key_column_name = 'input_key_column'
        sequence_length = 4
        embedding_dimension = 8

        features = {
            input_key_column_name:
            constant_op.constant('input0'),
            sparse_seq_feature_name:
            sparse_tensor.SparseTensor(indices=[[0, 0, 0], [0, 1,
                                                            0], [1, 0, 0],
                                                [1, 1, 0], [1, 1, 1],
                                                [2, 0, 0], [2, 1, 1]],
                                       values=[
                                           b'marlo', b'stringer', b'omar',
                                           b'stringer', b'marlo', b'marlo',
                                           b'omar'
                                       ],
                                       dense_shape=[3, 2, 2]),
            seq_feature_name:
            constant_op.constant(1.0, shape=[sequence_length]),
            ctx_feature_name:
            constant_op.constant(2.0)
        }

        labels = constant_op.constant(5.0, shape=[sequence_length])

        wire_cast = feature_column.sparse_column_with_keys(
            'wire_cast', ['marlo', 'omar', 'stringer'])
        sequence_feature_columns = [
            feature_column.real_valued_column(seq_feature_name, dimension=1),
            feature_column.embedding_column(
                wire_cast,
                dimension=embedding_dimension,
                initializer=init_ops.ones_initializer())
        ]

        context_feature_columns = [
            feature_column.real_valued_column(ctx_feature_name, dimension=1)
        ]

        expected_input_key = b'input0'

        expected_sequence = {
            ssre.RNNKeys.LABELS_KEY:
            np.array([5., 5., 5., 5.]),
            seq_feature_name:
            np.array([1., 1., 1., 1.]),
            sparse_seq_feature_name:
            sparse_tensor.SparseTensor(indices=[[0, 0, 0], [0, 1,
                                                            0], [1, 0, 0],
                                                [1, 1, 0], [1, 1, 1],
                                                [2, 0, 0], [2, 1, 1]],
                                       values=[
                                           b'marlo', b'stringer', b'omar',
                                           b'stringer', b'marlo', b'marlo',
                                           b'omar'
                                       ],
                                       dense_shape=[3, 2, 2]),
        }

        expected_context = {ctx_feature_name: 2.}

        input_key, sequence, context = ssre._prepare_features_for_sqss(
            features, labels, mode, input_key_column_name,
            sequence_feature_columns, context_feature_columns)

        def assert_equal(expected, got):
            self.assertEqual(sorted(expected), sorted(got))
            for k, v in expected.items():
                if isinstance(v, sparse_tensor.SparseTensor):
                    self.assertAllEqual(v.values.eval(), got[k].values)
                    self.assertAllEqual(v.indices.eval(), got[k].indices)
                    self.assertAllEqual(v.dense_shape.eval(),
                                        got[k].dense_shape)
                else:
                    self.assertAllEqual(v, got[k])

        with self.test_session() as sess:
            sess.run(variables.global_variables_initializer())
            sess.run(data_flow_ops.initialize_all_tables())
            actual_input_key, actual_sequence, actual_context = sess.run(
                [input_key, sequence, context])
            self.assertEqual(expected_input_key, actual_input_key)
            assert_equal(expected_sequence, actual_sequence)
            assert_equal(expected_context, actual_context)
Пример #45
0
def l2_normalization(inputs,
                     scaling=False,
                     scale_initializer=init_ops.ones_initializer(),
                     reuse=None,
                     variables_collections=None,
                     outputs_collections=None,
                     data_format='NHWC',
                     trainable=True,
                     scope=None):
    """Implement L2 normalization on every feature (i.e. spatial normalization).
        实现在每个特征图上的L2正则化
    Should be extended in some near future to other dimensions, providing a more
    flexible normalization framework.
    应该在不久的将来会被扩展到其他维度,会提供更多的正则化框架
    Args:
      inputs: a 4-D tensor with dimensions [batch_size, height, width, channels].
      scaling: whether or not to add a post scaling operation along the dimensions
        which have been normalized.
      输入:一个4D的张量带有的维度[batch_size, height, width, channels]
      规模:是否要添加一个后缩放操作在需要正则化的维度之间
      scale_initializer: An initializer for the weights.
      reuse: whether or not the layer and its variables should be reused. To be
        able to reuse the layer scope must be given.
      规模初始化:一个对于权重的初始化器
      variables_collections: optional list of collections for all the variables or
        a dictionary containing a different list of collection per variable.
      变量集合:对于所有变量或者一个字典(包含每个变量的集合的不同列表)可选择的集合列表
      outputs_collections: collection to add the outputs.
      输出集合:添加输出的集合
      data_format:  NHWC or NCHW data format.
      数据格式:NHWC 或者 NCHW 数据格式
      trainable: If `True` also add variables to the graph collection
        `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
      scope: Optional scope for `variable_scope`.
      可训练的:如果是true也可以添加变量到图集合中(GraphKeys.TRAINABLE_VARIABLES)
      作用域:对于"变量的作用域"的可选择的作用域
    Returns:
      A `Tensor` representing the output of the operation.
      一个"张量"代表操作的输出
    """

    with variable_scope.variable_scope(scope,
                                       'L2Normalization', [inputs],
                                       reuse=reuse) as sc:
        inputs_shape = inputs.get_shape()
        inputs_rank = inputs_shape.ndims
        dtype = inputs.dtype.base_dtype
        if data_format == 'NHWC':
            # norm_dim = tf.range(1, inputs_rank-1)
            norm_dim = tf.range(inputs_rank - 1, inputs_rank)
            params_shape = inputs_shape[-1:]
        elif data_format == 'NCHW':
            # norm_dim = tf.range(2, inputs_rank)
            norm_dim = tf.range(1, 2)
            params_shape = (inputs_shape[1])

        # Normalize along spatial dimensions.
        outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12)
        # Additional scaling.
        if scaling:
            scale_collections = utils.get_variable_collections(
                variables_collections, 'scale')
            scale = variables.model_variable('gamma',
                                             shape=params_shape,
                                             dtype=dtype,
                                             initializer=scale_initializer,
                                             collections=scale_collections,
                                             trainable=trainable)
            if data_format == 'NHWC':
                outputs = tf.multiply(outputs, scale)
            elif data_format == 'NCHW':
                scale = tf.expand_dims(scale, axis=-1)
                scale = tf.expand_dims(scale, axis=-1)
                outputs = tf.multiply(outputs, scale)
                # outputs = tf.transpose(outputs, perm=(0, 2, 3, 1))

        return utils.collect_named_outputs(outputs_collections,
                                           sc.original_name_scope, outputs)
Пример #46
0
def group_norm(inputs,
               groups=32,
               channels_axis=-1,
               reduction_axes=(-3, -2),
               center=True,
               scale=True,
               epsilon=1e-6,
               activation_fn=None,
               param_initializers=None,
               reuse=None,
               variables_collections=None,
               outputs_collections=None,
               trainable=True,
               scope=None,
               mean_close_to_zero=False):
    """Functional interface for the group normalization layer.

  Reference: https://arxiv.org/abs/1803.08494.

    "Group Normalization", Yuxin Wu, Kaiming He

  Args:
    inputs: A Tensor with at least 2 dimensions one which is channels. All
     shape dimensions must be fully defined.
    groups: Integer. Divide the channels into this number of groups over which
      normalization statistics are computed. This number must be commensurate
      with the number of channels in `inputs`.
    channels_axis: An integer. Specifies index of channels axis which will be
      broken into `groups`, each of which whose statistics will be computed
      across. Must be mutually exclusive with `reduction_axes`. Preferred usage
      is to specify negative integers to be agnostic as to whether a batch
      dimension is included.
    reduction_axes: Tuple of integers. Specifies dimensions over which
       statistics will be accumulated. Must be mutually exclusive with
       `channels_axis`. Statistics will not be accumulated across axes not
       specified in `reduction_axes` nor `channel_axis`. Preferred usage is to
       specify negative integers to be agnostic to whether a batch dimension is
       included.

      Some sample usage cases:
        NHWC format: channels_axis=-1, reduction_axes=[-3, -2]
        NCHW format: channels_axis=-3, reduction_axes=[-2, -1]

    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
      is ignored.
    scale: If True, multiply by `gamma`. If False, `gamma` is
      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
      disabled since the scaling can be done by the next layer.
    epsilon: Small float added to variance to avoid dividing by zero.
    activation_fn: Activation function, default set to None to skip it and
      maintain a linear activation.
    param_initializers: Optional initializers for beta, gamma, moving mean and
      moving variance.
    reuse: Whether or not the layer and its variables should be reused. To be
      able to reuse the layer scope must be given.
    variables_collections: Optional collections for the variables.
    outputs_collections: Collections to add the outputs.
    trainable: If `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
    scope: Optional scope for `variable_scope`.
    mean_close_to_zero: The mean of `input` before ReLU will be close to zero
      when batch size >= 4k for Resnet-50 on TPU. If `True`, use
      `nn.sufficient_statistics` and `nn.normalize_moments` to calculate the
      variance. This is the same behavior as `fused` equals `True` in batch
      normalization. If `False`, use `nn.moments` to calculate the variance.
      When `mean` is close to zero, like 1e-4, use `mean` to calculate the
      variance may have poor result due to repeated roundoff error and
      denormalization in `mean`.  When `mean` is large, like 1e2,
      sum(`input`^2) is so large that only the high-order digits of the elements
      are being accumulated. Thus, use sum(`input` - `mean`)^2/n to calcualte
      the variance has better accuracy compared to (sum(`input`^2)/n - `mean`^2)
      when `mean` is large.


  Returns:
    A `Tensor` representing the output of the operation.

  Raises:
    ValueError: If the rank of `inputs` is undefined.
    ValueError: If rank or channels dimension of `inputs` is undefined.
    ValueError: If number of groups is not commensurate with number of channels.
    ValueError: If reduction_axes or channels_axis are out of bounds.
    ValueError: If reduction_axes are not mutually exclusive with channels_axis.
  """
    # TODO(shlens): Support partially defined shapes for the inputs.
    inputs = ops.convert_to_tensor(inputs)
    original_shape = inputs.shape

    if inputs.shape.ndims is None:
        raise ValueError('Inputs %s has undefined rank.' % inputs.name)
    if channels_axis > (inputs.shape.ndims - 1):
        raise ValueError('Axis is out of bounds.')

    # Standardize the channels_axis to be positive and identify # of channels.
    if channels_axis < 0:
        channels_axis = inputs.shape.ndims + channels_axis
    channels = inputs.shape[channels_axis].value

    if channels is None:
        raise ValueError('Inputs %s has undefined channel dimension: %d.' %
                         (inputs.name, channels_axis))

    # Standardize the reduction_axes to be positive.
    reduction_axes = list(reduction_axes)
    for i in range(len(reduction_axes)):
        if reduction_axes[i] < 0:
            reduction_axes[i] += inputs.shape.ndims

    for a in reduction_axes:
        if a > inputs.shape.ndims:
            raise ValueError('Axis is out of bounds.')
        if inputs.shape[a].value is None:
            raise ValueError('Inputs %s has undefined dimensions %d.' %
                             (inputs.name, a))
        if channels_axis == a:
            raise ValueError('reduction_axis must be mutually exclusive '
                             'with channels_axis')
    if groups > channels:
        raise ValueError('Invalid groups %d for %d channels.' %
                         (groups, channels))
    if channels % groups != 0:
        raise ValueError('%d channels is not commensurate with %d groups.' %
                         (channels, groups))

    # Determine axes before channels. Some examples of common image formats:
    #  'NCHW': before = [N], after = [HW]
    #  'NHWC': before = [NHW], after = []
    axes_before_channels = inputs.shape.as_list()[:channels_axis]
    axes_after_channels = inputs.shape.as_list()[channels_axis + 1:]

    # Manually broadcast the parameters to conform to the number of groups.
    params_shape_broadcast = ([1] * len(axes_before_channels) +
                              [groups, channels // groups] +
                              [1] * len(axes_after_channels))

    # Reshape the input by the group within the channel dimension.
    inputs_shape = (axes_before_channels + [groups, channels // groups] +
                    axes_after_channels)
    inputs = array_ops.reshape(inputs, inputs_shape)

    # Determine the dimensions across which moments are calculated.
    moments_axes = [channels_axis + 1]
    for a in reduction_axes:
        if a > channels_axis:
            moments_axes.append(a + 1)
        else:
            moments_axes.append(a)

    with variable_scope.variable_scope(scope,
                                       'GroupNorm', [inputs],
                                       reuse=reuse) as sc:
        # Note that the params_shape is the number of channels always.
        params_shape = [channels]

        # Allocate parameters for the beta and gamma of the normalization.
        beta, gamma = None, None
        dtype = inputs.dtype.base_dtype
        if param_initializers is None:
            param_initializers = {}
        if center:
            beta_collections = utils.get_variable_collections(
                variables_collections, 'beta')
            beta_initializer = param_initializers.get(
                'beta', init_ops.zeros_initializer())
            beta = variables.model_variable('beta',
                                            shape=params_shape,
                                            dtype=dtype,
                                            initializer=beta_initializer,
                                            collections=beta_collections,
                                            trainable=trainable)
            beta = array_ops.reshape(beta, params_shape_broadcast)

        if scale:
            gamma_collections = utils.get_variable_collections(
                variables_collections, 'gamma')
            gamma_initializer = param_initializers.get(
                'gamma', init_ops.ones_initializer())
            gamma = variables.model_variable('gamma',
                                             shape=params_shape,
                                             dtype=dtype,
                                             initializer=gamma_initializer,
                                             collections=gamma_collections,
                                             trainable=trainable)
            gamma = array_ops.reshape(gamma, params_shape_broadcast)

        # Calculate the moments.
        if mean_close_to_zero:
            # One pass algorithm returns better result when mean is close to zero.
            counts, means_ss, variance_ss, _ = nn.sufficient_statistics(
                inputs, moments_axes, keep_dims=True)
            mean, variance = nn.normalize_moments(counts,
                                                  means_ss,
                                                  variance_ss,
                                                  shift=None)
        else:
            mean, variance = nn.moments(inputs, moments_axes, keep_dims=True)

        # Compute normalization.
        # TODO(shlens): Fix nn.batch_normalization to handle the 5-D Tensor
        # appropriately so that this operation may be faster.
        gain = math_ops.rsqrt(variance + epsilon)
        offset = -mean * gain
        if gamma is not None:
            gain *= gamma
            offset *= gamma
        if beta is not None:
            offset += beta
        outputs = inputs * gain + offset

        # Collapse the groups into the channel dimension.
        outputs = array_ops.reshape(outputs, original_shape)

        if activation_fn is not None:
            outputs = activation_fn(outputs)
        return utils.collect_named_outputs(outputs_collections, sc.name,
                                           outputs)
Пример #47
0
def custom_batch_norm(inputs,
                      decay=0.999,
                      center=True,
                      scale=False,
                      epsilon=0.001,
                      activation_fn=None,
                      param_initializers=None,
                      param_regularizers=None,
                      updates_collections=ops.GraphKeys.UPDATE_OPS,
                      is_training=True,
                      reuse=None,
                      variables_collections=None,
                      outputs_collections=None,
                      trainable=True,
                      batch_weights=None,
                      data_format='NHWC',
                      zero_debias_moving_mean=False,
                      scope=None,
                      renorm=False,
                      renorm_clipping=None,
                      renorm_decay=0.99,
                      noise_std=None):
    """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167.

      "Batch Normalization: Accelerating Deep Network Training by Reducing
      Internal Covariate Shift"

      Sergey Ioffe, Christian Szegedy

    Can be used as a normalizer function for conv2d and fully_connected.

    Note: when training, the moving_mean and moving_variance need to be updated.
    By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
    need to be added as a dependency to the `train_op`. For example:

    ```python
      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
      with tf.control_dependencies(update_ops):
        train_op = optimizer.minimize(loss)
    ```

    One can set updates_collections=None to force the updates in place, but that
    can have a speed penalty, especially in distributed settings.

    Args:
      inputs: A tensor with 2 or more dimensions, where the first dimension has
        `batch_size`. The normalization is over all but the last dimension if
        `data_format` is `NHWC` and the second dimension if `data_format` is
        `NCHW`.
      decay: Decay for the moving average. Reasonable values for `decay` are close
        to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc.
        Lower `decay` value (recommend trying `decay`=0.9) if model experiences
        reasonably good training performance but poor validation and/or test
        performance. Try zero_debias_moving_mean=True for improved stability.
      center: If True, add offset of `beta` to normalized tensor. If False, `beta`
        is ignored.
      scale: If True, multiply by `gamma`. If False, `gamma` is
        not used. When the next layer is linear (also e.g. `nn.relu`), this can be
        disabled since the scaling can be done by the next layer.
      epsilon: Small float added to variance to avoid dividing by zero.
      activation_fn: Activation function, default set to None to skip it and
        maintain a linear activation.
      param_initializers: Optional initializers for beta, gamma, moving mean and
        moving variance.
      param_regularizers: Optional regularizer for beta and gamma.
      updates_collections: Collections to collect the update ops for computation.
        The updates_ops need to be executed with the train_op.
        If None, a control dependency would be added to make sure the updates are
        computed in place.
      is_training: Whether or not the layer is in training mode. In training mode
        it would accumulate the statistics of the moments into `moving_mean` and
        `moving_variance` using an exponential moving average with the given
        `decay`. When it is not in training mode then it would use the values of
        the `moving_mean` and the `moving_variance`.
      reuse: Whether or not the layer and its variables should be reused. To be
        able to reuse the layer scope must be given.
      variables_collections: Optional collections for the variables.
      outputs_collections: Collections to add the outputs.
      trainable: If `True` also add variables to the graph collection
        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
      batch_weights: An optional tensor of shape `[batch_size]`,
        containing a frequency weight for each batch item. If present,
        then the batch normalization uses weighted mean and
        variance. (This can be used to correct for bias in training
        example selection.)
      fused:  Use nn.fused_batch_norm if True, nn.batch_normalization otherwise.
      data_format: A string. `NHWC` (default) and `NCHW` are supported.
      zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new
        pair of variables 'moving_mean/biased' and 'moving_mean/local_step'.
      scope: Optional scope for `variable_scope`.
      renorm: Whether to use Batch Renormalization
        (https://arxiv.org/abs/1702.03275). This adds extra variables during
        training. The inference is the same for either value of this parameter.
      renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
        scalar `Tensors` used to clip the renorm correction. The correction
        `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
        `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
        dmax are set to inf, 0, inf, respectively.
      renorm_decay: Momentum used to update the moving means and standard
        deviations with renorm. Unlike `momentum`, this affects training
        and should be neither too small (which would add noise) nor too large
        (which would give stale estimates). Note that `decay` is still applied
        to get the means and variances for inference.

    Returns:
      A `Tensor` representing the output of the operation.

    Raises:
      ValueError: If `batch_weights` is not None and `fused` is True.
      ValueError: If `param_regularizers` is not None and `fused` is True.
      ValueError: If `data_format` is neither `NHWC` nor `NCHW`.
      ValueError: If the rank of `inputs` is undefined.
      ValueError: If rank or channels dimension of `inputs` is undefined.
    """

    layer_variable_getter = slim.layers._build_variable_getter()
    with variable_scope.variable_scope(
            scope,
            'BatchNorm', [inputs],
            reuse=reuse,
            custom_getter=layer_variable_getter) as sc:
        inputs = ops.convert_to_tensor(inputs)

        # Determine whether we can use the core layer class.
        if (batch_weights is None
                and updates_collections is ops.GraphKeys.UPDATE_OPS
                and not zero_debias_moving_mean):
            # Use the core layer class.
            axis = 1 if data_format == 'NCHW' else -1
            if not param_initializers:
                param_initializers = {}
            beta_initializer = param_initializers.get(
                'beta', init_ops.zeros_initializer())
            gamma_initializer = param_initializers.get(
                'gamma', init_ops.ones_initializer())
            moving_mean_initializer = param_initializers.get(
                'moving_mean', init_ops.zeros_initializer())
            moving_variance_initializer = param_initializers.get(
                'moving_variance', init_ops.ones_initializer())
            if not param_regularizers:
                param_regularizers = {}
            beta_regularizer = param_regularizers.get('beta')
            gamma_regularizer = param_regularizers.get('gamma')
            layer = normalization_layers.BatchNormalization(
                axis=axis,
                momentum=decay,
                epsilon=epsilon,
                center=center,
                scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                moving_mean_initializer=moving_mean_initializer,
                moving_variance_initializer=moving_variance_initializer,
                beta_regularizer=beta_regularizer,
                gamma_regularizer=gamma_regularizer,
                trainable=trainable,
                renorm=renorm,
                renorm_clipping=renorm_clipping,
                renorm_momentum=renorm_decay,
                name=sc.name,
                _scope=sc,
                _reuse=reuse)
            outputs = layer.apply(inputs, training=is_training)

            # Add variables to collections.
            slim.layers._add_variable_to_collections(layer.moving_mean,
                                                     variables_collections,
                                                     'moving_mean')
            slim.layers._add_variable_to_collections(layer.moving_variance,
                                                     variables_collections,
                                                     'moving_variance')
            if layer.beta:
                slim.layers._add_variable_to_collections(
                    layer.beta, variables_collections, 'beta')
            if layer.gamma:
                slim.layers._add_variable_to_collections(
                    layer.gamma, variables_collections, 'gamma')

            if activation_fn is not None:
                outputs = activation_fn(outputs)
            return utils.collect_named_outputs(outputs_collections,
                                               sc.original_name_scope, outputs)

        # Not supported by layer class: batch_weights argument,
        # and custom updates_collections. In that case, use the legacy BN
        # implementation.
        # Custom updates collections are not supported because the update logic
        # is different in this case, in particular w.r.t. "forced updates" and
        # update op reuse.
        if renorm:
            raise ValueError('renorm is not supported with batch_weights, '
                             'updates_collections or zero_debias_moving_mean')
        inputs_shape = inputs.get_shape()
        inputs_rank = inputs_shape.ndims
        if inputs_rank is None:
            raise ValueError('Inputs %s has undefined rank.' % inputs.name)
        dtype = inputs.dtype.base_dtype
        if batch_weights is not None:
            batch_weights = ops.convert_to_tensor(batch_weights)
            inputs_shape[0:1].assert_is_compatible_with(
                batch_weights.get_shape())
            # Reshape batch weight values so they broadcast across inputs.
            nshape = [-1] + [1 for _ in range(inputs_rank - 1)]
            batch_weights = array_ops.reshape(batch_weights, nshape)

        if data_format == 'NCHW':
            moments_axes = [0] + list(range(2, inputs_rank))
            params_shape = inputs_shape[1:2]
            # For NCHW format, rather than relying on implicit broadcasting, we
            # explicitly reshape the params to params_shape_broadcast when computing
            # the moments and the batch normalization.
            params_shape_broadcast = list([1, inputs_shape[1].value] +
                                          [1 for _ in range(2, inputs_rank)])
        else:
            moments_axes = list(range(inputs_rank - 1))
            params_shape = inputs_shape[-1:]
            params_shape_broadcast = None
        if not params_shape.is_fully_defined():
            raise ValueError('Inputs %s has undefined channels dimension %s.' %
                             (inputs.name, params_shape))

        # Allocate parameters for the beta and gamma of the normalization.
        beta, gamma = None, None
        if not param_initializers:
            param_initializers = {}
        if center:
            beta_collections = utils.get_variable_collections(
                variables_collections, 'beta')
            beta_initializer = param_initializers.get(
                'beta', init_ops.zeros_initializer())
            beta = variables.model_variable('beta',
                                            shape=params_shape,
                                            dtype=dtype,
                                            initializer=beta_initializer,
                                            collections=beta_collections,
                                            trainable=trainable)
        if scale:
            gamma_collections = utils.get_variable_collections(
                variables_collections, 'gamma')
            gamma_initializer = param_initializers.get(
                'gamma', init_ops.ones_initializer())
            gamma = variables.model_variable('gamma',
                                             shape=params_shape,
                                             dtype=dtype,
                                             initializer=gamma_initializer,
                                             collections=gamma_collections,
                                             trainable=trainable)

        # Create moving_mean and moving_variance variables and add them to the
        # appropriate collections. We disable variable partitioning while creating
        # them, because assign_moving_average is not yet supported for partitioned
        # variables.
        partitioner = variable_scope.get_variable_scope().partitioner
        try:
            variable_scope.get_variable_scope().set_partitioner(None)
            moving_mean_collections = utils.get_variable_collections(
                variables_collections, 'moving_mean')
            moving_mean_initializer = param_initializers.get(
                'moving_mean', init_ops.zeros_initializer())
            moving_mean = variables.model_variable(
                'moving_mean',
                shape=params_shape,
                dtype=dtype,
                initializer=moving_mean_initializer,
                trainable=False,
                collections=moving_mean_collections)
            moving_variance_collections = utils.get_variable_collections(
                variables_collections, 'moving_variance')
            moving_variance_initializer = param_initializers.get(
                'moving_variance', init_ops.ones_initializer())
            moving_variance = variables.model_variable(
                'moving_variance',
                shape=params_shape,
                dtype=dtype,
                initializer=moving_variance_initializer,
                trainable=False,
                collections=moving_variance_collections)
        finally:
            variable_scope.get_variable_scope().set_partitioner(partitioner)

        # If `is_training` doesn't have a constant value, because it is a `Tensor`,
        # a `Variable` or `Placeholder` then is_training_value will be None and
        # `needs_moments` will be true.
        is_training_value = utils.constant_value(is_training)
        need_moments = is_training_value is None or is_training_value
        if need_moments:
            # Calculate the moments based on the individual batch.
            if batch_weights is None:
                if data_format == 'NCHW':
                    mean, variance = nn.moments(inputs,
                                                moments_axes,
                                                keep_dims=True)
                    mean = array_ops.reshape(mean, [-1])
                    variance = array_ops.reshape(variance, [-1])
                else:
                    mean, variance = nn.moments(inputs, moments_axes)
            else:
                if data_format == 'NCHW':
                    mean, variance = nn.weighted_moments(inputs,
                                                         moments_axes,
                                                         batch_weights,
                                                         keep_dims=True)
                    mean = array_ops.reshape(mean, [-1])
                    variance = array_ops.reshape(variance, [-1])
                else:
                    mean, variance = nn.weighted_moments(
                        inputs, moments_axes, batch_weights)

            moving_vars_fn = lambda: (moving_mean, moving_variance)
            if updates_collections is None:

                def _force_updates():
                    """Internal function forces updates moving_vars if is_training."""
                    update_moving_mean = moving_averages.assign_moving_average(
                        moving_mean,
                        mean,
                        decay,
                        zero_debias=zero_debias_moving_mean)
                    update_moving_variance = moving_averages.assign_moving_average(
                        moving_variance, variance, decay, zero_debias=False)
                    with ops.control_dependencies(
                        [update_moving_mean, update_moving_variance]):
                        return array_ops.identity(mean), array_ops.identity(
                            variance)

                mean, variance = utils.smart_cond(is_training, _force_updates,
                                                  moving_vars_fn)
            else:

                def _delay_updates():
                    """Internal function that delay updates moving_vars if is_training."""
                    update_moving_mean = moving_averages.assign_moving_average(
                        moving_mean,
                        mean,
                        decay,
                        zero_debias=zero_debias_moving_mean)
                    update_moving_variance = moving_averages.assign_moving_average(
                        moving_variance, variance, decay, zero_debias=False)
                    return update_moving_mean, update_moving_variance

                update_mean, update_variance = utils.smart_cond(
                    is_training, _delay_updates, moving_vars_fn)
                ops.add_to_collections(updates_collections, update_mean)
                ops.add_to_collections(updates_collections, update_variance)
                # Use computed moments during training and moving_vars otherwise.
                vars_fn = lambda: (mean, variance)
                mean, variance = utils.smart_cond(is_training, vars_fn,
                                                  moving_vars_fn)
        else:
            mean, variance = moving_mean, moving_variance
        if data_format == 'NCHW':
            mean = array_ops.reshape(mean, params_shape_broadcast)
            variance = array_ops.reshape(variance, params_shape_broadcast)
            beta = array_ops.reshape(beta, params_shape_broadcast)
            if gamma is not None:
                gamma = array_ops.reshape(gamma, params_shape_broadcast)

        # Compute batch_normalization.
        outputs = batch_normalization(inputs,
                                      mean,
                                      variance,
                                      beta,
                                      gamma,
                                      epsilon,
                                      noise_std=noise_std)
        outputs.set_shape(inputs_shape)
        if activation_fn is not None:
            outputs = activation_fn(outputs)
        return utils.collect_named_outputs(outputs_collections,
                                           sc.original_name_scope, outputs)
Пример #48
0
class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):

  def _assert_all_close(self, expected, actual, atol=0.001):
    if not context.executing_eagerly():
      with self.cached_session() as sess:
        keras_backend._initialize_variables(sess)
        self.assertAllClose(expected, actual, atol=atol)
    else:
      self.assertAllClose(expected, actual, atol=atol)

  @test_util.run_in_graph_and_eager_modes()
  def test_invalid_output_dim(self):
    with self.assertRaisesRegexp(
        ValueError, r'`output_dim` should be a positive integer. Given: -3.'):
      _ = kernel_layers.RandomFourierFeatures(output_dim=-3, scale=2.0)

  @test_util.run_in_graph_and_eager_modes()
  def test_unsupported_kernel_type(self):
    with self.assertRaisesRegexp(
        ValueError, r'Unsupported kernel type: \'unsupported_kernel\'.'):
      _ = kernel_layers.RandomFourierFeatures(
          3, 'unsupported_kernel', stddev=2.0)

  @test_util.run_in_graph_and_eager_modes()
  def test_invalid_scale(self):
    with self.assertRaisesRegexp(
        ValueError,
        r'When provided, `scale` should be a positive float. Given: 0.0.'):
      _ = kernel_layers.RandomFourierFeatures(output_dim=10, scale=0.0)

  @test_util.run_in_graph_and_eager_modes()
  def test_invalid_input_shape(self):
    inputs = random_ops.random_uniform((3, 2, 4), seed=1)
    rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0)
    with self.assertRaisesRegexp(
        ValueError,
        r'The rank of the input tensor should be 2. Got 3 instead.'):
      _ = rff_layer.apply(inputs)

  @parameterized.named_parameters(
      ('gaussian', 'gaussian', 10.0, False),
      ('random', init_ops.random_uniform_initializer, 1.0, True))
  @test_util.run_in_graph_and_eager_modes()
  def test_random_features_properties(self, initializer, scale, trainable):
    rff_layer = kernel_layers.RandomFourierFeatures(
        output_dim=10,
        kernel_initializer=initializer,
        scale=scale,
        trainable=trainable)
    self.assertEqual(rff_layer.output_dim, 10)
    self.assertEqual(rff_layer.kernel_initializer, initializer)
    self.assertEqual(rff_layer.scale, scale)
    self.assertEqual(rff_layer.trainable, trainable)

  @parameterized.named_parameters(('gaussian', 'gaussian', False),
                                  ('laplacian', 'laplacian', True),
                                  ('other', init_ops.ones_initializer, True))
  @test_util.run_in_graph_and_eager_modes()
  def test_call(self, initializer, trainable):
    rff_layer = kernel_layers.RandomFourierFeatures(
        output_dim=10,
        kernel_initializer=initializer,
        scale=1.0,
        trainable=trainable,
        name='random_fourier_features')
    inputs = random_ops.random_uniform((3, 2), seed=1)
    outputs = rff_layer(inputs)
    self.assertListEqual([3, 10], outputs.shape.as_list())
    num_trainable_vars = 1 if trainable else 0
    self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars)
    if not context.executing_eagerly():
      self.assertLen(
          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
          num_trainable_vars)

  @test_util.assert_no_new_pyobjects_executing_eagerly
  def test_no_eager_Leak(self):
    # Tests that repeatedly constructing and building a Layer does not leak
    # Python objects.
    inputs = random_ops.random_uniform((5, 4), seed=1)
    kernel_layers.RandomFourierFeatures(output_dim=4, name='rff')(inputs)
    kernel_layers.RandomFourierFeatures(output_dim=10, scale=2.0)(inputs)

  @test_util.run_in_graph_and_eager_modes()
  def test_output_shape(self):
    inputs = random_ops.random_uniform((3, 2), seed=1)
    rff_layer = kernel_layers.RandomFourierFeatures(
        output_dim=7, name='random_fourier_features', trainable=True)
    outputs = rff_layer(inputs)
    self.assertEqual([3, 7], outputs.shape.as_list())

  @parameterized.named_parameters(
      ('gaussian', 'gaussian'), ('laplacian', 'laplacian'),
      ('other', init_ops.random_uniform_initializer))
  @test_util.run_deprecated_v1
  def test_call_on_placeholder(self, initializer):
    inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[None, None])
    rff_layer = kernel_layers.RandomFourierFeatures(
        output_dim=5,
        kernel_initializer=initializer,
        name='random_fourier_features')
    with self.assertRaisesRegexp(
        ValueError, r'The last dimension of the inputs to '
        '`RandomFourierFeatures` should be defined. Found `None`.'):
      rff_layer(inputs)

    inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[2, None])
    rff_layer = kernel_layers.RandomFourierFeatures(
        output_dim=5,
        kernel_initializer=initializer,
        name='random_fourier_features')
    with self.assertRaisesRegexp(
        ValueError, r'The last dimension of the inputs to '
        '`RandomFourierFeatures` should be defined. Found `None`.'):
      rff_layer(inputs)

    inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 3])
    rff_layer = kernel_layers.RandomFourierFeatures(
        output_dim=5, name='random_fourier_features')
    rff_layer(inputs)

  @parameterized.named_parameters(('gaussian', 10, 'gaussian', 2.0),
                                  ('laplacian', 5, 'laplacian', None),
                                  ('other', 10, init_ops.ones_initializer, 1.0))
  @test_util.run_in_graph_and_eager_modes()
  def test_compute_output_shape(self, output_dim, initializer, scale):
    rff_layer = kernel_layers.RandomFourierFeatures(
        output_dim, initializer, scale=scale, name='rff')
    with self.assertRaises(ValueError):
      rff_layer.compute_output_shape(tensor_shape.TensorShape(None))
    with self.assertRaises(ValueError):
      rff_layer.compute_output_shape(tensor_shape.TensorShape([]))
    with self.assertRaises(ValueError):
      rff_layer.compute_output_shape(tensor_shape.TensorShape([3]))
    with self.assertRaises(ValueError):
      rff_layer.compute_output_shape(tensor_shape.TensorShape([3, 2, 3]))

    with self.assertRaisesRegexp(
        ValueError, r'The innermost dimension of input shape must be defined.'):
      rff_layer.compute_output_shape(tensor_shape.TensorShape([3, None]))

    self.assertEqual([None, output_dim],
                     rff_layer.compute_output_shape((None, 3)).as_list())
    self.assertEqual([None, output_dim],
                     rff_layer.compute_output_shape(
                         tensor_shape.TensorShape([None, 2])).as_list())
    self.assertEqual([4, output_dim],
                     rff_layer.compute_output_shape((4, 1)).as_list())

  @parameterized.named_parameters(
      ('gaussian', 10, 'gaussian', 3.0, False),
      ('laplacian', 5, 'laplacian', 5.5, True),
      ('other', 7, init_ops.random_uniform_initializer(), None, True))
  @test_util.run_in_graph_and_eager_modes()
  def test_get_config(self, output_dim, initializer, scale, trainable):
    rff_layer = kernel_layers.RandomFourierFeatures(
        output_dim,
        initializer,
        scale=scale,
        trainable=trainable,
        name='random_fourier_features',
    )
    expected_initializer = initializer
    if isinstance(initializer, init_ops.Initializer):
      expected_initializer = initializers.serialize(initializer)

    expected_config = {
        'output_dim': output_dim,
        'kernel_initializer': expected_initializer,
        'scale': scale,
        'name': 'random_fourier_features',
        'trainable': trainable,
        'dtype': None,
    }
    self.assertLen(expected_config, len(rff_layer.get_config()))
    self.assertSameElements(
        list(expected_config.items()), list(rff_layer.get_config().items()))

  @parameterized.named_parameters(
      ('gaussian', 5, 'gaussian', None, True),
      ('laplacian', 5, 'laplacian', 5.5, False),
      ('other', 7, init_ops.ones_initializer(), 2.0, True))
  @test_util.run_in_graph_and_eager_modes()
  def test_from_config(self, output_dim, initializer, scale, trainable):
    model_config = {
        'output_dim': output_dim,
        'kernel_initializer': initializer,
        'scale': scale,
        'trainable': trainable,
        'name': 'random_fourier_features',
    }
    rff_layer = kernel_layers.RandomFourierFeatures.from_config(model_config)
    self.assertEqual(rff_layer.output_dim, output_dim)
    self.assertEqual(rff_layer.kernel_initializer, initializer)
    self.assertEqual(rff_layer.scale, scale)
    self.assertEqual(rff_layer.trainable, trainable)

    inputs = random_ops.random_uniform((3, 2), seed=1)
    outputs = rff_layer(inputs)
    self.assertListEqual([3, output_dim], outputs.shape.as_list())
    num_trainable_vars = 1 if trainable else 0
    self.assertLen(rff_layer.trainable_variables, num_trainable_vars)
    if trainable:
      self.assertEqual('random_fourier_features/random_features_scale:0',
                       rff_layer.trainable_variables[0].name)
    self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars)
    if not context.executing_eagerly():
      self.assertLen(
          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
          num_trainable_vars)

  @parameterized.named_parameters(
      ('gaussian', 10, 'gaussian', 3.0, True),
      ('laplacian', 5, 'laplacian', 5.5, False),
      ('other', 10, init_ops.random_uniform_initializer(), None, True))
  @test_util.run_in_graph_and_eager_modes()
  def test_same_random_features_params_reused(self, output_dim, initializer,
                                              scale, trainable):
    """Applying the layer on the same input twice gives the same output."""
    rff_layer = kernel_layers.RandomFourierFeatures(
        output_dim=output_dim,
        kernel_initializer=initializer,
        scale=scale,
        trainable=trainable,
        name='random_fourier_features')
    inputs = constant_op.constant(
        np.random.uniform(low=-1.0, high=1.0, size=(2, 4)))
    output1 = rff_layer.apply(inputs)
    output2 = rff_layer.apply(inputs)
    self._assert_all_close(output1, output2)

  @parameterized.named_parameters(
      ('gaussian', 'gaussian', 5.0), ('laplacian', 'laplacian', 3.0),
      ('other', init_ops.random_uniform_initializer(), 5.0))
  @test_util.run_in_graph_and_eager_modes()
  def test_different_params_similar_approximation(self, initializer, scale):
    random_seed.set_random_seed(12345)
    rff_layer1 = kernel_layers.RandomFourierFeatures(
        output_dim=3000,
        kernel_initializer=initializer,
        scale=scale,
        name='rff1')
    rff_layer2 = kernel_layers.RandomFourierFeatures(
        output_dim=2000,
        kernel_initializer=initializer,
        scale=scale,
        name='rff2')
    # Two distinct inputs.
    x = constant_op.constant([[1.0, -1.0, 0.5]])
    y = constant_op.constant([[-1.0, 1.0, 1.0]])

    # Apply both layers to both inputs.
    output_x1 = math.sqrt(2.0 / 3000.0) * rff_layer1.apply(x)
    output_y1 = math.sqrt(2.0 / 3000.0) * rff_layer1.apply(y)
    output_x2 = math.sqrt(2.0 / 2000.0) * rff_layer2.apply(x)
    output_y2 = math.sqrt(2.0 / 2000.0) * rff_layer2.apply(y)

    # Compute the inner products of the outputs (on inputs x and y) for both
    # layers. For any fixed random features layer rff_layer, and inputs x, y,
    # rff_layer(x)^T * rff_layer(y) ~= K(x,y) up to a normalization factor.
    approx_kernel1 = kernelized_utils.inner_product(output_x1, output_y1)
    approx_kernel2 = kernelized_utils.inner_product(output_x2, output_y2)
    self._assert_all_close(approx_kernel1, approx_kernel2, atol=0.08)

  @parameterized.named_parameters(
      ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)),
      ('laplacian', 'laplacian', 20.0, _exact_laplacian(stddev=20.0)))
  @test_util.run_in_graph_and_eager_modes()
  def test_bad_kernel_approximation(self, initializer, scale, exact_kernel_fn):
    """Approximation is bad when output dimension is small."""
    # Two distinct inputs.
    x = constant_op.constant([[1.0, -1.0, 0.5]])
    y = constant_op.constant([[-1.0, 1.0, 1.0]])

    small_output_dim = 10
    random_seed.set_random_seed(1234)
    # Initialize layer.
    rff_layer = kernel_layers.RandomFourierFeatures(
        output_dim=small_output_dim,
        kernel_initializer=initializer,
        scale=scale,
        name='random_fourier_features')

    # Apply layer to both inputs.
    output_x = math.sqrt(2.0 / small_output_dim) * rff_layer.apply(x)
    output_y = math.sqrt(2.0 / small_output_dim) * rff_layer.apply(y)

    # The inner products of the outputs (on inputs x and y) approximates the
    # real value of the RBF kernel but poorly since the output dimension of the
    # layer is small.
    exact_kernel_value = exact_kernel_fn(x, y)
    approx_kernel_value = kernelized_utils.inner_product(output_x, output_y)
    abs_error = math_ops.abs(exact_kernel_value - approx_kernel_value)
    if not context.executing_eagerly():
      with self.cached_session() as sess:
        keras_backend._initialize_variables(sess)
        abs_error_eval = sess.run([abs_error])
        self.assertGreater(abs_error_eval[0][0], 0.05)
        self.assertLess(abs_error_eval[0][0], 0.5)
    else:
      self.assertGreater(abs_error, 0.05)
      self.assertLess(abs_error, 0.5)

  @parameterized.named_parameters(
      ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)),
      ('laplacian', 'laplacian', 10.0, _exact_laplacian(stddev=10.0)))
  @test_util.run_in_graph_and_eager_modes()
  def test_good_kernel_approximation_multiple_inputs(self, initializer, scale,
                                                     exact_kernel_fn):
    # Parameters.
    input_dim = 5
    output_dim = 2000
    x_rows = 20
    y_rows = 30

    x = constant_op.constant(
        np.random.uniform(size=(x_rows, input_dim)), dtype=dtypes.float32)
    y = constant_op.constant(
        np.random.uniform(size=(y_rows, input_dim)), dtype=dtypes.float32)

    random_seed.set_random_seed(1234)
    rff_layer = kernel_layers.RandomFourierFeatures(
        output_dim=output_dim,
        kernel_initializer=initializer,
        scale=scale,
        name='random_fourier_features')

    # The shapes of output_x and output_y are (x_rows, output_dim) and
    # (y_rows, output_dim) respectively.
    output_x = math.sqrt(2.0 / output_dim) * rff_layer.apply(x)
    output_y = math.sqrt(2.0 / output_dim) * rff_layer.apply(y)

    approx_kernel_matrix = kernelized_utils.inner_product(output_x, output_y)
    exact_kernel_matrix = exact_kernel_fn(x, y)
    self._assert_all_close(approx_kernel_matrix, exact_kernel_matrix, atol=0.05)