def __init__(self, axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True, beta_initializer=init_ops.zeros_initializer(), gamma_initializer=init_ops.ones_initializer(), moving_mean_initializer=init_ops.zeros_initializer(), moving_variance_initializer=init_ops.ones_initializer(), beta_regularizer=None, gamma_regularizer=None, trainable=True, name=None, **kwargs): super(BatchNormalization, self).__init__( name=name, trainable=trainable, **kwargs) self.axis = axis self.momentum = momentum self.epsilon = epsilon self.center = center self.scale = scale self.beta_initializer = beta_initializer self.gamma_initializer = gamma_initializer self.moving_mean_initializer = moving_mean_initializer self.moving_variance_initializer = moving_variance_initializer self.beta_regularizer = beta_regularizer self.gamma_regularizer = gamma_regularizer
def __init__(self, axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True, beta_initializer=init_ops.zeros_initializer(), gamma_initializer=init_ops.ones_initializer(), moving_mean_initializer=init_ops.zeros_initializer(), moving_variance_initializer=init_ops.ones_initializer(), beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None, renorm=False, renorm_clipping=None, renorm_momentum=0.99, fused=None, trainable=True, virtual_batch_size=None, adjustment=None, name=None, **kwargs): super(BatchNormalization, self).__init__( name=name, trainable=trainable, **kwargs) if isinstance(axis, list): self.axis = axis[:] else: self.axis = axis self.momentum = momentum self.epsilon = epsilon self.center = center self.scale = scale self.beta_initializer = beta_initializer self.gamma_initializer = gamma_initializer self.moving_mean_initializer = moving_mean_initializer self.moving_variance_initializer = moving_variance_initializer self.beta_regularizer = beta_regularizer self.gamma_regularizer = gamma_regularizer self.beta_constraint = beta_constraint self.gamma_constraint = gamma_constraint self.renorm = renorm self.virtual_batch_size = virtual_batch_size self.adjustment = adjustment if fused is None: fused = True self.fused = fused self._bessels_correction_test_only = True if renorm: renorm_clipping = renorm_clipping or {} keys = ['rmax', 'rmin', 'dmax'] if set(renorm_clipping) - set(keys): raise ValueError('renorm_clipping %s contains keys not in %s' % (renorm_clipping, keys)) self.renorm_clipping = renorm_clipping self.renorm_momentum = renorm_momentum
def __init__(self, axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True, beta_initializer=init_ops.zeros_initializer(), gamma_initializer=init_ops.ones_initializer(), moving_mean_initializer=init_ops.zeros_initializer(), moving_variance_initializer=init_ops.ones_initializer(), beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None, renorm=False, renorm_clipping=None, renorm_momentum=0.99, fused=None, trainable=True, name=None, **kwargs): super(BatchNormalization, self).__init__( name=name, trainable=trainable, **kwargs) self.axis = axis self.momentum = momentum self.epsilon = epsilon self.center = center self.scale = scale self.beta_initializer = beta_initializer self.gamma_initializer = gamma_initializer self.moving_mean_initializer = moving_mean_initializer self.moving_variance_initializer = moving_variance_initializer self.beta_regularizer = beta_regularizer self.gamma_regularizer = gamma_regularizer self.beta_constraint = beta_constraint self.gamma_constraint = gamma_constraint self.renorm = renorm # This environment variable is only used during the testing period of fused # batch norm and will be removed after that. if fused is None: fused = _FUSED_DEFAULT self.fused = fused self._bessels_correction_test_only = True if renorm: renorm_clipping = renorm_clipping or {} keys = ['rmax', 'rmin', 'dmax'] if set(renorm_clipping) - set(keys): raise ValueError('renorm_clipping %s contains keys not in %s' % (renorm_clipping, keys)) self.renorm_clipping = renorm_clipping self.renorm_momentum = renorm_momentum
def __init__(self, axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True, beta_initializer=init_ops.zeros_initializer(), gamma_initializer=init_ops.ones_initializer(), moving_mean_initializer=init_ops.zeros_initializer(), moving_variance_initializer=init_ops.ones_initializer(), beta_regularizer=None, gamma_regularizer=None, renorm=False, renorm_clipping=None, renorm_momentum=0.99, fused=False, trainable=True, name=None, **kwargs): super(BatchNormalization, self).__init__( name=name, trainable=trainable, **kwargs) self.axis = axis self.momentum = momentum self.epsilon = epsilon self.center = center self.scale = scale self.beta_initializer = beta_initializer self.gamma_initializer = gamma_initializer self.moving_mean_initializer = moving_mean_initializer self.moving_variance_initializer = moving_variance_initializer self.beta_regularizer = beta_regularizer self.gamma_regularizer = gamma_regularizer self.renorm = renorm self.fused = fused if self.fused and renorm: raise ValueError( 'Batch renorm is currently not supported with fused batch norm.') if self.fused and (beta_regularizer is not None or gamma_regularizer is not None): raise ValueError('Regularizers are not currently ' 'supported for fused batch norm.') if renorm: renorm_clipping = renorm_clipping or {} keys = ['rmax', 'rmin', 'dmax'] if set(renorm_clipping) - set(keys): raise ValueError('renorm_clipping %s contains keys not in %s' % (renorm_clipping, keys)) self.renorm_clipping = renorm_clipping self.renorm_momentum = renorm_momentum
def testControlDepsNone(self): with self.test_session() as session: c = constant_op.constant(1.0) with ops.control_dependencies([c]): # d get the control dependency. d = constant_op.constant(2.0) # Partitioned variables do not. var_x = variable_scope.get_variable( "x", shape=[2], initializer=init_ops.ones_initializer(), partitioner=partitioned_variables.variable_axis_size_partitioner(4)) ops_before_read = session.graph.get_operations() var_x.as_tensor() # Caches the ops for subsequent reads. reading_ops = [ op for op in session.graph.get_operations() if op not in ops_before_read ] self.assertEqual([c.op], d.op.control_inputs) # Tests that no control dependencies are added to reading a partitioned # variable which is similar to reading a variable. for op in reading_ops: self.assertEqual([], op.control_inputs)
def testEagerExecution(self): with context.eager_mode(): container = variable_scope.EagerVariableStore() x = constant_op.constant([[2.0]]) with container.as_default(): y = core_layers.dense( x, 1, name='my_dense', kernel_initializer=init_ops.ones_initializer()) self.assertAllEqual(y, [[2.0]]) self.assertEqual(len(container.variables()), 2) # Recreate the layer to test reuse. with container.as_default(): core_layers.dense( x, 1, name='my_dense', kernel_initializer=init_ops.ones_initializer()) self.assertEqual(len(container.variables()), 2)
def testOnesInitializer(self): with self.test_session(use_gpu=True): shape = [2, 3] x = variable_scope.get_variable( "x", shape=shape, initializer=init_ops.ones_initializer()) x.initializer.run() self.assertAllEqual(x.eval(), np.ones(shape))
def testVariableCreationInALoop(self): """Tests the variable created inside a loop can be used outside the loop.""" with self.test_session(): with variable_scope.variable_scope("ascope") as scope: def Body(i, _): var_x = variable_scope.get_variable( "x", shape=[2], initializer=init_ops.ones_initializer(), partitioner=partitioned_variables.variable_axis_size_partitioner( 4)) return (i + 1, var_x.as_tensor()) cond = lambda i, _: i < 2 _, x = control_flow_ops.while_loop( cond, Body, (0, constant_op.constant([7, 8], dtypes.float32))) variables.global_variables_initializer().run() self.assertAllClose([1.0, 1.0], x.eval()) scope.reuse_variables() var_x = variable_scope.get_variable( "x", shape=[2], initializer=init_ops.ones_initializer(), partitioner=partitioned_variables.variable_axis_size_partitioner(4)) self.assertAllClose([1.0, 1.0], var_x.as_tensor().eval())
def Foo(inputs): var = variable_scope.get_variable( "var", shape=[10], dtype=dtypes.float32, initializer=init_ops.ones_initializer()) return inputs + var
def build(self, inputs_shape): # Call the build method of the parent class. super(MaskedBasicLSTMCell, self).build(inputs_shape) self.built = False input_depth = inputs_shape[1].value h_depth = self._num_units self._mask = self.add_variable( name="mask", shape=[input_depth + h_depth, 4 * h_depth], initializer=init_ops.ones_initializer(), trainable=False, dtype=self.dtype) self._threshold = self.add_variable( name="threshold", shape=[], initializer=init_ops.zeros_initializer(), trainable=False, dtype=self.dtype) # Add masked_weights in the weights namescope so as to make it easier # for the quantization library to add quant ops. self._masked_kernel = math_ops.multiply(self._mask, self._kernel, core_layers.MASKED_WEIGHT_NAME) if self._mask not in ops.get_collection_ref(core_layers.MASK_COLLECTION): ops.add_to_collection(core_layers.MASK_COLLECTION, self._mask) ops.add_to_collection(core_layers.MASKED_WEIGHT_COLLECTION, self._masked_kernel) ops.add_to_collection(core_layers.THRESHOLD_COLLECTION, self._threshold) ops.add_to_collection(core_layers.WEIGHT_COLLECTION, self._kernel) self.built = True
def Body(i, _): var_x = variable_scope.get_variable( "x", shape=[2], initializer=init_ops.ones_initializer(), partitioner=partitioned_variables.variable_axis_size_partitioner( 4)) return (i + 1, var_x.as_tensor())
def _create_slots(self, var_list): for v in var_list: init_rms = init_ops.ones_initializer(dtype=v.dtype) self._get_or_make_slot_with_initializer(v, init_rms, v.get_shape(), v.dtype, "rms", self._name) if self._centered: self._zeros_slot(v, "mg", self._name) self._zeros_slot(v, "momentum", self._name)
def __init__(self, axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True, beta_initializer=init_ops.zeros_initializer(), gamma_initializer=init_ops.ones_initializer(), moving_mean_initializer=init_ops.zeros_initializer(), moving_variance_initializer=init_ops.ones_initializer(), beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None, renorm=False, renorm_clipping=None, renorm_momentum=0.99, fused=None, trainable=True, virtual_batch_size=None, adjustment=None, name=None, **kwargs): super(BatchNormalization, self).__init__( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, moving_mean_initializer=moving_mean_initializer, moving_variance_initializer=moving_variance_initializer, beta_regularizer=beta_regularizer, gamma_regularizer=gamma_regularizer, beta_constraint=beta_constraint, gamma_constraint=gamma_constraint, renorm=renorm, renorm_clipping=renorm_clipping, renorm_momentum=renorm_momentum, fused=fused, trainable=trainable, virtual_batch_size=virtual_batch_size, adjustment=adjustment, name=name, **kwargs)
def testAddVariable(self): obj = NonLayerCheckpointable() with self.assertRaisesRegexp(ValueError, "do not specify shape"): checkpointable_utils.add_variable( obj, name="shape_specified_twice", shape=[], initializer=1) constant_initializer = checkpointable_utils.add_variable( obj, name="constant_initializer", initializer=1) with variable_scope.variable_scope("some_variable_scope"): ones_initializer = checkpointable_utils.add_variable( obj, name="ones_initializer", shape=[2], initializer=init_ops.ones_initializer(dtype=dtypes.float32)) bare_initializer = checkpointable_utils.add_variable( obj, name="bare_initializer", shape=[2, 2], dtype=dtypes.float64, initializer=init_ops.zeros_initializer) # Even in graph mode, there are no naming conflicts between objects, only # naming conflicts within an object. other_duplicate = resource_variable_ops.ResourceVariable( name="duplicate", initial_value=1.) duplicate = checkpointable_utils.add_variable( obj, name="duplicate", shape=[]) with self.assertRaisesRegexp(ValueError, "'duplicate' already exists"): checkpointable_utils.add_variable(obj, name="duplicate", shape=[]) if context.in_graph_mode(): self.evaluate(variables.global_variables_initializer()) self.assertEqual("constant_initializer:0", constant_initializer.name) self.assertEqual(1, self.evaluate(constant_initializer)) self.assertEqual("some_variable_scope/ones_initializer:0", ones_initializer.name) self.assertAllEqual([1, 1], self.evaluate(ones_initializer)) self.assertAllEqual([[0., 0.], [0., 0.]], self.evaluate(bare_initializer)) self.assertEqual("a_variable:0", obj.a_variable.name) self.assertEqual("duplicate:0", other_duplicate.name) if context.in_graph_mode(): # The .name attribute may be globally influenced, but the checkpoint name # won't be (tested below). self.assertEqual("duplicate_1:0", duplicate.name) else: # When executing eagerly, there's no uniquification of variable names. The # checkpoint name will be the same. self.assertEqual("duplicate:0", duplicate.name) named_variables, _ = checkpointable_utils._serialize_object_graph(obj) expected_checkpoint_names = ( "a_variable/.ATTRIBUTES/VARIABLE_VALUE", "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE", "constant_initializer/.ATTRIBUTES/VARIABLE_VALUE", "duplicate/.ATTRIBUTES/VARIABLE_VALUE", "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE", ) six.assertCountEqual( self, expected_checkpoint_names, named_variables.keys())
def _create_variable_statistics_object(self): """Creates non-trainable variables representing input statistics.""" series_start_moments = Moments( mean=variable_scope.get_variable( name="series_start_mean", shape=[self._num_features], dtype=self._dtype, initializer=init_ops.zeros_initializer(), trainable=False), variance=variable_scope.get_variable( name="series_start_variance", shape=[self._num_features], dtype=self._dtype, initializer=init_ops.ones_initializer(), trainable=False)) overall_feature_moments = Moments( mean=variable_scope.get_variable( name="overall_feature_mean", shape=[self._num_features], dtype=self._dtype, initializer=init_ops.zeros_initializer(), trainable=False), variance=variable_scope.get_variable( name="overall_feature_var", shape=[self._num_features], dtype=self._dtype, initializer=init_ops.ones_initializer(), trainable=False)) start_time = variable_scope.get_variable( name="start_time", dtype=dtypes.int64, initializer=init_ops.zeros_initializer(), shape=[], trainable=False) total_observation_count = variable_scope.get_variable( name="total_observation_count", shape=[], dtype=dtypes.int64, initializer=init_ops.ones_initializer(), trainable=False) return InputStatistics( series_start_moments=series_start_moments, overall_feature_moments=overall_feature_moments, start_time=start_time, total_observation_count=total_observation_count)
def testLSTMLayer(self): # Run with all-0 weights, no padding. o = self._RunLSTMLayer('zeros', init_ops.zeros_initializer(), 0., 0., 0.) self.assertAllClose(o, [[[0.]] * self._batch_size] * 3) o = self._RunLSTMLayer('zeros', init_ops.zeros_initializer(), 0., 1., 0.) self.assertAllClose(o, [[[.25]] * self._batch_size, [[.125]] * self._batch_size, [[.0625]] * self._batch_size]) o = self._RunLSTMLayer('zeros', init_ops.zeros_initializer(), 1., 0., 0.) self.assertAllClose(o, [[[0.]] * self._batch_size] * 3) o = self._RunLSTMLayer('zeros', init_ops.zeros_initializer(), 1., 1., 0.) self.assertAllClose(o, [[[.25]] * self._batch_size, [[.125]] * self._batch_size, [[.0625]] * self._batch_size]) # Run with all-1 weights, no padding. weight1 = 1. for m_init in [0., 1.]: for c_init in [0., 1.]: o = self._RunLSTMLayer('ones', init_ops.ones_initializer(), m_init, c_init, 0.) m0 = self._NextM(self._inputs, weight1, m_init, c_init) c0 = self._NextC(self._inputs, weight1, m_init, c_init) self.assertAllClose(o[0], m0) m1 = self._NextM(self._inputs, weight1, m0, c0) c1 = self._NextC(self._inputs, weight1, m0, c0) self.assertAllClose(o[1], m1) m2 = self._NextM(self._inputs, weight1, m1, c1) self.assertAllClose(o[2], m2) # Run with random weights. for weight in np.random.rand(3): weight_tf = constant_op.constant(weight, dtypes.float32) random_weight = lambda shape, w=weight_tf: array_ops.fill(shape, w) # No padding. for m_init in [0., 1.]: for c_init in [0., 1.]: o = self._RunLSTMLayer('random', random_weight, m_init, c_init, 0.) m0 = self._NextM(self._inputs, weight, m_init, c_init) c0 = self._NextC(self._inputs, weight, m_init, c_init) self.assertAllClose(o[0], m0) m1 = self._NextM(self._inputs, weight, m0, c0) c1 = self._NextC(self._inputs, weight, m0, c0) self.assertAllClose(o[1], m1) m2 = self._NextM(self._inputs, weight, m1, c1) self.assertAllClose(o[2], m2) # Set padding. o = self._RunLSTMLayer('random', random_weight, 0., 0., 1.) self.assertAllClose(o, [[[0.]] * self._batch_size] * 3) o = self._RunLSTMLayer('random', random_weight, 0., 1., 1.) self.assertAllClose(o, [[[0.]] * self._batch_size] * 3) o = self._RunLSTMLayer('random', random_weight, 1., 0., 1.) self.assertAllClose(o, [[[1.]] * self._batch_size] * 3) o = self._RunLSTMLayer('random', random_weight, 1., 1., 1.) self.assertAllClose(o, [[[1.]] * self._batch_size] * 3)
def testGPU(self): with self.test_session(use_gpu=True) as sess: abc = variable_scope.get_variable( "abc", shape=[1], initializer=init_ops.ones_initializer(), use_resource=True) sess.run(variables.global_variables_initializer()) print(sess.run(abc))
def l2_normalization( inputs, scaling=False, scale_initializer=init_ops.ones_initializer(), reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Implement L2 normalization on every feature (i.e. spatial normalization). Should be extended in some near future to other dimensions, providing a more flexible normalization framework. inputs: a 4-D tensor with dimensions [batch_size, height, width, channels]. scaling: whether or not to add a post scaling operation along the dimensions which have been normalized. scale_initializer: An initializer for the weights. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional list of collections for all the variables or a dictionary containing a different list of collection per variable. outputs_collections: collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_scope`. Returns: A `Tensor` representing the output of the operation. """ with variable_scope.variable_scope( scope, 'L2Normalization', [inputs], reuse=reuse) as sc: inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims params_shape = inputs_shape[-1:] dtype = inputs.dtype.base_dtype # Normalize along spatial dimensions. norm_dim = tf.range(1, inputs_rank-1) outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12) # Additional scaling. if scaling: scale_collections = utils.get_variable_collections( variables_collections, 'scale') scale = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=scale_initializer, collections=scale_collections, trainable=trainable) outputs = tf.multiply(outputs, scale) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def _create_vars(self, var_list, state): for v in var_list: if v.get_shape().is_fully_defined(): init_rms = init_ops.ones_initializer(dtype=v.dtype.base_dtype) else: init_rms = array_ops.ones_like(v) state.create_slot_with_initializer(v, init_rms, v.get_shape(), v.dtype.base_dtype, "rms") if self._centered: state.zeros_slot(v, "mg") state.zeros_slot(v, "momentum")
def build(self, input_shape): input_shape = tensor_shape.TensorShape(input_shape) if tensor_shape.dimension_value(input_shape[-1]) is None: raise ValueError('The last dimension of the inputs to `Dense` ' 'should be defined. Found `None`.') self.input_spec = base.InputSpec( min_ndim=2, axes={-1: tensor_shape.dimension_value(input_shape[-1])}) self.kernel = self.add_variable( 'kernel', shape=[tensor_shape.dimension_value(input_shape[-1]), self.units], initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, dtype=self.dtype, trainable=True) self.mask = self.add_variable( name='mask', shape=[tensor_shape.dimension_value(input_shape[-1]), self.units], initializer=init_ops.ones_initializer(), trainable=False, dtype=self.dtype) self.threshold = self.add_variable( name='threshold', shape=[], initializer=init_ops.zeros_initializer(), trainable=False, dtype=self.dtype) # Add masked_weights in the weights namescope so as to make it easier # for the quantization library to add quant ops. self.masked_kernel = math_ops.multiply(self.mask, self.kernel, MASKED_WEIGHT_NAME) ops.add_to_collection(MASK_COLLECTION, self.mask) ops.add_to_collection(MASKED_WEIGHT_COLLECTION, self.masked_kernel) ops.add_to_collection(THRESHOLD_COLLECTION, self.threshold) ops.add_to_collection(WEIGHT_COLLECTION, self.kernel) if self.use_bias: self.bias = self.add_variable( 'bias', shape=[ self.units, ], initializer=self.bias_initializer, regularizer=self.bias_regularizer, dtype=self.dtype, trainable=True) else: self.bias = None self.built = True
def build(self, input_shape): """Creates scale variable if use_scale==True.""" if self.use_scale: self.scale = self.add_weight( name='scale', shape=(), initializer=init_ops.ones_initializer(), dtype=self.dtype, trainable=True) else: self.scale = None super(Attention, self).build(input_shape)
def testGPU(self): with self.test_session(use_gpu=True) as sess: abc = variable_scope.get_variable( "abc", shape=[1], initializer=init_ops.ones_initializer(), use_resource=True) sess.run(variables.global_variables_initializer()) self.assertEqual( resource_variable_ops.var_is_initialized_op(abc.handle).eval(), True) print(sess.run(abc))
def build(self, input_shape): input_shape = tensor_shape.TensorShape(input_shape) channel_axis = 1 if self.data_format == 'channels_first' else -1 if tensor_shape.dimension_value(input_shape[channel_axis]) is None: raise ValueError('The channel dimension of the inputs ' 'should be defined. Found `None`.') input_dim = tensor_shape.dimension_value(input_shape[channel_axis]) kernel_shape = self.kernel_size + (input_dim, self.filters) self.mask = self.add_variable( name='mask', shape=kernel_shape, initializer=init_ops.ones_initializer(), trainable=False, dtype=self.dtype) self.kernel = self.add_variable( name='kernel', shape=kernel_shape, initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, trainable=True, dtype=self.dtype) self.threshold = self.add_variable( name='threshold', shape=[], initializer=init_ops.zeros_initializer(), trainable=False, dtype=self.dtype) # Add masked_weights in the weights namescope so as to make it easier # for the quantization library to add quant ops. self.masked_kernel = math_ops.multiply(self.mask, self.kernel, MASKED_WEIGHT_NAME) ops.add_to_collection(MASK_COLLECTION, self.mask) ops.add_to_collection(MASKED_WEIGHT_COLLECTION, self.masked_kernel) ops.add_to_collection(THRESHOLD_COLLECTION, self.threshold) ops.add_to_collection(WEIGHT_COLLECTION, self.kernel) if self.use_bias: self.bias = self.add_variable( name='bias', shape=(self.filters,), initializer=self.bias_initializer, regularizer=self.bias_regularizer, trainable=True, dtype=self.dtype) else: self.bias = None self.input_spec = base.InputSpec( ndim=self.rank + 2, axes={channel_axis: input_dim}) self.built = True
def _create_slots(self, var_list): for v in var_list: if v.get_shape().is_fully_defined(): init_rms = init_ops.ones_initializer(dtype=v.dtype.base_dtype) else: init_rms = array_ops.ones_like(v) self._get_or_make_slot_with_initializer(v, init_rms, v.get_shape(), v.dtype.base_dtype, "rms", self._name) if self._centered: self._zeros_slot(v, "mg", self._name) self._zeros_slot(v, "momentum", self._name)
def testGPU(self): with test_util.use_gpu(): abc = variable_scope.get_variable( "abc", shape=[1], initializer=init_ops.ones_initializer(), use_resource=True) self.evaluate(variables.global_variables_initializer()) self.assertEqual( self.evaluate( resource_variable_ops.var_is_initialized_op(abc.handle)), True)
def testFunctionalDenseInitializerFromScope(self): with self.test_session() as sess: with variable_scope.variable_scope( 'scope', initializer=init_ops.ones_initializer()): inputs = random_ops.random_uniform((5, 3), seed=1) core_layers.dense(inputs, 2) sess.run(variables.global_variables_initializer()) weights = sess.run(variables.trainable_variables()) self.assertEqual(len(weights), 2) # Check that the matrix weights got initialized to ones (from scope). self.assertAllClose(weights[0], np.ones((3, 2))) # Check that the bias still got initialized to zeros. self.assertAllClose(weights[1], np.zeros((2)))
def testFunctionalDenseInitializerFromScope(self): with variable_scope.variable_scope( 'scope', initializer=init_ops.ones_initializer()): inputs = random_ops.random_uniform((5, 3), seed=1) core_layers.dense(inputs, 2) if context.in_graph_mode(): self.evaluate(variables.global_variables_initializer()) weights = variables.trainable_variables() self.assertEqual(len(weights), 2) # Check that the matrix weights got initialized to ones (from scope). self.assertAllClose( self.evaluate(weights[0].read_value()), np.ones((3, 2))) # Check that the bias still got initialized to zeros. self.assertAllClose(self.evaluate(weights[1].read_value()), np.zeros((2)))
def testFunctionalDenseInitializerFromScope(self): with variable_scope.variable_scope( 'scope', initializer=init_ops.ones_initializer()), self.test_session(): inputs = random_ops.random_uniform((5, 3), seed=1) core_layers.dense(inputs, 2) variables.global_variables_initializer().run() weights = _get_variable_dict_from_varstore() self.assertEqual(len(weights), 2) # Check that the matrix weights got initialized to ones (from scope). self.assertAllClose(weights['scope/dense/kernel'].read_value().eval(), np.ones((3, 2))) # Check that the bias still got initialized to zeros. self.assertAllClose(weights['scope/dense/bias'].read_value().eval(), np.zeros((2)))
def testLayerInDefun(self): conv = convolutional.Conv2D( filters=1, kernel_size=2, kernel_initializer=init_ops.ones_initializer(), bias_initializer=init_ops.zeros_initializer()) @function.defun def model(x): return conv(x) x = array_ops.ones([1, 2, 2, 1]) y = model(x) self.assertAllEqual([[[[4.0]]]], y.numpy())
def testPrepareInputsForRnnSparseAndDense(self): num_unroll = 2 embedding_dimension = 8 dense_dimension = 2 expected = [ np.array([[1., 1., 1., 1., 1., 1., 1., 1., 111., 112.], [1., 1., 1., 1., 1., 1., 1., 1., 211., 212.], [1., 1., 1., 1., 1., 1., 1., 1., 311., 312.]]), np.array([[1., 1., 1., 1., 1., 1., 1., 1., 121., 122.], [2., 2., 2., 2., 2., 2., 2., 2., 221., 222.], [1., 1., 1., 1., 1., 1., 1., 1., 321., 322.]]) ] sequence_features = { 'wire_cast': sparse_tensor.SparseTensor( indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), 'seq_feature0': constant_op.constant([[[111., 112.], [121., 122.]], [[211., 212.], [221., 222.]], [[311., 312.], [321., 322.]]]) } wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) wire_cast_embedded = feature_column.embedding_column( wire_cast, dimension=embedding_dimension, combiner='sum', initializer=init_ops.ones_initializer()) seq_feature0_column = feature_column.real_valued_column( 'seq_feature0', dimension=dense_dimension) sequence_feature_columns = [seq_feature0_column, wire_cast_embedded] context_features = None self._test_prepare_inputs_for_rnn(sequence_features, context_features, sequence_feature_columns, num_unroll, expected)
def batch_normalization(inputs, axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True, beta_initializer=init_ops.zeros_initializer(), gamma_initializer=init_ops.ones_initializer(), moving_mean_initializer=init_ops.zeros_initializer(), moving_variance_initializer=init_ops.ones_initializer(), beta_regularizer=None, gamma_regularizer=None, training=False, trainable=True, name=None, reuse=None, renorm=False, renorm_clipping=None, renorm_momentum=0.99): """Functional interface for the batch normalization layer. Reference: http://arxiv.org/abs/1502.03167 "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" Sergey Ioffe, Christian Szegedy Note: the operations which update the `moving_mean` and `moving_variance` variables will not be added as dependencies of your training operation and so must be run separately. For example: ``` extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) sess.run([train_op, extra_update_ops], ...) ``` Alternatively, add the operations as a dependency to your training operation manually, and then just run your training operation as normal: ``` extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(extra_update_ops): train_op = optimizer.minimize(loss) ... sess.run([train_op], ...) ``` Arguments: inputs: Tensor input. axis: Integer, the axis that should be normalized (typically the features axis). For instance, after a `Convolution2D` layer with `data_format="channels_first"`, set `axis=1` in `BatchNormalization`. momentum: Momentum for the moving average. epsilon: Small float added to variance to avoid dividing by zero. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. beta_initializer: Initializer for the beta weight. gamma_initializer: Initializer for the gamma weight. moving_mean_initializer: Initializer for the moving mean. moving_variance_initializer: Initializer for the moving variance. beta_regularizer: Optional regularizer for the beta weight. gamma_regularizer: Optional regularizer for the gamma weight. training: Either a Python boolean, or a TensorFlow boolean scalar tensor (e.g. a placeholder). Whether to return the output in training mode (normalized with statistics of the current batch) or in inference mode (normalized with moving statistics). **NOTE**: make sure to set this parameter correctly, or else your training/inference will not work properly. trainable: Boolean, if `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). name: String, the name of the layer. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. renorm: Whether to use Batch Renormalization (https://arxiv.org/abs/1702.03275). This adds extra variables during training. The inference is the same for either value of this parameter. renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to scalar `Tensors` used to clip the renorm correction. The correction `(r, d)` is used as `corrected_value = normalized_value * r + d`, with `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin, dmax are set to inf, 0, inf, respectively. renorm_momentum: Momentum used to update the moving means and standard deviations with renorm. Unlike `momentum`, this affects training and should be neither too small (which would add noise) nor too large (which would give stale estimates). Note that `momentum` is still applied to get the means and variances for inference. Returns: Output tensor. """ layer = BatchNormalization( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, moving_mean_initializer=moving_mean_initializer, moving_variance_initializer=moving_variance_initializer, beta_regularizer=beta_regularizer, gamma_regularizer=gamma_regularizer, trainable=trainable, renorm=renorm, renorm_clipping=renorm_clipping, renorm_momentum=renorm_momentum, name=name, _reuse=reuse, _scope=name) return layer.apply(inputs, training=training)
def testBatchNormsMatchFwdBwdSomeOnShard0SomeOnShard1(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) with variable_scope.variable_scope("vs", use_resource=True): with tu.ipu_shard(0): y = convolutional.conv2d( x, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv1') y = layers_norm.batch_normalization(y, fused=True, training=True) y = convolutional.conv2d( y, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv2') y = layers_norm.batch_normalization(y, fused=True, training=True) with tu.ipu_shard(1): y = convolutional.conv2d( y, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv3') y = layers_norm.batch_normalization(y, fused=True, training=True) loss = math_ops.reduce_sum(y) optimizer = gradient_descent.GradientDescentOptimizer(0.1) train = optimizer.minimize(loss) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True, sharded=True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) sess.run([train, loss], {x: np.zeros([1, 4, 4, 2])}) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) # Two BN for forwards (on shards 0 and 1) and two BN for grad # (note that we don't cache gradient application) ok = [ '__seed*', '*OnTileCopy*', 'Copy_', 'vs/conv1/Conv2D/convolution.*/Conv_1x1', 'vs/conv3/Conv2D/convolution.*/Conv_1x1', 'vs/batch_normalization/FusedBatchNorm/batch-norm-training.*/', 'vs/batch_normalization_2/FusedBatchNorm/batch-norm-training.*/', 'Sum/reduce.*/ReduceFinalStage/IntermediateToOutput/Reduce', 'gradients/vs/batch_normalization_2/FusedBatchNorm_grad/FusedBatchNormGrad/batch-norm-grad.*/', 'gradients/vs/batch_normalization_1/FusedBatchNorm_grad/FusedBatchNormGrad/batch-norm-grad.*/', 'GradientDescent/update_vs/batch_normalization/', 'GradientDescent/update_vs/batch_normalization_1/', 'GradientDescent/update_vs/batch_normalization_2/', 'gradients/vs/conv3/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Conv_4x4', 'gradients/vs/conv3/Conv2D_grad/Conv2DBackpropFilter/fusion.*/AddTo', 'gradients/vs/conv1/Conv2D_grad/Conv2DBackpropFilter/fusion*/Conv_4x4', 'gradients/vs/conv1/Conv2D_grad/Conv2DBackpropFilter/fusion.*/AddTo' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def conv_layer_norm(inputs, bptr, nclass=None, center=True, scale=True, activation_fn=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a Layer Normalization layer from https://arxiv.org/abs/1607.06450. "Layer Normalization" Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton Can be used as a normalizer function for conv2d and fully_connected. Args: inputs: a tensor with 2 or more dimensions. The normalization occurs over all but the first dimension. center: If True, subtract `beta`. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. activation_fn: activation function, default set to None to skip it and maintain a linear activation. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional collections for the variables. outputs_collections: collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_scope`. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: if rank or last dimension of `inputs` is undefined. """ with variable_scope.variable_scope(scope, 'LayerNorm', [inputs], reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype axis = list(range(1, inputs_rank)) params_shape = inputs_shape[-1:] if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined last dimension %s.' % (inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta = variables.model_variable( 'beta', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, collections=beta_collections, trainable=trainable) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma = variables.model_variable( 'gamma', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer(), collections=gamma_collections, trainable=trainable) # Calculate the moments on the last axis (layer activations). if nclass is not None: x = bptr / nclass fm = tf.exp(x - x**2) else: fm = tf.exp(bptr) fm_sum = tf.reduce_sum(fm, [1, 2]) fm_sum = tf.expand_dims(tf.expand_dims(fm_sum, 1), 2) norm_weight = tf.div(fm, fm_sum) mean = tf.mul(norm_weight, inputs) mean = tf.reduce_sum(mean, [1, 2]) mean = tf.expand_dims(tf.expand_dims(mean, 1), 2) #print (mean.get_shape().as_list()) variance = tf.sub(inputs, mean) variance = tf.square(variance) variance = tf.mul(norm_weight, variance) variance = nn.moments(variance, [1, 2], keep_dims=True)[1] #print (variance.get_shape().as_list()) #mean, variance = nn.moments(inputs, axis, keep_dims=True) # Compute layer normalization using the batch_normalization function. variance_epsilon = 1E-12 outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, variance_epsilon) outputs.set_shape(inputs_shape) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def __init__(self, reference_batch, axis=-1, epsilon=1e-3, center=True, scale=True, beta_initializer=init_ops.zeros_initializer(), gamma_initializer=init_ops.ones_initializer(), beta_regularizer=None, gamma_regularizer=None, trainable=True, name=None, batch_axis=0): """Initialize virtual batch normalization object. We precompute the 'mean' and 'mean squared' of the reference batch, so that `__call__` is efficient. This means that the axis must be supplied when the object is created, not when it is called. We precompute 'square mean' instead of 'variance', because the square mean can be easily adjusted on a per-example basis. Args: reference_batch: A minibatch tensors. This will form the reference data from which the normalization statistics are calculated. See https://arxiv.org/abs/1606.03498 for more details. axis: Integer, the axis that should be normalized (typically the features axis). For instance, after a `Convolution2D` layer with `data_format="channels_first"`, set `axis=1` in `BatchNormalization`. epsilon: Small float added to variance to avoid dividing by zero. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. beta_initializer: Initializer for the beta weight. gamma_initializer: Initializer for the gamma weight. beta_regularizer: Optional regularizer for the beta weight. gamma_regularizer: Optional regularizer for the gamma weight. trainable: Boolean, if `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). name: String, the name of the ops. batch_axis: The axis of the batch dimension. This dimension is treated differently in `virtual batch normalization` vs `batch normalization`. Raises: ValueError: If `reference_batch` has unknown dimensions at graph construction. ValueError: If `batch_axis` is the same as `axis`. """ axis = _validate_init_input_and_get_axis(reference_batch, axis) self._epsilon = epsilon self._beta = 0 self._gamma = 1 self._batch_axis = _validate_init_input_and_get_axis( reference_batch, batch_axis) if axis == self._batch_axis: raise ValueError('`axis` and `batch_axis` cannot be the same.') with variable_scope.variable_scope(name, 'VBN', values=[reference_batch ]) as self._vs: self._reference_batch = reference_batch # Calculate important shapes: # 1) Reduction axes for the reference batch # 2) Broadcast shape, if necessary # 3) Reduction axes for the virtual batchnormed batch # 4) Shape for optional parameters input_shape = self._reference_batch.shape ndims = input_shape.ndims reduction_axes = list(range(ndims)) del reduction_axes[axis] self._broadcast_shape = [1] * len(input_shape) self._broadcast_shape[axis] = input_shape.dims[axis] self._example_reduction_axes = list(range(ndims)) del self._example_reduction_axes[max(axis, self._batch_axis)] del self._example_reduction_axes[min(axis, self._batch_axis)] params_shape = self._reference_batch.shape[axis] # Determines whether broadcasting is needed. This is slightly different # than in the `nn.batch_normalization` case, due to `batch_dim`. self._needs_broadcasting = (sorted(self._example_reduction_axes) != list(range(ndims))[:-2]) # Calculate the sufficient statistics for the reference batch in a way # that can be easily modified by additional examples. self._ref_mean, self._ref_mean_squares = _statistics( self._reference_batch, reduction_axes) self._ref_variance = (self._ref_mean_squares - math_ops.square(self._ref_mean)) # Virtual batch normalization uses a weighted average between example # statistics and the reference batch statistics. ref_batch_size = _static_or_dynamic_batch_size( self._reference_batch, self._batch_axis) self._example_weight = 1. / (math_ops.to_float(ref_batch_size) + 1.) self._ref_weight = 1. - self._example_weight # Make the variables, if necessary. if center: self._beta = variable_scope.get_variable( name='beta', shape=(params_shape, ), initializer=beta_initializer, regularizer=beta_regularizer, trainable=trainable) if scale: self._gamma = variable_scope.get_variable( name='gamma', shape=(params_shape, ), initializer=gamma_initializer, regularizer=gamma_regularizer, trainable=trainable)
def fused_layer_norm(inputs, center=True, scale=True, activation_fn=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, begin_norm_axis=1, begin_params_axis=-1, scope=None, use_fused_batch_norm=False): with tf.variable_scope(scope, 'LayerNorm', [inputs], reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) inputs_shape = inputs.shape inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype if begin_norm_axis < 0: begin_norm_axis = inputs_rank + begin_norm_axis if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank: raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) ' 'must be < rank(inputs) (%d)' % (begin_params_axis, begin_norm_axis, inputs_rank)) params_shape = inputs_shape[begin_params_axis:] if not params_shape.is_fully_defined(): raise ValueError( 'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' % (inputs.name, begin_params_axis, inputs_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta = variables.model_variable( 'beta', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer(), collections=beta_collections, trainable=trainable) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma = variables.model_variable( 'gamma', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer(), collections=gamma_collections, trainable=trainable) if use_fused_batch_norm: # get static TensorShape if fully defined, # otherwise retrieve shape tensor norm_shape = inputs.shape[begin_norm_axis:] if norm_shape.is_fully_defined(): bn_shape = [1, -1, 1, numpy.prod(norm_shape.as_list())] else: norm_shape = tf.shape(inputs)[begin_norm_axis:] bn_shape = [1, -1, 1, tf.reduce_prod(norm_shape)] if inputs.get_shape().is_fully_defined(): outputs_shape = inputs.get_shape() else: outputs_shape = tf.shape(inputs) inputs = array_ops.reshape(inputs, bn_shape) if inputs.get_shape().is_fully_defined(): # static inputs TensorShape fully defined after reshape. ones = array_ops.ones(inputs.get_shape()[1], dtype=dtypes.float32) zeros = array_ops.zeros(inputs.get_shape()[1], dtype=dtypes.float32) else: # static inputs TensorShape NOT fully defined after reshape. # must use dynamic shape, which means these input tensors # have to be created at runtime, which causes a slowdown. scale_shape = tf.shape(inputs)[1] ones = array_ops.ones(scale_shape, dtype=dtypes.float32) zeros = array_ops.zeros(scale_shape, dtype=dtypes.float32) outputs, mean, variance = nn.fused_batch_norm(inputs, ones, zeros, epsilon=1e-4, data_format="NCHW") outputs = array_ops.reshape(outputs, outputs_shape) if center and scale: outputs = outputs * gamma + beta elif center: outputs = outputs + beta elif scale: outputs = outputs * gamma else: # Calculate the moments on the last axis (layer activations). norm_axes = list(range(begin_norm_axis, inputs_rank)) mean, variance = nn.moments(inputs, norm_axes, keep_dims=True) # Compute layer normalization using the batch_normalization function. variance_epsilon = 1e-4 outputs = nn.batch_normalization(inputs, mean, variance, offset=beta, scale=gamma, variance_epsilon=variance_epsilon) outputs.set_shape(inputs_shape) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def Foo(inputs): var = variable_scope.get_variable("var", shape=[10], dtype=dtypes.float32, initializer=init_ops.ones_initializer()) return inputs + var
def batch_normalization( inputs, axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True, beta_initializer=init_ops.zeros_initializer(), gamma_initializer=init_ops.ones_initializer(), moving_mean_initializer=init_ops.zeros_initializer(), moving_variance_initializer=init_ops.ones_initializer(), beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None, training=False, trainable=True, name=None, reuse=None, renorm=False, renorm_clipping=None, renorm_momentum=0.99, fused=None, virtual_batch_size=None, adjustment=None): """Functional interface for the batch normalization layer. Reference: http://arxiv.org/abs/1502.03167 "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" Sergey Ioffe, Christian Szegedy Note: when training, the moving_mean and moving_variance need to be updated. By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they need to be added as a dependency to the `train_op`. For example: ```python update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss) ``` Arguments: inputs: Tensor input. axis: An `int`, the axis that should be normalized (typically the features axis). For instance, after a `Convolution2D` layer with `data_format="channels_first"`, set `axis=1` in `BatchNormalization`. momentum: Momentum for the moving average. epsilon: Small float added to variance to avoid dividing by zero. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. beta_initializer: Initializer for the beta weight. gamma_initializer: Initializer for the gamma weight. moving_mean_initializer: Initializer for the moving mean. moving_variance_initializer: Initializer for the moving variance. beta_regularizer: Optional regularizer for the beta weight. gamma_regularizer: Optional regularizer for the gamma weight. beta_constraint: An optional projection function to be applied to the `beta` weight after being updated by an `Optimizer` (e.g. used to implement norm constraints or value constraints for layer weights). The function must take as input the unprojected variable and must return the projected variable (which must have the same shape). Constraints are not safe to use when doing asynchronous distributed training. gamma_constraint: An optional projection function to be applied to the `gamma` weight after being updated by an `Optimizer`. training: Either a Python boolean, or a TensorFlow boolean scalar tensor (e.g. a placeholder). Whether to return the output in training mode (normalized with statistics of the current batch) or in inference mode (normalized with moving statistics). **NOTE**: make sure to set this parameter correctly, or else your training/inference will not work properly. trainable: Boolean, if `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). name: String, the name of the layer. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. renorm: Whether to use Batch Renormalization (https://arxiv.org/abs/1702.03275). This adds extra variables during training. The inference is the same for either value of this parameter. renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to scalar `Tensors` used to clip the renorm correction. The correction `(r, d)` is used as `corrected_value = normalized_value * r + d`, with `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin, dmax are set to inf, 0, inf, respectively. renorm_momentum: Momentum used to update the moving means and standard deviations with renorm. Unlike `momentum`, this affects training and should be neither too small (which would add noise) nor too large (which would give stale estimates). Note that `momentum` is still applied to get the means and variances for inference. fused: if `True`, use a faster, fused implementation if possible. If `None`, use the system recommended implementation. virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`, which means batch normalization is performed across the whole batch. When `virtual_batch_size` is not `None`, instead perform "Ghost Batch Normalization", which creates virtual sub-batches which are each normalized separately (with shared gamma, beta, and moving statistics). Must divide the actual batch size during execution. adjustment: A function taking the `Tensor` containing the (dynamic) shape of the input tensor and returning a pair (scale, bias) to apply to the normalized values (before gamma and beta), only during training. For example, if axis==-1, `adjustment = lambda shape: ( tf.random_uniform(shape[-1:], 0.93, 1.07), tf.random_uniform(shape[-1:], -0.1, 0.1))` will scale the normalized value by up to 7% up or down, then shift the result by up to 0.1 (with independent scaling and bias for each feature but shared across all examples), and finally apply gamma and/or beta. If `None`, no adjustment is applied. Cannot be specified if virtual_batch_size is specified. Returns: Output tensor. Raises: ValueError: if eager execution is enabled. """ layer = BatchNormalization( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, moving_mean_initializer=moving_mean_initializer, moving_variance_initializer=moving_variance_initializer, beta_regularizer=beta_regularizer, gamma_regularizer=gamma_regularizer, beta_constraint=beta_constraint, gamma_constraint=gamma_constraint, renorm=renorm, renorm_clipping=renorm_clipping, renorm_momentum=renorm_momentum, fused=fused, trainable=trainable, virtual_batch_size=virtual_batch_size, adjustment=adjustment, name=name, _reuse=reuse, _scope=name) return layer.apply(inputs, training=training)
def testAddVariable(self): obj = NonLayerTrackable() with self.assertRaisesRegex(ValueError, "do not specify shape"): trackable_utils.add_variable(obj, name="shape_specified_twice", shape=[], initializer=1) constant_initializer = trackable_utils.add_variable( obj, name="constant_initializer", initializer=1) with variable_scope.variable_scope("some_variable_scope"): ones_initializer = trackable_utils.add_variable( obj, name="ones_initializer", shape=[2], initializer=init_ops.ones_initializer(dtype=dtypes.float32)) bare_initializer = trackable_utils.add_variable( obj, name="bare_initializer", shape=[2, 2], dtype=dtypes.float64, initializer=init_ops.zeros_initializer) # Even in graph mode, there are no naming conflicts between objects, only # naming conflicts within an object. other_duplicate = resource_variable_ops.ResourceVariable( name="duplicate", initial_value=1.) duplicate = trackable_utils.add_variable(obj, name="duplicate", shape=[]) with self.assertRaisesRegex(ValueError, "'duplicate'.*already declared"): trackable_utils.add_variable(obj, name="duplicate", shape=[]) self.evaluate(trackable_utils.gather_initializers(obj)) self.assertEqual("constant_initializer:0", constant_initializer.name) self.assertEqual(1, self.evaluate(constant_initializer)) self.assertEqual("some_variable_scope/ones_initializer:0", ones_initializer.name) self.assertAllEqual([1, 1], self.evaluate(ones_initializer)) self.assertAllEqual([[0., 0.], [0., 0.]], self.evaluate(bare_initializer)) self.assertEqual("a_variable:0", obj.a_variable.name) self.assertEqual("duplicate:0", other_duplicate.name) if context.executing_eagerly(): # When executing eagerly, there's no uniquification of variable names. The # checkpoint name will be the same. self.assertEqual("duplicate:0", duplicate.name) else: # The .name attribute may be globally influenced, but the checkpoint name # won't be (tested below). self.assertEqual("duplicate_1:0", duplicate.name) named_variables, _, _ = ( graph_view.ObjectGraphView(obj).serialize_object_graph()) expected_checkpoint_names = ( "a_variable/.ATTRIBUTES/VARIABLE_VALUE", "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE", "constant_initializer/.ATTRIBUTES/VARIABLE_VALUE", "duplicate/.ATTRIBUTES/VARIABLE_VALUE", "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE", ) six.assertCountEqual(self, expected_checkpoint_names, [v.name for v in named_variables])
def _create_slots(self, var_list): for v in var_list: init_rms = init_ops.ones_initializer(dtype=v.dtype) self._get_or_make_slot_with_initializer(v, init_rms, v.get_shape(), v.dtype, 'rms', self._name)
def custom_layer_norm(inputs, center=True, scale=True, activation_fn=None, reuse=None, outputs_collections=None, trainable=True, begin_norm_axis=1, begin_params_axis=-1, scope=None): """Adds a Layer Normalization layer. Based on the paper: "Layer Normalization" Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton https://arxiv.org/abs/1607.06450. Can be used as a normalizer function for conv2d and fully_connected. Given a tensor `inputs` of rank `R`, moments are calculated and normalization is performed over axes `begin_norm_axis ... R - 1`. Scaling and centering, if requested, is performed over axes `begin_params_axis .. R - 1`. By default, `begin_norm_axis = 1` and `begin_params_axis = -1`, meaning that normalization is performed over all but the first axis (the `HWC` if `inputs` is `NHWC`), while the `beta` and `gamma` trainable parameters are calculated for the rightmost axis (the `C` if `inputs` is `NHWC`). Scaling and recentering is performed via broadcast of the `beta` and `gamma` parameters with the normalized tensor. The shapes of `beta` and `gamma` are `inputs.shape[begin_params_axis:]`, and this part of the inputs' shape must be fully defined. Args: inputs: A tensor having rank `R`. The normalization is performed over axes `begin_norm_axis ... R - 1` and centering and scaling parameters are calculated over `begin_params_axis ... R - 1`. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). begin_norm_axis: The first normalization dimension: normalization will be performed along dimensions `begin_norm_axis : rank(inputs)` begin_params_axis: The first parameter (beta, gamma) dimension: scale and centering parameters will have dimensions `begin_params_axis : rank(inputs)` and will be broadcast with the normalized inputs accordingly. scope: Optional scope for `variable_scope`. Returns: A `Tensor` representing the output of the operation, having the same shape and dtype as `inputs`. Raises: ValueError: If the rank of `inputs` is not known at graph build time, or if `inputs.shape[begin_params_axis:]` is not fully defined at graph build time. """ with tf.compat.v1.variable_scope(scope, 'LayerNorm', [inputs], reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) inputs_shape = inputs.shape inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype if begin_norm_axis < 0: begin_norm_axis = inputs_rank + begin_norm_axis if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank: raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) ' 'must be < rank(inputs) (%d)' % (begin_params_axis, begin_norm_axis, inputs_rank)) params_shape = inputs_shape[begin_params_axis:] if not params_shape.is_fully_defined(): raise ValueError( 'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' % (inputs.name, begin_params_axis, inputs_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta = model_variable('beta', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer(), collections=None, trainable=trainable) if scale: gamma = model_variable('gamma', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer(), collections=None, trainable=trainable) # By default, compute the moments across all the dimensions except the one with index 0. norm_axes = list(range(begin_norm_axis, inputs_rank)) mean, variance = nn.moments(inputs, norm_axes, keep_dims=True) # Compute layer normalization using the batch_normalization function. # Note that epsilon must be increased for float16 due to the limited # representable range. variance_epsilon = 1e-12 if dtype != dtypes.float16 else 1e-3 devices = device_lib.list_local_devices() use_fused_layer_norm = any(dev.device_type == "HPU" for dev in devices) if use_fused_layer_norm: outputs, _, _ = habana_ops.habana_layer_norm( x=inputs, beta=beta, gamma=gamma, axes=tensor_util.make_tensor_proto(len(inputs.shape) - 1), epsilon=tensor_util.make_tensor_proto(variance_epsilon)) else: outputs = nn.batch_normalization(inputs, mean, variance, offset=beta, scale=gamma, variance_epsilon=variance_epsilon) outputs.set_shape(inputs_shape) if activation_fn is not None: outputs = activation_fn(outputs) return outputs
def instance_norm(inputs, center=True, scale=True, epsilon=1e-6, activation_fn=None, param_initializers=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, data_format=DATA_FORMAT_NHWC, scope=None): """Functional interface for the instance normalization layer. Reference: https://arxiv.org/abs/1607.08022. "Instance Normalization: The Missing Ingredient for Fast Stylization" Dmitry Ulyanov, Andrea Vedaldi, Victor Lempitsky Args: inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. The normalization is over all but the last dimension if `data_format` is `NHWC` and the second dimension if `data_format` is `NCHW`. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. param_initializers: Optional initializers for beta, gamma, moving mean and moving variance. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). data_format: A string. `NHWC` (default) and `NCHW` are supported. scope: Optional scope for `variable_scope`. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If `data_format` is neither `NHWC` nor `NCHW`. ValueError: If the rank of `inputs` is undefined. ValueError: If rank or channels dimension of `inputs` is undefined. """ inputs = ops.convert_to_tensor(inputs) inputs_shape = inputs.shape inputs_rank = inputs.shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC): raise ValueError('data_format has to be either NCHW or NHWC.') with variable_scope.variable_scope(scope, 'InstanceNorm', [inputs], reuse=reuse) as sc: if data_format == DATA_FORMAT_NCHW: reduction_axis = 1 # For NCHW format, rather than relying on implicit broadcasting, we # explicitly reshape the params to params_shape_broadcast when computing # the moments and the batch normalization. params_shape_broadcast = list([1, inputs_shape[1].value] + [1 for _ in range(2, inputs_rank)]) else: reduction_axis = inputs_rank - 1 params_shape_broadcast = None moments_axes = list(range(inputs_rank)) del moments_axes[reduction_axis] del moments_axes[0] params_shape = inputs_shape[reduction_axis:reduction_axis + 1] if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined channels dimension %s.' % (inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None dtype = inputs.dtype.base_dtype if param_initializers is None: param_initializers = {} if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta_initializer = param_initializers.get( 'beta', init_ops.zeros_initializer()) beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=beta_initializer, collections=beta_collections, trainable=trainable) if params_shape_broadcast: beta = array_ops.reshape(beta, params_shape_broadcast) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma_initializer = param_initializers.get( 'gamma', init_ops.ones_initializer()) gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=gamma_initializer, collections=gamma_collections, trainable=trainable) if params_shape_broadcast: gamma = array_ops.reshape(gamma, params_shape_broadcast) # Calculate the moments (instance activations). mean, variance = nn.moments(inputs, moments_axes, keep_dims=True) # Compute instance normalization. outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon, name='instancenorm') if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def l2_normalization(inputs, scaling=False, scale_initializer=init_ops.ones_initializer(), reuse=None, variables_collections=None, outputs_collections=None, data_format='NHWC', trainable=True, scope=None): """Implement L2 normalization on every feature (i.e. spatial normalization). Should be extended in some near future to other dimensions, providing a more flexible normalization framework. Args: inputs: a 4-D tensor with dimensions [batch_size, height, width, channels]. scaling: whether or not to add a post scaling operation along the dimensions which have been normalized. scale_initializer: An initializer for the weights. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional list of collections for all the variables or a dictionary containing a different list of collection per variable. outputs_collections: collection to add the outputs. data_format: NHWC or NCHW data format. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_scope`. Returns: A `Tensor` representing the output of the operation. """ with variable_scope.variable_scope(scope, 'L2Normalization', [inputs], reuse=reuse) as sc: inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims dtype = inputs.dtype.base_dtype if data_format == 'NHWC': params_shape = inputs_shape[-1:] elif data_format == 'NCHW': params_shape = (inputs_shape[1]) # Normalize along spatial dimensions. norm_dim = tf.range(1, inputs_rank - 1) outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12) # Additional scaling. if scaling: scale_collections = utils.get_variable_collections( variables_collections, 'scale') scale = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=scale_initializer, collections=scale_collections, trainable=trainable) if data_format == 'NCHW': scale = tf.expand_dims(scale, axis=-1) scale = tf.expand_dims(scale, axis=-1) outputs = tf.multiply(outputs, scale) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def _zero_debias(unbiased_var, value, decay): """Compute the delta required for a debiased Variable. All exponential moving averages initialized with Tensors are initialized to 0, and therefore are biased to 0. Variables initialized to 0 and used as EMAs are similarly biased. This function creates the debias updated amount according to a scale factor, as in https://arxiv.org/abs/1412.6980. To demonstrate the bias the results from 0-initialization, take an EMA that was initialized to `0` with decay `b`. After `t` timesteps of seeing the constant `c`, the variable have the following value: ``` EMA = 0*b^(t) + c*(1 - b)*b^(t-1) + c*(1 - b)*b^(t-2) + ... = c*(1 - b^t) ``` To have the true value `c`, we would divide by the scale factor `1 - b^t`. In order to perform debiasing, we use two shadow variables. One keeps track of the biased estimate, and the other keeps track of the number of updates that have occurred. Args: unbiased_var: A Variable representing the current value of the unbiased EMA. value: A Tensor representing the most recent value. decay: A Tensor representing `1-decay` for the EMA. Returns: The amount that the unbiased variable should be updated. Computing this tensor will also update the shadow variables appropriately. """ with variable_scope.variable_scope( unbiased_var.op.name, values=[unbiased_var, value, decay]) as scope: with ops.colocate_with(unbiased_var): biased_var = variable_scope.get_variable( "biased", initializer=init_ops.zeros_initializer( unbiased_var.get_shape(), dtype=unbiased_var.dtype), trainable=False) # Initializing the local_step to `0` would cause problems with the # debiasing equation, so we instead initialize to `1`. local_step = variable_scope.get_variable( "local_step", shape=[], dtype=unbiased_var.dtype, initializer=init_ops.ones_initializer(), trainable=False) # Get an update ops for both shadow variables. update_biased = state_ops.assign_sub(biased_var, (biased_var - value) * decay, name=scope.name) update_local_step = local_step.assign_add(1) # Compute the value of the delta to update the unbiased EMA. Make sure to # use the new values of the biased variable and the local step. with ops.control_dependencies([update_biased, update_local_step]): # This function gets `1 - decay`, so use `1.0 - decay` in the exponent. unbiased_ema_delta = (unbiased_var - biased_var.read_value() / (1 - math_ops.pow( 1.0 - decay, local_step.read_value()))) return unbiased_ema_delta
def testPrepareFeaturesForSQSS(self): mode = model_fn_lib.ModeKeys.TRAIN seq_feature_name = 'seq_feature' sparse_seq_feature_name = 'wire_cast' ctx_feature_name = 'ctx_feature' input_key_column_name = 'input_key_column' sequence_length = 4 embedding_dimension = 8 features = { input_key_column_name: constant_op.constant('input0'), sparse_seq_feature_name: sparse_tensor.SparseTensor(indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), seq_feature_name: constant_op.constant(1.0, shape=[sequence_length]), ctx_feature_name: constant_op.constant(2.0) } labels = constant_op.constant(5.0, shape=[sequence_length]) wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) sequence_feature_columns = [ feature_column.real_valued_column(seq_feature_name, dimension=1), feature_column.embedding_column( wire_cast, dimension=embedding_dimension, initializer=init_ops.ones_initializer()) ] context_feature_columns = [ feature_column.real_valued_column(ctx_feature_name, dimension=1) ] expected_input_key = b'input0' expected_sequence = { ssre.RNNKeys.LABELS_KEY: np.array([5., 5., 5., 5.]), seq_feature_name: np.array([1., 1., 1., 1.]), sparse_seq_feature_name: sparse_tensor.SparseTensor(indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), } expected_context = {ctx_feature_name: 2.} input_key, sequence, context = ssre._prepare_features_for_sqss( features, labels, mode, input_key_column_name, sequence_feature_columns, context_feature_columns) def assert_equal(expected, got): self.assertEqual(sorted(expected), sorted(got)) for k, v in expected.items(): if isinstance(v, sparse_tensor.SparseTensor): self.assertAllEqual(v.values.eval(), got[k].values) self.assertAllEqual(v.indices.eval(), got[k].indices) self.assertAllEqual(v.dense_shape.eval(), got[k].dense_shape) else: self.assertAllEqual(v, got[k]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(data_flow_ops.initialize_all_tables()) actual_input_key, actual_sequence, actual_context = sess.run( [input_key, sequence, context]) self.assertEqual(expected_input_key, actual_input_key) assert_equal(expected_sequence, actual_sequence) assert_equal(expected_context, actual_context)
def l2_normalization(inputs, scaling=False, scale_initializer=init_ops.ones_initializer(), reuse=None, variables_collections=None, outputs_collections=None, data_format='NHWC', trainable=True, scope=None): """Implement L2 normalization on every feature (i.e. spatial normalization). 实现在每个特征图上的L2正则化 Should be extended in some near future to other dimensions, providing a more flexible normalization framework. 应该在不久的将来会被扩展到其他维度,会提供更多的正则化框架 Args: inputs: a 4-D tensor with dimensions [batch_size, height, width, channels]. scaling: whether or not to add a post scaling operation along the dimensions which have been normalized. 输入:一个4D的张量带有的维度[batch_size, height, width, channels] 规模:是否要添加一个后缩放操作在需要正则化的维度之间 scale_initializer: An initializer for the weights. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. 规模初始化:一个对于权重的初始化器 variables_collections: optional list of collections for all the variables or a dictionary containing a different list of collection per variable. 变量集合:对于所有变量或者一个字典(包含每个变量的集合的不同列表)可选择的集合列表 outputs_collections: collection to add the outputs. 输出集合:添加输出的集合 data_format: NHWC or NCHW data format. 数据格式:NHWC 或者 NCHW 数据格式 trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_scope`. 可训练的:如果是true也可以添加变量到图集合中(GraphKeys.TRAINABLE_VARIABLES) 作用域:对于"变量的作用域"的可选择的作用域 Returns: A `Tensor` representing the output of the operation. 一个"张量"代表操作的输出 """ with variable_scope.variable_scope(scope, 'L2Normalization', [inputs], reuse=reuse) as sc: inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims dtype = inputs.dtype.base_dtype if data_format == 'NHWC': # norm_dim = tf.range(1, inputs_rank-1) norm_dim = tf.range(inputs_rank - 1, inputs_rank) params_shape = inputs_shape[-1:] elif data_format == 'NCHW': # norm_dim = tf.range(2, inputs_rank) norm_dim = tf.range(1, 2) params_shape = (inputs_shape[1]) # Normalize along spatial dimensions. outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12) # Additional scaling. if scaling: scale_collections = utils.get_variable_collections( variables_collections, 'scale') scale = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=scale_initializer, collections=scale_collections, trainable=trainable) if data_format == 'NHWC': outputs = tf.multiply(outputs, scale) elif data_format == 'NCHW': scale = tf.expand_dims(scale, axis=-1) scale = tf.expand_dims(scale, axis=-1) outputs = tf.multiply(outputs, scale) # outputs = tf.transpose(outputs, perm=(0, 2, 3, 1)) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def group_norm(inputs, groups=32, channels_axis=-1, reduction_axes=(-3, -2), center=True, scale=True, epsilon=1e-6, activation_fn=None, param_initializers=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None, mean_close_to_zero=False): """Functional interface for the group normalization layer. Reference: https://arxiv.org/abs/1803.08494. "Group Normalization", Yuxin Wu, Kaiming He Args: inputs: A Tensor with at least 2 dimensions one which is channels. All shape dimensions must be fully defined. groups: Integer. Divide the channels into this number of groups over which normalization statistics are computed. This number must be commensurate with the number of channels in `inputs`. channels_axis: An integer. Specifies index of channels axis which will be broken into `groups`, each of which whose statistics will be computed across. Must be mutually exclusive with `reduction_axes`. Preferred usage is to specify negative integers to be agnostic as to whether a batch dimension is included. reduction_axes: Tuple of integers. Specifies dimensions over which statistics will be accumulated. Must be mutually exclusive with `channels_axis`. Statistics will not be accumulated across axes not specified in `reduction_axes` nor `channel_axis`. Preferred usage is to specify negative integers to be agnostic to whether a batch dimension is included. Some sample usage cases: NHWC format: channels_axis=-1, reduction_axes=[-3, -2] NCHW format: channels_axis=-3, reduction_axes=[-2, -1] center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. param_initializers: Optional initializers for beta, gamma, moving mean and moving variance. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). scope: Optional scope for `variable_scope`. mean_close_to_zero: The mean of `input` before ReLU will be close to zero when batch size >= 4k for Resnet-50 on TPU. If `True`, use `nn.sufficient_statistics` and `nn.normalize_moments` to calculate the variance. This is the same behavior as `fused` equals `True` in batch normalization. If `False`, use `nn.moments` to calculate the variance. When `mean` is close to zero, like 1e-4, use `mean` to calculate the variance may have poor result due to repeated roundoff error and denormalization in `mean`. When `mean` is large, like 1e2, sum(`input`^2) is so large that only the high-order digits of the elements are being accumulated. Thus, use sum(`input` - `mean`)^2/n to calcualte the variance has better accuracy compared to (sum(`input`^2)/n - `mean`^2) when `mean` is large. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If the rank of `inputs` is undefined. ValueError: If rank or channels dimension of `inputs` is undefined. ValueError: If number of groups is not commensurate with number of channels. ValueError: If reduction_axes or channels_axis are out of bounds. ValueError: If reduction_axes are not mutually exclusive with channels_axis. """ # TODO(shlens): Support partially defined shapes for the inputs. inputs = ops.convert_to_tensor(inputs) original_shape = inputs.shape if inputs.shape.ndims is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) if channels_axis > (inputs.shape.ndims - 1): raise ValueError('Axis is out of bounds.') # Standardize the channels_axis to be positive and identify # of channels. if channels_axis < 0: channels_axis = inputs.shape.ndims + channels_axis channels = inputs.shape[channels_axis].value if channels is None: raise ValueError('Inputs %s has undefined channel dimension: %d.' % (inputs.name, channels_axis)) # Standardize the reduction_axes to be positive. reduction_axes = list(reduction_axes) for i in range(len(reduction_axes)): if reduction_axes[i] < 0: reduction_axes[i] += inputs.shape.ndims for a in reduction_axes: if a > inputs.shape.ndims: raise ValueError('Axis is out of bounds.') if inputs.shape[a].value is None: raise ValueError('Inputs %s has undefined dimensions %d.' % (inputs.name, a)) if channels_axis == a: raise ValueError('reduction_axis must be mutually exclusive ' 'with channels_axis') if groups > channels: raise ValueError('Invalid groups %d for %d channels.' % (groups, channels)) if channels % groups != 0: raise ValueError('%d channels is not commensurate with %d groups.' % (channels, groups)) # Determine axes before channels. Some examples of common image formats: # 'NCHW': before = [N], after = [HW] # 'NHWC': before = [NHW], after = [] axes_before_channels = inputs.shape.as_list()[:channels_axis] axes_after_channels = inputs.shape.as_list()[channels_axis + 1:] # Manually broadcast the parameters to conform to the number of groups. params_shape_broadcast = ([1] * len(axes_before_channels) + [groups, channels // groups] + [1] * len(axes_after_channels)) # Reshape the input by the group within the channel dimension. inputs_shape = (axes_before_channels + [groups, channels // groups] + axes_after_channels) inputs = array_ops.reshape(inputs, inputs_shape) # Determine the dimensions across which moments are calculated. moments_axes = [channels_axis + 1] for a in reduction_axes: if a > channels_axis: moments_axes.append(a + 1) else: moments_axes.append(a) with variable_scope.variable_scope(scope, 'GroupNorm', [inputs], reuse=reuse) as sc: # Note that the params_shape is the number of channels always. params_shape = [channels] # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None dtype = inputs.dtype.base_dtype if param_initializers is None: param_initializers = {} if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta_initializer = param_initializers.get( 'beta', init_ops.zeros_initializer()) beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=beta_initializer, collections=beta_collections, trainable=trainable) beta = array_ops.reshape(beta, params_shape_broadcast) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma_initializer = param_initializers.get( 'gamma', init_ops.ones_initializer()) gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=gamma_initializer, collections=gamma_collections, trainable=trainable) gamma = array_ops.reshape(gamma, params_shape_broadcast) # Calculate the moments. if mean_close_to_zero: # One pass algorithm returns better result when mean is close to zero. counts, means_ss, variance_ss, _ = nn.sufficient_statistics( inputs, moments_axes, keep_dims=True) mean, variance = nn.normalize_moments(counts, means_ss, variance_ss, shift=None) else: mean, variance = nn.moments(inputs, moments_axes, keep_dims=True) # Compute normalization. # TODO(shlens): Fix nn.batch_normalization to handle the 5-D Tensor # appropriately so that this operation may be faster. gain = math_ops.rsqrt(variance + epsilon) offset = -mean * gain if gamma is not None: gain *= gamma offset *= gamma if beta is not None: offset += beta outputs = inputs * gain + offset # Collapse the groups into the channel dimension. outputs = array_ops.reshape(outputs, original_shape) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def custom_batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, activation_fn=None, param_initializers=None, param_regularizers=None, updates_collections=ops.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, batch_weights=None, data_format='NHWC', zero_debias_moving_mean=False, scope=None, renorm=False, renorm_clipping=None, renorm_decay=0.99, noise_std=None): """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167. "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" Sergey Ioffe, Christian Szegedy Can be used as a normalizer function for conv2d and fully_connected. Note: when training, the moving_mean and moving_variance need to be updated. By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they need to be added as a dependency to the `train_op`. For example: ```python update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss) ``` One can set updates_collections=None to force the updates in place, but that can have a speed penalty, especially in distributed settings. Args: inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. The normalization is over all but the last dimension if `data_format` is `NHWC` and the second dimension if `data_format` is `NCHW`. decay: Decay for the moving average. Reasonable values for `decay` are close to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower `decay` value (recommend trying `decay`=0.9) if model experiences reasonably good training performance but poor validation and/or test performance. Try zero_debias_moving_mean=True for improved stability. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. param_initializers: Optional initializers for beta, gamma, moving mean and moving variance. param_regularizers: Optional regularizer for beta and gamma. updates_collections: Collections to collect the update ops for computation. The updates_ops need to be executed with the train_op. If None, a control dependency would be added to make sure the updates are computed in place. is_training: Whether or not the layer is in training mode. In training mode it would accumulate the statistics of the moments into `moving_mean` and `moving_variance` using an exponential moving average with the given `decay`. When it is not in training mode then it would use the values of the `moving_mean` and the `moving_variance`. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). batch_weights: An optional tensor of shape `[batch_size]`, containing a frequency weight for each batch item. If present, then the batch normalization uses weighted mean and variance. (This can be used to correct for bias in training example selection.) fused: Use nn.fused_batch_norm if True, nn.batch_normalization otherwise. data_format: A string. `NHWC` (default) and `NCHW` are supported. zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new pair of variables 'moving_mean/biased' and 'moving_mean/local_step'. scope: Optional scope for `variable_scope`. renorm: Whether to use Batch Renormalization (https://arxiv.org/abs/1702.03275). This adds extra variables during training. The inference is the same for either value of this parameter. renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to scalar `Tensors` used to clip the renorm correction. The correction `(r, d)` is used as `corrected_value = normalized_value * r + d`, with `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin, dmax are set to inf, 0, inf, respectively. renorm_decay: Momentum used to update the moving means and standard deviations with renorm. Unlike `momentum`, this affects training and should be neither too small (which would add noise) nor too large (which would give stale estimates). Note that `decay` is still applied to get the means and variances for inference. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If `batch_weights` is not None and `fused` is True. ValueError: If `param_regularizers` is not None and `fused` is True. ValueError: If `data_format` is neither `NHWC` nor `NCHW`. ValueError: If the rank of `inputs` is undefined. ValueError: If rank or channels dimension of `inputs` is undefined. """ layer_variable_getter = slim.layers._build_variable_getter() with variable_scope.variable_scope( scope, 'BatchNorm', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) # Determine whether we can use the core layer class. if (batch_weights is None and updates_collections is ops.GraphKeys.UPDATE_OPS and not zero_debias_moving_mean): # Use the core layer class. axis = 1 if data_format == 'NCHW' else -1 if not param_initializers: param_initializers = {} beta_initializer = param_initializers.get( 'beta', init_ops.zeros_initializer()) gamma_initializer = param_initializers.get( 'gamma', init_ops.ones_initializer()) moving_mean_initializer = param_initializers.get( 'moving_mean', init_ops.zeros_initializer()) moving_variance_initializer = param_initializers.get( 'moving_variance', init_ops.ones_initializer()) if not param_regularizers: param_regularizers = {} beta_regularizer = param_regularizers.get('beta') gamma_regularizer = param_regularizers.get('gamma') layer = normalization_layers.BatchNormalization( axis=axis, momentum=decay, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, moving_mean_initializer=moving_mean_initializer, moving_variance_initializer=moving_variance_initializer, beta_regularizer=beta_regularizer, gamma_regularizer=gamma_regularizer, trainable=trainable, renorm=renorm, renorm_clipping=renorm_clipping, renorm_momentum=renorm_decay, name=sc.name, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs, training=is_training) # Add variables to collections. slim.layers._add_variable_to_collections(layer.moving_mean, variables_collections, 'moving_mean') slim.layers._add_variable_to_collections(layer.moving_variance, variables_collections, 'moving_variance') if layer.beta: slim.layers._add_variable_to_collections( layer.beta, variables_collections, 'beta') if layer.gamma: slim.layers._add_variable_to_collections( layer.gamma, variables_collections, 'gamma') if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs) # Not supported by layer class: batch_weights argument, # and custom updates_collections. In that case, use the legacy BN # implementation. # Custom updates collections are not supported because the update logic # is different in this case, in particular w.r.t. "forced updates" and # update op reuse. if renorm: raise ValueError('renorm is not supported with batch_weights, ' 'updates_collections or zero_debias_moving_mean') inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype if batch_weights is not None: batch_weights = ops.convert_to_tensor(batch_weights) inputs_shape[0:1].assert_is_compatible_with( batch_weights.get_shape()) # Reshape batch weight values so they broadcast across inputs. nshape = [-1] + [1 for _ in range(inputs_rank - 1)] batch_weights = array_ops.reshape(batch_weights, nshape) if data_format == 'NCHW': moments_axes = [0] + list(range(2, inputs_rank)) params_shape = inputs_shape[1:2] # For NCHW format, rather than relying on implicit broadcasting, we # explicitly reshape the params to params_shape_broadcast when computing # the moments and the batch normalization. params_shape_broadcast = list([1, inputs_shape[1].value] + [1 for _ in range(2, inputs_rank)]) else: moments_axes = list(range(inputs_rank - 1)) params_shape = inputs_shape[-1:] params_shape_broadcast = None if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined channels dimension %s.' % (inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if not param_initializers: param_initializers = {} if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta_initializer = param_initializers.get( 'beta', init_ops.zeros_initializer()) beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=beta_initializer, collections=beta_collections, trainable=trainable) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma_initializer = param_initializers.get( 'gamma', init_ops.ones_initializer()) gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=gamma_initializer, collections=gamma_collections, trainable=trainable) # Create moving_mean and moving_variance variables and add them to the # appropriate collections. We disable variable partitioning while creating # them, because assign_moving_average is not yet supported for partitioned # variables. partitioner = variable_scope.get_variable_scope().partitioner try: variable_scope.get_variable_scope().set_partitioner(None) moving_mean_collections = utils.get_variable_collections( variables_collections, 'moving_mean') moving_mean_initializer = param_initializers.get( 'moving_mean', init_ops.zeros_initializer()) moving_mean = variables.model_variable( 'moving_mean', shape=params_shape, dtype=dtype, initializer=moving_mean_initializer, trainable=False, collections=moving_mean_collections) moving_variance_collections = utils.get_variable_collections( variables_collections, 'moving_variance') moving_variance_initializer = param_initializers.get( 'moving_variance', init_ops.ones_initializer()) moving_variance = variables.model_variable( 'moving_variance', shape=params_shape, dtype=dtype, initializer=moving_variance_initializer, trainable=False, collections=moving_variance_collections) finally: variable_scope.get_variable_scope().set_partitioner(partitioner) # If `is_training` doesn't have a constant value, because it is a `Tensor`, # a `Variable` or `Placeholder` then is_training_value will be None and # `needs_moments` will be true. is_training_value = utils.constant_value(is_training) need_moments = is_training_value is None or is_training_value if need_moments: # Calculate the moments based on the individual batch. if batch_weights is None: if data_format == 'NCHW': mean, variance = nn.moments(inputs, moments_axes, keep_dims=True) mean = array_ops.reshape(mean, [-1]) variance = array_ops.reshape(variance, [-1]) else: mean, variance = nn.moments(inputs, moments_axes) else: if data_format == 'NCHW': mean, variance = nn.weighted_moments(inputs, moments_axes, batch_weights, keep_dims=True) mean = array_ops.reshape(mean, [-1]) variance = array_ops.reshape(variance, [-1]) else: mean, variance = nn.weighted_moments( inputs, moments_axes, batch_weights) moving_vars_fn = lambda: (moving_mean, moving_variance) if updates_collections is None: def _force_updates(): """Internal function forces updates moving_vars if is_training.""" update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=zero_debias_moving_mean) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) with ops.control_dependencies( [update_moving_mean, update_moving_variance]): return array_ops.identity(mean), array_ops.identity( variance) mean, variance = utils.smart_cond(is_training, _force_updates, moving_vars_fn) else: def _delay_updates(): """Internal function that delay updates moving_vars if is_training.""" update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=zero_debias_moving_mean) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) return update_moving_mean, update_moving_variance update_mean, update_variance = utils.smart_cond( is_training, _delay_updates, moving_vars_fn) ops.add_to_collections(updates_collections, update_mean) ops.add_to_collections(updates_collections, update_variance) # Use computed moments during training and moving_vars otherwise. vars_fn = lambda: (mean, variance) mean, variance = utils.smart_cond(is_training, vars_fn, moving_vars_fn) else: mean, variance = moving_mean, moving_variance if data_format == 'NCHW': mean = array_ops.reshape(mean, params_shape_broadcast) variance = array_ops.reshape(variance, params_shape_broadcast) beta = array_ops.reshape(beta, params_shape_broadcast) if gamma is not None: gamma = array_ops.reshape(gamma, params_shape_broadcast) # Compute batch_normalization. outputs = batch_normalization(inputs, mean, variance, beta, gamma, epsilon, noise_std=noise_std) outputs.set_shape(inputs_shape) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase): def _assert_all_close(self, expected, actual, atol=0.001): if not context.executing_eagerly(): with self.cached_session() as sess: keras_backend._initialize_variables(sess) self.assertAllClose(expected, actual, atol=atol) else: self.assertAllClose(expected, actual, atol=atol) @test_util.run_in_graph_and_eager_modes() def test_invalid_output_dim(self): with self.assertRaisesRegexp( ValueError, r'`output_dim` should be a positive integer. Given: -3.'): _ = kernel_layers.RandomFourierFeatures(output_dim=-3, scale=2.0) @test_util.run_in_graph_and_eager_modes() def test_unsupported_kernel_type(self): with self.assertRaisesRegexp( ValueError, r'Unsupported kernel type: \'unsupported_kernel\'.'): _ = kernel_layers.RandomFourierFeatures( 3, 'unsupported_kernel', stddev=2.0) @test_util.run_in_graph_and_eager_modes() def test_invalid_scale(self): with self.assertRaisesRegexp( ValueError, r'When provided, `scale` should be a positive float. Given: 0.0.'): _ = kernel_layers.RandomFourierFeatures(output_dim=10, scale=0.0) @test_util.run_in_graph_and_eager_modes() def test_invalid_input_shape(self): inputs = random_ops.random_uniform((3, 2, 4), seed=1) rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0) with self.assertRaisesRegexp( ValueError, r'The rank of the input tensor should be 2. Got 3 instead.'): _ = rff_layer.apply(inputs) @parameterized.named_parameters( ('gaussian', 'gaussian', 10.0, False), ('random', init_ops.random_uniform_initializer, 1.0, True)) @test_util.run_in_graph_and_eager_modes() def test_random_features_properties(self, initializer, scale, trainable): rff_layer = kernel_layers.RandomFourierFeatures( output_dim=10, kernel_initializer=initializer, scale=scale, trainable=trainable) self.assertEqual(rff_layer.output_dim, 10) self.assertEqual(rff_layer.kernel_initializer, initializer) self.assertEqual(rff_layer.scale, scale) self.assertEqual(rff_layer.trainable, trainable) @parameterized.named_parameters(('gaussian', 'gaussian', False), ('laplacian', 'laplacian', True), ('other', init_ops.ones_initializer, True)) @test_util.run_in_graph_and_eager_modes() def test_call(self, initializer, trainable): rff_layer = kernel_layers.RandomFourierFeatures( output_dim=10, kernel_initializer=initializer, scale=1.0, trainable=trainable, name='random_fourier_features') inputs = random_ops.random_uniform((3, 2), seed=1) outputs = rff_layer(inputs) self.assertListEqual([3, 10], outputs.shape.as_list()) num_trainable_vars = 1 if trainable else 0 self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars) if not context.executing_eagerly(): self.assertLen( ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES), num_trainable_vars) @test_util.assert_no_new_pyobjects_executing_eagerly def test_no_eager_Leak(self): # Tests that repeatedly constructing and building a Layer does not leak # Python objects. inputs = random_ops.random_uniform((5, 4), seed=1) kernel_layers.RandomFourierFeatures(output_dim=4, name='rff')(inputs) kernel_layers.RandomFourierFeatures(output_dim=10, scale=2.0)(inputs) @test_util.run_in_graph_and_eager_modes() def test_output_shape(self): inputs = random_ops.random_uniform((3, 2), seed=1) rff_layer = kernel_layers.RandomFourierFeatures( output_dim=7, name='random_fourier_features', trainable=True) outputs = rff_layer(inputs) self.assertEqual([3, 7], outputs.shape.as_list()) @parameterized.named_parameters( ('gaussian', 'gaussian'), ('laplacian', 'laplacian'), ('other', init_ops.random_uniform_initializer)) @test_util.run_deprecated_v1 def test_call_on_placeholder(self, initializer): inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[None, None]) rff_layer = kernel_layers.RandomFourierFeatures( output_dim=5, kernel_initializer=initializer, name='random_fourier_features') with self.assertRaisesRegexp( ValueError, r'The last dimension of the inputs to ' '`RandomFourierFeatures` should be defined. Found `None`.'): rff_layer(inputs) inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[2, None]) rff_layer = kernel_layers.RandomFourierFeatures( output_dim=5, kernel_initializer=initializer, name='random_fourier_features') with self.assertRaisesRegexp( ValueError, r'The last dimension of the inputs to ' '`RandomFourierFeatures` should be defined. Found `None`.'): rff_layer(inputs) inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 3]) rff_layer = kernel_layers.RandomFourierFeatures( output_dim=5, name='random_fourier_features') rff_layer(inputs) @parameterized.named_parameters(('gaussian', 10, 'gaussian', 2.0), ('laplacian', 5, 'laplacian', None), ('other', 10, init_ops.ones_initializer, 1.0)) @test_util.run_in_graph_and_eager_modes() def test_compute_output_shape(self, output_dim, initializer, scale): rff_layer = kernel_layers.RandomFourierFeatures( output_dim, initializer, scale=scale, name='rff') with self.assertRaises(ValueError): rff_layer.compute_output_shape(tensor_shape.TensorShape(None)) with self.assertRaises(ValueError): rff_layer.compute_output_shape(tensor_shape.TensorShape([])) with self.assertRaises(ValueError): rff_layer.compute_output_shape(tensor_shape.TensorShape([3])) with self.assertRaises(ValueError): rff_layer.compute_output_shape(tensor_shape.TensorShape([3, 2, 3])) with self.assertRaisesRegexp( ValueError, r'The innermost dimension of input shape must be defined.'): rff_layer.compute_output_shape(tensor_shape.TensorShape([3, None])) self.assertEqual([None, output_dim], rff_layer.compute_output_shape((None, 3)).as_list()) self.assertEqual([None, output_dim], rff_layer.compute_output_shape( tensor_shape.TensorShape([None, 2])).as_list()) self.assertEqual([4, output_dim], rff_layer.compute_output_shape((4, 1)).as_list()) @parameterized.named_parameters( ('gaussian', 10, 'gaussian', 3.0, False), ('laplacian', 5, 'laplacian', 5.5, True), ('other', 7, init_ops.random_uniform_initializer(), None, True)) @test_util.run_in_graph_and_eager_modes() def test_get_config(self, output_dim, initializer, scale, trainable): rff_layer = kernel_layers.RandomFourierFeatures( output_dim, initializer, scale=scale, trainable=trainable, name='random_fourier_features', ) expected_initializer = initializer if isinstance(initializer, init_ops.Initializer): expected_initializer = initializers.serialize(initializer) expected_config = { 'output_dim': output_dim, 'kernel_initializer': expected_initializer, 'scale': scale, 'name': 'random_fourier_features', 'trainable': trainable, 'dtype': None, } self.assertLen(expected_config, len(rff_layer.get_config())) self.assertSameElements( list(expected_config.items()), list(rff_layer.get_config().items())) @parameterized.named_parameters( ('gaussian', 5, 'gaussian', None, True), ('laplacian', 5, 'laplacian', 5.5, False), ('other', 7, init_ops.ones_initializer(), 2.0, True)) @test_util.run_in_graph_and_eager_modes() def test_from_config(self, output_dim, initializer, scale, trainable): model_config = { 'output_dim': output_dim, 'kernel_initializer': initializer, 'scale': scale, 'trainable': trainable, 'name': 'random_fourier_features', } rff_layer = kernel_layers.RandomFourierFeatures.from_config(model_config) self.assertEqual(rff_layer.output_dim, output_dim) self.assertEqual(rff_layer.kernel_initializer, initializer) self.assertEqual(rff_layer.scale, scale) self.assertEqual(rff_layer.trainable, trainable) inputs = random_ops.random_uniform((3, 2), seed=1) outputs = rff_layer(inputs) self.assertListEqual([3, output_dim], outputs.shape.as_list()) num_trainable_vars = 1 if trainable else 0 self.assertLen(rff_layer.trainable_variables, num_trainable_vars) if trainable: self.assertEqual('random_fourier_features/random_features_scale:0', rff_layer.trainable_variables[0].name) self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars) if not context.executing_eagerly(): self.assertLen( ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES), num_trainable_vars) @parameterized.named_parameters( ('gaussian', 10, 'gaussian', 3.0, True), ('laplacian', 5, 'laplacian', 5.5, False), ('other', 10, init_ops.random_uniform_initializer(), None, True)) @test_util.run_in_graph_and_eager_modes() def test_same_random_features_params_reused(self, output_dim, initializer, scale, trainable): """Applying the layer on the same input twice gives the same output.""" rff_layer = kernel_layers.RandomFourierFeatures( output_dim=output_dim, kernel_initializer=initializer, scale=scale, trainable=trainable, name='random_fourier_features') inputs = constant_op.constant( np.random.uniform(low=-1.0, high=1.0, size=(2, 4))) output1 = rff_layer.apply(inputs) output2 = rff_layer.apply(inputs) self._assert_all_close(output1, output2) @parameterized.named_parameters( ('gaussian', 'gaussian', 5.0), ('laplacian', 'laplacian', 3.0), ('other', init_ops.random_uniform_initializer(), 5.0)) @test_util.run_in_graph_and_eager_modes() def test_different_params_similar_approximation(self, initializer, scale): random_seed.set_random_seed(12345) rff_layer1 = kernel_layers.RandomFourierFeatures( output_dim=3000, kernel_initializer=initializer, scale=scale, name='rff1') rff_layer2 = kernel_layers.RandomFourierFeatures( output_dim=2000, kernel_initializer=initializer, scale=scale, name='rff2') # Two distinct inputs. x = constant_op.constant([[1.0, -1.0, 0.5]]) y = constant_op.constant([[-1.0, 1.0, 1.0]]) # Apply both layers to both inputs. output_x1 = math.sqrt(2.0 / 3000.0) * rff_layer1.apply(x) output_y1 = math.sqrt(2.0 / 3000.0) * rff_layer1.apply(y) output_x2 = math.sqrt(2.0 / 2000.0) * rff_layer2.apply(x) output_y2 = math.sqrt(2.0 / 2000.0) * rff_layer2.apply(y) # Compute the inner products of the outputs (on inputs x and y) for both # layers. For any fixed random features layer rff_layer, and inputs x, y, # rff_layer(x)^T * rff_layer(y) ~= K(x,y) up to a normalization factor. approx_kernel1 = kernelized_utils.inner_product(output_x1, output_y1) approx_kernel2 = kernelized_utils.inner_product(output_x2, output_y2) self._assert_all_close(approx_kernel1, approx_kernel2, atol=0.08) @parameterized.named_parameters( ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)), ('laplacian', 'laplacian', 20.0, _exact_laplacian(stddev=20.0))) @test_util.run_in_graph_and_eager_modes() def test_bad_kernel_approximation(self, initializer, scale, exact_kernel_fn): """Approximation is bad when output dimension is small.""" # Two distinct inputs. x = constant_op.constant([[1.0, -1.0, 0.5]]) y = constant_op.constant([[-1.0, 1.0, 1.0]]) small_output_dim = 10 random_seed.set_random_seed(1234) # Initialize layer. rff_layer = kernel_layers.RandomFourierFeatures( output_dim=small_output_dim, kernel_initializer=initializer, scale=scale, name='random_fourier_features') # Apply layer to both inputs. output_x = math.sqrt(2.0 / small_output_dim) * rff_layer.apply(x) output_y = math.sqrt(2.0 / small_output_dim) * rff_layer.apply(y) # The inner products of the outputs (on inputs x and y) approximates the # real value of the RBF kernel but poorly since the output dimension of the # layer is small. exact_kernel_value = exact_kernel_fn(x, y) approx_kernel_value = kernelized_utils.inner_product(output_x, output_y) abs_error = math_ops.abs(exact_kernel_value - approx_kernel_value) if not context.executing_eagerly(): with self.cached_session() as sess: keras_backend._initialize_variables(sess) abs_error_eval = sess.run([abs_error]) self.assertGreater(abs_error_eval[0][0], 0.05) self.assertLess(abs_error_eval[0][0], 0.5) else: self.assertGreater(abs_error, 0.05) self.assertLess(abs_error, 0.5) @parameterized.named_parameters( ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)), ('laplacian', 'laplacian', 10.0, _exact_laplacian(stddev=10.0))) @test_util.run_in_graph_and_eager_modes() def test_good_kernel_approximation_multiple_inputs(self, initializer, scale, exact_kernel_fn): # Parameters. input_dim = 5 output_dim = 2000 x_rows = 20 y_rows = 30 x = constant_op.constant( np.random.uniform(size=(x_rows, input_dim)), dtype=dtypes.float32) y = constant_op.constant( np.random.uniform(size=(y_rows, input_dim)), dtype=dtypes.float32) random_seed.set_random_seed(1234) rff_layer = kernel_layers.RandomFourierFeatures( output_dim=output_dim, kernel_initializer=initializer, scale=scale, name='random_fourier_features') # The shapes of output_x and output_y are (x_rows, output_dim) and # (y_rows, output_dim) respectively. output_x = math.sqrt(2.0 / output_dim) * rff_layer.apply(x) output_y = math.sqrt(2.0 / output_dim) * rff_layer.apply(y) approx_kernel_matrix = kernelized_utils.inner_product(output_x, output_y) exact_kernel_matrix = exact_kernel_fn(x, y) self._assert_all_close(approx_kernel_matrix, exact_kernel_matrix, atol=0.05)