Exemplo n.º 1
0
    def __init__(self,
                 inner_layer: Optional[tf.keras.layers.Layer] = None,
                 normalization_layer: Optional[
                     Union[tf.keras.layers.Layer,
                           Sequence[tf.keras.layers.Layer]]] = None,
                 dropout_probability: float = 0.0,
                 use_pre_activation_order: bool = False,
                 inner_intermediate_size: Optional[int] = None,
                 inner_activation='relu',
                 inner_kernel_initializer=None,
                 name: Text = 'residual_block',
                 **kwargs):
        """Init.

    Args:
      inner_layer: Keras layer to apply as the inner layer in the residual
        block. The output of the layer must have the same shape as the input. By
        default, a 2-layer fully-connected network (via `DenseLayers`) is
        created based on the `inner_...` arguments below.
      normalization_layer: Normalization layer to apply. If `inner_layer`
        expects multiple inputs/outputs, then this should be a sequence of
        layers, one for each input. By default this is initialized to a single
        `tf.keras.layers.LayerNormalization` layer, so it must be given when
        expecting multiple `inner_layer` inputs.
      dropout_probability: The probability of dropping out a value when applying
        dropout for the block.
      use_pre_activation_order: If True, use "pre-activation" order (see class
        docstring for details).
      inner_intermediate_size: Size of intermediate fully-connected layer.
        Defaults to the input layer size. Ignored if `inner_layer` is not None.
      inner_activation: Activation function for the intermediate layer. Ignored
        if `inner_layer` is not None.
      inner_kernel_initializer: Initializer to use for fully-connected kernel
        weights. Bias weights are always initialized to 0. Ignored if
        `inner_layer` is not None.
      name: Name of the layer.
      **kwargs: Forwarded to super.
    """
        super(ResidualBlock, self).__init__(name=name, **kwargs)

        if normalization_layer is None:
            normalization_layer = tf.keras.layers.LayerNormalization(
                axis=-1, epsilon=1e-12, name='layer_norm')
        if isinstance(normalization_layer, Sequence):
            normalization_layers = normalization_layer
        else:
            normalization_layers = [normalization_layer]
        # Inner layer may be created later. Assign `normalization_layers` attribute
        # first, so that the variable order remains the same regardless.
        self.normalization_layers = normalization_layers
        self.inner_layer = inner_layer
        self.dropout_probability = dropout_probability
        self.use_pre_activation_order = use_pre_activation_order
        self.inner_intermediate_size = inner_intermediate_size
        self.inner_activation = inner_activation
        self.inner_kernel_initializer = inner_kernel_initializer
        self.dropout_layers = [
            recomputing_dropout.RecomputingDropout(rate=dropout_probability)
            for _ in self.normalization_layers
        ]
Exemplo n.º 2
0
 def test_force_recomputation(self):
     """Tests that an error is thrown when there is no recompute context."""
     dropout = recomputing_dropout.RecomputingDropout(
         0.4, force_recomputation=True)
     with self.assertRaises(ValueError) as assert_raises_context:
         dropout(np.random.normal(size=(2, 8)), training=True)
     self.assertContainsExactSubsequence(
         str(assert_raises_context.exception),
         'RecomputeContext is required')
Exemplo n.º 3
0
 def make_head():
     inputs = tf.keras.Input(shape=(5, ))
     x = tf.keras.layers.Dense(3,
                               activation='tanh',
                               name='dense',
                               bias_initializer='glorot_normal')(inputs)
     x = recomputing_dropout.RecomputingDropout(0.45)(x)
     outputs = {
         'head_mask': tf.cast(tf.math.not_equal(x, 0), tf.float32),
         'y': tf.reduce_sum(x),
     }
     return tf.keras.Model(inputs, outputs, name='head')
Exemplo n.º 4
0
    def test_recompute_grad(self):
        """Tests that the gradient is computed correctly with recompute_grad."""
        dense = tf.keras.layers.Dense(10, input_shape=(8, ))
        dropout = recomputing_dropout.RecomputingDropout(
            0.4, force_recomputation=True)

        @recompute_grad.recompute_grad
        def recompute_dense_dropout(x):
            return dropout(dense(x), training=True)

        # Define the model using dropout.
        def f(x):
            with tf.GradientTape() as tape:
                h1 = recompute_dense_dropout(x)
                h2 = recompute_dense_dropout(x)
                y = tf.math.reduce_sum(h1 + h2)
            return (tf.cast(tf.math.not_equal(h1, 0), tf.float32),
                    tf.cast(tf.math.not_equal(h2, 0), tf.float32),
                    tape.gradient(y, dense.trainable_variables))

        x = tf.convert_to_tensor(np.random.normal(size=(4, 8)), tf.float32)
        mask1, mask2, gradients = f(x)
        self.evaluate(tf.compat.v1.initializers.global_variables())

        mask1, mask2, gradients = self.evaluate([mask1, mask2, gradients])
        # Make sure entries were masked and there is randomness.
        self.assertGreaterEqual(np.sum(mask1 == 0), 2)
        self.assertGreaterEqual(np.sum(mask2 == 0), 2)
        self.assertNotAllEqual(mask1, mask2)

        # Use the masks to compute exact gradients.
        def g(x):
            with tf.GradientTape() as tape:
                # Rescale proportional to dropout rate.
                h1 = (dense(x) * mask1) / 0.6
                h2 = (dense(x) * mask2) / 0.6
                y = tf.math.reduce_sum(h1 + h2)
            return tape.gradient(y, dense.trainable_variables)

        expected_gradients = self.evaluate(g(x))
        self.assertAllClose(gradients, expected_gradients)
Exemplo n.º 5
0
    def test_nested_recompute_grad(self):
        """Tests nested usage of recompute_grad."""
        dense = tf.keras.layers.Dense(5,
                                      input_shape=(8, ),
                                      bias_initializer='glorot_normal')
        dropout = recomputing_dropout.RecomputingDropout(
            0.4, force_recomputation=True)

        @recompute_grad.recompute_grad
        def recompute_dense_dropout_tower(x):
            return dropout(dense(x), training=True)

        def make_head():
            inputs = tf.keras.Input(shape=(5, ))
            x = tf.keras.layers.Dense(3,
                                      activation='tanh',
                                      name='dense',
                                      bias_initializer='glorot_normal')(inputs)
            x = recomputing_dropout.RecomputingDropout(0.45)(x)
            outputs = {
                'head_mask': tf.cast(tf.math.not_equal(x, 0), tf.float32),
                'y': tf.reduce_sum(x),
            }
            return tf.keras.Model(inputs, outputs, name='head')

        head = make_head()

        # Nest recompute_grad inside another recompute_grad function.
        @recompute_grad.recompute_grad
        def recompute_model(x):
            y1 = recompute_dense_dropout_tower(x)
            y2 = recompute_dense_dropout_tower(x)
            outputs = head(y1 + y2, training=True)
            outputs.update({
                'tower1_mask':
                tf.cast(tf.math.not_equal(y1, 0), tf.float32),
                'tower2_mask':
                tf.cast(tf.math.not_equal(y2, 0), tf.float32),
            })
            return outputs

        def f(x):
            with tf.GradientTape() as tape:
                outputs = recompute_model(x)
            outputs['gradients'] = tape.gradient(
                outputs.pop('y'),
                dense.trainable_variables + head.trainable_variables)
            return outputs

        x = tf.convert_to_tensor(np.random.normal(size=(4, 8)), tf.float32)
        outputs = f(x)
        self.evaluate(tf.compat.v1.initializers.global_variables())
        outputs = self.evaluate(outputs)

        # Verify gradients are correct.
        def g(x):
            with tf.GradientTape() as tape:
                y1 = dense(x) * outputs['tower1_mask'] / 0.6
                y2 = dense(x) * outputs['tower2_mask'] / 0.6
                y = tf.reduce_sum(
                    head.get_layer('dense')(y1 + y2) * outputs['head_mask'] /
                    0.55)
            return tape.gradient(
                y, dense.trainable_variables + head.trainable_variables)

        # Increase tolerance from default of 1e-6 to reduce flakiness.
        self.assertAllClose(outputs['gradients'],
                            self.evaluate(g(x)),
                            rtol=2e-5,
                            atol=2e-5)