Python Attention 예제들, keras.layers.dense_attention.Attention Python 예제들

예제 #1

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

    def test_multi_dim_with_key(self):
        # Query tensor of shape [1, 1, 1]
        q = np.array([[[1.1]]], dtype=np.float32)
        # Value tensor of shape [1, 3, 1]
        v = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
        # Key tensor of shape [1, 3, 1]
        k = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
        # Value mask tensor of shape [1, 3]
        v_mask = np.array([[True, True, False]], dtype=np.bool_)
        attention_layer = dense_attention.Attention()
        actual = attention_layer([q, v, k], mask=[None, v_mask])

        # Expected scores of shape [1, 1, 3]
        # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8]]] = [[[1.76, 0.77, -0.88]]]
        # Expected attention distribution = softmax(scores) with zeros in
        # positions where v_mask == False.
        # => attention_distribution000 = exp(1.76)/(exp(1.76) + exp(0.77))
        #                              = 0.72908792234
        #    attention_distribution001 = exp(0.77)/(exp(1.76) + exp(0.77))
        #                              = 0.27091207765
        #    attention_distribution002 = 0
        #
        # Expected tensor of shape [1, 1, 1].
        # expected000 = 0.72908792234 * 0.5 + 0.27091207765 * 0.8 - 0 * 0.3
        #             = 0.58127362329
        expected = np.array([[[0.58127362329]]], dtype=np.float32)
        self.assertAllClose(expected, actual)

예제 #2

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

 def test_inputs_not_list(self):
     attention_layer = dense_attention.Attention()
     q = np.array([[[1.1]]], dtype=np.float32)
     with self.assertRaisesRegex(
             ValueError,
             'Attention layer must be called on a list of inputs'):
         attention_layer(q)

예제 #3

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

 def test_inputs_too_long(self):
     attention_layer = dense_attention.Attention()
     q = np.array([[[1.1]]], dtype=np.float32)
     with self.assertRaisesRegex(
             ValueError,
             'Attention layer accepts inputs list of length 2 or 3'):
         attention_layer([q, q, q, q])

예제 #4

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

 def test_mask_not_list(self):
     attention_layer = dense_attention.Attention()
     q = np.array([[[1.1]]], dtype=np.float32)
     mask = np.array([[True]], dtype=np.bool_)
     with self.assertRaisesRegex(ValueError,
                                 'Attention layer mask must be a list'):
         attention_layer([q, q], mask=mask)

예제 #5

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

 def test_mask_too_long(self):
     attention_layer = dense_attention.Attention()
     q = np.array([[[1.1]]], dtype=np.float32)
     mask = np.array([[True]], dtype=np.bool_)
     with self.assertRaisesRegex(
             ValueError, 'Attention layer mask must be a list of length 2'):
         attention_layer([q, q], mask=[mask, mask, mask])

예제 #6

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

 def test_scale_init_eager(self):
     """Tests that scale initializes to 1 when use_scale=True."""
     if not tf.executing_eagerly():
         self.skipTest('Only run in eager mode')
     attention_layer = dense_attention.Attention(use_scale=True)
     attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
     self.assertAllClose(1., attention_layer.scale.value())

예제 #7

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

 def test_scale_init_graph(self):
     """Tests that scale initializes to 1 when use_scale=True."""
     with self.cached_session() as sess:
         attention_layer = dense_attention.Attention(use_scale=True)
         attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
         sess.run(attention_layer.scale.initializer)
         self.assertAllClose(1., attention_layer.scale.value())

예제 #8

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

    def test_serialization(self, use_scale):
        # Test serialization with use_scale
        layer = dense_attention.Attention(use_scale=use_scale)

        config = keras.layers.serialize(layer)
        new_layer = keras.layers.deserialize(config)
        self.assertEqual(new_layer.use_scale, use_scale)

        config = layer.get_config()
        new_layer = dense_attention.Attention.from_config(config)
        self.assertEqual(new_layer.use_scale, use_scale)

예제 #9

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

    def test_calculate_scores_one_dim(self):
        # Query tensor of shape [1, 1, 1]
        q = np.array([[[1.1]]], dtype=np.float32)
        # Key tensor of shape [1, 1, 1]
        k = np.array([[[1.6]]], dtype=np.float32)
        attention_layer = dense_attention.Attention()
        attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
        actual = attention_layer._calculate_scores(query=q, key=k)

        # Expected tensor of shape [1, 1, 1].
        # expected000 = 1.1*1.6 = 1.76
        expected = np.array([[[1.76]]], dtype=np.float32)
        self.assertAllClose(expected, actual)

예제 #10

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

    def test_calculate_scores_one_dim_with_scale(self):
        """Tests that scores are multiplied by scale."""
        # Query tensor of shape [1, 1, 1]
        q = np.array([[[1.1]]], dtype=np.float32)
        # Key tensor of shape [1, 1, 1]
        k = np.array([[[1.6]]], dtype=np.float32)
        attention_layer = dense_attention.Attention(use_scale=True)
        attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
        attention_layer.scale = -2.
        actual = attention_layer._calculate_scores(query=q, key=k)

        # Expected tensor of shape [1, 1, 1].
        # expected000 = -2*1.1*1.6 = -3.52
        expected = np.array([[[-3.52]]], dtype=np.float32)
        self.assertAllClose(expected, actual)

예제 #11

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

    def test_shape(self):
        # Query tensor of shape [1, 2, 4]
        q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]],
                     dtype=np.float32)
        # Value tensor of shape [1, 3, 4]
        v = np.array([[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8],
                       [3.5, 3.6, 3.7, 3.8]]],
                     dtype=np.float32)
        # Value mask tensor of shape [1, 3]
        v_mask = np.array([[True, True, False]], dtype=np.bool_)
        attention_layer = dense_attention.Attention()
        actual = attention_layer([q, v], mask=[None, v_mask])

        expected_shape = [1, 2, 4]
        self.assertAllEqual(expected_shape, tf.compat.v1.shape(actual))

예제 #12

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

    def test_multi_dim_with_query_mask(self, return_attention_scores):
        # Query tensor of shape [1, 2, 1]
        q = np.array([[[1.1], [-0.5]]], dtype=np.float32)
        # Value tensor of shape [1, 3, 1]
        v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
        # Query mask tensor of shape [1, 2]
        q_mask = np.array([[True, False]], dtype=np.bool_)
        # Value mask tensor of shape [1, 3]
        v_mask = np.array([[True, True, False]], dtype=np.bool_)
        attention_layer = dense_attention.Attention()
        if return_attention_scores:
            actual, actual_scores = attention_layer(
                [q, v],
                mask=[q_mask, v_mask],
                return_attention_scores=return_attention_scores)
        else:
            actual = attention_layer(
                [q, v],
                mask=[q_mask, v_mask],
                return_attention_scores=return_attention_scores)

        # Expected scores of shape [1, 2, 3]
        # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8], [-0.5*1.6, -0.5*0.7, 0.5*0.8]]]
        #        = [[[1.76, 0.77, -0.88], [-0.8, -0.35, 0.4]]]
        # Expected attention distribution = softmax(scores) with zeros in
        # positions where v_mask == False.
        # => attention_distribution000 = exp(1.76)/(exp(1.76) + exp(0.77))
        #                              = 0.72908792234
        #    attention_distribution001 = exp(0.77)/(exp(1.76) + exp(0.77))
        #                              = 0.27091207765
        #    attention_distribution002 = 0
        # => attention_distribution010 = exp(-0.8)/(exp(-0.8) + exp(-0.35))
        #                              = 0.38936076605
        #    attention_distribution011 = exp(-0.35)/(exp(-0.8) + exp(-0.35))
        #                              = 0.61063923394
        #    attention_distribution012 = 0
        if return_attention_scores:
            expected_scores = np.array([[[0.72908792234, 0.27091207765, 0.],
                                         [0.38936076605, 0.61063923394, 0.]]],
                                       dtype=np.float32)
            self.assertAllClose(expected_scores, actual_scores)
        # Expected tensor of shape [1, 2, 1] with zeros where  q_mask == False.
        # expected000 = 0.72908792234 * 1.6 + 0.27091207765 * 0.7 - 0 * 0.8
        #             = 1.3561791301
        # expected000 = 0
        expected = np.array([[[1.3561791301], [0.]]], dtype=np.float32)
        self.assertAllClose(expected, actual)

예제 #13

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

    def test_calculate_scores_multi_dim(self):
        # Query tensor of shape [1, 2, 4]
        q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]],
                     dtype=np.float32)
        # Key tensor of shape [1, 3, 4]
        k = np.array([[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8],
                       [3.5, 3.6, 3.7, 3.8]]],
                     dtype=np.float32)
        attention_layer = dense_attention.Attention()
        attention_layer.build(input_shape=([1, 2, 4], [1, 3, 4]))
        actual = attention_layer._calculate_scores(query=q, key=k)

        # Expected tensor of shape [1, 2, 3].
        # expected000 = 1.*1.5+1.1*1.6+1.2*1.7+1.3*1.8 = 7.64
        # expected001 = 1.*2.5+1.1*2.6+1.2*2.7+1.3*2.8 = 12.24
        # expected002 = 1.*3.5+1.1*3.6+1.2*3.7+1.3*3.8 = 16.84
        # expected010 = 2.*1.5+2.1*1.6+2.2*1.7+2.3*1.8 = 14.24
        # expected011 = 2.*2.5+2.1*2.6+2.2*2.7+2.3*2.8 = 22.84
        # expected012 = 2.*3.5+2.1*3.6+2.2*3.7+2.3*3.8 = 31.44
        expected = np.array([[[7.64, 12.24, 16.84], [14.24, 22.84, 31.44]]],
                            dtype=np.float32)
        self.assertAllClose(expected, actual)

예제 #14

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

    def test_self_attention_causal(self, return_attention_scores):
        # Query-value tensor of shape [1, 3, 1]
        q = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
        attention_layer = dense_attention.Attention(causal=True)
        if return_attention_scores:
            actual, actual_scores = attention_layer(
                [q, q], return_attention_scores=return_attention_scores)
        else:
            actual = attention_layer(
                [q, q], return_attention_scores=return_attention_scores)

        # Expected scores of shape [1, 3, 3]
        # scores = [[0.25, 0.4, -0.15], [0.4, 0.64, -0.24], [-0.15, -0.24, 0.09]]
        # Expected attention distribution = softmax(scores) lower triangular
        # => attention_distribution00 = [1., 0., 0.]
        #    attention_distribution01
        #      = [exp(0.4), exp(0.64), 0.] / (exp(0.4) + exp(0.64))
        #      = [0.44028635073, 0.55971364926, 0.]
        #    attention_distribution02
        #      = [exp(-0.15), exp(-0.24), exp(0.09)]
        #        / (exp(-0.15) + exp(-0.24) + exp(0.09))
        #      = [0.31395396638, 0.28693232061, 0.399113713]
        if return_attention_scores:
            expected_scores = np.array(
                [[[1., 0., 0.], [0.44028635073, 0.55971364926, 0.],
                  [0.31395396638, 0.28693232061, 0.399113713]]],
                dtype=np.float32)
            self.assertAllClose(expected_scores, actual_scores)
        # Expected tensor of shape [1, 3, 1].
        # expected000 = 0.5
        # expected010 = 0.44028635073 * 0.5 + 0.55971364926 * 0.8
        #             = 0.66791409477
        # expected020 = 0.31395396638 * 0.5 +0.28693232061 * 0.8 -0.399113713 * 0.3
        #             = 0.26678872577
        expected = np.array([[[0.5], [0.66791409477], [0.26678872577]]],
                            dtype=np.float32)
        self.assertAllClose(expected, actual)

예제 #15

0

파일 보기

파일: layer_correctness_test.py 프로젝트: zy009197/keras

class LayerCorrectnessTest(keras_parameterized.TestCase):
    def setUp(self):
        super(LayerCorrectnessTest, self).setUp()
        # Set two virtual CPUs to test MirroredStrategy with multiple devices
        cpus = tf.config.list_physical_devices('CPU')
        tf.config.set_logical_device_configuration(cpus[0], [
            tf.config.LogicalDeviceConfiguration(),
            tf.config.LogicalDeviceConfiguration(),
        ])

    def _create_model_from_layer(self, layer, input_shapes):
        inputs = [layers.Input(batch_input_shape=s) for s in input_shapes]
        if len(inputs) == 1:
            inputs = inputs[0]
        y = layer(inputs)
        model = models.Model(inputs, y)
        model.compile('sgd', 'mse')
        return model

    @parameterized.named_parameters(
        ('LeakyReLU', advanced_activations.LeakyReLU, (2, 2)),
        ('PReLU', advanced_activations.PReLU, (2, 2)),
        ('ELU', advanced_activations.ELU, (2, 2)),
        ('ThresholdedReLU', advanced_activations.ThresholdedReLU, (2, 2)),
        ('Softmax', advanced_activations.Softmax, (2, 2)),
        ('ReLU', advanced_activations.ReLU, (2, 2)),
        ('Conv1D', lambda: convolutional.Conv1D(2, 2), (2, 2, 1)),
        ('Conv2D', lambda: convolutional.Conv2D(2, 2), (2, 2, 2, 1)),
        ('Conv3D', lambda: convolutional.Conv3D(2, 2), (2, 2, 2, 2, 1)),
        ('Conv2DTranspose', lambda: convolutional.Conv2DTranspose(2, 2),
         (2, 2, 2, 2)),
        ('SeparableConv2D', lambda: convolutional.SeparableConv2D(2, 2),
         (2, 2, 2, 1)),
        ('DepthwiseConv2D', lambda: convolutional.DepthwiseConv2D(2, 2),
         (2, 2, 2, 1)),
        ('UpSampling2D', convolutional.UpSampling2D, (2, 2, 2, 1)),
        ('ZeroPadding2D', convolutional.ZeroPadding2D, (2, 2, 2, 1)),
        ('Cropping2D', convolutional.Cropping2D, (2, 3, 3, 1)),
        ('ConvLSTM2D',
         lambda: convolutional_recurrent.ConvLSTM2D(4, kernel_size=(2, 2)),
         (4, 4, 4, 4, 4)),
        ('Dense', lambda: core.Dense(2), (2, 2)),
        ('Dropout', lambda: core.Dropout(0.5), (2, 2)),
        ('SpatialDropout2D', lambda: core.SpatialDropout2D(0.5), (2, 2, 2, 2)),
        ('Activation', lambda: core.Activation('sigmoid'), (2, 2)),
        ('Reshape', lambda: core.Reshape((1, 4, 1)), (2, 2, 2)),
        ('Permute', lambda: core.Permute((2, 1)), (2, 2, 2)),
        ('Attention', dense_attention.Attention, [(2, 2, 3), (2, 3, 3),
                                                  (2, 3, 3)]),
        ('AdditiveAttention', dense_attention.AdditiveAttention, [(2, 2, 3),
                                                                  (2, 3, 3),
                                                                  (2, 3, 3)]),
        ('Embedding', lambda: embeddings.Embedding(4, 4),
         (2, 4), 2e-3, 2e-3, np.random.randint(4, size=(2, 4))),
        ('LocallyConnected1D', lambda: local.LocallyConnected1D(2, 2),
         (2, 2, 1)),
        ('LocallyConnected2D', lambda: local.LocallyConnected2D(2, 2),
         (2, 2, 2, 1)),
        ('Add', merge.Add, [(2, 2), (2, 2)]),
        ('Subtract', merge.Subtract, [(2, 2), (2, 2)]),
        ('Multiply', merge.Multiply, [(2, 2), (2, 2)]),
        ('Average', merge.Average, [(2, 2), (2, 2)]),
        ('Maximum', merge.Maximum, [(2, 2), (2, 2)]),
        ('Minimum', merge.Minimum, [(2, 2), (2, 2)]),
        ('Concatenate', merge.Concatenate, [(2, 2), (2, 2)]),
        ('Dot', lambda: merge.Dot(1), [(2, 2), (2, 2)]),
        ('GaussianNoise', lambda: noise.GaussianNoise(0.5), (2, 2)),
        ('GaussianDropout', lambda: noise.GaussianDropout(0.5), (2, 2)),
        ('AlphaDropout', lambda: noise.AlphaDropout(0.5), (2, 2)),
        ('BatchNormalization', normalization_v2.BatchNormalization,
         (2, 2), 1e-2, 1e-2),
        ('LayerNormalization', normalization.LayerNormalization, (2, 2)),
        ('LayerNormalizationUnfused',
         lambda: normalization.LayerNormalization(axis=1), (2, 2, 2)),
        ('MaxPooling2D', pooling.MaxPooling2D, (2, 2, 2, 1)),
        ('AveragePooling2D', pooling.AveragePooling2D, (2, 2, 2, 1)),
        ('GlobalMaxPooling2D', pooling.GlobalMaxPooling2D, (2, 2, 2, 1)),
        ('GlobalAveragePooling2D', pooling.GlobalAveragePooling2D,
         (2, 2, 2, 1)),
        ('SimpleRNN', lambda: recurrent.SimpleRNN(units=4),
         (4, 4, 4), 1e-2, 1e-2),
        ('GRU', lambda: recurrent.GRU(units=4), (4, 4, 4)),
        ('LSTM', lambda: recurrent.LSTM(units=4), (4, 4, 4)),
        ('GRUV2', lambda: recurrent_v2.GRU(units=4), (4, 4, 4)),
        ('LSTMV2', lambda: recurrent_v2.LSTM(units=4), (4, 4, 4)),
        ('TimeDistributed', lambda: wrappers.TimeDistributed(core.Dense(2)),
         (2, 2, 2)),
        ('Bidirectional',
         lambda: wrappers.Bidirectional(recurrent.SimpleRNN(units=4)),
         (2, 2, 2)),
        ('AttentionLayerCausal',
         lambda: dense_attention.Attention(causal=True), [(2, 2, 3), (2, 3, 3),
                                                          (2, 3, 3)]),
        ('AdditiveAttentionLayerCausal',
         lambda: dense_attention.AdditiveAttention(causal=True), [(2, 3, 4),
                                                                  (2, 3, 4),
                                                                  (2, 3, 4)]),
    )
    def test_layer(self,
                   f32_layer_fn,
                   input_shape,
                   rtol=2e-3,
                   atol=2e-3,
                   input_data=None):
        """Tests a layer by comparing the float32 and mixed precision weights.

    A float32 layer, a mixed precision layer, and a distributed mixed precision
    layer are run. The three layers are identical other than their dtypes and
    distribution strategies. The outputs after predict() and weights after fit()
    are asserted to be close.

    Args:
      f32_layer_fn: A function returning a float32 layer. The other two layers
        will automatically be created from this
      input_shape: The shape of the input to the layer, including the batch
        dimension. Or a list of shapes if the layer takes multiple inputs.
      rtol: The relative tolerance to be asserted.
      atol: The absolute tolerance to be asserted.
      input_data: A Numpy array with the data of the input. If None, input data
        will be randomly generated
    """

        if f32_layer_fn == convolutional.ZeroPadding2D and \
           tf.test.is_built_with_rocm():
            return
        if isinstance(input_shape[0], int):
            input_shapes = [input_shape]
        else:
            input_shapes = input_shape
        strategy = create_mirrored_strategy()
        f32_layer = f32_layer_fn()

        # Create the layers
        assert f32_layer.dtype == f32_layer._compute_dtype == 'float32'
        config = f32_layer.get_config()
        config['dtype'] = policy.Policy('mixed_float16')
        mp_layer = f32_layer.__class__.from_config(config)
        distributed_mp_layer = f32_layer.__class__.from_config(config)

        # Compute per_replica_input_shapes for the distributed model
        global_batch_size = input_shapes[0][0]
        assert global_batch_size % strategy.num_replicas_in_sync == 0, (
            'The number of replicas, %d, does not divide the global batch size of '
            '%d' % (strategy.num_replicas_in_sync, global_batch_size))
        per_replica_batch_size = (global_batch_size //
                                  strategy.num_replicas_in_sync)
        per_replica_input_shapes = [(per_replica_batch_size, ) + s[1:]
                                    for s in input_shapes]

        # Create the models
        f32_model = self._create_model_from_layer(f32_layer, input_shapes)
        mp_model = self._create_model_from_layer(mp_layer, input_shapes)
        with strategy.scope():
            distributed_mp_model = self._create_model_from_layer(
                distributed_mp_layer, per_replica_input_shapes)

        # Set all model weights to the same values
        f32_weights = f32_model.get_weights()
        mp_model.set_weights(f32_weights)
        distributed_mp_model.set_weights(f32_weights)

        # Generate input data
        if input_data is None:
            # Cast inputs to float16 to avoid measuring error from having f16 layers
            # cast to float16.
            input_data = [
                np.random.normal(size=s).astype('float16')
                for s in input_shapes
            ]
            if len(input_data) == 1:
                input_data = input_data[0]

        # Assert all models have close outputs.
        f32_output = f32_model.predict(input_data)
        mp_output = mp_model.predict(input_data)
        self.assertAllClose(mp_output, f32_output, rtol=rtol, atol=atol)
        self.assertAllClose(distributed_mp_model.predict(input_data),
                            f32_output,
                            rtol=rtol,
                            atol=atol)

        # Run fit() on models
        output = np.random.normal(
            size=f32_model.outputs[0].shape).astype('float16')
        for model in f32_model, mp_model, distributed_mp_model:
            model.fit(input_data, output, batch_size=global_batch_size)

        # Assert all models have close weights
        f32_weights = f32_model.get_weights()
        self.assertAllClose(mp_model.get_weights(),
                            f32_weights,
                            rtol=rtol,
                            atol=atol)
        self.assertAllClose(distributed_mp_model.get_weights(),
                            f32_weights,
                            rtol=rtol,
                            atol=atol)

예제 #16

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

 def test_implicit_mask(self):
     attention_layer = dense_attention.Attention()
     q = core.Masking(1.1)(np.array([[[1.1], [1]]], dtype=np.float32))
     v = core.Masking(1.2)(np.array([[[1.2], [1]]], dtype=np.float32))
     actual = attention_layer([q, v])
     self.assertAllClose([[[0], [1]]], actual)

예제 #17

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

 def test_override_mask(self):
     attention_layer = dense_attention.Attention()
     q = core.Masking()(np.array([[[1.1]]], dtype=np.float32))
     mask = np.array([[False]], dtype=np.bool_)
     actual = attention_layer([q, q], mask=[mask, mask])
     self.assertAllClose([[[0]]], actual)

예제 #18

0

파일 보기

파일: dense_attention_test.py 프로젝트: yule9527/keras

 def test_scale_None(self):
     """Tests that scale is None by default."""
     attention_layer = dense_attention.Attention()
     attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
     self.assertIsNone(attention_layer.scale)