示例#1
0
    def test_inference_no_head_long(self):
        model = TFLongformerModel.from_pretrained(
            "allenai/longformer-base-4096")

        # 'Hello world! ' repeated 1000 times
        input_ids = tf.convert_to_tensor(
            [[0] + [20920, 232, 328, 1437] * 1000 + [2]],
            dtype=tf.dtypes.int32)

        attention_mask = tf.ones(shape_list(input_ids), dtype=tf.dtypes.int32)
        global_attention_mask = tf.zeros(shape_list(input_ids),
                                         dtype=tf.dtypes.int32)
        # Set global attention on a few random positions
        global_attention_mask = tf.tensor_scatter_nd_update(
            global_attention_mask, tf.constant([[0, 1], [0, 4], [0, 21]]),
            tf.constant([1, 1, 1]))

        output = model(input_ids,
                       attention_mask=attention_mask,
                       global_attention_mask=global_attention_mask)[0]

        expected_output_sum = tf.constant(74585.875)
        expected_output_mean = tf.constant(0.024267)

        # assert close
        tf.debugging.assert_near(tf.reduce_sum(output),
                                 expected_output_sum,
                                 rtol=1e-4)
        tf.debugging.assert_near(tf.reduce_mean(output),
                                 expected_output_mean,
                                 rtol=1e-4)
示例#2
0
    def test_diagonalize(self):
        hidden_states = self._get_hidden_states()
        hidden_states = tf.reshape(
            hidden_states, (1, 8, 4))  # set seq length = 8, hidden dim = 4
        chunked_hidden_states = TFLongformerSelfAttention._chunk(
            hidden_states, window_overlap=2)
        window_overlap_size = shape_list(chunked_hidden_states)[2]
        self.assertTrue(window_overlap_size == 4)

        padded_hidden_states = TFLongformerSelfAttention._pad_and_diagonalize(
            chunked_hidden_states)

        self.assertTrue(
            shape_list(padded_hidden_states)[-1] ==
            shape_list(chunked_hidden_states)[-1] + window_overlap_size - 1)

        # first row => [0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000]
        tf.debugging.assert_near(padded_hidden_states[0, 0, 0, :4],
                                 chunked_hidden_states[0, 0, 0],
                                 rtol=1e-3)
        tf.debugging.assert_near(padded_hidden_states[0, 0, 0, 4:],
                                 tf.zeros((3, ), dtype=tf.dtypes.float32),
                                 rtol=1e-3)

        # last row => [0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629]
        tf.debugging.assert_near(padded_hidden_states[0, 0, -1, 3:],
                                 chunked_hidden_states[0, 0, -1],
                                 rtol=1e-3)
        tf.debugging.assert_near(padded_hidden_states[0, 0, -1, :3],
                                 tf.zeros((3, ), dtype=tf.dtypes.float32),
                                 rtol=1e-3)
示例#3
0
    def create_and_check_model_with_global_attention_mask(
            self, config, input_ids, token_type_ids, input_mask,
            sequence_labels, token_labels, choice_labels):
        config.return_dict = True
        model = TFLongformerModel(config=config)
        half_input_mask_length = shape_list(input_mask)[-1] // 2
        global_attention_mask = tf.concat(
            [
                tf.zeros_like(input_mask)[:, :half_input_mask_length],
                tf.ones_like(input_mask)[:, half_input_mask_length:],
            ],
            axis=-1,
        )

        result = model(
            input_ids,
            attention_mask=input_mask,
            global_attention_mask=global_attention_mask,
            token_type_ids=token_type_ids,
        )
        result = model(input_ids,
                       token_type_ids=token_type_ids,
                       global_attention_mask=global_attention_mask)
        result = model(input_ids, global_attention_mask=global_attention_mask)

        self.parent.assertListEqual(
            shape_list(result.last_hidden_state),
            [self.batch_size, self.seq_length, self.hidden_size])
        self.parent.assertListEqual(shape_list(result.pooler_output),
                                    [self.batch_size, self.hidden_size])
示例#4
0
    def create_and_check_gpt2_model_attention_mask_past(
            self, config, input_ids, input_mask, head_mask, token_type_ids,
            *args):
        model = TFGPT2Model(config=config)

        # create attention mask
        half_seq_length = self.seq_length // 2
        attn_mask_begin = tf.ones((self.batch_size, half_seq_length),
                                  dtype=tf.int32)
        attn_mask_end = tf.zeros(
            (self.batch_size, self.seq_length - half_seq_length),
            dtype=tf.int32)
        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)

        # first forward pass
        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()

        # create hypothetical next token and extent to next_input_ids
        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)

        # change a random masked slice from input_ids
        random_seq_idx_to_change = ids_tensor(
            (1, ), half_seq_length).numpy() + 1
        random_other_next_tokens = ids_tensor(
            (self.batch_size, self.seq_length), config.vocab_size)
        vector_condition = tf.range(
            self.seq_length) == (self.seq_length - random_seq_idx_to_change)
        condition = tf.transpose(
            tf.broadcast_to(tf.expand_dims(vector_condition, -1),
                            (self.seq_length, self.batch_size)))
        input_ids = tf.where(condition, random_other_next_tokens, input_ids)

        # append to next input_ids and attn_mask
        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
        attn_mask = tf.concat([
            attn_mask,
            tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)
        ],
                              axis=1)

        # get two different outputs
        output_from_no_past = model(
            next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
        output_from_past = model(next_tokens,
                                 past=past,
                                 attention_mask=attn_mask)["last_hidden_state"]

        # select random slice
        random_slice_idx = int(
            ids_tensor((1, ),
                       shape_list(output_from_past)[-1]))
        output_from_no_past_slice = output_from_no_past[:, -1,
                                                        random_slice_idx]
        output_from_past_slice = output_from_past[:, 0, random_slice_idx]

        # test that outputs are equal for slice
        tf.debugging.assert_near(output_from_past_slice,
                                 output_from_no_past_slice,
                                 rtol=1e-12)
示例#5
0
    def create_and_check_model(self, config, input_ids, token_type_ids,
                               input_mask, sequence_labels, token_labels,
                               choice_labels):
        config.return_dict = True
        model = TFLongformerModel(config=config)
        result = model(input_ids,
                       attention_mask=input_mask,
                       token_type_ids=token_type_ids)
        result = model(input_ids, token_type_ids=token_type_ids)
        result = model(input_ids)

        self.parent.assertListEqual(
            shape_list(result.last_hidden_state),
            [self.batch_size, self.seq_length, self.hidden_size])
        self.parent.assertListEqual(shape_list(result.pooler_output),
                                    [self.batch_size, self.hidden_size])
示例#6
0
    def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
        model = TFGPT2Model(config=config)

        # first forward pass
        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)

        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)

        output, past = outputs.to_tuple()

        # create hypothetical next token and extent to next_input_ids
        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)

        # append to next input_ids and token_type_ids
        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
        next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)

        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
        output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"]

        # select random slice
        random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
        output_from_past_slice = output_from_past[:, 0, random_slice_idx]

        # test that outputs are equal for slice
        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
    def call(
        self,
        hidden_states: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        relative_position_bias: Optional[
            "TFData2VecVisionRelativePositionBias"] = None,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        batch_size = shape_list(hidden_states)[0]
        mixed_query_layer = self.query(inputs=hidden_states)
        mixed_key_layer = self.key(inputs=hidden_states)
        mixed_value_layer = self.value(inputs=hidden_states)
        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        # (batch size, num_heads, seq_len_q, seq_len_k)
        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
        attention_scores = attention_scores / self.sqrt_att_head_size

        # Add relative position bias if present.
        if self.relative_position_bias is not None:
            # Passing `0.0` to the `relative_position_bias()` layer because otherwise Keras
            # might complain about `Layer.call()` not being invoked properly. In this case this input
            # i.e., 0.0 is not going to be used in any calculations so we're safe.
            attention_scores = attention_scores + self.relative_position_bias(
                0.0)[None, ...]

        # Add shared relative position bias if provided.
        if relative_position_bias is not None:
            attention_scores = attention_scores + relative_position_bias

        # Normalize the attention scores to probabilities.
        attention_probs = stable_softmax(logits=attention_scores, axis=-1)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(inputs=attention_probs,
                                       training=training)

        # Mask heads if we want to
        if head_mask is not None:
            attention_probs = tf.multiply(attention_probs, head_mask)

        attention_output = tf.matmul(attention_probs, value_layer)
        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])

        # (batch_size, seq_len_q, all_head_size)
        attention_output = tf.reshape(tensor=attention_output,
                                      shape=(batch_size, -1,
                                             self.all_head_size))
        outputs = (attention_output,
                   attention_probs) if output_attentions else (
                       attention_output, )

        return outputs
示例#8
0
    def create_and_check_for_question_answering(self, config, input_ids,
                                                token_type_ids, input_mask,
                                                sequence_labels, token_labels,
                                                choice_labels):
        config.return_dict = True
        model = TFLongformerForQuestionAnswering(config=config)
        result = model(
            input_ids,
            attention_mask=input_mask,
            token_type_ids=token_type_ids,
            start_positions=sequence_labels,
            end_positions=sequence_labels,
        )

        self.parent.assertListEqual(shape_list(result.start_logits),
                                    [self.batch_size, self.seq_length])
        self.parent.assertListEqual(shape_list(result.end_logits),
                                    [self.batch_size, self.seq_length])
示例#9
0
 def create_and_check_for_sequence_classification(
         self, config, input_ids, token_type_ids, input_mask,
         sequence_labels, token_labels, choice_labels):
     config.num_labels = self.num_labels
     model = TFLongformerForSequenceClassification(config=config)
     output = model(input_ids,
                    attention_mask=input_mask,
                    token_type_ids=token_type_ids,
                    labels=sequence_labels).logits
     self.parent.assertListEqual(shape_list(output),
                                 [self.batch_size, self.num_labels])
示例#10
0
 def create_and_check_for_masked_lm(self, config, input_ids, token_type_ids,
                                    input_mask, sequence_labels,
                                    token_labels, choice_labels):
     config.return_dict = True
     model = TFLongformerForMaskedLM(config=config)
     result = model(input_ids,
                    attention_mask=input_mask,
                    token_type_ids=token_type_ids,
                    labels=token_labels)
     self.parent.assertListEqual(
         shape_list(result.logits),
         [self.batch_size, self.seq_length, self.vocab_size])
示例#11
0
    def create_and_check_gpt2_model_past_large_inputs(self, config, input_ids,
                                                      input_mask, head_mask,
                                                      token_type_ids, *args):
        model = TFGPT2Model(config=config)

        input_ids = input_ids[:1, :]
        input_mask = input_mask[:1, :]
        token_type_ids = token_type_ids[:1, :]
        self.batch_size = 1

        # first forward pass
        outputs = model(input_ids,
                        attention_mask=input_mask,
                        token_type_ids=token_type_ids,
                        use_cache=True)

        output, past = outputs.to_tuple()

        # create hypothetical next token and extent to next_input_ids
        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
        next_token_types = ids_tensor((self.batch_size, 3),
                                      self.type_vocab_size)

        # append to next input_ids and token_type_ids
        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
        next_token_type_ids = tf.concat([token_type_ids, next_token_types],
                                        axis=-1)

        output_from_no_past = model(
            next_input_ids,
            token_type_ids=next_token_type_ids,
            attention_mask=next_attention_mask)["last_hidden_state"]
        output_from_past = model(next_tokens,
                                 token_type_ids=next_token_types,
                                 attention_mask=next_attention_mask,
                                 past=past)["last_hidden_state"]
        self.parent.assertTrue(
            output_from_past.shape[1] == next_tokens.shape[1])

        # select random slice
        random_slice_idx = int(
            ids_tensor((1, ),
                       shape_list(output_from_past)[-1]))
        output_from_no_past_slice = output_from_no_past[:, -3:,
                                                        random_slice_idx]
        output_from_past_slice = output_from_past[:, :, random_slice_idx]

        # test that outputs are equal for slice
        tf.debugging.assert_near(output_from_past_slice,
                                 output_from_no_past_slice,
                                 rtol=1e-3)
示例#12
0
    def test_pad_and_transpose_last_two_dims(self):
        hidden_states = self._get_hidden_states()
        self.assertTrue(shape_list(hidden_states), [1, 8, 4])

        # pad along seq length dim
        paddings = tf.constant([[0, 0], [0, 0], [0, 1], [0, 0]],
                               dtype=tf.dtypes.int32)

        hidden_states = TFLongformerSelfAttention._chunk(hidden_states,
                                                         window_overlap=2)
        padded_hidden_states = TFLongformerSelfAttention._pad_and_transpose_last_two_dims(
            hidden_states, paddings)
        self.assertTrue(shape_list(padded_hidden_states) == [1, 1, 8, 5])

        expected_added_dim = tf.zeros((5, ), dtype=tf.dtypes.float32)
        tf.debugging.assert_near(expected_added_dim,
                                 padded_hidden_states[0, 0, -1, :],
                                 rtol=1e-6)
        tf.debugging.assert_near(hidden_states[0, 0, -1, :],
                                 tf.reshape(padded_hidden_states,
                                            (1, -1))[0, 24:32],
                                 rtol=1e-6)
示例#13
0
    def test_inference_no_head(self):
        model = TFLongformerModel.from_pretrained(
            "allenai/longformer-base-4096")

        # 'Hello world!'
        input_ids = tf.convert_to_tensor([[0, 20920, 232, 328, 1437, 2]],
                                         dtype=tf.dtypes.int32)
        attention_mask = tf.ones(shape_list(input_ids), dtype=tf.dtypes.int32)

        output = model(input_ids, attention_mask=attention_mask)[0]
        output_without_mask = model(input_ids)[0]

        expected_output_slice = tf.convert_to_tensor(
            [0.0549, 0.1087, -0.1119, -0.0368, 0.0250],
            dtype=tf.dtypes.float32)

        tf.debugging.assert_near(output[0, 0, -5:],
                                 expected_output_slice,
                                 rtol=1e-3)
        tf.debugging.assert_near(output_without_mask[0, 0, -5:],
                                 expected_output_slice,
                                 rtol=1e-3)
示例#14
0
    def test_chunk(self):
        hidden_states = self._get_hidden_states()
        batch_size = 1
        seq_length = 8
        hidden_size = 4
        hidden_states = tf.reshape(hidden_states,
                                   (batch_size, seq_length, hidden_size))

        chunked_hidden_states = TFLongformerSelfAttention._chunk(
            hidden_states, window_overlap=2)

        # expected slices across chunk and seq length dim
        expected_slice_along_seq_length = tf.convert_to_tensor(
            [0.4983, -0.7584, -1.6944], dtype=tf.dtypes.float32)
        expected_slice_along_chunk = tf.convert_to_tensor(
            [0.4983, -1.8348, -0.7584, 2.0514], dtype=tf.dtypes.float32)

        self.assertTrue(shape_list(chunked_hidden_states) == [1, 3, 4, 4])
        tf.debugging.assert_near(chunked_hidden_states[0, :, 0, 0],
                                 expected_slice_along_seq_length,
                                 rtol=1e-3)
        tf.debugging.assert_near(chunked_hidden_states[0, 0, :, 0],
                                 expected_slice_along_chunk,
                                 rtol=1e-3)
    def call(self,
             pixel_values: tf.Tensor,
             training: bool = False) -> tf.Tensor:
        batch_size, num_channels, height, width = shape_list(pixel_values)
        if getattr(height, "numpy", None) and getattr(width, "numpy", None):
            if height != self.image_size[0] or width != self.image_size[1]:
                raise ValueError(
                    f"Input image size ({height}*{width}) doesn't match model"
                    f" ({self.image_size[0]}*{self.image_size[1]}).")

        # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
        # So change the input format from `NCHW` to `NHWC`.
        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))

        projection = self.projection(pixel_values)

        # Change the 2D spatial dimensions to a single temporal dimension.
        # shape = (batch_size, num_patches, out_channels=embed_dim)
        num_patches = (width // self.patch_size[1]) * (height //
                                                       self.patch_size[0])

        return tf.reshape(tensor=projection,
                          shape=(batch_size, num_patches, -1))
    def call(self,
             pixel_values: tf.Tensor,
             bool_masked_pos: Optional[tf.Tensor] = None) -> tf.Tensor:

        embeddings = self.patch_embeddings(pixel_values)
        batch_size, seq_len, projection_dim = shape_list(embeddings)

        cls_tokens = tf.tile(self.cls_token, (batch_size, 1, 1))

        if bool_masked_pos is not None:
            mask_tokens = tf.broadcast_to(
                self.mask_token, (batch_size, seq_len, projection_dim))
            # replace the masked visual tokens by mask_tokens
            w = bool_masked_pos[..., None]
            w = tf.cast(w, mask_tokens.dtype)
            # since TF doesn't support eager tensor assignment
            embeddings = embeddings * (1 - w) + mask_tokens * w

        embeddings = tf.concat([cls_tokens, embeddings], axis=1)
        if self.position_embeddings is not None:
            embeddings = embeddings + self.position_embeddings
        embeddings = self.dropout(embeddings)

        return embeddings