def create_model(): # Longformer encoder encoder = TFLongformerModel.from_pretrained('weights.h5') # QA Model - Reproducing HuggingFace like QA model architecture input_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32) #token_type_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32) attention_mask = layers.Input(shape=(MAX_LEN,), dtype=tf.int32) embedding = encoder( input_ids, attention_mask=attention_mask )[0] start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding) start_logits = layers.Flatten()(start_logits) end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding) end_logits = layers.Flatten()(end_logits) start_probs = layers.Activation(keras.activations.softmax)(start_logits) end_probs = layers.Activation(keras.activations.softmax)(end_logits) model = keras.Model( inputs=[input_ids, attention_mask], outputs=[start_probs, end_probs], ) loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False) optimizer = keras.optimizers.Adam(lr=LR) model.compile(optimizer=optimizer, loss=[loss, loss]) return model
def test_inference_no_head_long(self): model = TFLongformerModel.from_pretrained( "allenai/longformer-base-4096") # 'Hello world! ' repeated 1000 times input_ids = tf.convert_to_tensor( [[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=tf.dtypes.int32) attention_mask = tf.ones(shape_list(input_ids), dtype=tf.dtypes.int32) global_attention_mask = tf.zeros(shape_list(input_ids), dtype=tf.dtypes.int32) # Set global attention on a few random positions global_attention_mask = tf.tensor_scatter_nd_update( global_attention_mask, tf.constant([[0, 1], [0, 4], [0, 21]]), tf.constant([1, 1, 1])) output = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)[0] expected_output_sum = tf.constant(74585.875) expected_output_mean = tf.constant(0.024267) # assert close tf.debugging.assert_near(tf.reduce_sum(output), expected_output_sum, rtol=1e-4) tf.debugging.assert_near(tf.reduce_mean(output), expected_output_mean, rtol=1e-4)
def test_layer_local_attn(self): model = TFLongformerModel.from_pretrained( "patrickvonplaten/longformer-random-tiny") layer = model.longformer.encoder.layer[0].attention.self_attention hidden_states = self._get_hidden_states() batch_size, seq_length, hidden_size = hidden_states.shape attention_mask = tf.zeros((batch_size, seq_length), dtype=tf.dtypes.float32) is_index_global_attn = tf.math.greater(attention_mask, 1) is_global_attn = tf.math.reduce_any(is_index_global_attn) attention_mask = tf.where( tf.range(4)[None, :, None, None] > 1, -10000.0, attention_mask[:, :, None, None]) is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0) layer_head_mask = None output_hidden_states = layer([ hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn ])[0] expected_slice = tf.convert_to_tensor([ 0.00188, 0.012196, -0.017051, -0.025571, -0.02996, 0.017297, -0.011521, 0.004848 ], dtype=tf.dtypes.float32) self.assertTrue(output_hidden_states.shape, (1, 4, 8)) tf.debugging.assert_near(output_hidden_states[0, 1], expected_slice, rtol=1e-3)
def __init__( self, pretrained_model_name_or_path='allenai/longformer-base-4096', reduce_output='cls_pooled', trainable=True, num_tokens=None, **kwargs ): super(LongformerEncoder, self).__init__() try: from transformers import TFLongformerModel except ModuleNotFoundError: logger.error( ' transformers is not installed. ' 'In order to install all text feature dependencies run ' 'pip install ludwig[text]' ) sys.exit(-1) self.transformer = TFLongformerModel.from_pretrained( pretrained_model_name_or_path ) self.reduce_output = reduce_output if not self.reduce_output == 'cls_pooled': self.reduce_sequence = SequenceReducer(reduce_mode=reduce_output) self.transformer.trainable = trainable self.transformer.resize_token_embeddings(num_tokens)
def test_layer_global_attn(self): model = TFLongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny", use_cdn=False) layer = model.longformer.encoder.layer[0].attention.self_attention hidden_states = self._get_hidden_states() hidden_states = tf.concat([self._get_hidden_states(), self._get_hidden_states() - 0.5], axis=0) batch_size, seq_length, hidden_size = hidden_states.shape # create attn mask attention_mask_1 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32) attention_mask_2 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32) attention_mask_1 = tf.where(tf.range(4)[None, None, None, :] > 1, 10000.0, attention_mask_1) attention_mask_1 = tf.where(tf.range(4)[None, None, None, :] > 2, -10000.0, attention_mask_1) attention_mask_2 = tf.where(tf.range(4)[None, None, None, :] > 0, 10000.0, attention_mask_2) attention_mask = tf.concat([attention_mask_1, attention_mask_2], axis=0) output_hidden_states = layer([hidden_states, attention_mask, None])[0] self.assertTrue(output_hidden_states.shape, (2, 4, 8)) expected_slice_0 = tf.convert_to_tensor( [-0.06508, -0.039306, 0.030934, -0.03417, -0.00656, -0.01553, -0.02088, -0.04938], dtype=tf.dtypes.float32 ) expected_slice_1 = tf.convert_to_tensor( [-0.04055, -0.038399, 0.0396, -0.03735, -0.03415, 0.01357, 0.00145, -0.05709], dtype=tf.dtypes.float32 ) tf.debugging.assert_near(output_hidden_states[0, 2], expected_slice_0, rtol=1e-3) tf.debugging.assert_near(output_hidden_states[1, -2], expected_slice_1, rtol=1e-3)
def _test_TFLongformer(self, size, large=False): from transformers import LongformerTokenizer, TFLongformerModel tokenizer = LongformerTokenizer.from_pretrained(size) model = TFLongformerModel.from_pretrained(size) input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf") spec, input_dict = self.spec_and_pad(input_dict, max_length=512) outputs = ["last_hidden_state"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
def test_inference_no_head(self): model = TFLongformerModel.from_pretrained("allenai/longformer-base-4096") # 'Hello world!' input_ids = tf.convert_to_tensor([[0, 20920, 232, 328, 1437, 2]], dtype=tf.dtypes.int32) attention_mask = tf.ones(shape_list(input_ids), dtype=tf.dtypes.int32) output = model(input_ids, attention_mask=attention_mask)[0] output_without_mask = model(input_ids)[0] expected_output_slice = tf.convert_to_tensor( [0.0549, 0.1087, -0.1119, -0.0368, 0.0250], dtype=tf.dtypes.float32 ) tf.debugging.assert_near(output[0, 0, -5:], expected_output_slice, rtol=1e-3) tf.debugging.assert_near(output_without_mask[0, 0, -5:], expected_output_slice, rtol=1e-3)
def test_layer_local_attn(self): model = TFLongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny", use_cdn=False) layer = model.longformer.encoder.layer[0].attention.self_attention hidden_states = self._get_hidden_states() batch_size, seq_length, hidden_size = hidden_states.shape attention_mask = tf.zeros((batch_size, 1, 1, seq_length), dtype=tf.dtypes.float32) attention_mask = tf.where(tf.range(4)[None, None, None, :] > 1, -10000.0, attention_mask) output_hidden_states = layer([hidden_states, attention_mask, None])[0] expected_slice = tf.convert_to_tensor( [0.00188, 0.012196, -0.017051, -0.025571, -0.02996, 0.017297, -0.011521, 0.004848], dtype=tf.dtypes.float32 ) self.assertTrue(output_hidden_states.shape, (1, 4, 8)) tf.debugging.assert_near(output_hidden_states[0, 1], expected_slice, rtol=1e-3)
def test_layer_attn_probs(self): model = TFLongformerModel.from_pretrained( "patrickvonplaten/longformer-random-tiny") layer = model.longformer.encoder.layer[0].attention.self_attention hidden_states = tf.concat( [self._get_hidden_states(), self._get_hidden_states() - 0.5], axis=0) batch_size, seq_length, hidden_size = hidden_states.shape # create attn mask attention_mask_1 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32) attention_mask_2 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32) attention_mask_1 = tf.where( tf.range(4)[None, :, None, None] > 1, 10000.0, attention_mask_1) attention_mask_1 = tf.where( tf.range(4)[None, :, None, None] > 2, -10000.0, attention_mask_1) attention_mask_2 = tf.where( tf.range(4)[None, :, None, None] > 0, 10000.0, attention_mask_2) attention_mask = tf.concat([attention_mask_1, attention_mask_2], axis=0) is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0) is_index_global_attn = tf.math.greater(attention_mask[:, :, 0, 0], 0) is_global_attn = tf.math.reduce_any(is_index_global_attn) layer_head_mask = None output_hidden_states, local_attentions, global_attentions = layer([ hidden_states, -tf.math.abs(attention_mask), layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn, ]) self.assertEqual(local_attentions.shape, (2, 4, 2, 8)) self.assertEqual(global_attentions.shape, (2, 2, 3, 4)) self.assertTrue((local_attentions[0, 2:4, :, :] == 0).numpy().tolist()) self.assertTrue((local_attentions[1, 1:4, :, :] == 0).numpy().tolist()) # # The weight of all tokens with local attention must sum to 1. self.assertTrue((tf.math.abs( tf.math.reduce_sum(global_attentions[0, :, :2, :], axis=-1) - 1) < 1e-6).numpy().tolist()) self.assertTrue((tf.math.abs( tf.math.reduce_sum(global_attentions[1, :, :1, :], axis=-1) - 1) < 1e-6).numpy().tolist()) tf.debugging.assert_near( local_attentions[0, 0, 0, :], tf.convert_to_tensor([ 0.3328, 0.0000, 0.0000, 0.0000, 0.0000, 0.3355, 0.3318, 0.0000 ], dtype=tf.dtypes.float32), rtol=1e-3, ) tf.debugging.assert_near( local_attentions[1, 0, 0, :], tf.convert_to_tensor([ 0.2492, 0.2502, 0.2502, 0.0000, 0.0000, 0.2505, 0.0000, 0.0000 ], dtype=tf.dtypes.float32), rtol=1e-3, ) # All the global attention weights must sum to 1. self.assertTrue( (tf.math.abs(tf.math.reduce_sum(global_attentions, axis=-1) - 1) < 1e-6).numpy().tolist()) tf.debugging.assert_near( global_attentions[0, 0, 1, :], tf.convert_to_tensor([0.2500, 0.2500, 0.2500, 0.2500], dtype=tf.dtypes.float32), rtol=1e-3, ) tf.debugging.assert_near( global_attentions[1, 0, 0, :], tf.convert_to_tensor([0.2497, 0.2500, 0.2499, 0.2504], dtype=tf.dtypes.float32), rtol=1e-3, )