def test_embedding_lookup_1d_ids(self, use_one_hot_lookup): embedding_table = tf.constant([ [1.0, -1.0], # [1.1, -1.1], # [1.2, -1.2], # [1.3, -1.3], # [1.4, -1.4], # ]) vocab_size, embedding_size = embedding_table.shape.as_list() input_ids = tf.constant([1, 0, 0, 3]) input_mask = tf.constant([1, 1, 0, 1]) layer = readtwice_layers.EmbeddingLookup( vocab_size=vocab_size, embedding_size=embedding_size, use_one_hot_lookup=use_one_hot_lookup) layer.build(None) # Shapes are unused so we pass None. layer.embedding_table = embedding_table expected = [ [1.1, -1.1], # [1.0, -1.0], # [0.0, 0.0], # [1.3, -1.3], # ] result = layer(input_ids, input_mask=input_mask) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(expected, result)
def test_embedding_lookup_with_projection(self): # Create an embedding table with width != projection_size embedding_table = tf.constant([ [1.0, -1.0, 0.5], # [1.1, -1.1, -0.4], # [1.2, -1.2, -0.5], # [1.3, -1.3, 0.8], # [1.4, -1.4, 0.9], # ]) projection_size = 2 # Different from the embedding_dimension. vocab_size, embedding_size = embedding_table.shape.as_list() input_ids = tf.constant([ [3, 2, 1], # [4, 0, 4], # ]) input_mask = tf.constant([ [1, 0, 0], # [0, 0, 1], # ]) layer = readtwice_layers.EmbeddingLookup( vocab_size=vocab_size, embedding_size=embedding_size, projection_size=projection_size, use_one_hot_lookup=True) layer.build(None) # Shapes are unused so we pass None. layer.embedding_table = embedding_table # Dense layer to use for projection. Note that, we have a non-zero # bias initializer here to ensure that the bias term doesn't get through # to the masked_ids after projection. layer.embedding_projection = tf.keras.layers.Dense( units=projection_size, activation=None, use_bias=True, kernel_initializer='ones', bias_initializer='ones') expected = [ [ [1.8, 1.8], # [1.3, -1.3, 0.8] * kernel_initializer + 1 (bias). [0., 0.], # [0., 0.], # ], # [ [0., 0.], # [0., 0.], # [1.9, 1.9], # [1.4, -1.4, 0.9] * kernel_initializer + 1 (bias). ], # ] result = layer(input_ids, input_mask) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(expected, result)
def test_embedding_lookup_random_init_no_mask(self, use_one_hot_lookup): vocab_size = 5 embedding_size = 2 input_ids = tf.constant([1, 0, 0, 3]) input_size = input_ids.shape.as_list()[0] layer = readtwice_layers.EmbeddingLookup( vocab_size=vocab_size, embedding_size=embedding_size, use_one_hot_lookup=use_one_hot_lookup) result = layer(input_ids) self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(result) self.assertAllEqual([input_size, embedding_size], result.shape)
def test_embedding_lookup_no_projection(self, projection_size): # Create an embedding table with width = projection_size embedding_table = tf.constant([ [1.0, -1.0, 0.5], # [1.1, -1.1, -0.5], # [1.2, -1.2, -0.2], # [1.3, -1.3, 0.3], # [1.4, -1.4, 0.4], # ]) vocab_size, embedding_size = embedding_table.shape.as_list() input_ids = tf.constant([ [3, 2, 1], # [4, 0, 4], # ]) layer = readtwice_layers.EmbeddingLookup( vocab_size=vocab_size, embedding_size=embedding_size, projection_size=projection_size, use_one_hot_lookup=True) layer.build(None) # Shapes are unused so we pass None. layer.embedding_table = embedding_table expected = [ [ [1.3, -1.3, 0.3], # [1.2, -1.2, -0.2], # [1.1, -1.1, -0.5], # ], # [ [1.4, -1.4, 0.4], # [1.0, -1.0, 0.5], # [1.4, -1.4, 0.4], # ], # ] result = layer(input_ids) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(expected, result)
def __init__(self, config, num_layers_override, num_cross_attention_heads, enable_default_side_input=False, use_one_hot_embeddings=False, name="read_it_twice_decoder", **kwargs): """Constructor for ReadItTwiceDecoderModel. Args: config: `model_config.ReadItTwiceBertConfig` instance. num_layers_override: int. Number of Transformer layers. num_cross_attention_heads: int. Number of cross-attention heads. enable_default_side_input: Add a default side input, which acts like a no-op attention, effective allowing attention weights to sum up to something less than 1. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.nn.embedding_lookup() for the word embeddings. name: (Optional) name of the layer. **kwargs: Forwarded to super. Raises: ValueError: The config is invalid. """ super(ReadItTwiceDecoderModel, self).__init__(name=name, **kwargs) self.use_one_hot_embeddings = use_one_hot_embeddings self.num_layers_override = num_layers_override self.num_cross_attention_heads = num_cross_attention_heads self.enable_default_side_input = enable_default_side_input if config.embedding_size is None: config = dataclasses.replace(config, embedding_size=config.hidden_size) self.config = config self.token_embedding = readtwice_layers.EmbeddingLookup( vocab_size=config.vocab_size, embedding_size=config.embedding_size, projection_size=config.hidden_size, initializer_range=config.initializer_range, use_one_hot_lookup=use_one_hot_embeddings, name="token_emb_lookup") self.token_embedding_norm = tf.keras.layers.LayerNormalization( axis=-1, epsilon=1e-12, name="emb_layer_norm") self.token_embedding_dropout = tf.keras.layers.Dropout( rate=config.hidden_dropout_prob) self.position_embedding = readtwice_layers.EmbeddingLookup( vocab_size=config.max_seq_length, embedding_size=config.hidden_size, initializer_range=config.initializer_range, use_one_hot_lookup=use_one_hot_embeddings, name="position_emb_lookup_long") # Call layers to force variable initialization. self.position_embedding(tf.ones([1, 1], tf.int32)) self.transformer_with_side_inputs = readtwice_layers.TransformerWithSideInputLayers( hidden_size=config.hidden_size, num_hidden_layers=self.num_layers_override, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=tensor_utils.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config.attention_probs_dropout_prob, initializer_range=config.initializer_range, share_kv_projections=False, num_cross_attention_heads=self.num_cross_attention_heads, enable_default_side_input=self.enable_default_side_input)
def __init__(self, config, use_one_hot_embeddings=False, name="read_it_twice_bert", **kwargs): """Constructor for ReadItTwiceBertModel. Args: config: `model_config.ReadItTwiceBertConfig` instance. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.nn.embedding_lookup() for the word embeddings. name: (Optional) name of the layer. **kwargs: Forwarded to super. Raises: ValueError: The config is invalid. """ super(ReadItTwiceBertModel, self).__init__(name=name, **kwargs) self.use_one_hot_embeddings = use_one_hot_embeddings if config.cross_attention_top_k is not None: assert config.second_read_type == "cross_attend_once" if config.embedding_size is None: config = dataclasses.replace(config, embedding_size=config.hidden_size) self.config = config self.token_embedding = readtwice_layers.EmbeddingLookup( vocab_size=config.vocab_size, embedding_size=config.embedding_size, projection_size=config.hidden_size, initializer_range=config.initializer_range, use_one_hot_lookup=use_one_hot_embeddings, name="token_emb_lookup") self.token_embedding_norm = tf.keras.layers.LayerNormalization( axis=-1, epsilon=1e-12, name="emb_layer_norm") self.token_embedding_dropout = tf.keras.layers.Dropout( rate=config.hidden_dropout_prob) self.position_embedding = readtwice_layers.EmbeddingLookup( vocab_size=config.max_seq_length, embedding_size=config.hidden_size, initializer_range=config.initializer_range, use_one_hot_lookup=use_one_hot_embeddings, name="position_emb_lookup_long") # Call layers to force variable initialization. self.position_embedding(tf.ones([1, 1], tf.int32)) if config.cross_attention_pos_emb_mode is not None: # We would end up adding block position embeddings multiple times. assert config.summary_postprocessing_type not in [ "pos", "transformer" ] if config.second_read_type == "from_scratch": share_kv_projections_first_read = config.share_kv_projections else: # Summaries are not going to be used by the first read model anyway. share_kv_projections_first_read = True self.transformer_with_side_inputs = readtwice_layers.TransformerWithSideInputLayers( hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=tensor_utils.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config.attention_probs_dropout_prob, initializer_range=config.initializer_range, share_kv_projections=share_kv_projections_first_read, name="transformer_layers") # grad_checkpointing_period=config.grad_checkpointing_period) self.summary_extraction = SummaryExtraction( config=config, use_one_hot_embeddings=use_one_hot_embeddings) if config.second_read_type == "new_layers": if config.second_read_num_new_layers is None: raise ValueError("Must specify `second_read_num_new_layers`" "when `second_read_type` is new_layers") self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers( hidden_size=config.hidden_size, num_hidden_layers=config.second_read_num_new_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=tensor_utils.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, share_kv_projections=config.share_kv_projections, name="transformer_layers") elif config.second_read_type == "cross_attend_once": if config.second_read_num_new_layers is None: raise ValueError( "Must specify `second_read_num_new_layers`" "when `second_read_type` is cross_attend_once") if config.second_read_num_cross_attention_heads is None: raise ValueError( "Must specify `second_read_num_cross_attention_heads`" "when `second_read_type` is cross_attend_once") if config.second_read_enable_default_side_input is None: raise ValueError( "Must specify `second_read_enable_default_side_input`" "when `second_read_type` is cross_attend_once") self.cross_attention_layer = readtwice_layers.ResidualBlock( inner_layer=readtwice_layers.SideAttention( hidden_size=config.hidden_size, num_heads=config.second_read_num_cross_attention_heads, att_dropout_prob=0, initializer=tf.keras.initializers.TruncatedNormal( stddev=config.initializer_range), top_k_attention=config.cross_attention_top_k, pos_embed_mode=config.cross_attention_pos_emb_mode, pos_embed_size=config.max_num_blocks_per_document, use_one_hot_embeddings=use_one_hot_embeddings, enable_default_side_input=config. second_read_enable_default_side_input), dropout_probability=config.hidden_dropout_prob, use_pre_activation_order=False, name="cross_attention_layer") self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers( hidden_size=config.hidden_size, num_hidden_layers=config.second_read_num_new_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=tensor_utils.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, share_kv_projections=True, name="transformer_layers") elif config.second_read_type == "new_layers_cross_attention": if config.second_read_num_new_layers is None: raise ValueError( "Must specify `second_read_num_new_layers`" "when `second_read_type` is cross_attend_once") if config.second_read_num_cross_attention_heads is None: raise ValueError( "Must specify `second_read_num_cross_attention_heads`" "when `second_read_type` is cross_attend_once") if config.second_read_enable_default_side_input is None: raise ValueError( "Must specify `second_read_enable_default_side_input`" "when `second_read_type` is cross_attend_once") self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers( hidden_size=config.hidden_size, num_hidden_layers=config.second_read_num_new_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=tensor_utils.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, share_kv_projections=True, num_cross_attention_heads=( config.second_read_num_cross_attention_heads), enable_default_side_input=( config.second_read_enable_default_side_input), name="transformer_layers") else: if config.second_read_type != "from_scratch": raise ValueError("Unknown `second_read_type`: '{}'".format( config.second_read_type))
def __init__(self, config, use_one_hot_embeddings, name="summary_extraction", **kwargs): """Constructor for SummaryExtraction. Args: config: `model_config.ReadItTwiceBertConfig` instance. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.nn.embedding_lookup() for the word embeddings. name: (Optional) name of the layer. **kwargs: Forwarded to super. Raises: ValueError: The config is invalid. """ super(SummaryExtraction, self).__init__(name=name, **kwargs) self.mode = config.summary_mode self.hidden_size = config.hidden_size self.postprocessing_type = config.summary_postprocessing_type self.use_sparse_memory_attention = (config.use_sparse_memory_attention) self.embedding_norm = None if self.mode == "cls": pass elif self.mode == "text_block": self.text_block_extract_every_x = config.text_block_extract_every_x assert self.text_block_extract_every_x is not None self.extraction_linear = tf.keras.layers.Dense( config.hidden_size, activation=None, kernel_initializer=(tf.truncated_normal_initializer( stddev=config.initializer_range)), name="entity_pool_linear") elif self.mode == "entity": self.extraction_linear = tf.keras.layers.Dense( config.hidden_size, activation=None, kernel_initializer=(tf.truncated_normal_initializer( stddev=config.initializer_range)), name="entity_pool_linear") else: raise ValueError("Unknown summary mode: {}".format(self.mode)) if self.postprocessing_type == "none": self.postprocessing = None elif self.postprocessing_type == "linear": self.postprocessing = tf.keras.layers.Dense( config.hidden_size, activation=tf.tanh, kernel_initializer=(tf.truncated_normal_initializer( stddev=config.initializer_range)), name="cls_pool") elif self.postprocessing_type in ["pos", "transformer"]: self.position_embedding = readtwice_layers.EmbeddingLookup( vocab_size=config.max_num_blocks_per_document, embedding_size=config.hidden_size, initializer_range=config.initializer_range, use_one_hot_lookup=use_one_hot_embeddings, name="block_position_emb_lookup") # Call layers to force variable initialization. self.position_embedding(tf.ones([1, 1], tf.int32)) self.embedding_norm = tf.keras.layers.LayerNormalization( axis=-1, epsilon=1e-12, name="summary_emb_layer_norm") self.embedding_dropout = tf.keras.layers.Dropout( rate=config.hidden_dropout_prob) if self.postprocessing_type == "transformer": if config.summary_postprocessing_num_layers is None: raise ValueError( "Must specify `postprocessing_num_layers`" "when `postprocessing_type` is \"transformer\"") self.postprocessing = readtwice_layers.TransformerWithSideInputLayers( hidden_size=config.hidden_size, num_hidden_layers=config.summary_postprocessing_num_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=tensor_utils.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, share_kv_projections=True) else: raise ValueError("Unknown summary type: {}".format( self.postprocessing_type))