def _embed_hash_buckets(self, ids: tf.Tensor, embedding_size: int, num_hashes: int, num_buckets: int, initializer_range: int) -> tf.Tensor: """Converts IDs (e.g. codepoints) into embeddings via multiple hashing. Args: ids: The codepoints or other IDs to be hashed. embedding_size: The dimensionality of the returned embeddings. num_hashes: The number of hash functions to use. num_buckets: The number of hash buckets (i.e. embeddings in each table). initializer_range: Maximum absolute value for initial weights. Returns: The codepoint emeddings. """ if embedding_size % num_hashes != 0: raise ValueError(f"Expected `embedding_size` ({embedding_size}) % " f"`num_hashes` ({num_hashes}) == 0") shard_embedding_size = embedding_size // num_hashes hash_bucket_tensors = self._hash_bucket_tensors( ids, num_hashes=num_hashes, num_buckets=num_buckets) embedding_shards = [] for i, hash_bucket_ids in enumerate(hash_bucket_tensors): embedding_table = tf.get_variable( name=f"embeddings/HashBucketCodepointEmbedder_{i}", shape=[num_buckets, shard_embedding_size], initializer=bert_modeling.create_initializer( initializer_range)) shard_embeddings = tf.nn.embedding_lookup(embedding_table, hash_bucket_ids) embedding_shards.append(shard_embeddings) return tf.concat(embedding_shards, axis=-1)
def _chars_to_molecules( self, char_encoding: tf.Tensor, expected_molecule_seq_length: tf.Tensor) -> tf.Tensor: """Convert char seq to initial molecule seq.""" del expected_molecule_seq_length # Used by contract only. with tf.variable_scope("initial_char_encoder/chars_to_molecules"): downsampled = tf.layers.conv1d( inputs=char_encoding, filters=self.config.hidden_size, kernel_size=self.config.downsampling_rate, strides=self.config.downsampling_rate, padding="valid", activation=bert_modeling.get_activation( self.config.hidden_act), name="conv") # `char_dim_cls_position`: [batch, 1, char_dim] # And below: `cls_position`: [batch, 1, molecule_dim] char_dim_cls_position = char_encoding[:, 0:1, :] # `molecule_dim_seq`: [batch, char_seq_len, molecule_dim] if self.config.hidden_size == self.config.hidden_size: cls_position = char_dim_cls_position else: assert self.config.hidden_size != self.config.hidden_size cls_position = bert_modeling.dense_layer_2d( char_dim_cls_position, self.config.hidden_size, bert_modeling.create_initializer( self.config.initializer_range), None, "cls_position_dense") if self._is_training: cls_position = bert_modeling.dropout( cls_position, self.config.hidden_dropout_prob) # Truncate the last molecule in order to reserve a position for [CLS]. # Often, the last position is never used (unless we completely fill the # text buffer). This is important in order to maintain alignment on TPUs # (i.e. a multiple of 128). downsampled_truncated = downsampled[:, 0:-1, :] # We also keep [CLS] as a separate sequence position since we always # want to reserve a position (and the model capacity that goes along # with that) in the deep BERT stack. # `result`: [batch, molecule_seq, molecule_dim] result = tf.concat([cls_position, downsampled_truncated], axis=1) return bert_modeling.layer_norm(result)
def _pool(self, seq_to_pool: tf.Tensor) -> tf.Tensor: """Grab the [CLS] molecule for use in classification tasks.""" # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained. # This snippet is taken from vanilla BERT. first_token_tensor = tf.squeeze(seq_to_pool[:, 0:1, :], axis=1) return tf.layers.dense( first_token_tensor, self.config.hidden_size, activation=tf.tanh, kernel_initializer=bert_modeling.create_initializer( self.config.initializer_range))
def _molecules_to_chars(self, molecules: tf.Tensor, molecule_seq_length: tf.Tensor, expected_char_seq_length: tf.Tensor, expected_char_dim: int) -> tf.Tensor: """Converts molecule seq back to a char seq.""" del expected_char_dim # Used by contract only. with tf.variable_scope("molecules_to_chars"): repeated = self._repeat_molecules( molecules, char_seq_length=expected_char_seq_length, molecule_seq_length=molecule_seq_length) if self.config.hidden_size == self.config.hidden_size: # If the dimensionality matches, just directly add a residual (not the # typical case). return repeated # Use a *slice* of the original features in order to create a residual # connection despite having different dimensions. This is a fairly # unusual (novel?) way of performing residual connections since they # typically assume uniform dimensionality. orig_features_for_residual = ( repeated[:, :, :self.config.hidden_size]) # Project molecules back to `char_dim`. result = bert_modeling.dense_layer_2d( repeated, self.config.hidden_size, bert_modeling.create_initializer( self.config.initializer_range), None, "dense") if self._is_training: result = bert_modeling.dropout(result, self.config.hidden_dropout_prob) # Add a resnet connection from the final character stack back through # the molecule transformer stack for a *slice* of the features. return bert_modeling.layer_norm( _safe_add(result, orig_features_for_residual))
def local_transformer_model(input_tensor: tf.Tensor, attention_mask: tf.Tensor, input_kv_tensor: Optional[tf.Tensor] = None, init_kv_attention_mask: Optional[tf.Tensor] = None, hidden_size: int = 768, num_hidden_layers: int = 12, num_attention_heads: int = 12, intermediate_size: int = 3072, intermediate_act_fn: Optional[Text] = None, hidden_dropout_prob: float = 0.1, attention_probs_dropout_prob: float = 0.1, initializer_range: float = 0.02, do_return_all_layers: bool = False, num_layers_to_update: Optional[int] = None, always_attend_to_first_position: bool = True, first_position_attends_to_all: bool = True, attend_from_chunk_width: int = 128, attend_from_chunk_stride: int = 128, attend_to_chunk_width: int = 128, attend_to_chunk_stride: int = 128, init_attend_to_chunk_width: int = 128, init_attend_to_chunk_stride: int = 128): """Fork of BERT's `transformer_model` that performs local attention. This attention is local in that attention happens only within each block (as defined by the length of the stides). Function parameters specific to local attention (i.e. added from BERT's `attention_layer`) are at the bottom of the argument list. IMPORTANT: Both `input_tensor` and `init_kv_tensor` must have a static sequence length dimension, such that it can be extracted as a python integer at graph-building time. Dynamic sequence lengths are not supported by `local_transformer_model` and `local_attention_layer` (because doing so would greatly limit XLA's ability to create a highly optimized program on TPU). Args: input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: int32 Tensor of shape [batch_size, seq_length, seq_length], with 1 for positions that can be attended to and 0 in positions that should not be. input_kv_tensor: float Tensor of shape [batch_size, seq_length_kv, seq_dim_kv]. If specified, this will be used for the initial layer of keys and values for self-attention. `input_tensor` will still be used for queries and resnet connections. init_kv_attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length_kv], with 1 for positions that can be attended to and 0 in positions that should not be. i.e. It indicates which items we can attend *from* in `input_tensor` (`seq_length`) and *to* in `input_kv_tensor` (`seq_length_kv`). hidden_size: int. Hidden size of the Transformer. num_hidden_layers: int. Number of layers (blocks) in the Transformer. num_attention_heads: int. Number of attention heads in the Transformer. intermediate_size: int. The size of the "intermediate" (a.k.a., feed forward) layer. intermediate_act_fn: function. The non-linear activation function to apply to the output of the intermediate/feed-forward layer. hidden_dropout_prob: float. Dropout probability for the hidden layers. attention_probs_dropout_prob: float. Dropout probability of the attention probabilities. initializer_range: float. Range of the initializer (stddev of truncated normal). do_return_all_layers: Whether to also return all layers or just the final layer. num_layers_to_update: (optional) Number of layers to update during training (a `tf.stop_gradient` is applied beyond this point). This is useful for gradual layer unfreezing during fine-tuning to prevent catastrophic forgetting. always_attend_to_first_position: Should all blocks be able to attend to the `to_tensor`'s first position (e.g. a [CLS] position)? first_position_attends_to_all: Should the query ("from") tensor's first position be able to attend to all positions within the key-value tensor? attend_from_chunk_width: The width of each block-wise chunk in the query ("from") tensor. attend_from_chunk_stride: The number of elements to skip when moving to the next block in the query ("from") tensor. attend_to_chunk_width: The width of each block-wise chunk in the key-value ("to") tensor. attend_to_chunk_stride: The number of elements to skip when moving to the next block in the key-value ("to") tensor. init_attend_to_chunk_width: `attend_to_chunk_width` for first layer when `init_kv_tensor` is specified. init_attend_to_chunk_stride: `attend_to_chunk_stride` for first layer when `init_kv_tensor` is specified. Returns: float Tensor of shape [batch_size, seq_length, hidden_size], the final hidden layer of the Transformer. Raises: ValueError: A Tensor shape or parameter is invalid. """ if num_hidden_layers == 0: return input_tensor if hidden_size % num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, num_attention_heads)) attention_head_size = int(hidden_size / num_attention_heads) input_shape = bert_modeling.get_shape_list(input_tensor, expected_rank=3) input_width = input_shape[2] from_shape = bert_modeling.get_shape_list(input_tensor, expected_rank=3) # This is enforced as a static int in the contract above. from_seq_length: int = from_shape[1] # The Transformer performs sum residuals on all layers so the input needs # to be the same as the hidden size. if input_width != hidden_size: raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % (input_width, hidden_size)) prev_output = input_tensor all_layer_outputs = [] for layer_idx in range(num_hidden_layers): with tf.variable_scope("layer_%d" % layer_idx): layer_input = prev_output if layer_idx == 0 and input_kv_tensor is not None: layer_kv_input = input_kv_tensor layer_attention_mask = init_kv_attention_mask layer_attend_to_chunk_width = init_attend_to_chunk_width layer_attend_to_chunk_stride = init_attend_to_chunk_stride if init_kv_attention_mask is None: raise ValueError("`init_kv_attention_mask` must be specified when " "`input_kv_tensor` is specified.") else: layer_kv_input = layer_input layer_attention_mask = attention_mask layer_attend_to_chunk_width = attend_to_chunk_width layer_attend_to_chunk_stride = attend_to_chunk_stride to_shape = bert_modeling.get_shape_list(layer_kv_input, expected_rank=3) to_seq_length: int = to_shape[1] assert isinstance(to_seq_length, int) with tf.variable_scope("attention", reuse=tf.AUTO_REUSE): with tf.variable_scope("self"): attention_output = local_attention_layer( from_tensor=layer_input, # Queries. to_tensor=layer_kv_input, from_seq_length=from_seq_length, to_seq_length=to_seq_length, attention_mask=layer_attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, # Parameters specific to local attention: always_attend_to_first_position=always_attend_to_first_position, first_position_attends_to_all=first_position_attends_to_all, attend_from_chunk_width=attend_from_chunk_width, attend_from_chunk_stride=attend_from_chunk_stride, attend_to_chunk_width=layer_attend_to_chunk_width, attend_to_chunk_stride=layer_attend_to_chunk_stride) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope("output"): attention_output = bert_modeling.dense_layer_3d_proj( attention_output, hidden_size, num_attention_heads, attention_head_size, bert_modeling.create_initializer(initializer_range), None, "dense") attention_output = bert_modeling.dropout(attention_output, hidden_dropout_prob) attention_output = bert_modeling.layer_norm( _safe_add(attention_output, layer_input)) # The activation is only applied to the "intermediate" hidden layer. with tf.variable_scope("intermediate"): intermediate_output = bert_modeling.dense_layer_2d( attention_output, intermediate_size, bert_modeling.create_initializer(initializer_range), intermediate_act_fn, "dense") # Down-project back to `hidden_size` then add the residual. with tf.variable_scope("output"): layer_output = bert_modeling.dense_layer_2d( intermediate_output, hidden_size, bert_modeling.create_initializer(initializer_range), None, "dense") layer_output = bert_modeling.dropout(layer_output, hidden_dropout_prob) layer_output = bert_modeling.layer_norm( _safe_add(layer_output, attention_output)) if num_layers_to_update is not None: num_layers_remaining = num_hidden_layers - layer_idx - 1 if num_layers_remaining == num_layers_to_update: layer_output = tf.stop_gradient(layer_output) prev_output = layer_output all_layer_outputs.append(layer_output) if do_return_all_layers: return all_layer_outputs else: return all_layer_outputs[-1]