def embedding_lookup(self, input_ids, vocab_size, batch_size, max_seq_length, embedding_size=128, initializer_range=0.02, word_embedding_name='word_embeddings', dtype=tf.float32, trainable=True, tilda_embeddings=None): if input_ids.shape.ndims == 2: input_ids = tf.expand_dims(input_ids, axis=[-1]) if tilda_embeddings is not None: embedding_table = tilda_embeddings else: embedding_table = tf.get_variable( name=word_embedding_name, shape=[vocab_size, embedding_size], initializer=util.create_initializer(initializer_range), dtype=dtype, trainable=trainable) flat_input_ids = tf.reshape(input_ids, [-1]) output = tf.gather(embedding_table, flat_input_ids, name='embedding_look_up') output = tf.reshape(output, [batch_size, max_seq_length, embedding_size]) return (output, embedding_table)
def einsum_via_matmul(input_tensor, w, num_inner_dims): """Implements einsum via matmul and reshape ops. Args: input_tensor: float Tensor of shape [<batch_dims>, <inner_dims>]. w: float Tensor of shape [<inner_dims>, <outer_dims>]. num_inner_dims: int. number of dimensions to use for inner products. Returns: float Tensor of shape [<batch_dims>, <outer_dims>]. """ input_shape = util.get_shape_list(input_tensor) w_shape = util.get_shape_list(w) batch_dims = input_shape[:-num_inner_dims] inner_dims = input_shape[-num_inner_dims:] outer_dims = w_shape[num_inner_dims:] inner_dim = np.prod(inner_dims) outer_dim = np.prod(outer_dims) if num_inner_dims > 1: input_tensor = tf.reshape(input_tensor, batch_dims + [inner_dim]) if len(w_shape) > 2: w = tf.reshape(w, [inner_dim, outer_dim]) ret = tf.matmul(input_tensor, w) if len(outer_dims) > 1: ret = tf.reshape(ret, batch_dims + outer_dims) return ret
def gather_positions(sequence, positions): '''Gathers the vectors at the specific positions over a minibatch. Args: sequence: A [batch_size, seq_length] or [batch_size, seq_length, depth] tensor of values positions: A [batch_size, n_positions] tensor of indices Returns: A [batch_size, n_positions] or [batch_size, n_positions, depth] tensor of the values at the indices ''' shape = util.get_shape_list(sequence, expected_rank=[2, 3]) depth_dimension = (len(shape) == 3) if depth_dimension: B, L, D = shape else: B, L = shape D = 1 sequence = tf.expand_dims(sequence, -1) position_shift = tf.expand_dims(L * tf.range(B), -1) flat_positions = tf.reshape(positions + position_shift, [-1]) flat_sequence = tf.reshape(sequence, [B * L, D]) gathered = tf.gather(flat_sequence, flat_positions) if depth_dimension: return tf.reshape(gathered, [B, -1, D]) else: return tf.reshape(gathered, [B, -1])
def crf_unary_score(tag_indices, sequence_lengths, inputs): ''' Computes the unary scores of tag sequences. Args: tag_indices: A [batch_size, max_seq_len] matrix of tag indices. sequence_lengths: A [batch_size] vector of true sequence lengths. inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials. Returns: unary_scores: A [batch_size] vector of unary scores. ''' batch_size = tf.shape(inputs)[0] max_seq_len = tf.shape(inputs)[1] num_tags = tf.shape(inputs)[2] flattened_inputs = tf.reshape(inputs, [-1]) offsets = tf.expand_dims(tf.range(batch_size) * max_seq_len * num_tags, 1) offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0) # Use int32 or int64 based on tag_indices' dtype. if tag_indices.dtype == tf.int64: offsets = tf.cast(offsets, tf.int64) flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1]) unary_scores = tf.reshape( tf.gather(flattened_inputs, flattened_tag_indices), [batch_size, max_seq_len]) masks = tf.sequence_mask(sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=tf.float32) unary_scores = tf.reduce_sum(unary_scores * masks, 1) return unary_scores
def __init__(self, is_training, input_tensor, label_ids, sample_weight=None, scope='mrc', name='', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) seq_length = input_tensor.shape.as_list()[-2] hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[2, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, seq_length, 2]) logits = tf.transpose(logits, [0, 2, 1]) probs = tf.nn.softmax(logits, axis=-1, name='probs') self.probs[name] = probs start_one_hot_labels = tf.one_hot(label_ids[:, 0], depth=seq_length, dtype=tf.float32) end_one_hot_labels = tf.one_hot(label_ids[:, 1], depth=seq_length, dtype=tf.float32) start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1) end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1) per_example_loss = ( -0.5 * tf.reduce_sum(start_one_hot_labels * start_log_probs, axis=-1) - 0.5 * tf.reduce_sum(end_one_hot_labels * end_log_probs, axis=-1)) if sample_weight is not None: per_example_loss *= sample_weight self.total_loss = tf.reduce_mean(per_example_loss) self.losses[name] = per_example_loss start_preds = tf.expand_dims(tf.argmax(logits[:, 0, :], axis=-1), axis=-1) end_preds = tf.expand_dims(tf.argmax(logits[:, 1, :], axis=-1), axis=-1) self.preds[name] = tf.concat([start_preds, end_preds], axis=-1)
def __init__(self, is_training, input_tensor, input_mask, label_ids, label_size=2, sample_weight=None, scope='cls/sequence', name='', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) batch_size = tf.shape(input_tensor)[0] seq_length = input_tensor.shape.as_list()[-2] hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, seq_length, label_size]) self.preds[name] = tf.argmax(logits, axis=-1) self.probs[name] = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=label_size, dtype=tf.float32) per_token_losses = -tf.reduce_mean(one_hot_labels * log_probs, axis=-1) input_mask = tf.concat([ tf.zeros((batch_size, 1), dtype=tf.float32), tf.cast(input_mask[:, 2:], dtype=tf.float32), tf.zeros((batch_size, 1), dtype=tf.float32) ], axis=-1) per_token_losses *= input_mask per_example_loss = tf.reduce_mean(per_token_losses, axis=-1) if sample_weight is not None: per_example_loss *= tf.cast(sample_weight, dtype=tf.float32) self.losses[name] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)
def scatter_update(sequence, updates, positions): '''Scatter-update a sequence. Args: sequence: A [batch_size, seq_len] or [batch_size, seq_len, depth] tensor updates: A tensor of size batch_size*seq_len(*depth) positions: A [batch_size, n_positions] tensor Returns: A tuple of two tensors. First is a [batch_size, seq_len] or [batch_size, seq_len, depth] tensor of 'sequence' with elements at 'positions' replaced by the values at 'updates.' Updates to index 0 are ignored. If there are duplicated positions the update is only applied once. Second is a [batch_size, seq_len] mask tensor of which inputs were updated. ''' shape = util.get_shape_list(sequence, expected_rank=[2, 3]) depth_dimension = (len(shape) == 3) if depth_dimension: B, L, D = shape else: B, L = shape D = 1 sequence = tf.expand_dims(sequence, -1) N = util.get_shape_list(positions)[1] shift = tf.expand_dims(L * tf.range(B), -1) flat_positions = tf.reshape(positions + shift, [-1, 1]) flat_updates = tf.reshape(updates, [-1, D]) updates = tf.scatter_nd(flat_positions, flat_updates, [B * L, D]) updates = tf.reshape(updates, [B, L, D]) flat_updates_mask = tf.ones([B * N], tf.int32) updates_mask = tf.scatter_nd(flat_positions, flat_updates_mask, [B * L]) updates_mask = tf.reshape(updates_mask, [B, L]) not_first_token = tf.concat( [tf.zeros((B, 1), tf.int32), tf.ones((B, L - 1), tf.int32)], -1) updates_mask *= not_first_token updates_mask_3d = tf.expand_dims(updates_mask, -1) # account for duplicate positions if sequence.dtype == tf.float32: updates_mask_3d = tf.cast(updates_mask_3d, tf.float32) updates /= tf.maximum(1.0, updates_mask_3d) else: assert sequence.dtype == tf.int32 updates = tf.divide(updates, tf.maximum(1, updates_mask_3d)) updates = tf.cast(updates, tf.int32) updates_mask = tf.minimum(updates_mask, 1) updates_mask_3d = tf.minimum(updates_mask_3d, 1) updated_sequence = (((1 - updates_mask_3d) * sequence) + (updates_mask_3d * updates)) if not depth_dimension: updated_sequence = tf.squeeze(updated_sequence, -1) return updated_sequence, updates_mask
def embedding_postprocessor(self, input_tensor, position_ids, batch_size, max_seq_length, hidden_size, use_token_type=False, segment_ids=None, token_type_vocab_size=16, token_type_embedding_name=\ 'token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1, dtype=tf.float32, trainable=True): output = input_tensor if use_token_type: if segment_ids is None: raise ValueError( 'segment_ids must be specified if use_token_type is True.') token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, hidden_size], initializer=util.create_initializer(initializer_range), dtype=dtype, trainable=trainable) # This vocab will be small so we always do one-hot here, # since it is always faster for a small vocabulary. flat_segment_ids = tf.reshape(segment_ids, [-1]) one_hot_ids = tf.one_hot(flat_segment_ids, depth=token_type_vocab_size, dtype=dtype) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape( token_type_embeddings, [batch_size, max_seq_length, hidden_size]) output += token_type_embeddings if use_position_embeddings: full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, hidden_size], initializer=util.create_initializer(initializer_range), dtype=dtype, trainable=trainable) output += tf.gather(full_position_embeddings, position_ids) output = util.layer_norm_and_dropout(output, dropout_prob, trainable=trainable) return output
def rel_shift(x, klen=-1): '''perform relative shift to form the relative attention score.''' x_size = tf.shape(x) x = tf.reshape(x, [x_size[1], x_size[0], x_size[2], x_size[3]]) x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1]) x = tf.reshape(x, [x_size[0], x_size[1] - 1, x_size[2], x_size[3]]) x = tf.slice(x, [0, 0, 0, 0], [-1, klen, -1, -1]) return x
def embedding_lookup(input_ids, vocab_size, embedding_size=128, initializer_range=0.02, word_embedding_name='word_embeddings', use_one_hot_embeddings=False): '''Looks up words embeddings for id tensor. Args: input_ids: int32 Tensor of shape [batch_size, seq_length] containing word ids. vocab_size: int. Size of the embedding vocabulary. embedding_size: int. Width of the word embeddings. initializer_range: float. Embedding initialization range. word_embedding_name: string. Name of the embedding table. use_one_hot_embeddings: bool. If True, use one-hot method for word embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better for TPUs. Returns: float Tensor of shape [batch_size, seq_length, embedding_size]. ''' # This function assumes that the input is of shape [batch_size, seq_length, # num_inputs]. # # If the input is a 2D tensor of shape [batch_size, seq_length], we # reshape to [batch_size, seq_length, 1]. original_dims = input_ids.shape.ndims if original_dims == 2: input_ids = tf.expand_dims(input_ids, axis=[-1]) embedding_table = tf.get_variable( name=word_embedding_name, shape=[vocab_size, embedding_size], initializer=util.create_initializer(initializer_range)) if original_dims == 3: input_shape = util.get_shape_list(input_ids) tf.reshape(input_ids, [-1, input_shape[-1]]) output = tf.matmul(input_ids, embedding_table) output = tf.reshape(output, [input_shape[0], input_shape[1], embedding_size]) else: if use_one_hot_embeddings: flat_input_ids = tf.reshape(input_ids, [-1]) one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) output = tf.matmul(one_hot_input_ids, embedding_table) else: output = tf.nn.embedding_lookup(embedding_table, input_ids) input_shape = util.get_shape_list(input_ids) output = tf.reshape( output, input_shape[0:-1] + [input_shape[-1] * embedding_size]) return output, embedding_table
def _forward(target_ids, target_mask, target_max_seq_length): with tf.variable_scope('decoder'): # shared embedding dec = tf.nn.embedding_lookup(embedding_table, target_ids) dec *= hidden_size ** 0.5 # scale dec += positional_encoding(dec, target_max_seq_length) dec = util.dropout(dec, dropout_rate) # blocks for i in range(num_blocks): with tf.variable_scope('block_%s' % i): # masked self-attention dec = multihead_attention( queries=dec, keys=dec, values=dec, key_masks=target_mask, num_heads=num_attention_heads, dropout_rate=dropout_rate, training=is_training, causality=True, scope='masked_self_attention') # vanilla attention dec = multihead_attention( queries=dec, keys=memory, values=memory, key_masks=source_mask, num_heads=num_attention_heads, dropout_rate=dropout_rate, training=is_training, causality=False, scope='vanilla_attention') # feed forward dec = ff( dec, num_units=[4 * hidden_size, hidden_size]) # final linear projection (embedding weights are shared) with tf.variable_scope('cls'): output_bias = tf.get_variable( 'output_bias', shape=[vocab_size], initializer=tf.zeros_initializer()) dec = tf.reshape(dec, [-1, hidden_size]) logits = tf.matmul(dec, embedding_table, transpose_b=True) logits = tf.reshape( logits, [-1, target_max_seq_length, vocab_size]) logits = tf.nn.bias_add(logits, output_bias) return logits
def conv1d(x, scope, nf, *, w_init_stdev=0.02): with tf.variable_scope(scope): *start, nx = shape_list(x) w = tf.get_variable( 'w', [1, nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev)) b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0)) c = tf.reshape( tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf])) + b, start + [nf]) return c
def __init__(self, is_training, input_tensor, input_mask, label_ids, label_size=5, sample_weight=None, scope='cls/sequence', name='', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) seq_length = input_tensor.shape.as_list()[-2] hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, seq_length, label_size]) with tf.variable_scope('crf'): input_length = tf.reduce_sum(input_mask, axis=-1) per_example_loss, transition_matrix = \ contrib.crf.crf_log_likelihood( inputs=logits, tag_indices=label_ids, sequence_lengths=input_length) per_example_loss = -per_example_loss if sample_weight is not None: per_example_loss *= tf.cast(sample_weight, dtype=tf.float32) self.total_loss = tf.reduce_mean(per_example_loss) self.losses[name] = per_example_loss self.preds[name] = tf.argmax(logits, axis=-1) self.probs['logits'] = logits self.probs['transition_matrix'] = transition_matrix
def gather_indexes(sequence_tensor, positions): sequence_shape = util.get_shape_list(sequence_tensor, 3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) flat_positions = tf.reshape(positions + flat_offsets, [-1]) flat_sequence_tensor = tf.reshape(sequence_tensor, [batch_size * seq_length, width]) output_tensor = tf.gather(flat_sequence_tensor, flat_positions) return output_tensor
def dense_layer_3d(input_tensor, num_attention_heads, head_size, initializer, activation, use_einsum, name=None, trainable=True): """A dense layer with 3D kernel. Args: input_tensor: float Tensor of shape [batch, seq_length, hidden_size]. num_attention_heads: Number of attention heads. head_size: The size per attention head. initializer: Kernel initializer. activation: Actication function. use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers. name: The name scope of this layer. Returns: float logits Tensor. """ input_shape = util.get_shape_list(input_tensor) hidden_size = input_shape[2] with tf.variable_scope(name): w = tf.get_variable( name="kernel", shape=[hidden_size, num_attention_heads * head_size], initializer=initializer, trainable=trainable) w = tf.reshape(w, [hidden_size, num_attention_heads, head_size]) b = tf.get_variable( name="bias", shape=[num_attention_heads * head_size], initializer=tf.zeros_initializer, trainable=trainable) b = tf.reshape(b, [num_attention_heads, head_size]) if use_einsum: ret = tf.einsum("BFH,HND->BFND", input_tensor, w) else: ret = einsum_via_matmul(input_tensor, w, 1) ret += b if activation is not None: return activation(ret) else: return ret
def crf_binary_score(tag_indices, sequence_lengths, transition_params): ''' Computes the binary scores of tag sequences. Args: tag_indices: A [batch_size, max_seq_len] matrix of tag indices. sequence_lengths: A [batch_size] vector of true sequence lengths. transition_params: A [num_tags, num_tags] matrix of binary potentials. Returns: binary_scores: A [batch_size] vector of binary scores. ''' # Get shape information. num_tags = transition_params.get_shape()[0] num_transitions = tf.shape(tag_indices)[1] - 1 # Truncate by one on each side of the sequence to get the start and end # indices of each transition. start_tag_indices = tf.slice(tag_indices, [0, 0], [-1, num_transitions]) end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions]) # Encode the indices in a flattened representation. flattened_transition_indices = \ start_tag_indices * num_tags + end_tag_indices flattened_transition_params = tf.reshape(transition_params, [-1]) # Get the binary scores based on the flattened representation. binary_scores = tf.gather(flattened_transition_params, flattened_transition_indices) masks = tf.sequence_mask(sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=tf.float32) truncated_masks = tf.slice(masks, [0, 1], [-1, -1]) binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1) return binary_scores
def transpose_for_scores(input_tensor, batch_size, num_attention_heads, max_seq_length, width): output_tensor = tf.reshape( input_tensor, [batch_size, max_seq_length, num_attention_heads, width]) output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) return output_tensor
def create_attention_mask_from_input_mask(from_tensor, to_mask): '''Create 3D attention mask from a 2D tensor mask. Args: from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. to_mask: int32 Tensor of shape [batch_size, to_seq_length]. Returns: float Tensor of shape [batch_size, from_seq_length, to_seq_length]. ''' from_shape = util.get_shape_list(from_tensor, expected_rank=[2, 3]) batch_size = from_shape[0] from_seq_length = from_shape[1] to_shape = util.get_shape_list(to_mask, expected_rank=2) to_seq_length = to_shape[1] to_mask = tf.cast(tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) # We don't assume that `from_tensor` is a mask (although it could be). We # don't actually care if we attend *from* padding tokens (only *to* padding) # tokens so we create a tensor of all ones. # # `broadcast_ones` = [batch_size, from_seq_length, 1] broadcast_ones = tf.ones(shape=[batch_size, from_seq_length, 1], dtype=tf.float32) # Here we broadcast along two dimensions to create the mask. mask = broadcast_ones * to_mask return mask
def mask_attn_weights(w): # w has shape [batch, heads, dst_sequence, src_sequence], where # information flows from src to dst. _, _, nd, ns = shape_list(w) b = attention_mask(nd, ns, dtype=w.dtype) b = tf.reshape(b, [1, 1, nd, ns]) w = w * b - tf.cast(1e10, w.dtype) * (1 - b) return w
def sample_from_softmax(logits, disallow=None): if disallow is not None: logits -= 1000.0 * tf.reshape(disallow, [-1, logits.shape[-1]]) uniform_noise = tf.random_uniform( util.get_shape_list(logits), minval=0, maxval=1) gumbel_noise = -tf.log(-tf.log(uniform_noise + 1e-9) + 1e-9) return tf.one_hot(tf.argmax(logits + gumbel_noise, -1, output_type=tf.int32), logits.shape[-1])
def _get_pred_loss(self, teacher_logits, student_logits, weights): teacher_probs = tf.nn.softmax(teacher_logits, axis=-1) teacher_probs = tf.stop_gradient(teacher_probs) student_log_probs = tf.nn.log_softmax(student_logits, axis=-1) pred_loss = ( - tf.reduce_sum(teacher_probs * student_log_probs, axis=-1) * tf.reshape(weights, [-1, 1])) pred_loss = tf.reduce_mean(pred_loss) return pred_loss
def _forward(dilated_ids, dilated_mask): logits = self._bert_forward( bert_config, dilated_ids, dilated_mask, batch_size, dilated_seq_length, tilda_embeddings=tilda_embeddings) output_ids = tf.argmax(logits, axis=-1) output_ids = tf.cast(output_ids, dtype=tf.int32) equal_zero = tf.cast(tf.equal(output_ids, 0), tf.int32) equal_zero = tf.reduce_sum(equal_zero, axis=-1) right_pad = spad_id * tf.sequence_mask( equal_zero, dilated_seq_length, dtype=tf.int32) paded = tf.concat([output_ids, right_pad], axis=-1) flattened_padded = tf.reshape(paded, [-1]) is_valid = tf.cast(tf.greater(flattened_padded, 0), dtype=tf.int32) flattened_valid = tf.boolean_mask(flattened_padded, is_valid) valid = tf.reshape(flattened_valid, [batch_size, dilated_seq_length]) cutted_valid = valid[:, :max_seq_length] nonpad_mask = tf.cast(tf.not_equal(cutted_valid, spad_id), dtype=tf.int32) output_ids = cutted_valid * nonpad_mask reshaped = tf.reshape(output_ids, [batch_size, max_seq_length, 1]) concatenated = tf.concat( [reshaped, tf.zeros_like(reshaped)], axis=-1) dilated_ids = tf.reshape(concatenated, [batch_size, max_seq_length * 2]) input_mask = tf.reduce_sum(nonpad_mask, axis=-1) dilated_mask = tf.sequence_mask(input_mask, dilated_seq_length, dtype=tf.int32) return dilated_ids, dilated_mask
def create_attention_mask_from_input_mask(to_mask, batch_size, max_seq_length, dtype=tf.float32): to_mask = tf.cast(tf.reshape(to_mask, [batch_size, 1, max_seq_length]), dtype=dtype) broadcast_ones = tf.ones(shape=[batch_size, max_seq_length, 1], dtype=dtype) mask = broadcast_ones * to_mask return mask
def create_attention_mask_from_input_mask(self, input_mask, batch_size, max_seq_length, dtype=tf.float32): to_mask = tf.cast(tf.reshape(input_mask, [batch_size, 1, max_seq_length]), dtype=dtype) broadcast_ones = tf.ones(shape=[batch_size, max_seq_length, 1], dtype=dtype) mask = broadcast_ones * to_mask broadcast_eye = tf.tile( tf.reshape(tf.eye(max_seq_length), [1, max_seq_length, max_seq_length]), [batch_size, 1, 1]) mask += broadcast_eye mask = tf.cast(tf.greater(mask, 0), dtype) return mask
def _single_seq_fn(): batch_size = tf.shape(inputs, out_type=tag_indices.dtype)[0] example_inds = tf.reshape( tf.range(batch_size, dtype=tag_indices.dtype), [-1, 1]) sequence_scores = tf.gather_nd( tf.squeeze(inputs, [1]), tf.concat([example_inds, tag_indices], axis=1)) sequence_scores = tf.where(tf.less_equal(sequence_lengths, 0), tf.zeros_like(sequence_scores), sequence_scores) return sequence_scores
def reshape_to_matrix(input_tensor): '''Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).''' ndims = input_tensor.shape.ndims if ndims < 2: raise ValueError('Input tensor must have at least rank 2. Shape = %s' % (input_tensor.shape)) if ndims == 2: return input_tensor width = input_tensor.shape[-1] output_tensor = tf.reshape(input_tensor, [-1, width]) return output_tensor
def reshape_from_matrix(output_tensor, orig_shape_list, original_shape=None): '''Reshapes a rank 2 tensor back to its original rank >= 2 tensor.''' if len(orig_shape_list) == 2: return output_tensor if not original_shape: original_shape = get_shape_list(output_tensor) orig_dims = orig_shape_list[0:-1] width = original_shape[-1] return tf.reshape(output_tensor, orig_dims + [width])
def _forward(dilated_ids, dilated_mask): logits = self._bert_forward( bert_config, dilated_ids, dilated_mask, batch_size, dilated_seq_length, tilda_embeddings=tilda_embeddings) output_ids = tf.argmax(logits, axis=-1) output_ids = tf.cast(output_ids, dtype=tf.int32) # special padding (using `spad` token) equal_zero = tf.cast(tf.equal(output_ids, 0), tf.int32) equal_zero = tf.reduce_sum(equal_zero, axis=-1) right_pad = spad_id * tf.sequence_mask( equal_zero, dilated_seq_length, dtype=tf.int32) paded = tf.concat([output_ids, right_pad], axis=-1) # extract ids of length `max_seq_length` flattened_padded = tf.reshape(paded, [-1]) is_valid = tf.cast(tf.greater(flattened_padded, 0), dtype=tf.int32) flattened_valid = tf.boolean_mask(flattened_padded, is_valid) valid = tf.reshape(flattened_valid, [batch_size, dilated_seq_length]) cutted_valid = valid[:, :max_seq_length] # replace `spad` token with `pad` non_spad_mask = tf.cast(tf.not_equal( cutted_valid, spad_id), dtype=tf.int32) output_ids = cutted_valid * non_spad_mask output_length = tf.reduce_sum(non_spad_mask, axis=-1) # dilate reshaped_ids = tf.reshape(output_ids, [batch_size, max_seq_length, 1]) reshaped_mask = tf.reshape( tf.sequence_mask(output_length, max_seq_length, dtype=tf.int32), [batch_size, max_seq_length, 1]) concat_ids = tf.concat( [reshaped_ids, tf.zeros_like(reshaped_ids)], axis=-1) concat_mask = tf.concat([ reshaped_mask, tf.zeros_like(reshaped_mask, dtype=tf.int32) ], axis=-1) dilated_ids = tf.reshape(concat_ids, [batch_size, max_seq_length * 2]) dilated_mask = tf.reshape(concat_mask, [batch_size, max_seq_length * 2]) return dilated_ids, dilated_mask
def _get_embedding_loss(self, teacher, student, bert_config, weights): teacher_embedding = teacher.get_embedding_output() teacher_embedding = tf.stop_gradient(teacher_embedding) student_embedding = student.get_embedding_output() with tf.variable_scope('embedding_loss'): linear_trans = tf.layers.dense( student_embedding, bert_config.hidden_size, kernel_initializer=util.create_initializer( bert_config.initializer_range)) embedding_loss = tf.losses.mean_squared_error( linear_trans, teacher_embedding, weights=tf.reshape(weights, [-1, 1, 1])) return embedding_loss
def create_attention_mask_from_input_mask(self, input_mask, batch_size, max_seq_length, dtype=tf.float32): if self._mode == 'bi': to_mask = tf.cast(tf.reshape( input_mask, [batch_size, 1, max_seq_length]), dtype=dtype) broadcast_ones = tf.ones( shape=[batch_size, max_seq_length, 1], dtype=dtype) mask = broadcast_ones * to_mask elif self._mode == 'l2r': arange = tf.range(max_seq_length) + 1 to_mask = tf.cast(tf.sequence_mask(arange, max_seq_length), dtype) to_mask = tf.reshape(to_mask, [1, max_seq_length, max_seq_length]) mask = tf.tile(to_mask, [batch_size, 1, 1]) elif self._mode == 'r2l': to_mask = tf.cast(tf.reshape( input_mask, [batch_size, 1, max_seq_length]), dtype=dtype) broadcast_ones = tf.ones( shape=[batch_size, max_seq_length, 1], dtype=dtype) cover_mask = broadcast_ones * to_mask arange = tf.range(max_seq_length) reverse = tf.cast(tf.sequence_mask(arange, max_seq_length), dtype) reverse = tf.reshape(reverse, [1, max_seq_length, max_seq_length]) reverse_mask = tf.tile(reverse, [batch_size, 1, 1]) mask = (1 - reverse_mask) * cover_mask elif self._mode == 's2s': mask = tf.cast( tf.sequence_mask(input_mask, max_seq_length), dtype) return mask