def embedding_lookup(self, input_ids, vocab_size, batch_size, max_seq_length, embedding_size=128, initializer_range=0.02, word_embedding_name='word_embeddings', dtype=tf.float32, trainable=True, tilda_embeddings=None): if input_ids.shape.ndims == 2: input_ids = tf.expand_dims(input_ids, axis=[-1]) if tilda_embeddings is not None: embedding_table = tilda_embeddings else: embedding_table = tf.get_variable( name=word_embedding_name, shape=[vocab_size, embedding_size], initializer=util.create_initializer(initializer_range), dtype=dtype, trainable=trainable) flat_input_ids = tf.reshape(input_ids, [-1]) output = tf.gather(embedding_table, flat_input_ids, name='embedding_look_up') output = tf.reshape(output, [batch_size, max_seq_length, embedding_size]) return (output, embedding_table)
def gather_positions(sequence, positions): '''Gathers the vectors at the specific positions over a minibatch. Args: sequence: A [batch_size, seq_length] or [batch_size, seq_length, depth] tensor of values positions: A [batch_size, n_positions] tensor of indices Returns: A [batch_size, n_positions] or [batch_size, n_positions, depth] tensor of the values at the indices ''' shape = util.get_shape_list(sequence, expected_rank=[2, 3]) depth_dimension = (len(shape) == 3) if depth_dimension: B, L, D = shape else: B, L = shape D = 1 sequence = tf.expand_dims(sequence, -1) position_shift = tf.expand_dims(L * tf.range(B), -1) flat_positions = tf.reshape(positions + position_shift, [-1]) flat_sequence = tf.reshape(sequence, [B * L, D]) gathered = tf.gather(flat_sequence, flat_positions) if depth_dimension: return tf.reshape(gathered, [B, -1, D]) else: return tf.reshape(gathered, [B, -1])
def crf_binary_score(tag_indices, sequence_lengths, transition_params): ''' Computes the binary scores of tag sequences. Args: tag_indices: A [batch_size, max_seq_len] matrix of tag indices. sequence_lengths: A [batch_size] vector of true sequence lengths. transition_params: A [num_tags, num_tags] matrix of binary potentials. Returns: binary_scores: A [batch_size] vector of binary scores. ''' # Get shape information. num_tags = transition_params.get_shape()[0] num_transitions = tf.shape(tag_indices)[1] - 1 # Truncate by one on each side of the sequence to get the start and end # indices of each transition. start_tag_indices = tf.slice(tag_indices, [0, 0], [-1, num_transitions]) end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions]) # Encode the indices in a flattened representation. flattened_transition_indices = \ start_tag_indices * num_tags + end_tag_indices flattened_transition_params = tf.reshape(transition_params, [-1]) # Get the binary scores based on the flattened representation. binary_scores = tf.gather(flattened_transition_params, flattened_transition_indices) masks = tf.sequence_mask(sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=tf.float32) truncated_masks = tf.slice(masks, [0, 1], [-1, -1]) binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1) return binary_scores
def crf_unary_score(tag_indices, sequence_lengths, inputs): ''' Computes the unary scores of tag sequences. Args: tag_indices: A [batch_size, max_seq_len] matrix of tag indices. sequence_lengths: A [batch_size] vector of true sequence lengths. inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials. Returns: unary_scores: A [batch_size] vector of unary scores. ''' batch_size = tf.shape(inputs)[0] max_seq_len = tf.shape(inputs)[1] num_tags = tf.shape(inputs)[2] flattened_inputs = tf.reshape(inputs, [-1]) offsets = tf.expand_dims(tf.range(batch_size) * max_seq_len * num_tags, 1) offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0) # Use int32 or int64 based on tag_indices' dtype. if tag_indices.dtype == tf.int64: offsets = tf.cast(offsets, tf.int64) flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1]) unary_scores = tf.reshape( tf.gather(flattened_inputs, flattened_tag_indices), [batch_size, max_seq_len]) masks = tf.sequence_mask(sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=tf.float32) unary_scores = tf.reduce_sum(unary_scores * masks, 1) return unary_scores
def embedding_postprocessor(self, input_tensor, position_ids, batch_size, max_seq_length, hidden_size, use_token_type=False, segment_ids=None, token_type_vocab_size=16, token_type_embedding_name=\ 'token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1, dtype=tf.float32, trainable=True): output = input_tensor if use_token_type: if segment_ids is None: raise ValueError( 'segment_ids must be specified if use_token_type is True.') token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, hidden_size], initializer=util.create_initializer(initializer_range), dtype=dtype, trainable=trainable) # This vocab will be small so we always do one-hot here, # since it is always faster for a small vocabulary. flat_segment_ids = tf.reshape(segment_ids, [-1]) one_hot_ids = tf.one_hot(flat_segment_ids, depth=token_type_vocab_size, dtype=dtype) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape( token_type_embeddings, [batch_size, max_seq_length, hidden_size]) output += token_type_embeddings if use_position_embeddings: full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, hidden_size], initializer=util.create_initializer(initializer_range), dtype=dtype, trainable=trainable) output += tf.gather(full_position_embeddings, position_ids) output = util.layer_norm_and_dropout(output, dropout_prob, trainable=trainable) return output
def gather_indexes(sequence_tensor, positions): sequence_shape = util.get_shape_list(sequence_tensor, 3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) flat_positions = tf.reshape(positions + flat_offsets, [-1]) flat_sequence_tensor = tf.reshape(sequence_tensor, [batch_size * seq_length, width]) output_tensor = tf.gather(flat_sequence_tensor, flat_positions) return output_tensor
def _forward(input_ids, past=None): batch, sequence = shape_list(input_ids) if tilda_embeddings is None: wte = tf.get_variable( 'word_embeddings', [hparams.n_vocab, hparams.n_embed], initializer=tf.random_normal_initializer(stddev=0.02)) else: wte = tilda_embeddings wpe = tf.get_variable( 'wpe', [hparams.n_ctx, hparams.n_embed], initializer=tf.random_normal_initializer(stddev=0.01)) past_length = 0 if past is None else tf.shape(past)[-2] h = (tf.gather(wte, input_ids) + tf.gather(wpe, positions_for(input_ids, past_length))) # stacked transformer layers presents = [] pasts = tf.unstack(past, axis=1) if past is not None else \ [None] * hparams.n_layer assert len(pasts) == hparams.n_layer for layer, past in enumerate(pasts): h, present = block(h, 'h%d' % layer, past=past, hparams=hparams) presents.append(present) present = tf.stack(presents, axis=1) h = norm(h, 'ln_f') # Language model loss. Do tokens <n predict token n? h_flat = tf.reshape(h, [batch * sequence, hparams.n_embed]) logits = tf.matmul(h_flat, wte, transpose_b=True) logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab]) return logits, present
def __init__(self, is_training, input_tensor, n_wide_features, wide_features, label_ids, label_size=2, sample_weight=None, scope='cls/seq_relationship', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) hidden_size = input_tensor.shape.as_list()[-1] feature_size = wide_features.shape.as_list()[-1] with tf.variable_scope('wide'): feature_embeddings = tf.get_variable( name='feature_embeddings', shape=[feature_size + 1, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) wide_output = tf.gather(feature_embeddings, wide_features) # [B, N, H] with tf.variable_scope('wide_and_deep'): deep_output = tf.expand_dims(input_tensor, -1) # [B, H, 1] attention_scores = tf.matmul(wide_output, deep_output) # [B, N, 1] attention_scores = tf.transpose(attention_scores, [0, 2, 1]) # [B, 1, N] attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(hidden_size)) feature_mask = tf.cast( tf.sequence_mask(n_wide_features, feature_size), tf.float32) # [B, N] feature_mask = tf.expand_dims(feature_mask, 1) # [B, 1, N] attention_scores += (1.0 - feature_mask) * -10000.0 attention_matrix = tf.nn.softmax(attention_scores, axis=-1) attention_output = tf.matmul(attention_matrix, wide_output) # [B, 1, H] attention_output = attention_output[:, 0, :] # [B, H] # attention_output = util.dropout( # attention_output, hidden_dropout_prob) input_tensor = util.layer_norm(attention_output + input_tensor, trainable=trainable) with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) self.preds['preds'] = tf.argmax(logits, axis=-1) self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=label_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = tf.cast(sample_weight, dtype=tf.float32) * per_example_loss thresh = kwargs.get('tsa_thresh') if thresh is not None: assert isinstance( thresh, float), ('`tsa_thresh` must be a float between 0 and 1.') uncertainty = tf.reduce_sum(self.probs['probs'] * tf.log(self.probs['probs']), axis=-1) uncertainty /= tf.log(1 / label_size) per_example_loss = tf.cast( tf.greater(uncertainty, thresh), dtype=tf.float32) * \ per_example_loss self.losses['losses'] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)
def __init__(self, vocab_size, filter_sizes, num_channels, is_training, input_ids, scope='text_cnn', embedding_size=256, dropout_prob=0.1, trainable=True, **kwargs): input_shape = util.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] max_seq_length = input_shape[1] if isinstance(filter_sizes, str): filter_sizes = filter_sizes.split(',') assert isinstance(filter_sizes, list), ( '`filter_sizes` should be a list of integers or a string ' 'seperated with commas.') # Tilda embeddings for SMART algorithm tilda_embeddings = None use_tilda_embedding=kwargs.get('use_tilda_embedding') if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): with tf.variable_scope('embeddings'): if tilda_embeddings is not None: embedding_table = tilda_embeddings else: embedding_table = tf.get_variable( name='word_embeddings', shape=[vocab_size, embedding_size], initializer=util.create_initializer(0.02), dtype=tf.float32, trainable=trainable) flat_input_ids = tf.reshape(input_ids, [-1]) output = tf.gather( embedding_table, flat_input_ids, name='embedding_look_up') output = tf.reshape( output, [batch_size, max_seq_length, embedding_size]) output_expanded = tf.expand_dims(output, -1) # Create a convolution + maxpool layer for each filter size pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.variable_scope('conv_%s' % filter_size): # Convolution Layer filter_shape = [filter_size, embedding_size, 1, num_channels] W = tf.get_variable( name='W', shape=filter_shape, initializer=\ tf.truncated_normal_initializer(0.1), dtype=tf.float32, trainable=trainable) b = tf.get_variable( name='b', shape=[num_channels], initializer=\ tf.constant_initializer(0.1), dtype=tf.float32, trainable=trainable) conv = tf.nn.conv2d( output_expanded, W, strides=[1, 1, 1, 1], padding='VALID', name='conv') # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu') # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, max_seq_length - int(filter_size) + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name='pool') pooled_outputs.append(pooled) num_channels_total = num_channels * len(filter_sizes) h_pool = tf.concat(pooled_outputs, 3) h_pool_flat = tf.reshape(h_pool, [batch_size, num_channels_total]) with tf.name_scope('dropout'): self.pooled_output = util.dropout(h_pool_flat, dropout_prob)
def __init__(self, bert_config, is_training, input_tensor, input_mask, sem_features, label_ids, max_seq_length, feature_size, label_size=2, sample_weight=None, scope='cls/seq_relationship', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) input_shape = util.get_shape_list(input_tensor) batch_size = input_shape[0] hidden_size = input_shape[-1] with tf.variable_scope('sem'): feature_embeddings = tf.get_variable( name='feature_embeddings', shape=[feature_size + 3, hidden_size], # for [PAD], [CLS], [SEP] initializer=util.create_initializer(initializer_range), trainable=trainable) sem_output = tf.gather(feature_embeddings, sem_features) # [B, N, H] attention_heads = [] with tf.variable_scope('self'): attention_mask = BERTEncoder.create_attention_mask_from_input_mask( input_mask, batch_size, max_seq_length) (attention_head, _) = BERTEncoder.attention_layer( from_tensor=sem_output, to_tensor=sem_output, attention_mask=attention_mask, num_attention_heads=bert_config.num_attention_heads, size_per_head=(hidden_size // bert_config.num_attention_heads), attention_probs_dropout_prob=hidden_dropout_prob if is_training else 0.0, initializer_range=initializer_range, do_return_2d_tensor=False, batch_size=batch_size, from_max_seq_length=max_seq_length, to_max_seq_length=max_seq_length, trainable=trainable) attention_heads.append(attention_head) if len(attention_heads) == 1: attention_output = attention_heads[0] else: attention_output = tf.concat(attention_heads, axis=-1) attention_output = attention_output[:, 0, :] # [B, H] input_tensor = util.layer_norm(attention_output + input_tensor, trainable=trainable) with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) self.preds['preds'] = tf.argmax(logits, axis=-1) self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=label_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = tf.cast(sample_weight, dtype=tf.float32) * per_example_loss thresh = kwargs.get('tsa_thresh') if thresh is not None: assert isinstance( thresh, float), ('`tsa_thresh` must be a float between 0 and 1.') uncertainty = tf.reduce_sum(self.probs['probs'] * tf.log(self.probs['probs']), axis=-1) uncertainty /= tf.log(1 / label_size) per_example_loss = tf.cast( tf.greater(uncertainty, thresh), dtype=tf.float32) * \ per_example_loss self.losses['losses'] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)