def get_mlm_logits(model, albert_config, mlm_positions): """From run_pretraining.py.""" input_tensor = gather_indexes(model.get_sequence_output(), mlm_positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=albert_config.embedding_size, activation=modeling.get_activation(albert_config.hidden_act), kernel_initializer=modeling.create_initializer( albert_config.initializer_range), ) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[albert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, model.get_embedding_table(), transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def get_masked_lm_output( albert_config, input_tensor, output_weights, positions, label_ids, label_weights, ): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope('cls/predictions'): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope('transform'): input_tensor = tf.layers.dense( input_tensor, units = albert_config.embedding_size, activation = modeling.get_activation(albert_config.hidden_act), kernel_initializer = modeling.create_initializer( albert_config.initializer_range ), ) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( 'output_bias', shape = [albert_config.vocab_size], initializer = tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, output_weights, transpose_b = True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis = -1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot( label_ids, depth = albert_config.vocab_size, dtype = tf.float32 ) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum( log_probs * one_hot_labels, axis = [-1] ) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def __init__(self, bert_config, tokenizer): _graph = tf.Graph() with _graph.as_default(): self.X = tf.placeholder(tf.int32, [None, None]) self.top_p = tf.placeholder(tf.float32, None) self.top_k = tf.placeholder(tf.int32, None) self.k = tf.placeholder(tf.int32, None) self.temperature = tf.placeholder(tf.float32, None) self.indices = tf.placeholder(tf.int32, [None, None]) self.MASK = tf.placeholder(tf.int32, [None, None]) self._tokenizer = tokenizer self.model = modeling.AlbertModel( config=bert_config, is_training=False, input_ids=self.X, input_mask=self.MASK, use_one_hot_embeddings=False, ) self.logits = self.model.get_pooled_output() input_tensor = self.model.get_sequence_output() output_weights = self.model.get_embedding_table() with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( input_tensor, units=bert_config.embedding_size, activation=modeling.get_activation( bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) self._logits = tf.nn.bias_add(logits, output_bias) self._log_softmax = tf.nn.log_softmax(self._logits, axis=-1) logits = tf.gather_nd(self._logits, self.indices) logits = logits / self.temperature def necleus(): return top_p_logits(logits, self.top_p) def select_k(): return top_k_logits(logits, self.top_k) logits = tf.cond(self.top_p > 0, necleus, select_k) self.samples = tf.multinomial(logits, num_samples=self.k, output_dtype=tf.int32) self._sess = tf.InteractiveSession() self._sess.run(tf.global_variables_initializer()) var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='bert') cls = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='cls') self._saver = tf.train.Saver(var_list=var_lists + cls) attns = _extract_attention_weights(bert_config.num_hidden_layers, tf.get_default_graph()) self.attns = attns