def get_mlm_logits(model, albert_config, mlm_positions): """From run_pretraining.py.""" input_tensor = gather_indexes(model.get_sequence_output(), mlm_positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=albert_config.embedding_size, activation=modeling.get_activation(albert_config.hidden_act), kernel_initializer=modeling.create_initializer( albert_config.initializer_range), ) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[albert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, model.get_embedding_table(), transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def get_sentence_order_output(albert_config, input_tensor, labels): """Get loss and log probs for the next sentence prediction.""" # Simple binary classification. Note that 0 is "next sentence" and 1 is # "random sentence". This weight matrix is not used after pre-training. with tf.variable_scope('cls/seq_relationship'): output_weights = tf.get_variable( 'output_weights', shape = [2, albert_config.hidden_size], initializer = modeling.create_initializer( albert_config.initializer_range ), ) output_bias = tf.get_variable( 'output_bias', shape = [2], initializer = tf.zeros_initializer() ) logits = tf.matmul(input_tensor, output_weights, transpose_b = True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis = -1) labels = tf.reshape(labels, [-1]) one_hot_labels = tf.one_hot(labels, depth = 2, dtype = tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis = -1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, log_probs)
def get_masked_lm_output( albert_config, input_tensor, output_weights, positions, label_ids, label_weights, ): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope('cls/predictions'): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope('transform'): input_tensor = tf.layers.dense( input_tensor, units = albert_config.embedding_size, activation = modeling.get_activation(albert_config.hidden_act), kernel_initializer = modeling.create_initializer( albert_config.initializer_range ), ) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( 'output_bias', shape = [albert_config.vocab_size], initializer = tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, output_weights, transpose_b = True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis = -1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot( label_ids, depth = albert_config.vocab_size, dtype = tf.float32 ) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum( log_probs * one_hot_labels, axis = [-1] ) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_sentence_order_logits(input_tensor, albert_config): """Get loss and log probs for the next sentence prediction.""" # Simple binary classification. Note that 0 is "next sentence" and 1 is # "random sentence". This weight matrix is not used after pre-training. with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights", shape=[2, albert_config.hidden_size], initializer=modeling.create_initializer( albert_config.initializer_range)) output_bias = tf.get_variable( "output_bias", shape=[2], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def create_model(albert_config, is_training, input_ids, input_mask, segment_ids, input_cdc_ids, age, sex_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" if not FLAGS.cdc_only: model = modeling.AlbertModel( config=albert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. if FLAGS.use_pooled_output: tf.logging.info("using pooled output") output_albert_layer = model.get_pooled_output() else: tf.logging.info("using meaned output") output_albert_layer = tf.reduce_mean(model.get_sequence_output(), axis=1) with tf.variable_scope('cdc'): with tf.variable_scope("embedding"): embedding_table = tf.get_variable( name="embedding_table", shape=[FLAGS.cdc_vocab_size, FLAGS.cdc_embedding_size], initializer=modeling.create_initializer()) embedded = tf.nn.embedding_lookup(embedding_table, input_cdc_ids) mask = tf.not_equal(input_cdc_ids, 0) embed_average = tf.keras.layers.GlobalAveragePooling1D()(embedded, mask) embed_max = tf.keras.layers.GlobalMaxPooling1D()(embedded) concat_max_average = tf.concat([embed_average, embed_max], axis=-1) # concat_sex_age = tf.concat([average, age, sex_ids], axis=-1) # # with tf.variable_scope("dense_1"): # input_size = concat_sex_age.shape[-1].value # output_size = 2 * FLAGS.cdc_embedding_size # # W = tf.get_variable(name="kernel", # shape=[input_size, output_size], # initializer=modeling.create_initializer()) # b = tf.get_variable(name="bias", # shape=[output_size], # initializer=tf.zeros_initializer) # dense_1 = tf.matmul(concat_sex_age, W) # dense_1 = tf.nn.bias_add(dense_1, b) # dense_1 = tf.nn.relu(dense_1) # # with tf.variable_scope("dense_2"): # input_size = dense_1.shape[-1].value # output_size = FLAGS.cdc_embedding_size # W = tf.get_variable(name="kernel", # shape=[input_size, output_size], # initializer=modeling.create_initializer()) # b = tf.get_variable(name="bias", # shape=[output_size], # initializer=tf.zeros_initializer) # dense_2 = tf.matmul(dense_1, W) # dense_2 = tf.nn.bias_add(dense_2, b) # dense_2 = tf.nn.relu(dense_2) output_cdc_layer = tf.concat([age, sex_ids, concat_max_average], axis=-1) # Concatenate the output_layer with other features if FLAGS.cdc_only: output_layer = output_cdc_layer else: output_layer = tf.concat([output_albert_layer, output_cdc_layer], axis=-1) hidden_size = output_layer.shape[-1].value with tf.variable_scope("output"): output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, probabilities, predictions)
def __init__(self, bert_config, tokenizer): _graph = tf.Graph() with _graph.as_default(): self.X = tf.placeholder(tf.int32, [None, None]) self.top_p = tf.placeholder(tf.float32, None) self.top_k = tf.placeholder(tf.int32, None) self.k = tf.placeholder(tf.int32, None) self.temperature = tf.placeholder(tf.float32, None) self.indices = tf.placeholder(tf.int32, [None, None]) self.MASK = tf.placeholder(tf.int32, [None, None]) self._tokenizer = tokenizer self.model = modeling.AlbertModel( config=bert_config, is_training=False, input_ids=self.X, input_mask=self.MASK, use_one_hot_embeddings=False, ) self.logits = self.model.get_pooled_output() input_tensor = self.model.get_sequence_output() output_weights = self.model.get_embedding_table() with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( input_tensor, units=bert_config.embedding_size, activation=modeling.get_activation( bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) self._logits = tf.nn.bias_add(logits, output_bias) self._log_softmax = tf.nn.log_softmax(self._logits, axis=-1) logits = tf.gather_nd(self._logits, self.indices) logits = logits / self.temperature def necleus(): return top_p_logits(logits, self.top_p) def select_k(): return top_k_logits(logits, self.top_k) logits = tf.cond(self.top_p > 0, necleus, select_k) self.samples = tf.multinomial(logits, num_samples=self.k, output_dtype=tf.int32) self._sess = tf.InteractiveSession() self._sess.run(tf.global_variables_initializer()) var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='bert') cls = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='cls') self._saver = tf.train.Saver(var_list=var_lists + cls) attns = _extract_attention_weights(bert_config.num_hidden_layers, tf.get_default_graph()) self.attns = attns