def classify_domain(self, inputs): output_prev = tf.layers.dense( inputs, self.hidden_size, activation=tf.nn.relu, name='domain_layer_prev', kernel_initializer=_mh.create_initializer( initializer_range=self.initializer_range)) output = tf.layers.dense(output_prev, self.num_domains, activation=None, name='domain_layer_final', kernel_initializer=_mh.create_initializer( initializer_range=self.initializer_range)) return output
def textCNN(embedding, seq_length, window_size, pool_size, filter_number, hidden_size, dropout_prob, initializer_range, scope=None): """Apply textCNN on the embeddings. The code here is revised from the below url: https://github.com/dennybritz/cnn-text-classification-tf/blob/master/text_cnn.py Double Salute ! """ embedding_shape = _mh.get_shape_list(embedding) seq_length = embedding_shape[1] embedding_size = embedding_shape[2] embedded_expanded = tf.expand_dims(embedding, -1) pooled_outputs = [] for i, ws in enumerate(window_size): with tf.variable_scope(scope, default_name='conv_{}'.format(i)): # Conv filter_shape = [ws, embedding_size, 1, filter_number] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[filter_number]), name="b") conv = tf.nn.conv2d(embedded_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # MaxPool pooled = tf.nn.max_pool(h, ksize=[1, pool_size[i], 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = filter_number * len(window_size) h_pool = tf.concat(pooled_outputs, 3) h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total]) # Add dropout with tf.name_scope("dropout"): h_drop = tf.nn.dropout(h_pool_flat, keep_prob=(1 - dropout_prob)) # Final Output with tf.variable_scope('textCNN_output'): output = tf.layers.dense(h_drop, hidden_size, activation=tf.nn.relu, name='layer_output', kernel_initializer=_mh.create_initializer( initializer_range=initializer_range)) return output
def classify_layer(self, inputs): """Classify the input as class according to the number of classes.""" output = tf.layers.dense(inputs, self.num_classes, activation=None, name='label_layer', kernel_initializer=_mh.create_initializer( initializer_range=self.initializer_range)) return output
def build(self, input_text, input_image, scope=None): """"Build the whole graph.""" with tf.variable_scope(scope, default_name='EANN'): # Embedding with tf.variable_scope('embeddings'): embedding_output, self.embedding_table = _mh.embedding_lookup( input_ids=input_text, vocab_size=self.vocab_size, embedding_size=self.embedding_size, initializer_range=self.initializer_range, word_embedding_name='word_embeddings') # textCNN -> [batch_size, hidden_size] with tf.variable_scope('textCNN'): text_output = textCNN(embedding_output, self.seq_length, self.window_size, self.pool_size, self.filter_number_text, self.hidden_size, self.dropout, self.initializer_range) # VGG_19 with tf.variable_scope('vgg_19'): image_output = self.vgg(input_image) # image_output.pretrained() batch_size = _mh.get_shape_list(image_output)[0] # squeeze the tensor, as the following dense layer need specified last dimension, # must specify the exact dimension image_output = tf.reshape(image_output, (batch_size, 25088)) image_output = tf.layers.dense( image_output, self.hidden_size, activation=None, name='image_output_layer', kernel_initializer=_mh.create_initializer( initializer_range=self.initializer_range)) # concatenate the text output with the image output text_image_output = tf.concat((text_output, image_output), -1) # label classify layer with tf.variable_scope('classify_label'): label_output = self.classify_layer(text_image_output) # domain classify layer with tf.variable_scope('classify_domain'): # apply reversal gradient here reverse_text_image_output = flip_gradient(text_image_output) domain_output = self.classify_domain(reverse_text_image_output) return label_output, domain_output, batch_size
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, pre_positional_embeddings=None, use_one_hot_embeddings=False, scope=None): """"Constructor for ALBert. Args: config: # TODO is_training: bool. If True, enable dropout, else disable dropout. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor, this is the mask for point the padding indices, [batch_size, seq_length]. ATTENTION: for the UniLM model, the input_mask is shape as [seq_length, seq_length], see more in the `create_mask_for_lm` in load_data.py. token_type_ids: (optional) int32 Tensor, point the words belonging to different segments, [batch_size,seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. """ config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 input_shape = _mh.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None: # each word is the real word, no padding. input_shape = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) with tf.variable_scope(scope, default_name='bert'): # Embedding with tf.variable_scope('embeddings'): # 1. obtain embeddings self.embedding_output, self.embedding_table, self.projection_table = _mh.embedding_lookup_factorized( input_ids=input_ids, vocab_size=config.vocab_size, hidden_size=config.hidden_size, embedding_size=config.embedding_size, use_one_hot_embedding=use_one_hot_embeddings, initializer_range=config.initializer_range, word_embedding_name='word_embeddings') # 2. add positional embeddings self.embedding_output = _mh.embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=False, token_type_ids=token_type_ids, token_type_vocab_size=config.token_type_vocab_size, token_type_embedding_name='token_type_embeddings', use_positional_embeddings=True, positional_embedding_type=config.pre_positional_embedding_type, pre_positional_embeddings=pre_positional_embeddings, positional_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_positional_embeddings=config.max_positional_embeddings, dropout_prob=config.hidden_dropout_prob) # Encoder with tf.variable_scope('encoder'): # obtain the mask # ATTENTION: do not use the original mask method, see more in the comments below this class. (not for this lm task) # attention_mask = _mh.create_attention_mask_from_input_mask(input_ids, input_mask) attention_mask = input_mask self.all_encoder_layers = tranformer_model(input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=_mh.gelu, hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config.attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True, share_parameter_across_layers=False) self.sequence_output = self.all_encoder_layers[-1] # for classification task with tf.variable_scope('pooler'): # [batch_size, seq_length, hidden_size] -> [batch_size, hidden_size] first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=_mh.create_initializer(config.initializer_range))
def model_fn(features, labels, mode, params): _info('*** Features ***') for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features['input_ids'] # [batch_size, seq_length] # build model is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = BertModelOfficial( config=bert_config, is_training=is_training, input_ids=input_ids) # [b, s, h] sequence_output = model.get_pooled_output() # sequence_output = tf.reshape(sequence_output, # [-1, bert_config.max_length * bert_config.hidden_size]) _info(sequence_output.shape) with tf.variable_scope('prediction'): logits = tf.layers.dense(sequence_output, bert_config.classes, name='prediction', kernel_initializer=_mh.create_initializer(0.2)) # logits = _mh.batch_norm(logits, is_training=is_training) prob = tf.nn.softmax(logits, axis=-1) # [b, 2] predict_ids = tf.argmax(prob, axis=-1) # [b, ] if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'class': predict_ids} # the default key in 'output', however, when customized, the keys are identical with the keys in dict. output_spec = tf.estimator.EstimatorSpec(mode, predictions=predictions) else: if mode == tf.estimator.ModeKeys.TRAIN: tvars = tf.trainable_variables() initialized_variable_names = {} if init_checkpoint: (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) _info('*** Trainable Variables ***') for var in tvars: init_string = '' if var.name in initialized_variable_names: init_string = ', *INIT_FROM_CKPT*' _info('name = {}, shape={}{}'.format(var.name, var.shape, init_string)) batch_size = tf.cast(bert_config.batch_size, tf.float32) labels = tf.reshape(labels, [-1]) # logits = tf.expand_dims(logits, axis=1) seq_loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits)) / batch_size loss = seq_loss """ Tutorial on `polynomial_decay`: The formula is as below: global_step = min(global_step, decay_steps) decayed_learning_rate = (learning_rate - end_learning_rate) * (1 - global_step / decay_steps) ^ (power) + end_learning_rate global_step: each batch step. decay_steps: the whole step, the lr will touch the end_learning_rate after the decay_steps. TRAIN_STEPS: the number for repeating the whole dataset, so the decay_steps = len(dataset) / batch_size * TRAIN_STEPS. """ train_op, lr = optimization.create_optimizer(loss, bert_config.learning_rate, bert_config.num_train_steps * 100, bert_config.lr_limit) """ learning_rate = tf.train.polynomial_decay(config.learning_rate, tf.train.get_or_create_global_step(), _cg.TRIAN_STEPS, end_learning_rate=0.0, power=1.0, cycle=False) lr = tf.maximum(tf.constant(config.lr_limit), learning_rate) optimizer = tf.train.AdamOptimizer(lr, name='optimizer') tvars = tf.trainable_variables() gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=config.colocate_gradients_with_ops) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step()) """ # this is excellent, because it could display the result each step, i.e., each step equals to batch_size. # the output_spec, display the result every save checkpoints step. logging_hook = tf.train.LoggingTensorHook({'loss' : loss, 'lr': lr}, every_n_iter=10) output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook]) elif mode == tf.estimator.ModeKeys.EVAL: # TODO raise NotImplementedError return output_spec
def build(self, sent_A, sent_B, sent_length_A, sent_length_B, scope=None): # RNN Encoder encoder_outputs_A = RNNEncoder(sent_A, sent_length_A, self.vocab_size, self.embedding_size, self.num_layers, self.hidden_size, self.forget_bias, self.dropout, self.initializer_range) encoder_outputs_B = RNNEncoder(sent_B, sent_length_B, self.vocab_size, self.embedding_size, self.num_layers, self.hidden_size, self.forget_bias, self.dropout, self.initializer_range) # CNN cnn_output_A = CNNExtractor(encoder_outputs_A, self.kernel_size, self.pool_size, self.dropout, self.initializer_range) cnn_output_B = CNNExtractor(encoder_outputs_B, self.kernel_size, self.pool_size, self.dropout, self.initializer_range) # Attention attention_A = AttentionLayer(encoder_outputs_A, encoder_outputs_B) attention_B = AttentionLayer(encoder_outputs_B, encoder_outputs_A) # Max and Mean on the concatenate of the encoder outputs and the attention outputs V_a = tf.concat( (encoder_outputs_A, attention_A, encoder_outputs_A - attention_A, tf.multiply(encoder_outputs_A, attention_A)), axis=-1) V_b = tf.concat( (encoder_outputs_B, attention_B, encoder_outputs_B - attention_B, tf.multiply(encoder_outputs_B, attention_B)), axis=-1) v_a_max = tf.reduce_max(V_a, axis=-1) v_a_avg = tf.reduce_mean(V_a, axis=-1) v_b_max = tf.reduce_max(V_b, axis=-1) v_b_avg = tf.reduce_mean(V_b, axis=-1) # concatenate the final output # (8*s_a -8) output_a = tf.concat((v_a_max, cnn_output_A, v_a_avg), axis=-1) # (8*s_b -8) output_b = tf.concat((v_b_max, cnn_output_B, v_b_avg), axis=-1) output = self.similarity_model(output_a, output_b) with tf.variable_scope('prediction'): layer_size = _mh.get_shape_list(output)[1] // 2 output = tf.layers.dense( output, layer_size, activation=tf.nn.tanh, name='layer_mid', kernel_initializer=_mh.create_initializer( initializer_range=self.initializer_range)) output = tf.layers.dense( output, 2, activation=tf.nn.tanh, name='layer_final', kernel_initializer=_mh.create_initializer( initializer_range=self.initializer_range)) return output
def tranformer_model(input_tensor, attention_mask=None, hidden_size=1024, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=_mh.gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False, share_parameter_across_layers=True): """Multi-head, multi-layer Transformer. Args: input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length], where 1 indicates the position can be attended and 0 indicates the position cannot be attended. hidden_size: int. Hidden size of the Transformer. num_hidden_layers: int. Number of layers in the Transformer. num_attention_heads: int. Number of attention heads in the Transformer. intermediate_size: int. The size of the feed forward layer. intermediate_act_fn: activation function after feed forward layer. hidden_dropout_prob: float. attention_probs_dropout_prob: float. initializer_range: float. do_return_all_layers: bool. Return the output from all the hidden layers or just the final layer. share_parameter_across_layers: bool. Whether share parameters across each attention layer. Returns: float Tensor of shape [batch_size, seq_length, hidden_size], or a list contains 'num_hidden_layers' float Tensor. """ if hidden_size % num_attention_heads != 0: _error( 'The hidden size {} cannot be divided by the number of attention heads {}' .format(hidden_size, num_attention_heads)) raise ValueError # the hidden size for each head attention_head_size = int(hidden_size / num_attention_heads) input_shape = _mh.get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] input_width = input_shape[2] # residual layer need to perform on the outputs from all layers, # so the hidden size, i.e. the outputs from the transformer blocks # should be the same as the input_width, at the beginning, it is input tensor, # diffetentiate hidden_size from the intermediate_size, # intermediate layer is before the hidden layer. if input_width != hidden_size: _error( 'The width of the input tensor {} not not equal to the hidden size {}' .format(input_width, hidden_size)) raise ValueError # create a list to save the output from each transformer layer] prev_output = input_tensor # [batch_size, seq_length, width] all_layer_outputs = [] for layer_idx in range(num_hidden_layers): if share_parameter_across_layers: name_variable_scope = 'layer_shared' else: name_variable_scope = 'layer_{}'.format(layer_idx) # share the parameter across layers when share_parameter_across_layers us True and not the first layer with tf.variable_scope( name_variable_scope, reuse=True if (share_parameter_across_layers and layer_idx > 0) else False): layer_input = prev_output with tf.variable_scope('attention'): attention_heads = [] with tf.variable_scope('self'): attention_head = self_attention_layer( from_tensor=layer_input, to_tensor=layer_input, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob= attention_probs_dropout_prob, initializer_range=initializer_range, batch_size=batch_size, from_seq_length=seq_length, to_seq_length=seq_length) attention_output = attention_head # perform residual layer to finish the self-attention block with tf.variable_scope('output'): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=_mh.create_initializer( initializer_range)) attention_output = _mh.dropout(attention_output, hidden_dropout_prob) attention_output = _mh.layer_norm(attention_output + layer_input) # do double linear projection to enhance the context representation with tf.variable_scope('intermediate'): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=_mh.create_initializer( initializer_range)) with tf.variable_scope('output'): layer_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=_mh.create_initializer( initializer_range)) layer_output = _mh.dropout(layer_output, hidden_dropout_prob) layer_output = _mh.layer_norm(layer_output + attention_output) prev_output = layer_output all_layer_outputs.append(layer_output) if do_return_all_layers: return all_layer_outputs else: return all_layer_outputs[-1]
def self_attention_layer(from_tensor, to_tensor, attention_mask=None, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, batch_size=None, from_seq_length=None, to_seq_length=None): """Perform self-attention. Args: from_tensor: float Tensor of shape [batch_size, seq_length, width]. to_tensor: float Tensor of shape [batch_size, seq_length, width]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length], where 1 indicates the position can be attended and 0 indicates the position cannot be attended. num_attention_heads: int. Number of attention heads in the Transformer. size_per_head: int. Size of each attention head. query_act: (optional) Activation function for the query transformer. key_act: (optional) Activation function for the key transformer. value_act: (optional) Activation function for the value transformer. attention_probs_dropout_prob: (optional) float. initializer_range: float. batch_size: (optional) int. from_seq_length: (optional) int. to_seq_length: (optional) int. Returns: float Tensor of shape [batch_size, from_seq_length, width]. """ def transpose_for_scores(input_tensor, batch_size, num_attention_heads, seq_length, size_per_head): """Change the order of axes. witdh = num_attention_heads * size_per_head. Args: input_tensor: float Tensor of shape [batch_size, seq_length, width]. Returns: float Tensor of shape [batch_size, num_attention_heads, seq_length, size_per_head]. """ output_tensor = tf.reshape( input_tensor, [batch_size, seq_length, num_attention_heads, size_per_head]) output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) return output_tensor # check the rank from_shape = _mh.get_shape_list(from_tensor, expected_rank=3) to_shape = _mh.get_shape_list(to_tensor, expected_rank=3) if len(from_shape) != len(to_shape) != 3: _error( 'The rank of `from_tensor` should match the rank of `to_tensor`, and should be 3' ) raise ValueError # calculate the query, key, value # from_tensor: [batch_size, seq_length, width] -> query_layer: [batch_size, seq_length, num_attention_heads * size_per_head] # num_attention_heads * size_per_head == hidden_size == width query_layer = tf.layers.dense( from_tensor, num_attention_heads * size_per_head, activation=query_act, name='query', kernel_initializer=_mh.create_initializer(initializer_range)) key_layer = tf.layers.dense( to_tensor, num_attention_heads * size_per_head, activation=key_act, name='key', kernel_initializer=_mh.create_initializer(initializer_range)) value_layer = tf.layers.dense( to_tensor, num_attention_heads * size_per_head, activation=value_act, name='value', kernel_initializer=_mh.create_initializer(initializer_range)) # [batch_size, seq_length, width] -> [batch_size, num_attention_heads, seq_length, size_per_head] query_layer = transpose_for_scores(query_layer, batch_size, num_attention_heads, from_seq_length, size_per_head) key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, to_seq_length, size_per_head) # calculate the attention scores # [batch_size, num_attention_heads, from_seq_length, to_seq_length] attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) if attention_mask is not None: # [batch_size, seq_length, seq_length] -> [batch_size, 1, seq_length, seq_length] attention_mask = tf.expand_dims(attention_mask, axis=1) adder = (1.0 - tf.cast(attention_mask, dtype=tf.float32)) * -10000.0 attention_scores += adder attention_probs = tf.nn.softmax(attention_scores) attention_probs = _mh.dropout(attention_probs, attention_probs_dropout_prob) # calculate the context layer # [batch_size, num_attention_heads, to_seq_length, size_per_head] value_layer = transpose_for_scores(value_layer, batch_size, num_attention_heads, to_seq_length, size_per_head) context_layer = tf.matmul(attention_scores, value_layer) # [batch_size, from_seq_length, num_attention_heads, size_per_head] context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) # [batch_size, from_seq_length, width] context_layer = tf.reshape( context_layer, [batch_size, from_seq_length, num_attention_heads * size_per_head]) return context_layer