def build_encoder(self, input_lengths, input_mask, *args, **kargs): reuse = kargs["reuse"] word_emb, entity_emb = self.build_emebdding(*args, **kargs) dropout_rate = tf.cond(self.is_training, lambda:self.config.dropout_rate, lambda:0.0) with tf.variable_scope(self.config.scope+"_encoder", reuse=reuse): input_dim = word_emb.get_shape()[-1] word_emb = match_utils.multi_highway_layer(word_emb, input_dim, self.config.highway_layer_num) [sent_repres_fw, sent_repres_bw, sent_repres] = layer_utils.my_lstm_layer(word_emb, self.config.context_lstm_dim, input_lengths=input_lengths, scope_name=self.config.scope, reuse=reuse, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) # word_emb = tf.layers.dense(word_emb, self.emb_size) memory_tran = tf.transpose(self.memory, [1,0]) # e * c word_emb_ = tf.expand_dims(sent_repres, 3) input_mask = tf.cast(input_mask, tf.float32) print(word_emb_.get_shape(), "======emb shape======") H_enc = leam_utils.att_emb_ngram_encoder_maxout( word_emb_, input_mask, self.memory, memory_tran, self.config ) print("===H_enc shape===", H_enc.get_shape()) return H_enc
def build_emebdding(self, *args, **kargs): reuse = kargs["reuse"] dropout_rate = tf.cond(self.is_training, lambda: self.config.dropout_rate, lambda: 0.0) word_emb = tf.nn.embedding_lookup(self.emb_mat, self.sent_token) if self.config.with_word_drop: word_drop_rate = tf.cond(self.is_training, lambda: self.config.word_drop_rate, lambda: 0.0) word_emb, word_drop_mask = common_utils.word_dropout( word_emb, word_drop_rate) else: word_drop_mask = self.sent_token_mask entity_emb = tf.nn.embedding_lookup(self.emb_mat, self.entity_token) [_, _, entity_emb ] = layer_utils.my_lstm_layer(entity_emb, self.config.context_lstm_dim, input_lengths=self.entity_token_len, scope_name=self.config.scope, reuse=reuse, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) entity_mask = tf.expand_dims(self.entity_token_mask, axis=-1) # batch x len x 1 entity_emb = tf.reduce_max(qanet_layers.mask_logits( entity_emb, entity_mask), axis=1) entity_emb = tf.expand_dims(entity_emb, axis=1) seq_len = tf.reduce_max(self.sent_token_len) entity_emb = tf.tile(entity_emb, [1, seq_len, 1]) mask = tf.expand_dims(self.sent_token_mask, -1) word_emb = tf.concat([word_emb, entity_emb], axis=-1) word_emb *= tf.cast(mask, tf.float32) print(word_emb.get_shape(), "=====word with entity========") if self.config.with_char: char_emb = self.build_char_embedding(self.sent_char, self.sent_char_len, self.char_mat, is_training=self.is_training, reuse=reuse) word_emb = tf.concat([word_emb, char_emb], axis=-1) return word_emb, word_drop_mask
def lstm_char_embedding(char_token, char_lengths, char_embedding, config, is_training=True, reuse=None): dropout_rate = tf.cond(is_training, lambda: config.dropout_rate, lambda: 0.0) with tf.variable_scope(config.scope + "_lstm_char_embedding_layer", reuse=reuse): char_dim = char_embedding.get_shape()[-1] input_shape = tf.shape(char_token) batch_size = input_shape[0] question_len = input_shape[1] char_len = input_shape[2] in_question_char_repres = tf.nn.embedding_lookup( char_embedding, char_token) in_question_char_repres = tf.reshape(in_question_char_repres, shape=[-1, char_len, char_dim]) question_char_lengths = tf.reshape(char_lengths, [-1]) quesiton_char_mask = tf.sequence_mask( question_char_lengths, char_len, dtype=tf.float32) # [batch_size*question_len, q_char_len] in_question_char_repres = tf.multiply( in_question_char_repres, tf.expand_dims(quesiton_char_mask, axis=-1)) (question_char_outputs_fw, question_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_question_char_repres, config.char_lstm_dim, input_lengths=question_char_lengths, scope_name="char_lstm", reuse=reuse, is_training=is_training, dropout_rate=dropout_rate, use_cudnn=config.use_cudnn) question_char_outputs_fw = layer_utils.collect_final_step_of_lstm( question_char_outputs_fw, question_char_lengths - 1) question_char_outputs_bw = question_char_outputs_bw[:, 0, :] question_char_outputs = tf.concat( axis=1, values=[question_char_outputs_fw, question_char_outputs_bw]) question_char_outputs = tf.reshape( question_char_outputs, [batch_size, question_len, 2 * config.char_lstm_dim]) return question_char_outputs
def build_encoder(self, index, input_lengths, input_mask, *args, **kargs): reuse = kargs["reuse"] word_emb = self.build_emebdding(index, *args, **kargs) dropout_rate = tf.cond(self.is_training, lambda:self.config.dropout_rate, lambda:0.0) word_emb = tf.nn.dropout(word_emb, 1-dropout_rate) with tf.variable_scope(self.config.scope+"_input_highway", reuse=reuse): input_dim = word_emb.get_shape()[-1] sent_repres = match_utils.multi_highway_layer(word_emb, input_dim, self.config.highway_layer_num) [_, _, sent_repres] = layer_utils.my_lstm_layer(sent_repres, self.config.context_lstm_dim, input_lengths=input_lengths, scope_name=self.config.scope, reuse=reuse, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) sent_repres = tf.layers.dense(sent_repres, self.config.context_lstm_dim*2, activation=tf.nn.relu) + sent_repres ignore_padding = (1 - input_mask) ignore_padding = decathlon_utils.attention_bias_ignore_padding(ignore_padding) encoder_self_attention_bias = ignore_padding output = decathlon_utils.multihead_attention_texar(sent_repres, memory=None, memory_attention_bias=encoder_self_attention_bias, num_heads=self.config.num_heads, num_units=None, dropout_rate=dropout_rate, scope="multihead_attention") output = tf.layers.dense(output, self.config.context_lstm_dim*2, activation=tf.nn.relu) + output output = qanet_layers.layer_norm(output, scope = "layer_norm", reuse = reuse) return sent_repres
def build_encoder(self, index, input_lengths, *args, **kargs): reuse = kargs["reuse"] word_emb = self.build_emebdding(index, *args, **kargs) dropout_rate = tf.cond(self.is_training, lambda: self.config.dropout_rate, lambda: 0.0) word_emb = tf.nn.dropout(word_emb, 1 - dropout_rate) with tf.variable_scope(self.config.scope + "_input_highway", reuse=reuse): input_dim = word_emb.get_shape()[-1] sent_repres = match_utils.multi_highway_layer( word_emb, input_dim, self.config.highway_layer_num) if self.config.rnn == "lstm": [sent_repres_fw, sent_repres_bw, sent_repres ] = layer_utils.my_lstm_layer(sent_repres, self.config.context_lstm_dim, input_lengths=input_lengths, scope_name=self.config.scope, reuse=reuse, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) elif self.config.rnn == "slstm": word_emb_proj = tf.layers.dense(word_emb, self.config.slstm_hidden_size) initial_hidden_states = word_emb_proj initial_cell_states = tf.identity(initial_hidden_states) [new_hidden_states, new_cell_states, dummynode_hidden_states ] = slstm_utils.slstm_cell(self.config, self.config.scope, self.config.slstm_hidden_size, input_lengths, initial_hidden_states, initial_cell_states, self.config.slstm_layer_num, dropout_rate, reuse=reuse) sent_repres = new_hidden_states return sent_repres
def build_encoder(self, sent_repres, input_lengths, *args, **kargs): reuse = kargs["reuse"] dropout_rate = tf.cond(self.is_training, lambda: self.config.dropout_rate, lambda: 0.0) with tf.variable_scope(self.config.scope + "_input_highway", reuse=reuse): if self.config.rnn == "lstm": [sent_repres_fw, sent_repres_bw, sent_repres ] = layer_utils.my_lstm_layer(sent_repres, self.config.context_lstm_dim, input_lengths=input_lengths, scope_name=self.config.scope, reuse=reuse, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn, lstm_type=self.config.lstm_type) match_dim = self.config.context_lstm_dim * 2 elif self.config.rnn == "slstm": word_emb_proj = tf.layers.dense(word_emb, self.config.slstm_hidden_size) initial_hidden_states = word_emb_proj initial_cell_states = tf.identity(initial_hidden_states) [new_hidden_states, new_cell_states, dummynode_hidden_states ] = slstm_utils.slstm_cell(self.config, self.config.scope, self.config.slstm_hidden_size, input_lengths, initial_hidden_states, initial_cell_states, self.config.slstm_layer_num, dropout_rate, reuse=reuse) match_dim = self.config.slstm_hidden_size * 2 sent_repres = new_hidden_states return sent_repres, match_dim
def build_compression(self, context_fusion, context_mask, context_len, scope_name, *args, **kargs): reuse = kargs["reuse"] dropout_rate = tf.cond(self.is_training, lambda:self.config.dropout_rate, lambda:0.0) with tf.variable_scope(self.config.scope+"_compression_"+scope_name, reuse=reuse): ignore_padding = (1 - context_mask) ignore_padding = decathlon_utils.attention_bias_ignore_padding(ignore_padding) encoder_self_attention_bias = ignore_padding context_repres = decathlon_utils.multihead_attention_texar(context_fusion, memory=None, memory_attention_bias=encoder_self_attention_bias, num_heads=self.config.num_heads, num_units=None, dropout_rate=dropout_rate, scope="context") context_repres = tf.layers.dense(context_repres, self.config.context_lstm_dim*2, activation=tf.nn.relu) + context_repres context_repres = qanet_layers.layer_norm(context_repres, scope = "layer_norm", reuse = reuse) [_, _, context_repres] = layer_utils.my_lstm_layer(context_repres, self.config.context_lstm_dim, input_lengths=input_lengths, scope_name=self.config.scope, reuse=reuse, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) return context_repres
def build_attention_aggregation(self, coattention, context, context_len, context_mask, sent1_repres, sent2_repres, sent1_len, sent2_len, sent1_mask, sent2_mask, *args, **kargs): reuse = kargs["reuse"] dropout_rate = tf.cond(self.is_training, lambda:self.config.dropout_rate, lambda:0.0) context_fusion = [] with tf.variable_scope(self.config.scope+"_aggerate_attention", reuse=reuse): for i in range(len(self.config.attn_lst)): context_f = tf.concat([context, coattention[i]], axis=-1) with tf.variable_scope(self.config.scope+"_attn_fusion_{}".format(i), reuse=reuse): [_, _, context_f] = layer_utils.my_lstm_layer(context_f, self.config.context_lstm_dim, input_lengths=context_len, scope_name=self.config.scope+"_context", reuse=None, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) context_fusion.append(context_f) # batch x 4 x len x dim context_fusion = tf.stack(context_fusion, axis=1) with tf.variable_scope(self.config.scope+"_attention_fusion", reuse=reuse): context_fusion = decathlon_utils.attention_fusion( context_fusion, context_maks, self.config.scope+"_context_fusion", reuse=reuse) return context_fusion
def build_interactor(self, sent1_repres, sent2_repres, sent1_len, sent2_len, sent1_mask, sent2_mask, *args, **kargs): reuse = kargs["reuse"] input_dim = sent1_repres.get_shape()[-1] dropout_rate = tf.cond(self.is_training, lambda: self.config.dropout_rate, lambda: 0.0) with tf.variable_scope(self.config.scope + "_interaction_module", reuse=reuse): if self.config.with_self_attention: v_1_attn = esim_utils.multihead_attention( sent1_repres, sent1_repres, num_units=None, num_heads=self.config.num_heads, dropout_rate=dropout_rate, is_training=True, causality=False, scope="multihead_attention", reuse=None) v_2_attn = esim_utils.multihead_attention( sent2_repres, sent2_repres, num_units=None, num_heads=self.config.num_heads, dropout_rate=dropout_rate, is_training=True, causality=False, scope="multihead_attention", reuse=True) sent1_repres = tf.concat([sent1_repres, v_1_attn], axis=-1) sent2_repres = tf.concat([sent2_repres, v_2_attn], axis=-1) [query_attention_outputs, context_attention_outputs ] = esim_utils.query_context_attention(sent1_repres, sent2_repres, sent1_len, sent2_len, sent1_mask, sent2_mask, dropout_rate, self.config.scope, reuse=reuse) if self.config.rnn == "lstm": [sent1_repres_fw, sent1_repres_bw, sent1_repres ] = layer_utils.my_lstm_layer(query_attention_outputs, self.config.context_lstm_dim, input_lengths=sent1_len, scope_name=self.config.scope, reuse=None, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) [sent2_repres_fw, sent2_repres_bw, sent2_repres ] = layer_utils.my_lstm_layer(context_attention_outputs, self.config.context_lstm_dim, input_lengths=sent2_len, scope_name=self.config.scope, reuse=True, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) match_dim = self.config.context_lstm_dim * 8 elif self.config.rnn == "slstm": sent1_initial_hidden_states = tf.layers.dense( query_attention_outputs, self.config.slstm_hidden_size) sent1_initial_cell_states = tf.identity( sent1_initial_hidden_states) [ new_sent1_hidden_states, new_sent1_cell_states, dummynode_sent1_hidden_states ] = slstm_utils.slstm_cell(self.config, self.config.scope, self.config.slstm_hidden_size, sent1_len, sent1_initial_hidden_states, sent1_initial_cell_states, self.config.slstm_layer_num, dropout_rate, reuse=None) sent1_repres = new_sent1_hidden_states sent2_initial_hidden_states = tf.layers.dense( context_attention_outputs, self.config.slstm_hidden_size) sent2_initial_cell_states = tf.identity( sent2_initial_hidden_states) [ new_sent2_hidden_states, new_sent2_cell_states, dummynode_sent2_hidden_states ] = slstm_utils.slstm_cell(self.config, self.config.scope, self.config.slstm_hidden_size, sent2_len, sent2_initial_hidden_states, sent2_initial_cell_states, self.config.slstm_layer_num, dropout_rate, reuse=True) sent2_repres = new_sent2_hidden_states match_dim = self.config.slstm_hidden_size * 4 v_1_sum = tf.reduce_sum(sent1_repres, 1) v_1_ave = tf.div( v_1_sum, tf.expand_dims(tf.cast(sent1_len, tf.float32) + EPSILON, -1)) v_2_sum = tf.reduce_sum(sent2_repres, 1) v_2_ave = tf.div( v_2_sum, tf.expand_dims(tf.cast(sent2_len, tf.float32) + EPSILON, -1)) # v_1_max = tf.reduce_max(sent1_repres, 1) # v_2_max = tf.reduce_max(sent2_repres, 1) mask_q = tf.expand_dims(sent1_mask, -1) mask_c = tf.expand_dims(sent2_mask, -1) v_1_max = tf.reduce_max(qanet_layers.mask_logits( sent1_repres, mask_q), axis=1) v_2_max = tf.reduce_max(qanet_layers.mask_logits( sent2_repres, mask_c), axis=1) out1 = tf.concat([v_1_ave, v_1_max], axis=-1) out2 = tf.concat([v_2_ave, v_2_max], axis=-1) out = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1) return out1, out2, out, match_dim
def bilateral_match_func(in_question_repres, in_passage_repres, question_lengths, passage_lengths, question_mask, passage_mask, input_dim, is_training, options=None): question_aware_representatins = [] question_aware_dim = 0 passage_aware_representatins = [] passage_aware_dim = 0 # ====word level matching====== (match_reps, match_dim) = match_passage_with_question( in_passage_repres, in_question_repres, passage_mask, question_mask, passage_lengths, question_lengths, input_dim, scope="word_match_forward", with_full_match=False, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options.with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) question_aware_representatins.append(match_reps) question_aware_dim += match_dim (match_reps, match_dim) = match_passage_with_question( in_question_repres, in_passage_repres, question_mask, passage_mask, question_lengths, passage_lengths, input_dim, scope="word_match_backward", with_full_match=False, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options.with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) passage_aware_representatins.append(match_reps) passage_aware_dim += match_dim with tf.variable_scope('context_MP_matching'): for i in range( options.context_layer_num): # support multiple context layer with tf.variable_scope('layer-{}'.format(i)): # contextual lstm for both passage and question in_question_repres = tf.multiply( in_question_repres, tf.expand_dims(question_mask, axis=-1)) in_passage_repres = tf.multiply( in_passage_repres, tf.expand_dims(passage_mask, axis=-1)) (question_context_representation_fw, question_context_representation_bw, in_question_repres) = layer_utils.my_lstm_layer( in_question_repres, options.context_lstm_dim, input_lengths=question_lengths, scope_name="context_represent", reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) (passage_context_representation_fw, passage_context_representation_bw, in_passage_repres) = layer_utils.my_lstm_layer( in_passage_repres, options.context_lstm_dim, input_lengths=passage_lengths, scope_name="context_represent", reuse=True, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) # Multi-perspective matching with tf.variable_scope('left_MP_matching'): (match_reps, match_dim) = match_passage_with_question( passage_context_representation_fw, question_context_representation_fw, passage_mask, question_mask, passage_lengths, question_lengths, options.context_lstm_dim, scope="forward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options. with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) question_aware_representatins.append(match_reps) question_aware_dim += match_dim (match_reps, match_dim) = match_passage_with_question( passage_context_representation_bw, question_context_representation_bw, passage_mask, question_mask, passage_lengths, question_lengths, options.context_lstm_dim, scope="backward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options. with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) question_aware_representatins.append(match_reps) question_aware_dim += match_dim with tf.variable_scope('right_MP_matching'): (match_reps, match_dim) = match_passage_with_question( question_context_representation_fw, passage_context_representation_fw, question_mask, passage_mask, question_lengths, passage_lengths, options.context_lstm_dim, scope="forward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options. with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) passage_aware_representatins.append(match_reps) passage_aware_dim += match_dim (match_reps, match_dim) = match_passage_with_question( question_context_representation_bw, passage_context_representation_bw, question_mask, passage_mask, question_lengths, passage_lengths, options.context_lstm_dim, scope="backward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options. with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) passage_aware_representatins.append(match_reps) passage_aware_dim += match_dim question_aware_representatins = tf.concat( axis=2, values=question_aware_representatins ) # [batch_size, passage_len, question_aware_dim] passage_aware_representatins = tf.concat( axis=2, values=passage_aware_representatins ) # [batch_size, question_len, question_aware_dim] dropout_rate = tf.cond(is_training, lambda: options.dropout_rate, lambda: 0.0) question_aware_representatins = tf.nn.dropout( question_aware_representatins, (1 - dropout_rate)) passage_aware_representatins = tf.nn.dropout(passage_aware_representatins, (1 - dropout_rate)) # ======Highway layer====== if options.with_match_highway: with tf.variable_scope("left_matching_highway"): question_aware_representatins = multi_highway_layer( question_aware_representatins, question_aware_dim, options.highway_layer_num) with tf.variable_scope("right_matching_highway"): passage_aware_representatins = multi_highway_layer( passage_aware_representatins, passage_aware_dim, options.highway_layer_num) #========Aggregation Layer====== aggregation_representation = [] aggregation_dim = 0 qa_aggregation_input = question_aware_representatins pa_aggregation_input = passage_aware_representatins with tf.variable_scope('aggregation_layer'): for i in range(options.aggregation_layer_num ): # support multiple aggregation layer qa_aggregation_input = tf.multiply( qa_aggregation_input, tf.expand_dims(passage_mask, axis=-1)) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( qa_aggregation_input, options.aggregation_lstm_dim, input_lengths=passage_lengths, scope_name='left_layer-{}'.format(i), reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) fw_rep = layer_utils.collect_final_step_of_lstm( fw_rep, passage_lengths - 1) bw_rep = bw_rep[:, 0, :] aggregation_representation.append(fw_rep) aggregation_representation.append(bw_rep) aggregation_dim += 2 * options.aggregation_lstm_dim qa_aggregation_input = cur_aggregation_representation # [batch_size, passage_len, 2*aggregation_lstm_dim] pa_aggregation_input = tf.multiply( pa_aggregation_input, tf.expand_dims(question_mask, axis=-1)) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( pa_aggregation_input, options.aggregation_lstm_dim, input_lengths=question_lengths, scope_name='right_layer-{}'.format(i), reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) fw_rep = layer_utils.collect_final_step_of_lstm( fw_rep, question_lengths - 1) bw_rep = bw_rep[:, 0, :] aggregation_representation.append(fw_rep) aggregation_representation.append(bw_rep) aggregation_dim += 2 * options.aggregation_lstm_dim pa_aggregation_input = cur_aggregation_representation # [batch_size, passage_len, 2*aggregation_lstm_dim] aggregation_representation = tf.concat( axis=1, values=aggregation_representation) # [batch_size, aggregation_dim] # ======Highway layer====== if options.with_aggregation_highway: with tf.variable_scope("aggregation_highway"): agg_shape = tf.shape(aggregation_representation) batch_size = agg_shape[0] aggregation_representation = tf.reshape( aggregation_representation, [1, batch_size, aggregation_dim]) aggregation_representation = multi_highway_layer( aggregation_representation, aggregation_dim, options.highway_layer_num) aggregation_representation = tf.reshape( aggregation_representation, [batch_size, aggregation_dim]) return (aggregation_representation, aggregation_dim)
def build_encoder(self, input_lengths, input_mask, *args, **kargs): reuse = kargs["reuse"] word_emb = self.build_emebdding(*args, **kargs) dropout_rate = tf.cond(self.is_training, lambda: self.config.dropout_rate, lambda: 0.0) word_emb = tf.nn.dropout(word_emb, 1 - dropout_rate) with tf.variable_scope(self.config.scope + "_input_highway", reuse=reuse): input_dim = word_emb.get_shape()[-1] sent_repres = match_utils.multi_highway_layer( word_emb, input_dim, self.config.highway_layer_num) if self.config.rnn == "lstm": [sent_repres_fw, sent_repres_bw, sent_repres ] = layer_utils.my_lstm_layer(sent_repres, self.config.context_lstm_dim, input_lengths=input_lengths, scope_name=self.config.scope, reuse=reuse, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) match_dim = self.config.context_lstm_dim * 6 elif self.config.rnn == "slstm": word_emb_proj = tf.layers.dense(word_emb, self.config.slstm_hidden_size) initial_hidden_states = word_emb_proj initial_cell_states = tf.identity(initial_hidden_states) [new_hidden_states, new_cell_states, dummynode_hidden_states ] = slstm_utils.slstm_cell(self.config, self.config.scope, self.config.slstm_hidden_size, input_lengths, initial_hidden_states, initial_cell_states, self.config.slstm_layer_num, dropout_rate, reuse=reuse) sent_repres = new_hidden_states match_dim = self.config.slstm_hidden_size * 3 if self.config.multi_head: mask = tf.cast(input_mask, tf.float32) ignore_padding = (1 - mask) ignore_padding = label_network_utils.attention_bias_ignore_padding( ignore_padding) encoder_self_attention_bias = ignore_padding sent_repres = label_network_utils.multihead_attention_texar( sent_repres, memory=None, memory_attention_bias=encoder_self_attention_bias, num_heads=8, num_units=128, dropout_rate=dropout_rate, scope="multihead_attention") v_attn = self_attn.multi_dimensional_attention( sent_repres, input_mask, 'multi_dim_attn_for_%s' % self.config.scope, 1 - dropout_rate, self.is_training, self.config.weight_decay, "relu") mask = tf.expand_dims(input_mask, -1) v_sum = tf.reduce_sum(sent_repres * tf.cast(mask, tf.float32), 1) v_ave = tf.div( v_sum, tf.expand_dims( tf.cast(input_lengths, tf.float32) + EPSILON, -1)) v_max = tf.reduce_max(qanet_layers.mask_logits(sent_repres, mask), axis=1) v_last = esim_utils.last_relevant_output(sent_repres, input_lengths) out = tf.concat([v_ave, v_max, v_last, v_attn], axis=-1) return out, match_dim
def build_interactor(self, sent1_repres, sent2_repres, sent1_len, sent2_len, sent1_mask, sent2_mask, *args, **kargs): reuse = kargs["reuse"] input_dim = sent1_repres.get_shape()[-1] dropout_rate = tf.cond(self.is_training, lambda: self.config.dropout_rate, lambda: 0.0) with tf.variable_scope(self.config.scope + "_interaction_module", reuse=reuse): [c2q_concat, q2c_concat] = man_utils.concat_attention(sent1_repres, sent2_repres, sent1_len, sent2_len, sent1_mask, sent2_mask, dropout_rate, self.config.scope, reuse=reuse) [c2q_bilinear, q2c_bilinear] = man_utils.bilinear_attention(sent1_repres, sent2_repres, sent1_len, sent2_len, sent1_mask, sent2_mask, dropout_rate, self.config.scope, reuse=reuse) [c2q_dot, q2c_dot] = man_utils.dot_attention(sent1_repres, sent2_repres, sent1_len, sent2_len, sent1_mask, sent2_mask, dropout_rate, self.config.scope, reuse=reuse) [c2q_minus, q2c_minus] = man_utils.minus_attention(sent1_repres, sent2_repres, sent1_len, sent2_len, sent1_mask, sent2_mask, dropout_rate, self.config.scope, reuse=reuse) sent1_agg = tf.concat( [sent1_repres, c2q_concat, c2q_bilinear, c2q_dot, c2q_minus], axis=-1) sent1_agg_dim = self.config.context_lstm_dim * 10 sent2_agg = tf.concat( [sent2_repres, q2c_concat, q2c_bilinear, q2c_dot, q2c_minus], axis=-1) sent2_agg_dim = self.config.context_lstm_dim * 10 with tf.variable_scope(self.config.scope + "_inner_highway", reuse=None): sent1_agg = match_utils.multi_highway_layer( sent1_agg, sent1_agg_dim, 1, scope="sent_attention_highway") tf.get_variable_scope().reuse_variables() sent2_agg = match_utils.multi_highway_layer( sent2_agg, sent2_agg_dim, 1, scope="sent_attention_highway") [_, _, sent1_agg ] = layer_utils.my_lstm_layer(sent1_agg, self.config.context_lstm_dim, input_lengths=sent1_len, scope_name="inner_aggeration", reuse=False, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) [_, _, sent2_agg ] = layer_utils.my_lstm_layer(sent2_agg, self.config.context_lstm_dim, input_lengths=sent2_len, scope_name="inner_aggeration", reuse=True, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) with tf.variable_scope(self.config.scope + "_predictor_self_attention", reuse=None): context_attn = man_utils.self_attention(sent1_repres, sent2_agg, sent1_len, sent2_len, sent1_mask, sent2_mask, dropout_rate, self.config.scope, reuse=None) tf.get_variable_scope().reuse_variables() query_attn = man_utils.self_attention(sent2_repres, sent1_agg, sent2_len, sent1_len, sent2_mask, sent1_mask, dropout_rate, self.config.scope, reuse=None) aggre_output = tf.concat([ context_attn, query_attn, tf.abs(context_attn - query_attn), context_attn * query_attn ], axis=-1) match_dim = self.config.context_lstm_dim * 2 * 4 return context_attn, query_attn, aggre_output, match_dim
def build_encoder(self, index, input_lengths, input_mask, *args, **kargs): reuse = kargs["reuse"] word_emb = self.build_emebdding(index, *args, **kargs) dropout_rate = tf.cond(self.is_training, lambda: self.config.dropout_rate, lambda: 0.0) word_emb = tf.nn.dropout(word_emb, 1 - dropout_rate) with tf.variable_scope(self.config.scope + "_input_highway", reuse=reuse): input_dim = word_emb.get_shape()[-1] sent_repres = match_utils.multi_highway_layer( word_emb, input_dim, self.config.highway_layer_num) if self.config.rnn == "lstm": [sent_repres_fw, sent_repres_bw, sent_repres ] = layer_utils.my_lstm_layer(sent_repres, self.config.context_lstm_dim, input_lengths=input_lengths, scope_name=self.config.scope, reuse=reuse, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) elif self.config.rnn == "slstm": word_emb_proj = tf.layers.dense(word_emb, self.config.slstm_hidden_size) initial_hidden_states = word_emb_proj initial_cell_states = tf.identity(initial_hidden_states) [new_hidden_states, new_cell_states, dummynode_hidden_states ] = slstm_utils.slstm_cell(self.config, self.config.scope, self.config.slstm_hidden_size, input_lengths, initial_hidden_states, initial_cell_states, self.config.slstm_layer_num, dropout_rate, reuse=reuse) sent_repres = new_hidden_states elif self.config.rnn == "base_transformer": sent_repres = base_transformer_utils.transformer_encoder( sent_repres, target_space=None, hparams=self.config, features=None, make_image_summary=False) elif self.config.rnn == "universal_transformer": sent_repres, act_loss = universal_transformer_utils.universal_transformer_encoder( sent_repres, target_space=None, hparams=self.config, features=None, make_image_summary=False) elif self.config.rnn == "highway": sent_repres = sent_repres input_mask = tf.expand_dims(tf.cast(input_mask, tf.float32), axis=-1) sent_repres_sum = tf.reduce_sum(sent_repres * input_mask, axis=1) sent_repres_avr = tf.div( sent_repres_sum, tf.expand_dims( tf.cast(input_lengths, tf.float32) + EPSILON, -1)) if self.config.metric == "Hyperbolic": sent_repres = tf.clip_by_norm(sent_repres_sum, 1.0 - EPSILON, axes=1) else: sent_repres = sent_repres_avr if self.config.rnn == "universal_transformer": return sent_repres, act_loss else: return sent_repres
def build_encoder(self, input_lengths, input_mask, *args, **kargs): reuse = kargs["reuse"] word_emb, entity_emb = self.build_emebdding(*args, **kargs) dropout_rate = tf.cond(self.is_training, lambda: self.config.dropout_rate, lambda: 0.0) word_emb = tf.nn.dropout(word_emb, 1 - dropout_rate) with tf.variable_scope(self.config.scope + "_input_highway", reuse=reuse): input_dim = word_emb.get_shape()[-1] sent_repres = match_utils.multi_highway_layer( word_emb, input_dim, self.config.highway_layer_num) mask = tf.expand_dims(input_mask, -1) # sent_repres = tf.layers.dense(sent_repres, self.emb_size) sent_repres *= tf.cast(mask, tf.float32) # sent_repres = label_network_utils.self_attn( # enc=sent_repres, # scope=self.config.scope, # dropout=dropout_rate, # reuse=None, # config=self.config # ) # sent_repres = label_network_utils.text_cnn( # sent_repres, # filter_sizes=[1,3,5], # scope=self.config.scope, # embed_size=self.emb_size, # num_filters=self.config.num_filters) # output = sent_repres # print(sent_repres.get_shape(), "===text cnn encoder shape===") [sent_repres_fw, sent_repres_bw, sent_repres ] = layer_utils.my_lstm_layer(sent_repres, self.config.context_lstm_dim, input_lengths=input_lengths, scope_name=self.config.scope, reuse=reuse, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) match_dim = self.config.context_lstm_dim * 8 with tf.variable_scope(self.config.scope + "sent_label_attention", reuse=reuse): memory = tf.expand_dims(self.memory, axis=0) memory = tf.tile(memory, [tf.shape(sent_repres)[0], 1, 1]) # entity_emb = tf.expand_dims(entity_emb, axis=1) # entity_emb = tf.tile(entity_emb, [1, tf.shape(memory)[1], 1]) # print("===emb shape===", entity_emb.get_shape()) # # batch x classes x dim # memory = tf.concat([memory, entity_emb], axis=-1) print("==memory shape==", memory.get_shape()) # output = label_network_utils.memory_attention(sent_repres, # memory, input_mask, # scope=self.config.scope, # memory_mask=None) print(sent_repres.get_shape(), memory.get_shape()) output = label_network_utils.memory_attention_v1( sent_repres, memory, input_mask, "memory_attention", memory_mask=None, reuse=None, attention_output="multi_head", num_heads=4, dropout_rate=dropout_rate, threshold=1 / float(self.num_classes), apply_hard_attn=True) print("==output shape==", output.get_shape()) return sent_repres, entity_emb, output