def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: # 计算字符emb with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) # TODO What? heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: with tf.variable_scope(tf.get_variable_scope(), reuse=True): qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer( config.emb_mat)) # emb_mat is glove else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( [word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) with tf.variable_scope(tf.get_variable_scope(), reuse=True): qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat([fw_u, bw_u], 2) if config.share_lstm_weights: with tf.variable_scope(tf.get_variable_scope(), reuse=True): (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] TODO JX == x_len? h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat([fw_g0, bw_g0], 3) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) # Output Layer logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat([p0, g1, a1i, g1 * a1i], 3), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat([fw_g2, bw_g2], 3) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) JA = config.max_answer_length JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): # Char-CNN Embedding if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) # Word Embedding if config.use_word_emb: with tf.variable_scope("emb_var") as scope, tf.device( "/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') tf.get_variable_scope().reuse_variables() self.word_emb_scope = scope if config.use_glove_for_unk: word_emb_mat = tf.concat( [word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq # Concat Char-CNN Embedding and Word Embedding if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # exact match if config.use_exact_match: emx = tf.expand_dims(tf.cast(self.emx, tf.float32), -1) xx = tf.concat([xx, emx], 3) # [N, M, JX, di+1] emq = tf.expand_dims(tf.cast(self.emq, tf.float32), -1) qq = tf.concat([qq, emq], 2) # [N, JQ, di+1] # 2 layer highway network on Concat Embedding if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # Bidirection-LSTM (3rd layer on paper) cell = GRUCell(d) if config.GRU else BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), _ = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat([fw_u, bw_u], 2) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h # Attention Flow Layer (4th layer on paper) with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, size=d, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell # Modeling layer (5th layer on paper) tp0 = p0 for layer_idx in range(config.LSTM_num_layers - 1): (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope="g_{}".format(layer_idx)) # [N, M, JX, 2d] p0 = tf.concat([fw_g0, bw_g0], 3) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) # [N, M, JX, 2d] # Self match layer with tf.variable_scope("SelfMatch"): s0 = tf.reshape(g1, [N * M, JX, 2 * d]) # [N * M, JX, 2d] x_mask = tf.reshape(self.x_mask, [N * M, JX]) first_cell = AttentionCell(cell, s0, size=d, mask=x_mask, is_train=self.is_train) (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn( first_cell, first_cell, s0, x_len, dtype='float', scope='s') # [N, M, JX, 2d] s1 = tf.concat([fw_s, bw_s], 2) # [N * M, JX, 2d], M == 1 # prepare for PtrNet encoder_output = tf.expand_dims(s1, 1) # [N, M, JX, 2d] encoder_output = tf.expand_dims( tf.cast(self.x_mask, tf.float32), -1) * encoder_output # [N, M, JX, 2d] if config.GRU: encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: if isinstance(fw_s_f, LSTMStateTuple): encoder_state_c = tf.concat((fw_s_f.c, bw_s_f.c), 1, name='encoder_concat_c') encoder_state_h = tf.concat((fw_s_f.h, bw_s_f.h), 1, name='encoder_concat_h') encoder_state_final = LSTMStateTuple(c=encoder_state_c, h=encoder_state_h) elif isinstance(fw_s_f, tf.Tensor): encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: encoder_state_final = None tf.logging.error("encoder_state_final not set") print("encoder_state_final:", encoder_state_final) with tf.variable_scope("output"): # eos_symbol = config.eos_symbol # next_symbol = config.next_symbol tf.assert_equal( M, 1) # currently dynamic M is not supported, thus we assume M==1 answer_string = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.int32, name='answer_string') # [N, M, JA + 1] answer_string_mask = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.bool, name='answer_string_mask') # [N, M, JA + 1] answer_string_length = tf.placeholder( shape=(N, 1), dtype=tf.int32, name='answer_string_length', ) # [N, M] self.tensor_dict['answer_string'] = answer_string self.tensor_dict['answer_string_mask'] = answer_string_mask self.tensor_dict['answer_string_length'] = answer_string_length self.answer_string = answer_string self.answer_string_mask = answer_string_mask self.answer_string_length = answer_string_length answer_string_flattened = tf.reshape(answer_string, [N * M, JA + 1]) self.answer_string_flattened = answer_string_flattened # [N * M, JA+1] print("answer_string_flattened:", answer_string_flattened) answer_string_length_flattened = tf.reshape( answer_string_length, [N * M]) self.answer_string_length_flattened = answer_string_length_flattened # [N * M] print("answer_string_length_flattened:", answer_string_length_flattened) decoder_cell = GRUCell(2 * d) if config.GRU else BasicLSTMCell( 2 * d, state_is_tuple=True) with tf.variable_scope("Decoder"): decoder_train_logits = ptr_decoder( decoder_cell, tf.reshape(tp0, [N * M, JX, 2 * d]), # [N * M, JX, 2d] tf.reshape(encoder_output, [N * M, JX, 2 * d]), # [N * M, JX, 2d] encoder_final_state=encoder_state_final, max_encoder_length=config.sent_size_th, decoder_output_length= answer_string_length_flattened, # [N * M] batch_size=N, # N * M (M=1) attention_proj_dim=self.config.decoder_proj_dim, scope='ptr_decoder' ) # [batch_size, dec_len*, enc_seq_len + 1] self.decoder_train_logits = decoder_train_logits print("decoder_train_logits:", decoder_train_logits) self.decoder_train_softmax = tf.nn.softmax( self.decoder_train_logits) self.decoder_inference = tf.argmax( decoder_train_logits, axis=2, name='decoder_inference') # [N, JA + 1] self.yp = tf.ones([N, M, JX], dtype=tf.int32) * -1 self.yp2 = tf.ones([N, M, JX], dtype=tf.int32) * -1
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( 0, [word_emb_mat, self.new_emb_mat]) print(word_emb_mat.get_shape().as_list()) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0, p1 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell # with tf.variable_scope("activate"): # p0 = tf.nn.relu(_linear(tf.reshape(p0,[-1,1200]),300,bias=0.01,bias_start=0.0,scope='relu')) # if config.share_lstm_weights: # tf.get_variable_scope().reuse_variables() # p1 = tf.nn.relu(_linear(tf.reshape(p1,[-1,1200]),300,bias=0.01,bias_start=0.0,scope='relu')) with tf.variable_scope('two_lstm'): p0 = tf.reshape(p0, [N, 1, -1, 300]) p1 = tf.reshape(p1, [N, 1, -1, 300]) (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) q_len_new = tf.tile(tf.expand_dims(q_len, 1), [1, M]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p1, q_len_new, dtype='float', scope='g0') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) # with tf.variable_scope('two_lstm_1'): # (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(first_cell, first_cell, g0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] # g2 = tf.concat(3, [fw_g2, bw_g2]) # q_len_new = tf.tile(tf.expand_dims(q_len,1),[1,M]) # if config.share_lstm_weights: # tf.get_variable_scope().reuse_variables() # (fw_g3, bw_g3), _ = bidirectional_dynamic_rnn(first_cell, first_cell, g1, q_len_new, dtype='float', scope='g0') # [N, M, JX, 2d] # g3 = tf.concat(3, [fw_g3, bw_g3]) g0 = tf.reduce_sum(tf.reduce_max(g0, 2), 1) g1 = tf.reduce_sum(tf.reduce_max(g1, 2), 1) logits = _linear([g0, g1, tf.abs(tf.subtract(g0, g1)), g0 * g1], 2, bias=0.01, bias_start=0.0, scope='logits1') flat_logits2 = tf.reshape(logits, [N, 2]) yp = tf.nn.softmax(flat_logits2) # [-1, M*JX] self.tensor_dict['g0'] = g0 self.tensor_dict['g1'] = g1 self.logits = flat_logits2 self.yp = yp
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size beam_width = config.beam_width GO_TOKEN = 0 EOS_TOKEN = 1 JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat), trainable=True) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper( cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper( cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper( cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper( cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell( cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell( cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell( cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell( cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) if config.na: na_bias = tf.get_variable("na_bias", shape=[], dtype='float') na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]), [N, 1]) # [N, 1] concat_flat_logits = tf.concat( axis=1, values=[na_bias_tiled, flat_logits]) concat_flat_yp = tf.nn.softmax(concat_flat_logits) na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]), [1]) flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1]) concat_flat_logits2 = tf.concat( axis=1, values=[na_bias_tiled, flat_logits2]) concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2) na_prob2 = tf.squeeze( tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1]) # [N] flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1]) self.concat_logits = concat_flat_logits self.concat_logits2 = concat_flat_logits2 self.na_prob = na_prob * na_prob2 yp = tf.reshape(flat_yp, [-1, M, JX]) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) wyp = tf.nn.sigmoid(logits2) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 self.wyp = wyp with tf.variable_scope("q_gen"): # Question Generation Using (Paragraph & Predicted Ans Pos) NM = config.max_num_sents * config.batch_size # Separated encoder #ss = tf.reshape(xx, (-1, JX, dw+dco)) q_worthy = tf.reduce_sum( tf.to_int32(self.y), axis=2 ) # so we get probability distribution of answer-likely. (N, M) q_worthy = tf.expand_dims(tf.to_int32(tf.argmax(q_worthy, axis=1)), axis=1) # (N) -> (N, 1) q_worthy = tf.concat([ tf.expand_dims(tf.range(0, N, dtype=tf.int32), axis=1), q_worthy ], axis=1) # example : [0, 9], [1, 11], [2, 8], [3, 5], [4, 0], [5, 1] ... ss = tf.gather_nd(xx, q_worthy) syp = tf.expand_dims(tf.gather_nd(yp, q_worthy), axis=-1) syp2 = tf.expand_dims(tf.gather_nd(yp2, q_worthy), axis=-1) ss_with_ans = tf.concat([ss, syp, syp2], axis=2) qg_dim = 600 cell_fw, cell_bw = rnn.DropoutWrapper(rnn.GRUCell(qg_dim), input_keep_prob=config.input_keep_prob), \ rnn.DropoutWrapper(rnn.GRUCell(qg_dim), input_keep_prob=config.input_keep_prob) s_outputs, s_states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ss_with_ans, dtype=tf.float32) s_outputs = tf.concat(s_outputs, axis=2) s_states = tf.concat(s_states, axis=1) start_tokens = tf.zeros([N], dtype=tf.int32) self.inp_q_with_GO = tf.concat( [tf.expand_dims(start_tokens, axis=1), self.q], axis=1) # supervise if mode is train if config.mode == "train": emb_q = tf.nn.embedding_lookup(params=word_emb_mat, ids=self.inp_q_with_GO) #emb_q = tf.reshape(tf.tile(tf.expand_dims(emb_q, axis=1), [1, M, 1, 1]), (NM, JQ+1, dw)) train_helper = seq2seq.TrainingHelper(emb_q, [JQ] * N) else: s_outputs = seq2seq.tile_batch(s_outputs, multiplier=beam_width) s_states = seq2seq.tile_batch(s_states, multiplier=beam_width) cell = rnn.DropoutWrapper(rnn.GRUCell(num_units=qg_dim * 2), input_keep_prob=config.input_keep_prob) attention_mechanism = seq2seq.BahdanauAttention(num_units=qg_dim * 2, memory=s_outputs) attn_cell = seq2seq.AttentionWrapper(cell, attention_mechanism, attention_layer_size=qg_dim * 2, output_attention=True, alignment_history=False) total_glove_vocab_size = 78878 #72686 out_cell = rnn.OutputProjectionWrapper(attn_cell, VW + total_glove_vocab_size) if config.mode == "train": decoder_initial_states = out_cell.zero_state( batch_size=N, dtype=tf.float32).clone(cell_state=s_states) decoder = seq2seq.BasicDecoder( cell=out_cell, helper=train_helper, initial_state=decoder_initial_states) else: decoder_initial_states = out_cell.zero_state( batch_size=N * beam_width, dtype=tf.float32).clone(cell_state=s_states) decoder = seq2seq.BeamSearchDecoder( cell=out_cell, embedding=word_emb_mat, start_tokens=start_tokens, end_token=EOS_TOKEN, initial_state=decoder_initial_states, beam_width=beam_width, length_penalty_weight=0.0) outputs = seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=JQ) if config.mode == "train": gen_q = outputs[0].sample_id gen_q_prob = outputs[0].rnn_output gen_q_states = outputs[1] else: gen_q = outputs[0].predicted_ids[:, :, 0] gen_q_prob = tf.nn.embedding_lookup( params=word_emb_mat, ids=outputs[0].predicted_ids[:, :, 0]) gen_q_states = outputs[1] self.gen_q = gen_q self.gen_q_prob = gen_q_prob self.gen_q_states = gen_q_states
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=tf.random_normal_initializer) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") cell_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") cell2_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") d_cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") cell3_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") d_cell3_fw = SwitchableDropoutWrapper( cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper( cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") cell4_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") d_cell4_fw = SwitchableDropoutWrapper( cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper( cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell( cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell( cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell( cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell( cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) if config.na: na_bias = tf.get_variable("na_bias", shape=[], dtype='float') na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]), [N, 1]) # [N, 1] concat_flat_logits = tf.concat( axis=1, values=[na_bias_tiled, flat_logits]) concat_flat_yp = tf.nn.softmax(concat_flat_logits) na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]), [1]) flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1]) concat_flat_logits2 = tf.concat( axis=1, values=[na_bias_tiled, flat_logits2]) concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2) na_prob2 = tf.squeeze( tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1]) # [N] flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1]) self.concat_logits = concat_flat_logits self.concat_logits2 = concat_flat_logits2 self.na_prob = na_prob * na_prob2 yp = tf.reshape(flat_yp, [-1, M, JX], name="yp") yp2 = tf.reshape(flat_yp2, [-1, M, JX], name="yp2") wyp = tf.nn.sigmoid(logits2, name="wyp") self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 self.wyp = wyp
def apply(self, context, context_st, query, query_st, q_sub_st, q_len, c_mask, cdoc_mask, q_mask, candidates, cand_mask, greedy_read=False, reuse=False): batch_size = self.batch_size hidden_dim = self.hidden_dim query_dim = self.query_dim self.docs_len = tf.reduce_sum(tf.cast(c_mask, 'int32'), 2) candidates = tf.squeeze(candidates, axis=1) c_state = tf.zeros((batch_size, query_dim)) m_state = tf.zeros((batch_size, hidden_dim)) a_state = tf.zeros((batch_size, hidden_dim)) with tf.variable_scope('MACRnn'): with tf.variable_scope('q_sub_proj'): W = tf.get_variable('W', [query_dim, hidden_dim]) b = tf.get_variable('b', [hidden_dim]) m_state = tf.matmul(q_sub_st, W) + b self.c_history = [] self.m_history = [] self.a_history = [] self.doc_attn_logits_lst = [] self.word_attn_logits_lst = [] self.doc_attn_weights_lst = [] cell = tf.contrib.rnn.GRUCell(hidden_dim) self.cell = cell for i in range(self.num_hops): scope_str = 'MACRnn-layer-%d' % 0 if self.read_strategy == 'one_doc_per_it_and_repeat_2nd_step' and i > 1: m_state = self.m_history[0] a_state = self.a_history[0] c_state, m_state, a_state, doc_attn_logits, doc_attn_weights, word_attn_logits = self.HierarchicalAttnMACCell(i, cell, query, query_st, q_sub_st, context, context_st, c_mask, cdoc_mask, \ q_mask, self.c_history, self.m_history, c_state, m_state, a_state, scope_str, reuse=(reuse or i!=0), greedy_read=greedy_read) self.doc_attn_logits_lst.append(doc_attn_logits) self.word_attn_logits_lst.append(word_attn_logits) self.doc_attn_weights_lst.append(doc_attn_weights) self.c_history.append(c_state) self.m_history.append(m_state) if (self.reasoning_unit == 'concat_first_sent' or self.reasoning_unit == 'concat_full_doc') and i == self.num_hops - 1: with tf.variable_scope("concat_read_lstm", reuse=False): max_len = tf.reduce_max(self.concat_selected_doc_len) self.concat_selected_doc_mask = [] for k in range(self.batch_size): concat_selected_doc_mask_k = tf.concat(values=[tf.ones([self.concat_selected_doc_len[k]]), tf.zeros([max_len-self.concat_selected_doc_len[k]])], axis=0) self.concat_selected_doc_mask.append(concat_selected_doc_mask_k) self.concat_selected_doc = tf.stack(self.concat_selected_doc, axis=0) self.concat_selected_doc_mask = tf.cast(tf.stack(self.concat_selected_doc_mask, axis=0), 'bool') p0 = biattention_layer(self.is_train, self.concat_selected_doc, query, h_mask=self.concat_selected_doc_mask, u_mask=q_mask) p0 = tf.squeeze(p0, axis=1) cell_fw = BasicLSTMCell(40, state_is_tuple=True) cell_bw = BasicLSTMCell(40, state_is_tuple=True) cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=self.input_keep_prob) cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=self.input_keep_prob) (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, p0, self.concat_selected_doc_len, dtype='float') x = tf.concat(axis=2, values=[fw_h, bw_h]) logits = linear_logits([x], True, input_keep_prob=self.input_keep_prob, mask=self.concat_selected_doc_mask, is_train=self.is_train, scope='logits1') probs = tf.nn.softmax(logits) doc_rep = tf.einsum('ijk,ij->ik', self.concat_selected_doc, probs) a_state = doc_rep self.a_history.append(a_state) if self.oracle == 'extra': scope_str = 'MACRnn-layer-%d' % 0 if self.read_strategy == 'one_doc_per_it_and_repeat_2nd_step' and i > 1: m_state = self.m_history[0] a_state = self.a_history[0] _, _, a_state, _, _, _ = self.HierarchicalAttnMACCell(self.num_hops, cell, query, query_st, q_sub_st, context, context_st, c_mask, cdoc_mask, \ q_mask, self.c_history, self.m_history, c_state, m_state, a_state, scope_str, reuse=True, greedy_read=greedy_read) if self.prediction == 'candidates': if self.output_unit_type == 'triplet-mlp': g1 = self.MACOutputUnit(a_state, context, candidates, query=query) if (self.reasoning_unit != 'concat_first_sent' and self.reasoning_unit != 'concat_full_doc') and (self.reasoning_unit != 'attention-lstm' or self.read_strategy != 'one_doc_per_it'): for i in range(self.num_hops): gi = self.MACOutputUnit(self.a_history[i], context, candidates, query=query) self.answer_list.append(tf.expand_dims(gi, axis=1)) else: g1 = self.MACOutputUnit(a_state, context, candidates) if (self.reasoning_unit != 'concat_first_sent' and self.reasoning_unit != 'concat_full_doc') and (self.reasoning_unit != 'attention-lstm' or self.read_strategy != 'one_doc_per_it'): for i in range(self.num_hops): gi = self.MACOutputUnit(self.a_history[i], context, candidates, reuse=True) self.answer_list.append(tf.expand_dims(gi, axis=1)) return tf.expand_dims(g1, axis=1) else: raise NotImplementedError
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) JA = config.max_answer_length JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): # Char-CNN Embedding if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list(map(int, config.out_channel_dims.split(','))) # [100] heights = list(map(int, config.filter_heights.split(','))) # [5] assert sum(filter_sizes) == dco, (filter_sizes, dco) # Make sure filter channels = char_cnn_out size with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) # Word Embedding if config.use_word_emb: with tf.variable_scope("emb_var") as scope, tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') tf.get_variable_scope().reuse_variables() self.word_emb_scope = scope if config.use_glove_for_unk: word_emb_mat = tf.concat([word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq # Concat Char-CNN Embedding and Word Embedding if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # exact match if config.use_exact_match: # TODO: What does it mean? emx = tf.expand_dims(tf.cast(self.emx, tf.float32), -1) xx = tf.concat([xx, emx], 3) # [N, M, JX, di+1] emq = tf.expand_dims(tf.cast(self.emq, tf.float32), -1) qq = tf.concat([qq, emq], 2) # [N, JQ, di+1] # 2 layer highway network on Concat Embedding if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # Bidirection-LSTM (3rd layer on paper) cell = GRUCell(d) if config.GRU else BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] flat_x_len = flatten(x_len, 0) # [N * M] with tf.variable_scope("prepro"): if config.use_fused_lstm: #yes with tf.variable_scope("u1"): fw_inputs = tf.transpose(qq, [1, 0, 2]) #[time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, q_len, batch_dim=1, seq_dim=0) fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0) prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0) fw_outputs, fw_final = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=q_len, scope="fw") bw_outputs, bw_final = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=q_len, scope="bw") bw_outputs = tf.reverse_sequence(bw_outputs, q_len, batch_dim=1, seq_dim = 0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) output = tf.transpose(current_inputs, [1, 0, 2]) u = output flat_xx = flatten(xx, 2) # [N * M, JX, d] if config.share_lstm_weights: # Yes tf.get_variable_scope().reuse_variables() with tf.variable_scope("u1"): fw_inputs = tf.transpose(flat_xx, [1, 0, 2]) #[time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0) # fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) # bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) fw_outputs, fw_final = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="fw") bw_outputs, bw_final = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="bw") bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) output = tf.transpose(current_inputs, [1, 0, 2]) else: # No with tf.variable_scope("h1"): fw_inputs = tf.transpose(flat_xx, [1, 0, 2]) #[time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0) # fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) # bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0) prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0) fw_outputs, fw_final = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="fw") bw_outputs, bw_final = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="bw") bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) output = tf.transpose(current_inputs, [1, 0, 2]) h = tf.expand_dims(output, 1) # [N, M, JX, 2d] else: (fw_u, bw_u), _ = bidirectional_dynamic_rnn(d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat([fw_u, bw_u], 2) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u # hidden state of Q = u self.tensor_dict['h'] = h # hidden state of C = h # Attention Flow Layer (4th layer on paper) with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell(cell, u, size=d, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell # a GRU cell with dropout wrapper tp0 = p0 # Output of Attention layer # Modeling layer (5th layer on paper) with tf.variable_scope('modeling_layer'): if config.use_fused_lstm: g1, encoder_state_final = build_fused_bidirectional_rnn(inputs=p0, num_units=config.hidden_size, num_layers=config.num_modeling_layers, inputs_length=flat_x_len, input_keep_prob=config.input_keep_prob, scope='modeling_layer_g') else: for layer_idx in range(config.num_modeling_layers-1): (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell, first_cell, p0, x_len, dtype='float', scope="g_{}".format(layer_idx)) # [N, M, JX, 2d] p0 = tf.concat([fw_g0, bw_g0], 3) (fw_g1, bw_g1), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(first_cell, first_cell, p0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) # [N, M, JX, 2d] # Self match layer if config.use_self_match: s0 = tf.reshape(g1, [N * M, JX, 2 * d]) # [N * M, JX, 2d] x_mask = tf.reshape(self.x_mask, [N * M, JX]) # [N * M, JX] if config.use_static_self_match: with tf.variable_scope("StaticSelfMatch"): # implemented follow r-net section 3.3 W_x_Vj = tf.contrib.layers.fully_connected( # [N * M, JX, d] s0, int(d / 2), scope='row_first', activation_fn=None, biases_initializer=None ) W_x_Vt = tf.contrib.layers.fully_connected( # [N * M, JX, d] s0, int(d / 2), scope='col_first', activation_fn=None, biases_initializer=None ) sum_rc = tf.add( # [N * M, JX, JX, d] tf.expand_dims(W_x_Vj, 1), tf.expand_dims(W_x_Vt, 2) ) v = tf.get_variable('second', shape=[1, 1, 1, int(d / 2)], dtype=tf.float32) Sj = tf.reduce_sum(tf.multiply(v, tf.tanh(sum_rc)), -1) # [N * M, JX, JX] Ai = softmax(Sj, mask = tf.expand_dims(x_mask, 1)) # [N * M, JX, JX] Ai = tf.expand_dims(Ai, -1) # [N * M, JX, JX, 1] Vi = tf.expand_dims(s0, 1) # [N * M, 1, JX, 2d] Ct = tf.reduce_sum( # [N * M, JX, 2d] tf.multiply(Ai, Vi), axis = 2 ) inputs_Vt_Ct = tf.concat([s0, Ct], 2) # [N * M, JX, 4d] if config.use_fused_lstm: fw_inputs = tf.transpose(inputs_Vt_Ct, [1, 0, 2]) # [time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0) fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0) prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0) fw_outputs, fw_s_f = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="fw") bw_outputs, bw_s_f = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="bw") fw_s_f = LSTMStateTuple(c=fw_s_f[0], h=fw_s_f[1]) bw_s_f = LSTMStateTuple(c=bw_s_f[0], h=bw_s_f[1]) bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) s1 = tf.transpose(current_inputs, [1, 0, 2]) else: (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(first_cell, first_cell, inputs_Vt_Ct, flat_x_len, dtype='float', scope='s') # [N, M, JX, 2d] s1 = tf.concat([fw_s, bw_s], 2) # [N * M, JX, 2d], M == 1 else: with tf.variable_scope("DynamicSelfMatch"): first_cell = AttentionCell(cell, s0, size=d, mask=x_mask, is_train=self.is_train) (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(first_cell, first_cell, s0, x_len, dtype='float', scope='s') # [N, M, JX, 2d] s1 = tf.concat([fw_s, bw_s], 2) # [N * M, JX, 2d], M == 1 g1 = tf.expand_dims(s1, 1) # [N, M, JX, 2d] # prepare for PtrNet encoder_output = g1 # [N, M, JX, 2d] encoder_output = tf.expand_dims(tf.cast(self.x_mask, tf.float32), -1) * encoder_output # [N, M, JX, 2d] if config.use_self_match or not config.use_fused_lstm: if config.GRU: encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: if isinstance(fw_s_f, LSTMStateTuple): encoder_state_c = tf.concat( (fw_s_f.c, bw_s_f.c), 1, name='encoder_concat_c') encoder_state_h = tf.concat( (fw_s_f.h, bw_s_f.h), 1, name='encoder_concat_h') encoder_state_final = LSTMStateTuple(c=encoder_state_c, h=encoder_state_h) elif isinstance(fw_s_f, tf.Tensor): encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: encoder_state_final = None tf.logging.error("encoder_state_final not set") print("encoder_state_final:", encoder_state_final) with tf.variable_scope("output"): # eos_symbol = config.eos_symbol # next_symbol = config.next_symbol tf.assert_equal(M, 1) # currently dynamic M is not supported, thus we assume M==1 answer_string = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.int32, name='answer_string' ) # [N, M, JA + 1] answer_string_mask = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.bool, name='answer_string_mask' ) # [N, M, JA + 1] answer_string_length = tf.placeholder( shape=(N, 1), dtype=tf.int32, name='answer_string_length', ) # [N, M] self.tensor_dict['answer_string'] = answer_string self.tensor_dict['answer_string_mask'] = answer_string_mask self.tensor_dict['answer_string_length'] = answer_string_length self.answer_string = answer_string self.answer_string_mask = answer_string_mask self.answer_string_length = answer_string_length answer_string_flattened = tf.reshape(answer_string, [N * M, JA + 1]) self.answer_string_flattened = answer_string_flattened # [N * M, JA+1] print("answer_string_flattened:", answer_string_flattened) answer_string_length_flattened = tf.reshape(answer_string_length, [N * M]) self.answer_string_length_flattened = answer_string_length_flattened # [N * M] print("answer_string_length_flattened:", answer_string_length_flattened) decoder_cell = GRUCell(2 * d) if config.GRU else BasicLSTMCell(2 * d, state_is_tuple=True) with tf.variable_scope("Decoder"): decoder_train_logits = ptr_decoder(decoder_cell, tf.reshape(tp0, [N * M, JX, 2 * d]), # [N * M, JX, 2d] tf.reshape(encoder_output, [N * M, JX, 2 * d]), # [N * M, JX, 2d] flat_x_len, encoder_final_state=encoder_state_final, max_encoder_length=config.sent_size_th, decoder_output_length=answer_string_length_flattened, # [N * M] batch_size=N, # N * M (M=1) attention_proj_dim=self.config.decoder_proj_dim, scope='ptr_decoder') # [batch_size, dec_len*, enc_seq_len + 1] self.decoder_train_logits = decoder_train_logits print("decoder_train_logits:", decoder_train_logits) self.decoder_train_softmax = tf.nn.softmax(self.decoder_train_logits) self.decoder_inference = tf.argmax(decoder_train_logits, axis=2, name='decoder_inference') # [N, JA + 1] self.yp = tf.ones([N, M, JX], dtype=tf.int32) * -1 self.yp2 = tf.ones([N, M, JX], dtype=tf.int32) * -1
def _build_forward(self): config = self.config N = config.batch_size M = config.max_num_sents JX = config.max_sent_size JQ = config.max_ques_size VW = config.word_vocab_size VC = config.char_vocab_size W = config.max_word_size d = config.hidden_size JX = tf.shape(self.x)[2] # JX max sentence size, length, JQ = tf.shape(self.q)[1] # JQ max questions size, length, is the M = tf.shape(self.x)[1] # m is the max number of sentences dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size # dc = 8, each char will be map to 8-number vector, "char-level word embedding size [100]" with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') # 330,8 a matrix for each char to its 8-number vector with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] 60,None,None,16,8, batch-size, # N is the number of batch_size # M the max number of sentences # JX is the max sentence length # W is the max length of a word # dc is the vector for each char # map each char to a vector Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] # JQ the max length of question # W the max length of words # mao each char in questiosn to vectors Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) # max questions size, length, max_word_size(16), char_emb_size(8) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) # so here, there are 100 filters and the size of each filter is 5 # different heights and there are different number of these filter, but here just 100 5-long filters assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape( qq, [-1, JQ, dco ]) # here, xx and qq are the output of cnn, if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: # create a new word embedding or use the glove? word_emb_mat = tf.concat( [word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # here we used cnn and word embedding represented each word with a 200-unit vector # so for, xx, (batch_size, sentence#, word#, embedding), qq (batch_size, word#, embedding) # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # same shape with line 173 cell = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M], [60,?] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] [60] # masks are true and false, here, he sums up those truths, with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat( [fw_u, bw_u], 2) # (60, ?, 200) | 200 becahse combined 2 100 hidden states if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u # [60, ?, 200] for question self.tensor_dict['h'] = h # [60, ?, ?, 200] for article with tf.variable_scope("main"): if config.dynamic_att: # todo what is this dynamic attention. p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) cell2 = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number first_cell = SwitchableDropoutWrapper( cell2, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, inputs=p0, sequence_length=x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat([fw_g0, bw_g0], 3) cell3 = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number first_cell3 = SwitchableDropoutWrapper( cell3, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell3, first_cell3, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) cell4 = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number first_cell4 = SwitchableDropoutWrapper( cell4, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( first_cell4, first_cell4, tf.concat([p0, g1, a1i, g1 * a1i], 3), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat([fw_g2, bw_g2], 3) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
def _build_forward(self): #config为预先配置好的参数等 config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size #嵌入层 with tf.variable_scope("emb"): #字符嵌入层 if config.use_char_emb: #若需要字符嵌入层 with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) #CNN的滤波器参数 filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) #词嵌入层 if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: #若调用已训练好的词嵌入文件 word_emb_mat = tf.concat( 0, [word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): #将文章主体context:x和问题query:q转换为词向量 #embedding_lookup(params, ids),根据ids寻找params中的第id行 Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: #若进行了字符嵌入,在指定维度上将字符嵌入和词嵌入进行拼接 xx = tf.concat(3, [xx, Ax]) # [N, M, JX, di] qq = tf.concat(2, [qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # 经过两层highway network得到context vector∈ R^(d*T)和query vectorQ∈R^(d∗J) if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell = BasicLSTMCell(d, state_is_tuple=True) #SwitchableDropoutWrapper为自定义的DropoutWrapper类 d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) #reduce_sum在指定的维度上求和(得到x和q的非空值总数),cast将输入的tensor映射到指定类型(此处为x_mask到int32) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] #Contextual Embedding Layer:对上一层得到的X和Q分别使用BiLSTM进行处理,分别捕捉X和Q中各自单词间的局部关系 with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] #fw_u和bw_u分别为双向lstm的output u = tf.concat(2, [fw_u, bw_u]) #[N, J, 2d] if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h #核心层Attention Flow Layer with tf.variable_scope("main"): if config.dynamic_att: p0 = h #expand_dims()在矩阵指定位置增加维度 #tile()对矩阵的指定维度进行复制 u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [ N * M, JQ, 2 * d ]) #先在索引1的位置添加一个维度,然后复制M(context中最多的sentence数量)次,使u和h能具有相同的维度 q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat(3, [p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(3, [fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list(map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat([word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper(cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper(cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper(cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper(cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper(cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper(cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell(cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell(cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell(cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell(cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) with tf.variable_scope("output"): if config.model_name == "basic": logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell4_fw, d_cell4_bw, tf.concat([p0, g1, a1i, g1 * a1i], 3), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat([fw_g2, bw_g2], 3) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 elif config.model_name == "basic-class": C = 3 if config.data_dir.startswith('data/snli') else 2 (fw_g2, bw_g2) = (fw_g1, bw_g1) if config.classifier == 'maxpool': g2 = tf.concat([fw_g2, bw_g2], 3) # [N, M, JX, 2d] g2 = tf.reduce_max(g2, 2) # [N, M, 2d] g2_dim = 2 * d elif config.classifier == 'sumpool': g2 = tf.concat([fw_g2, bw_g2], 3) g2 = tf.reduce_sum(g2, 2) g2_dim = 2 * d else: fw_g2_ = tf.gather(tf.transpose(fw_g2, [2, 0, 1, 3]), JX - 1) bw_g2_ = tf.gather(tf.transpose(bw_g2, [2, 0, 1, 3]), 0) g2 = tf.concat([fw_g2_, bw_g2_], 2) g2_dim = 2 * d g2_ = tf.reshape(g2, [N, g2_dim]) logits0 = linear(g2_, C, True, wd=config.wd, input_keep_prob=config.input_keep_prob, is_train=self.is_train, scope='classifier') flat_yp0 = tf.nn.softmax(logits0) yp0 = tf.reshape(flat_yp0, [N, M, C]) self.tensor_dict['g1'] = g1 self.logits0 = logits0 self.yp0 = yp0 self.logits = logits0 self.yp = yp0
def _build_forward(self): config = self.config N, M, JX, JQ, VW, d, dc, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.hidden_size, \ config.char_emb_size, config.max_word_size H = config.max_tree_height x_mask = self.x > 0 q_mask = self.q > 0 tx_mask = self.tx > 0 # [N, M, H, JX] with tf.variable_scope("char_emb"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] filter = tf.get_variable("filter", shape=[1, config.char_filter_height, dc, d], dtype='float') bias = tf.get_variable("bias", shape=[d], dtype='float') strides = [1, 1, 1, 1] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) xxc = tf.nn.conv2d(Acx, filter, strides, "VALID") + bias # [N*M, JX, W/filter_stride, d] qqc = tf.nn.conv2d(Acq, filter, strides, "VALID") + bias # [N, JQ, W/filter_stride, d] xxc = tf.reshape(tf.reduce_max(tf.nn.relu(xxc), 2), [-1, M, JX, d]) qqc = tf.reshape(tf.reduce_max(tf.nn.relu(qqc), 2), [-1, JQ, d]) with tf.variable_scope("word_emb"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, config.word_emb_size], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, config.word_emb_size], dtype='float') Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] # Ax = linear([Ax], d, False, scope='Ax_reshape') # Aq = linear([Aq], d, False, scope='Aq_reshape') xx = tf.concat(3, [xxc, Ax]) # [N, M, JX, 2d] qq = tf.concat(2, [qqc, Aq]) # [N, JQ, 2d] D = d + config.word_emb_size with tf.variable_scope("pos_emb"): pos_emb_mat = tf.get_variable("pos_emb_mat", shape=[config.pos_vocab_size, d], dtype='float') Atx = tf.nn.embedding_lookup(pos_emb_mat, self.tx) # [N, M, H, JX, d] cell = BasicLSTMCell(D, state_is_tuple=True) cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(q_mask, 'int32'), 1) # [N] with tf.variable_scope("rnn"): (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='start') # [N, M, JX, 2d] tf.get_variable_scope().reuse_variables() (fw_us, bw_us), (_, (fw_u, bw_u)) = bidirectional_dynamic_rnn(cell, cell, qq, q_len, dtype='float', scope='start') # [N, J, d], [N, d] u = (fw_u + bw_u) / 2.0 h = (fw_h + bw_h) / 2.0 with tf.variable_scope("h"): no_op_cell = NoOpCell(D) tree_rnn_cell = TreeRNNCell(no_op_cell, d, tf.reduce_max) initial_state = tf.reshape(h, [N*M*JX, D]) # [N*M*JX, D] inputs = tf.concat(4, [Atx, tf.cast(self.tx_edge_mask, 'float')]) # [N, M, H, JX, d+JX] inputs = tf.reshape(tf.transpose(inputs, [0, 1, 3, 2, 4]), [N*M*JX, H, d + JX]) # [N*M*JX, H, d+JX] length = tf.reshape(tf.reduce_sum(tf.cast(tx_mask, 'int32'), 2), [N*M*JX]) # length = tf.reshape(tf.reduce_sum(tf.cast(tf.transpose(tx_mask, [0, 1, 3, 2]), 'float'), 3), [-1]) h, _ = dynamic_rnn(tree_rnn_cell, inputs, length, initial_state=initial_state) # [N*M*JX, H, D] h = tf.transpose(tf.reshape(h, [N, M, JX, H, D]), [0, 1, 3, 2, 4]) # [N, M, H, JX, D] u = tf.expand_dims(tf.expand_dims(tf.expand_dims(u, 1), 1), 1) # [N, 1, 1, 1, 4d] dot = linear(h * u, 1, True, squeeze=True, scope='dot') # [N, M, H, JX] # self.logits = tf.reshape(dot, [N, M * H * JX]) self.logits = tf.reshape(exp_mask(dot, tx_mask), [N, M * H * JX]) # [N, M, H, JX] self.yp = tf.reshape(tf.nn.softmax(self.logits), [N, M, H, JX])
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size print("parameters", N, M, JX, JQ, VW, VC, d, W) JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list(map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # prepro layer cell cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell_fwh = BasicLSTMCell(d, state_is_tuple=True) cell_bwh = BasicLSTMCell(d, state_is_tuple=True) d_cell_fwh = SwitchableDropoutWrapper(cell_fwh, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bwh = SwitchableDropoutWrapper(cell_bwh, self.is_train, input_keep_prob=config.input_keep_prob) # attention layer cell cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper(cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper(cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper(cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper(cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) # out layer cell_bw cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper(cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper(cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_h, bw_h), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') h = tf.concat(axis=3, values=[fw_h, bw_h]) (fw_u, bw_u), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) # if config.share_lstm_weights: # tf.get_variable_scope().reuse_variables() # (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] # h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] # else: # (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] # h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h # h = tf.Print(h, [tf.reduce_max(h), tf.reduce_min(h), "h"], summarize=1000) with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell(cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell(cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell(cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell(cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw # p0 = tf.Print(p0, [p0, "lstm output-1:"], summarize=200) # two layer LSTM (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) # g0 = tf.Print(g0, [g0, "lstm output0:"], summarize=200) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) # g1 = tf.Print(g1, [g1, "lstm output:"], summarize=200) # output through a denselayer with tf.variable_scope("output"): #lstm_out = g1[:,0,-1,:] lstm_out = tf.reduce_sum(g1, axis = 2) lstm_out = tf.reshape(lstm_out, [-1, 2*d]) lstm_out = tf.nn.dropout(lstm_out, 0.5) #logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, # mask=self.x_mask, is_train=self.is_train, func='linear', scope='logits1') #print (logits) #lstm_out.set_shape([N, M*JX*2*d]) dense1 = tf.layers.dense(inputs=lstm_out, units=64, activation = tf.nn.relu, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.003)) score = tf.layers.dense(inputs=dense1, units = 2, activation = None, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.003)) self.probs = tf.nn.softmax(score) # self.tensor_dict['g1'] = g1 # self.tensor_dict['g2'] = g2 self.score = score
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W ,EW, WOW= \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size,config.word_vocab_size-config.vw_wo_entity_size,config.vw_wo_entity_size JX = tf.shape(self.x)[2] # words JQ = tf.shape(self.q)[1] # words M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size #print ("dhruv is here",N, self.x.get_shape(), JX, self.q.get_shape(), VW, VC, d, W,dc, dw, dco) with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list(map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': entity_emb_mat = tf.get_variable("entity_emb_mat", dtype='float', shape=[EW, EW], initializer=get_initializer(config.onehot_encoded)) entity_emb_out = _linear(entity_emb_mat, dw, True, bias_initializer=tf.constant_initializer(0.0)) word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[WOW, dw], initializer=get_initializer(config.emb_mat)) word_emb_mat = tf.concat(axis=0,values=[word_emb_mat, entity_emb_out]) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] i.e. [batch size, max sentences, max words, embedding size] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] i.e. [batch size, max words, embedding size] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq #xx = tf.Print(xx,[tf.shape(xx),xx],message="DHRUV xx=",summarize=20) cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper(cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper(cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper(cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper(cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper(cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper(cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: # not true p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell(cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell(cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell(cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell(cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) # p0 seems to be G in paper first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) # g1 seems to be M in paper g1= tf.Print(g1,[tf.shape(g1)],message="g1 shape",first_n=5,summarize=200) p0 = tf.Print(p0, [tf.shape(p0)], message="p0 shape", first_n=5, summarize=200) my_cell_fw = BasicLSTMCell(d, state_is_tuple=True) my_cell_fw_d = SwitchableDropoutWrapper(my_cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) my_cell_bw = BasicLSTMCell(d, state_is_tuple=True) my_cell_bw_d = SwitchableDropoutWrapper(my_cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g11,bw_g11),(my_fw_final_state, my_bw_final_state),g11_len = my_bidirectional_dynamic_rnn(my_cell_fw_d, my_cell_bw_d, g1, x_len, dtype='float', scope='my_g2') # [N, M, JX, 2d] g11 = tf.concat(axis=2, values=[fw_g11, bw_g11]) my_encoder_final_state_c = tf.concat(values = (my_fw_final_state.c, my_bw_final_state.c), axis = 1, name = "my_encoder_final_state_c") my_encoder_final_state_h = tf.concat(values = (my_fw_final_state.h, my_bw_final_state.h), axis = 1, name = "my_encoder_final_state_h") my_encoder_final_state = tf.contrib.rnn.LSTMStateTuple(c = my_encoder_final_state_c, h = my_encoder_final_state_h) #compute indices for finding span as the second task in multi task learning logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') logits = tf.Print(logits, [tf.shape(logits)], message="logits shape", first_n=5, summarize=200) a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_logits = tf.Print(flat_logits, [tf.shape(flat_logits),flat_logits], message="flat_logits shape and contents", first_n=5, summarize=200) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) tgt_vocab_size = config.len_new_emb_mat # hparam # FIXME: Obtain embeddings differently? print("length is",config.len_new_emb_mat) tgt_embedding_size = dw # hparam # Look up embedding decoder_emb_inp = tf.nn.embedding_lookup(word_emb_mat, self.decoder_inputs) # [batch_size, max words, embedding_size] def decode_with_attention(helper, scope, reuse=None,maximum_iterations=None): with tf.variable_scope(scope, reuse=reuse): attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=d, memory=g11) cell = tf.contrib.rnn.GRUCell(num_units=d) attn_cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism,attention_layer_size=d /2) out_cell = tf.contrib.rnn.OutputProjectionWrapper(attn_cell, tgt_vocab_size, reuse=reuse) decoder = tf.contrib.seq2seq.BasicDecoder(cell=out_cell, helper=helper,initial_state=out_cell.zero_state( dtype=tf.float32, batch_size=N)) # initial_state=encoder_final_state) outputs = tf.contrib.seq2seq.dynamic_decode(decoder=decoder, output_time_major=False, impute_finished=True, maximum_iterations=maximum_iterations) return outputs[0] def decode(helper, scope, reuse=None, maximum_iterations=None): with tf.variable_scope(scope, reuse=reuse): decoder_cell = BasicLSTMCell(2 * d, state_is_tuple=True) # hparam projection_layer = layers_core.Dense(tgt_vocab_size, use_bias=False) # hparam decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, my_encoder_final_state,output_layer=projection_layer) # decoder final_outputs, _ ,_= tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=False, impute_finished=True, maximum_iterations=maximum_iterations) # dynamic decoding return final_outputs # Decoder if config.mode == 'train': #TODO:doesnt seem to be correct to use this variable for dev training_helper = tf.contrib.seq2seq.TrainingHelper(decoder_emb_inp, self.target_sequence_length,time_major=False) #final_outputs = decode(helper=training_helper, scope="HAHA", reuse=None) final_outputs = decode_with_attention(helper=training_helper, scope="HAHA", reuse=None) else: training_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(word_emb_mat, tf.fill([N], self.tgt_sos_id),self.tgt_eos_id) #final_outputs= decode(helper=training_helper, scope="HAHA", reuse=True,maximum_iterations=100) final_outputs= decode_with_attention(helper=training_helper, scope="HAHA", reuse=True,maximum_iterations=100) self.decoder_logits_train = final_outputs.rnn_output self.index_start = flat_logits self.index_end = flat_logits2
def build_forward(self, k, c_mask, query, query_st, q_mask, cand_emb, cand_mask, drop_one_doc=False, dropped_hops=None, reuse=False): self.selected_sent_ids = self.model.selected_sent_ids cand_emb = tf.squeeze(cand_emb, axis=1) config = self.config model = self.model context_dim = self.context_dim x_acc_sents_len = model.x_sents_len_reconstruct self.x_acc_sents_len = x_acc_sents_len zero_prepend_x_acc_sents_len = tf.concat([ tf.expand_dims(tf.zeros_like(x_acc_sents_len[:, :, 0]), axis=2), x_acc_sents_len ], axis=2) with tf.variable_scope('assembler', reuse=reuse): if config.assembler_biattn_w_first_doc: assert config.assembler_merge_query_st, ( "If biattn with first doc, then has to merge query information in another way." ) for hop in range(config.num_hops - 1): doc_indices = model.mac_rnn_cell.doc_indices_history[hop + 1] doc2 = tf.gather_nd(k, doc_indices) doc2_mask = tf.cast( tf.gather_nd(tf.cast(c_mask, 'float32'), doc_indices), 'bool') #doc2_first_sent_len = tf.gather_nd(x_sents_len[:, :, 0], doc_indices) selected_sent_indices = tf.concat([ doc_indices, tf.expand_dims(self.selected_sent_ids[:, hop + 1], axis=1) ], axis=-1) self.selected_sent_indices = selected_sent_indices doc2_selected_sent_end = tf.gather_nd( x_acc_sents_len, selected_sent_indices) doc2_selected_sent_start = tf.gather_nd( zero_prepend_x_acc_sents_len, selected_sent_indices) self.doc2_selected_sent_start = doc2_selected_sent_start self.doc2_selected_sent_end = doc2_selected_sent_end self.doc_lens = [] self.prev_doc_lens = [] self.new_doc_lens = [] self.padding_lens = [] if hop == 0: self.concat_selected_doc = [] for i in range(config.batch_size): new_doc = tf.slice( doc2[i], [doc2_selected_sent_start[i], 0], [ doc2_selected_sent_end[i] - doc2_selected_sent_start[i], -1 ]) self.concat_selected_doc.append(new_doc) self.concat_selected_doc_len = doc2_selected_sent_end - doc2_selected_sent_start else: for i in range(config.batch_size): prev_doc = tf.slice( self.concat_selected_doc[i], [0, 0], [self.concat_selected_doc_len[i], -1]) new_doc = tf.slice( doc2[i], [doc2_selected_sent_start[i], 0], [ doc2_selected_sent_end[i] - doc2_selected_sent_start[i], -1 ]) padding_len = tf.reduce_max( self.concat_selected_doc_len + doc2_selected_sent_end - doc2_selected_sent_start ) - self.concat_selected_doc_len[i] - ( doc2_selected_sent_end[i] - doc2_selected_sent_start[i]) padding = tf.zeros([padding_len, context_dim]) self.concat_selected_doc[i] = tf.concat( [prev_doc, new_doc, padding], axis=0) self.doc_lens.append( tf.shape( tf.concat([prev_doc, new_doc, padding], axis=0))[0]) self.prev_doc_lens.append(tf.shape(prev_doc)[0]) self.new_doc_lens.append(tf.shape(new_doc)[0]) self.padding_lens.append(padding_len) self.concat_selected_doc_len += ( doc2_selected_sent_end - doc2_selected_sent_start) max_len = tf.reduce_max(self.concat_selected_doc_len) self.concat_selected_doc_mask = [] for i in range(config.batch_size): concat_selected_doc_mask_i = tf.concat(values=[ tf.ones([self.concat_selected_doc_len[i]]), tf.zeros([max_len - self.concat_selected_doc_len[i]]) ], axis=0) self.concat_selected_doc_mask.append( concat_selected_doc_mask_i) self.concat_selected_doc = tf.stack(self.concat_selected_doc, axis=0) self.concat_selected_doc_mask = tf.cast( tf.stack(self.concat_selected_doc_mask, axis=0), 'bool') first_doc_indices = model.mac_rnn_cell.doc_indices_history[0] doc1 = tf.gather_nd(k, first_doc_indices) doc1_mask = tf.cast( tf.gather_nd(tf.cast(c_mask, 'float32'), first_doc_indices), 'bool') p0 = biattention_layer(self.is_train, self.concat_selected_doc, doc1, h_mask=self.concat_selected_doc_mask, u_mask=doc1_mask) else: if drop_one_doc: num_docs_to_concat = config.num_hops - 1 else: num_docs_to_concat = ( config.num_hops + 2 if config.mac_read_strategy == 'one_doc_per_it_and_repeat_for_3_hops' else (config.num_hops - 1) * 2 ) if config.assembler_repeat_first_doc else config.num_hops for hop in range(num_docs_to_concat): if config.assembler_repeat_first_doc: if drop_one_doc: raise NotImplementedError if config.mac_read_strategy == 'one_doc_per_it_and_repeat_for_3_hops': doc_indices = model.mac_rnn_cell.doc_indices_history[ 0] if ( hop % 4 == 1 or hop == 0 ) else model.mac_rnn_cell.doc_indices_history[ hop - int(hop / 5)] else: doc_indices = model.mac_rnn_cell.doc_indices_history[ 0] if hop % 2 == 0 else model.mac_rnn_cell.doc_indices_history[ int(hop / 2) + 1] else: if drop_one_doc: doc_indices = tf.squeeze(tf.slice( tf.stack( model.mac_rnn_cell.doc_indices_history), [dropped_hops[hop], 0, 0], [1, -1, -1]), axis=0) else: doc_indices = model.mac_rnn_cell.doc_indices_history[ hop] doc2 = tf.gather_nd(k, doc_indices) doc2_mask = tf.cast( tf.gather_nd(tf.cast(c_mask, 'float32'), doc_indices), 'bool') #doc2_first_sent_len = tf.gather_nd(x_sents_len[:, :, 0], doc_indices) if config.assembler_repeat_first_doc: if config.mac_read_strategy == 'one_doc_per_it_and_repeat_for_3_hops': if hop % 4 == 1 or hop == 0: selected_sent_indices = tf.concat([ doc_indices, tf.expand_dims( self.selected_sent_ids[:, 0], axis=1) ], axis=-1) else: selected_sent_indices = tf.concat([ doc_indices, tf.expand_dims( self.selected_sent_ids[:, hop - int(hop / 5)], axis=1) ], axis=-1) else: if hop % 2 == 0: print("concat first doc") selected_sent_indices = tf.concat([ doc_indices, tf.expand_dims( self.selected_sent_ids[:, 0], axis=1) ], axis=-1) else: print("concat second doc") selected_sent_indices = tf.concat([ doc_indices, tf.expand_dims( self.selected_sent_ids[:, int(hop / 2) + 1], axis=1) ], axis=-1) else: selected_sent_indices = tf.concat([ doc_indices, tf.expand_dims(self.selected_sent_ids[:, hop], axis=1) ], axis=-1) self.selected_sent_indices = selected_sent_indices doc2_selected_sent_end = tf.gather_nd( x_acc_sents_len, selected_sent_indices) doc2_selected_sent_start = tf.gather_nd( zero_prepend_x_acc_sents_len, selected_sent_indices) self.doc2_selected_sent_start = doc2_selected_sent_start self.doc2_selected_sent_end = doc2_selected_sent_end if hop == 0: self.doc_lens = [] self.prev_doc_lens = [] self.new_doc_lens = [] self.padding_lens = [] self.concat_selected_doc = [] for i in range(config.batch_size): new_doc = tf.slice( doc2[i], [doc2_selected_sent_start[i], 0], [ doc2_selected_sent_end[i] - doc2_selected_sent_start[i], -1 ]) self.concat_selected_doc.append(new_doc) self.concat_selected_doc_len = doc2_selected_sent_end - doc2_selected_sent_start else: for i in range(config.batch_size): prev_doc = tf.slice( self.concat_selected_doc[i], [0, 0], [self.concat_selected_doc_len[i], -1]) new_doc = tf.slice( doc2[i], [doc2_selected_sent_start[i], 0], [ doc2_selected_sent_end[i] - doc2_selected_sent_start[i], -1 ]) padding_len = tf.reduce_max( self.concat_selected_doc_len + doc2_selected_sent_end - doc2_selected_sent_start ) - self.concat_selected_doc_len[i] - ( doc2_selected_sent_end[i] - doc2_selected_sent_start[i]) padding = tf.zeros([padding_len, context_dim]) self.concat_selected_doc[i] = tf.concat( [prev_doc, new_doc, padding], axis=0) self.doc_lens.append( tf.shape( tf.concat([prev_doc, new_doc, padding], axis=0))[0]) self.prev_doc_lens.append(tf.shape(prev_doc)[0]) self.new_doc_lens.append(tf.shape(new_doc)[0]) self.padding_lens.append(padding_len) self.concat_selected_doc_len += ( doc2_selected_sent_end - doc2_selected_sent_start) max_len = tf.reduce_max(self.concat_selected_doc_len) self.concat_selected_doc_mask = [] for i in range(config.batch_size): concat_selected_doc_mask_i = tf.concat(values=[ tf.ones([self.concat_selected_doc_len[i]]), tf.zeros([max_len - self.concat_selected_doc_len[i]]) ], axis=0) self.concat_selected_doc_mask.append( concat_selected_doc_mask_i) self.concat_selected_doc = tf.stack(self.concat_selected_doc, axis=0) self.concat_selected_doc_mask = tf.cast( tf.stack(self.concat_selected_doc_mask, axis=0), 'bool') p0 = biattention_layer(self.is_train, self.concat_selected_doc, query, h_mask=self.concat_selected_doc_mask, u_mask=q_mask) p0 = tf.squeeze(p0, axis=1) if config.assembler_bidaf_layer > 1: with tf.variable_scope("layer_1"): cell_fw = BasicLSTMCell(40, state_is_tuple=True) cell_bw = BasicLSTMCell(40, state_is_tuple=True) cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, p0, self.concat_selected_doc_len, dtype='float') x = tf.concat(axis=2, values=[fw_h, bw_h]) with tf.variable_scope("layer_2"): cell2_fw = BasicLSTMCell(40, state_is_tuple=True) cell2_bw = BasicLSTMCell(40, state_is_tuple=True) cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell2_fw, cell2_bw, x, self.concat_selected_doc_len, dtype='float') x = tf.concat(axis=2, values=[fw_h, bw_h]) else: cell_fw = BasicLSTMCell(40, state_is_tuple=True) cell_bw = BasicLSTMCell(40, state_is_tuple=True) cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, p0, self.concat_selected_doc_len, dtype='float') x = tf.concat(axis=2, values=[fw_h, bw_h]) logits = linear_logits([x], True, input_keep_prob=config.input_keep_prob, mask=self.concat_selected_doc_mask, is_train=self.is_train, scope='a_state_logits') probs = tf.nn.softmax(logits) new_ans = tf.einsum('ijk,ij->ik', self.concat_selected_doc, probs) if config.assembler_merge_query_st: W_c = tf.get_variable('W_c', [40, 40]) b_c = tf.get_variable('b_c', [40]) c_proj = tf.matmul(query_st, W_c) + b_c W1 = tf.get_variable('W1', [3 * 40, 2 * 40]) b1 = tf.get_variable('b1', [2 * 40]) W2 = tf.get_variable('W2', [2 * 40, 40]) b2 = tf.get_variable('b2', [40]) concat_in = tf.concat( axis=-1, values=[new_ans, c_proj, new_ans * c_proj]) a_state = tf.matmul(tf.nn.relu(tf.matmul(concat_in, W1) + b1), W2) + b2 else: a_state = new_ans g1 = tf.expand_dims(self.output_unit(cand_emb, a_state), axis=1) self.g1 = g1 self.cand_mask = cand_mask self.cand_emb = cand_emb logits = linear_logits([g1], True, is_train=self.is_train, input_keep_prob=config.input_keep_prob, mask=cand_mask, scope='g1_logits') JX = tf.shape(g1)[2] self.g1_shape = tf.shape(g1) flat_logits = tf.reshape(logits, [config.batch_size, JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [config.batch_size, 1, JX]) self.logits = flat_logits self.yp = yp
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( 0, [word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(3, [xx, Ax]) # [N, M, JX, di] qq = tf.concat(2, [qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.two_prepro_layers: (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, u, q_len, dtype='float', scope='u2') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] if config.two_prepro_layers: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, h, x_len, dtype='float', scope='u2') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] if config.two_prepro_layers: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, h, x_len, dtype='float', scope='h2') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) if config.late: (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat(3, [g1, p0]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(3, [fw_g2, bw_g2]) # logits2 = u_logits(config, self.is_train, tf.concat(3, [g1, a1i]), u, h_mask=self.x_mask, u_mask=self.q_mask, scope="logits2") logits = get_logits([g1, g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') if config.feed_gt: logy = tf.log(tf.cast(self.y, 'float') + VERY_SMALL_NUMBER) logits = tf.cond(self.is_train, lambda: logy, lambda: logits) if config.feed_hard: hard_yp = tf.argmax(tf.reshape(logits, [N, M * JX]), 1) hard_logits = tf.reshape(tf.one_hot(hard_yp, M * JX), [N, M, JX]) # [N, M, JX] logits = tf.cond(self.is_train, lambda: logits, lambda: hard_logits) flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) logits2 = get_logits([g1, g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) else: logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) if config.feed_gt: logy = tf.log(tf.cast(self.y, 'float') + VERY_SMALL_NUMBER) logits = tf.cond(self.is_train, lambda: logy, lambda: logits) if config.feed_hard: hard_yp = tf.argmax(tf.reshape(logits, [N, M * JX]), 1) hard_logits = tf.reshape(tf.one_hot(hard_yp, M * JX), [N, M, JX]) # [N, M, JX] logits = tf.cond(self.is_train, lambda: logits, lambda: hard_logits) flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) yp_aug = tf.expand_dims(yp, -1) g1yp = g1 * yp_aug if config.prev_mode == 'a': prev = a1i elif config.prev_mode == 'y': prev = yp_aug elif config.prev_mode == 'gy': prev = g1yp else: raise Exception() (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat(3, [p0, g1, prev, g1 * prev]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(3, [fw_g2, bw_g2]) # logits2 = u_logits(config, self.is_train, tf.concat(3, [g1, a1i]), u, h_mask=self.x_mask, u_mask=self.q_mask, scope="logits2") logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
def _build_forward(self): config = self.config N, M, JX, JQ, VW, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): print('word embedding') # if config.use_char_emb: # with tf.variable_scope("emb_var"), tf.device("/cpu:0"): # char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') # with tf.variable_scope("char"): # Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] # Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] # Acx = tf.reshape(Acx, [-1, JX, W, dc]) # Acq = tf.reshape(Acq, [-1, JQ, W, dc]) # filter_sizes = list(map(int, config.out_channel_dims.split(','))) # heights = list(map(int, config.filter_heights.split(','))) # assert sum(filter_sizes) == dco, (filter_sizes, dco) # with tf.variable_scope("conv"): # xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") # if config.share_cnn_weights: # tf.get_variable_scope().reuse_variables() # qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") # else: # qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") # xx = tf.reshape(xx, [-1, M, JX, dco]) # qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/gpu:7"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') # if config.use_glove_for_unk: # word_emb_mat = tf.concat(0, [word_emb_mat]) print(word_emb_mat) with tf.name_scope("word"): print('embedding lookup') Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq print('embedding lookup ready') # if config.use_char_emb: # xx = tf.concat(3, [xx, Ax]) # [N, M, JX, di] # qq = tf.concat(2, [qq, Aq]) # [N, JQ, di] # else: xx = Ax qq = Aq # highway network #if config.highway: # with tf.variable_scope("highway"): # xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) # tf.get_variable_scope().reuse_variables() # qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq print('context emmbedding') cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] print('prepro') with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell, d_cell, qq, q_len, dtype='float32', scope='u1') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h print('main pro') with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell(cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) logits = get_logits([g1, p0], [config.batch_size,config.max_num_sents] , True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func='sigmoid', scope='logits1') # a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) # a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) # (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell, d_cell, tf.concat(3, [p0, g1, a1i, g1 * a1i]), # x_len, dtype='float', scope='g2') # [N, M, JX, 2d] # g2 = tf.concat(3, [fw_g2, bw_g2]) # logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, # mask=self.x_mask, # is_train=self.is_train, func=config.answer_func, scope='logits2') # flat_logits = tf.reshape(logits, [-1, M * JX]) # flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] # yp = tf.reshape(flat_yp, [-1, M, JX]) yp = tf.greater(0.5, logits) # flat_logits2 = tf.reshape(logits2, [-1, M * JX]) # flat_yp2 = tf.nn.softmax(flat_logits2) # yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 # self.tensor_dict['g2'] = g2 self.logits = logits # self.logits2 = flat_logits2 self.yp = yp
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JQ = JX print('VC:{} NEW_EMB:{}'.format(VW, self.new_emb_mat.get_shape())) dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq xx = tf.reshape(xx, [-1, M, JX, d]) qq = tf.reshape(qq, [-1, JQ, d]) if config.use_pos_emb: with tf.variable_scope("pos_onehot"), tf.device("/cpu:0"): pos_x = tf.one_hot( self.x_pos, depth=config.pos_tag_num) # [N,M,JX,depth] pos_q = tf.one_hot( self.q_pos, depth=config.pos_tag_num) # [N,JQ,depth] xx = tf.concat(axis=3, values=[xx, pos_x]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, pos_q]) if config.use_sem_emb: with tf.variable_scope("sem_onehot"), tf.device("/cpu:0"): sem_x = tf.one_hot(self.x_sem, depth=3) # [N,M,JX,3] sem_q = tf.one_hot(self.q_sem, depth=3) # [N,JQ,3] xx = tf.concat(axis=3, values=[xx, sem_x]) qq = tf.concat(axis=2, values=[qq, sem_q]) if config.use_neg_emb: with tf.variable_scope("neg_onehot"), tf.device("/cpu:0"): neg_x = tf.one_hot(self.x_neg, depth=2) # [N,M,JX,2] neg_q = tf.one_hot(self.q_neg, depth=2) # [N,JQ,2] xx = tf.concat(axis=3, values=[xx, neg_x]) qq = tf.concat(axis=2, values=[qq, neg_q]) if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell_fw2 = BasicLSTMCell(d, state_is_tuple=True) cell_bw2 = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw2 = SwitchableDropoutWrapper( cell_fw2, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw2 = SwitchableDropoutWrapper( cell_bw2, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] if config.lstm: with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] print('fw_u_f hsape:{}'.format(fw_u_f.get_shape())) u = tf.concat(axis=2, values=[fw_u, bw_u]) #[N,JQ,2d] if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] print('fw_u_f nn hsape:{}'.format(fw_u_f.get_shape())) else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h else: h = xx u = qq h1 = h[:, 0, :, :] h2 = h[:, 1, :, :] h3 = h[:, 2, :, :] h4 = h[:, 3, :, :] n_1 = tf.reshape(self.x_mask[:, 0, :], [N, JX]) n_2 = tf.reshape(self.x_mask[:, 1, :], [N, JX]) n_3 = tf.reshape(self.x_mask[:, 2, :], [N, JX]) n_4 = tf.reshape(self.x_mask[:, 3, :], [N, JX]) if config.self_attention: with tf.variable_scope("h_self_weight"): print(h.get_shape()) for i in range(2): with tf.variable_scope("self-attention"): h1 = self_attention_layer( config, self.is_train, h1, p_mask=tf.expand_dims(n_1, -1), scope="{}_layer_self_att_enc_e".format( i)) # [N, len, dim] tf.get_variable_scope().reuse_variables() h2 = self_attention_layer( config, self.is_train, h2, p_mask=tf.expand_dims(n_2, -1), scope="{}_layer_self_att_enc_e".format(i)) tf.get_variable_scope().reuse_variables() h3 = self_attention_layer( config, self.is_train, h3, p_mask=tf.expand_dims(n_3, -1), scope="{}_layer_self_att_enc_e".format(i)) tf.get_variable_scope().reuse_variables() h4 = self_attention_layer( config, self.is_train, h4, p_mask=tf.expand_dims(n_4, -1), scope="{}_layer_self_att_enc_e".format(i)) with tf.variable_scope("self-attention"): u = self_attention_layer( config, self.is_train, u, p_mask=tf.expand_dims(self.q_mask, -1), scope="{}_layer_self_att_enc_p".format(i)) if config.plot_encoder == "concate": h = tf.concat([h1, h2, h3, h4], axis=1) print("h concate shape".format(h.get_shape())) n_n = tf.concat([n_1, n_2, n_3, n_4], axis=1) elif config.plot_encoder == "sum": h1 = tf.expand_dims(h1, axis=1) h2 = tf.expand_dims(h2, axis=1) h3 = tf.expand_dims(h3, axis=1) h4 = tf.expand_dims(h4, axis=1) h = tf.concat([h1, h2, h3, h4], axis=1) h = tf.reduce_sum(h, axis=1) print("h sum shape".format(h.get_shape())) elif config.plot_encoder == "lstm": # h1 = tf.reduce_sum(h1, axis=1) h1 = tf.expand_dims(tf.reduce_sum(h1, axis=-1), axis=1) h2 = tf.expand_dims(tf.reduce_sum(h2, axis=-1), axis=1) h3 = tf.expand_dims(tf.reduce_sum(h3, axis=-1), axis=1) h4 = tf.expand_dims(tf.reduce_sum(h4, axis=-1), axis=1) (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw2, d_cell_bw2, tf.concat([h1, h2, h3, h4], axis=1), dtype='float', scope='1') # [N, J, d], [N, d] print('fw_u_f hsape:{}'.format(fw_u_f.get_shape())) h = tf.concat(axis=2, values=[fw_u, bw_u]) # [N,JQ,2d] u = tf.expand_dims(tf.reduce_sum(u, axis=-1), axis=1) tf.get_variable_scope().reuse_variables() (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw2, d_cell_bw2, tf.concat([u], axis=1), dtype='float', scope='1') # [N, J, d], [N, d] print('fw_u_f hsape:{}'.format(fw_u_f.get_shape())) u = tf.concat(axis=2, values=[fw_u, bw_u]) # [N,JQ,2d] if config.interact: with tf.variable_scope("interact"): def get_attention(h, u, m): JX = tf.shape(h)[1] JQ = tf.shape(u)[1] h = tf.expand_dims(h, 2) u = tf.expand_dims(u, 1) h = tf.tile(h, [1, 1, JQ, 1]) u = tf.tile(u, [1, JX, 1, 1]) attention = h * u # N,JX,JQ,2d return attention if config.plot_encoder == "concate": attention = get_attention(h, u, M) else: attention = get_attention(h, u, 1) with tf.variable_scope('conv_dense'): if config.plot_encoder == "concate": out_final = dense_net(config, attention, self.is_train) else: out_final = tf.reshape(attention, shape=[N, -1]) else: h = tf.reshape(h, [-1, M * 2 * d * JX]) print("h shape {}".format(h.get_shape())) u = tf.reshape(u, [-1, 2 * d * JQ]) print("U shape {}".format(u.get_shape())) attention = tf.concat([h, u], axis=-1) out_final = attention out_final = linear(tf.concat([attention], axis=-1), 1000, True, bias_start=0.0, scope="logit8", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) out_final = linear(tf.concat([out_final], axis=-1), 400, True, bias_start=0.0, scope="logit9", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) out_final = linear(out_final, 300, True, bias_start=0.0, scope="logit3", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) with tf.variable_scope('conv_dense'): if config.hao: out_final = linear(tf.concat( [out_final, self.haoruopeng_feature], axis=-1), 200, True, bias_start=0.0, scope="logit", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) out_final = linear(out_final, 100, True, bias_start=0.0, scope="logit3", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) else: out_final = linear(tf.concat([out_final], axis=-1), 200, True, bias_start=0.0, scope="logit", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = linear(out_final, 100, True, bias_start=0.0, scope="logit3", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) self.tensor_dict['outfinal'] = out_final self.prediction = linear(tf.concat([out_final], axis=-1), 1, True, bias_start=0.0, scope="logit2", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train)
def get_drnncell(self): cell = BasicLSTMCell(self.config.hidden_size, state_is_tuple=True, reuse=False) d_cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=self.config.input_keep_prob) return d_cell
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W ,EW, WOW= \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.len_new_emb_mat, config.char_vocab_size, config.hidden_size, \ config.max_word_size,config.word_vocab_size-config.vw_wo_entity_size,config.vw_wo_entity_size JX = tf.shape(self.x)[2] # words JQ = tf.shape(self.q)[1] # words M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': init_word_emb = tf.random_normal_initializer(-0.5, 0.5) #entity_emb_mat = tf.get_variable("entity_emb_mat", dtype='float', shape=[EW, EW], initializer=get_initializer(config.onehot_encoded)) #entity_emb_out = _linear(entity_emb_mat, dw, True, bias_initializer=tf.constant_initializer(0.0)) #word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=init_word_emb) #word_emb_mat = tf.concat(axis=0,values=[word_emb_mat, entity_emb_out]) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') #if config.use_glove_for_unk: # word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup( word_emb_mat, self.x ) # [N, M, JX, d] i.e. [batch size, max sentences, max words, embedding size] Aq = tf.nn.embedding_lookup( word_emb_mat, self.q ) # [N, JQ, d] i.e. [batch size, max words, embedding size] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq #xx = tf.Print(xx,[tf.shape(xx),xx],message="DHRUV xx=",summarize=20) cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper( cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper( cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper( cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper( cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N,M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), (fw_s, bw_s) = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: # not true p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell( cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell( cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell( cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell( cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer( config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) # p0 seems to be G in paper first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw #p1 = tf.reshape(p0,[N , M*JX, 8*d]) (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), (my_fw_final_state, my_bw_final_state) = bidirectional_dynamic_rnn( second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) # g1 seems to be M in paper #g1= tf.reshape(g1,[N, M , JX, 2*d]) #reshaping here again, since g1 is used ahead g1 = tf.Print(g1, [tf.shape(g1)], message="g1 shape", first_n=5, summarize=200) p0 = tf.Print(p0, [tf.shape(p0)], message="p0 shape", first_n=5, summarize=200) g11 = tf.reshape(g1, [N, -1, 2 * d]) my_encoder_final_state_c = tf.concat( values=(my_fw_final_state.c, my_bw_final_state.c), axis=1, name="my_encoder_final_state_c") my_encoder_final_state_h = tf.concat( values=(my_fw_final_state.h, my_bw_final_state.h), axis=1, name="my_encoder_final_state_h") my_encoder_final_state = tf.contrib.rnn.LSTMStateTuple( c=my_encoder_final_state_c, h=my_encoder_final_state_h) #compute indices for finding span as the second task in multi task learning logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') logits = tf.Print(logits, [tf.shape(logits)], message="logits shape", first_n=5, summarize=200) a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_logits = tf.Print(flat_logits, [tf.shape(flat_logits), flat_logits], message="flat_logits shape and contents", first_n=5, summarize=200) self.flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) self.flat_yp2 = tf.nn.softmax(flat_logits2) tgt_vocab_size = config.len_new_emb_mat # hparam # FIXME: Obtain embeddings differently? print("length is", config.len_new_emb_mat) nodes = d # Look up embedding decoder_emb_inp = tf.nn.embedding_lookup( word_emb_mat, self.decoder_inputs) # [batch_size, max words, embedding_size] with tf.variable_scope("rnn_decoder", reuse=tf.AUTO_REUSE): init = tf.random_normal_initializer(0.0, 0.5) W_dense = tf.get_variable(name="W_dense", shape=[2 * nodes, tgt_vocab_size], dtype=tf.float32, initializer=init) b_dense = tf.get_variable(name="b_dense", shape=[tgt_vocab_size], dtype=tf.float32, initializer=tf.zeros_initializer) W_att_dec = tf.get_variable(name="W_att_dec", shape=[2 * nodes, 2 * nodes], dtype=tf.float32, initializer=init) W_att_enc = tf.get_variable(name="W_att_enc1", shape=[1, 1, 2 * nodes, 2 * nodes], dtype=tf.float32, initializer=init) v_blend = tf.get_variable(name="v_blend", shape=[1, 2 * nodes], dtype=tf.float32, initializer=init) pad_time_slice = tf.fill([N], 0, name='PAD') pad_step_embedded = tf.nn.embedding_lookup( word_emb_mat, pad_time_slice) decoder_cell = tf.contrib.rnn.BasicLSTMCell( 2 * nodes, state_is_tuple=True ) # doesnt work without the factor of 2?? '''Loop transition function is a mapping (time, previous_cell_output, previous_cell_state, previous_loop_state) -> (elements_finished, input, cell_state, output, loop_state). It is called before RNNCell to prepare its inputs and state. Everything is a Tensor except for initial call at time=0 when everything is None (except time).''' def execute_pointer_network(attn_dist): #this is to find the word in the summary, which recieved highest probability and pass it to the next step in decoder index_pos = tf.argmax(attn_dist, axis=1) index_pos = tf.expand_dims(index_pos, 1) index_pos = tf.concat([ tf.reshape(tf.range(start=0, limit=N, dtype=tf.int64), [N, 1]), tf.zeros([N, 1], tf.int64), index_pos ], axis=1) index_pos = tf.cast(tf.gather_nd(params=self.x, indices=index_pos), dtype=tf.int64) return index_pos def execute_normal_decoder(previous_output, W_dense, b_dense): output_logits = tf.add(tf.matmul(previous_output, W_dense), b_dense) return tf.argmax(output_logits, axis=1) def loop_fn_initial(): initial_elements_finished = ( 0 >= self.target_sequence_length ) # all False at the initial step #initial_input = tf.concat([decoder_emb_inp[:,0], my_encoder_final_state_h], 1) initial_input = decoder_emb_inp[:, 0] initial_cell_state = my_encoder_final_state #setting the correct shapes , as it is used to determine the emit structure initial_cell_output = tf.cond( self.pointer_gen, lambda: tf.zeros([M * JX], tf.float32), lambda: tf.zeros([2 * nodes], tf.float32)) initial_loop_state = None # we don't need to pass any additional information return (initial_elements_finished, initial_input, initial_cell_state, initial_cell_output, initial_loop_state) encoder_output = tf.expand_dims(g11, axis=2) def loop_fn_transition(time, previous_output, previous_state, previous_loop_state): def get_next_input(): # compute Badhanau style attention #performing convolution or reshaping input to (-1,2*d) and then doing matmul, is essentially the same operation #see matrix_mult.py...conv2d might be faster?? #https://stackoverflow.com/questions/38235555/tensorflow-matmul-of-input-matrix-with-batch-data encoder_features = tf.nn.conv2d( encoder_output, W_att_enc, [1, 1, 1, 1], "SAME" ) # shape (batch_size,max_enc_steps,1,attention_vec_size) dec_portion = tf.matmul(previous_state.h, W_att_dec) decoder_features = tf.expand_dims( tf.expand_dims(dec_portion, 1), 1 ) # reshape to (batch_size, 1, 1, attention_vec_size) #python broadcasting will alllow the two features to get added e_not_masked = tf.reduce_sum( v_blend * tf.nn.tanh(encoder_features + decoder_features), [2, 3]) # calculate e, (batch_size, max_enc_steps) #The shape of output of a softmax is the same as the input: it just normalizes the values. attn_dist = tf.nn.softmax( e_not_masked) # (batch_size, max_enc_steps) attn_dist = tf.Print(attn_dist, [tf.shape(attn_dist)], message="attn_dist", first_n=5, summarize=200) #Multiplying all the 2d vectors with same attn_dist values,and finally keeping 1 2d vector for every batch example context_vector = tf.reduce_sum( tf.reshape(attn_dist, [N, -1, 1, 1]) * encoder_output, [1, 2]) # shape (batch_size, attn_size). context_vector = tf.reshape(context_vector, [-1, 2 * nodes]) #next_input = tf.cond(self.is_train, lambda: tf.concat( # [tf.reshape(decoder_emb_inp[:, time], (N, dw)), context_vector], 1), # lambda: tf.concat([tf.nn.embedding_lookup(word_emb_mat, prediction), context_vector], 1)) #output_logits = tf.add(tf.matmul(previous_output, W_dense), b_dense) prediction = tf.cond( self.pointer_gen, lambda: execute_pointer_network(attn_dist), lambda: execute_normal_decoder( previous_output, W_dense, b_dense)) with tf.variable_scope("modified_dec_inputs", reuse=tf.AUTO_REUSE): next_input = tf.cond( self.is_train, lambda: _linear(args=[context_vector] + [ tf.reshape(decoder_emb_inp[:, time], (N, dw)) ], output_size=dw, bias=True), lambda: _linear([context_vector] + [ tf.nn.embedding_lookup( word_emb_mat, prediction) ], dw, True)) return next_input, attn_dist elements_finished = ( time >= self.target_sequence_length ) # this operation produces boolean tensor of [batch_size] # defining if corresponding sequence has ended finished = tf.reduce_all( elements_finished) # -> boolean scalar #input = tf.cond(finished, lambda: tf.concat([pad_step_embedded, my_encoder_final_state_h], 1),get_next_input) input, attn_distribution = tf.cond( finished, lambda: (pad_step_embedded, tf.zeros([N, M * JX], tf.float32)), get_next_input) attn_distribution = tf.Print(attn_distribution, [tf.shape(attn_distribution)], message="attn_distribution", first_n=5, summarize=200) state = previous_state output = tf.cond(self.pointer_gen, lambda: attn_distribution, lambda: previous_output) output = tf.Print(output, [tf.shape(output)], message="OUTPUT", first_n=5, summarize=200) loop_state = None return (elements_finished, input, state, output, loop_state) def loop_fn(time, previous_output, previous_state, previous_loop_state): if previous_state is None: # time == 0 assert previous_output is None and previous_state is None return loop_fn_initial() else: return loop_fn_transition(time, previous_output, previous_state, previous_loop_state) decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn( decoder_cell, loop_fn) decoder_outputs = decoder_outputs_ta.stack() decoder_outputs = tf.Print(decoder_outputs, [tf.shape(decoder_outputs)], message="decoder_outputs", first_n=5, summarize=200) # To do output projection, we have to temporarilly flatten decoder_outputs from [max_steps, batch_size, hidden_dim] to # [max_steps*batch_size, hidden_dim], as tf.matmul needs rank-2 tensors at most. decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack( tf.shape(decoder_outputs)) decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_dim)) #if pointer networks, no need to pass through dense layer decoder_logits_flat = tf.cond( self.pointer_gen, lambda: decoder_outputs_flat, lambda: tf.add( tf.matmul(decoder_outputs_flat, W_dense), b_dense)) decoder_logits = tf.cond( self.pointer_gen, lambda: tf.reshape( decoder_logits_flat, (decoder_max_steps, decoder_batch_size, decoder_dim)), lambda: tf.reshape(decoder_logits_flat, (decoder_max_steps, decoder_batch_size, tgt_vocab_size))) decoder_logits = _transpose_batch_time(decoder_logits) #decoder_prediction = tf.argmax(decoder_logits, -1) #self.decoder_logits_train = final_outputs.rnn_output self.decoder_logits_train = decoder_logits self.index_start = flat_logits self.index_end = flat_logits2
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper( cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper( cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper( cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper( cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell( cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell( cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell( cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell( cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw config.ruminating_layer = True if config.ruminating_layer: ''' RUMINATING LAYER ''' with tf.variable_scope('rum_layer'): print('-' * 5 + "RUMINATING LAYER" + '-' * 5) print("Context", xx) #[N,M,JX,2d] print("Question", qq) #[N,JQ,2d] print("p0", p0) #[N,M,JX,8D] sum_cell = BasicLSTMCell(d, state_is_tuple=True) (s_f, s_b), _ = bidirectional_dynamic_rnn(sum_cell, sum_cell, p0, x_len, dtype=tf.float32, scope="sum_layer") batch_lens = (tf.reshape(x_len, [N * M])) s_f = tf.reshape(s_f, [N * M, JX, d]) s_b = tf.reshape(s_b, [N * M, JX, d]) s_fout = tf.reshape(extract_axis_1(s_f, batch_lens), [N, M, d]) s_bout = tf.reshape(extract_axis_1(s_b, batch_lens), [N, M, d]) s = tf.concat(axis=2, values=[s_fout, s_bout]) # [N,M,2d] print("summarization layer", s) print('-' * 5 + "QUESTION RUMINATE LAYER" + '-' * 5) S_Q = tf.tile(tf.expand_dims(s, 2), [1, 1, JQ, 1]) # [N,M,JQ,2d] S_cell_fw = BasicLSTMCell(d, state_is_tuple=True) S_cell_bw = BasicLSTMCell(d, state_is_tuple=True) (fw_hq, bw_hq), _ = bidirectional_dynamic_rnn(S_cell_fw, S_cell_bw, S_Q, q_len, dtype=tf.float32, scope="S_Q") S_Q = tf.concat(axis=3, values=[fw_hq, bw_hq]) q_m = tf.reshape(tf.expand_dims(qq, 1), [N, M, JQ, 2 * d]) with tf.variable_scope("question_rum_layer"): Q_hat = ruminating_layer(S_Q, q_m, N, M, JQ, d) print("Q_hat", Q_hat) #[N,M,JQ,2d] print('-' * 5 + "CONTEXT RUMINATE LAYER" + '-' * 5) S_C = tf.tile(tf.expand_dims(s, 2), [1, 1, JX, 1]) # [N,M,JX,2d] C_cell_fw = BasicLSTMCell(d, state_is_tuple=True) C_cell_bw = BasicLSTMCell(d, state_is_tuple=True) (fw_h, bw_h), _ = bidirectional_dynamic_rnn(C_cell_fw, C_cell_bw, S_C, x_len, dtype=tf.float32, scope="S_C") S_C = tf.concat(axis=3, values=[fw_h, bw_h]) #[N,M,JX,2d] with tf.variable_scope("context_rum_layer"): C_hat = ruminating_layer(S_C, xx, N, M, JX, d) print("C_hat", C_hat) #[N,M,JX,2d] #Second Hop bi-Attention print('-' * 5 + "SECOND HOP ATTENTION" + '-' * 5) sh_aug = tf.tile(tf.expand_dims(C_hat, 3), [1, 1, 1, JQ, 1]) #[N,M,JX,2d] su_aug = tf.tile(tf.expand_dims(Q_hat, 2), [1, 1, JX, 1, 1]) #[N,M,JQ,2d] sh_mask_aug = tf.tile(tf.expand_dims(self.x_mask, -1), [1, 1, 1, JQ]) su_mask_aug = tf.tile( tf.expand_dims(tf.expand_dims(self.q_mask, 1), 1), [1, M, JX, 1]) shu_mask = sh_mask_aug & su_mask_aug su_logits = get_logits([sh_aug, su_aug], None, True, wd=config.wd, mask=shu_mask, is_train=True, func=config.logit_func, scope='su_logits') su_a = softsel(su_aug, su_logits) sh_a = softsel(C_hat, tf.reduce_max(su_logits, 3)) sh_a = tf.tile(tf.expand_dims(sh_a, 2), [1, 1, JX, 1]) p00 = tf.concat( axis=3, values=[C_hat, su_a, C_hat * su_a, C_hat * sh_a]) print("p00", p00) #[N,M,JX,8d] p0 = p00 print('-' * 5 + "END RUMINATING LAYER" + '-' * 5) (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 if config.na: na_bias = tf.get_variable("na_bias", shape=[], dtype='float') na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]), [N, 1]) # [N, 1] concat_flat_logits = tf.concat( axis=1, values=[na_bias_tiled, flat_logits]) concat_flat_yp = tf.nn.softmax(concat_flat_logits) na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]), [1]) flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1]) concat_flat_logits2 = tf.concat( axis=1, values=[na_bias_tiled, flat_logits2]) concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2) na_prob2 = tf.squeeze( tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1]) # [N] flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1]) self.concat_logits = concat_flat_logits self.concat_logits2 = concat_flat_logits2 self.na_prob = na_prob * na_prob2 yp = tf.reshape(flat_yp, [-1, M, JX]) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) wyp = tf.nn.sigmoid(logits2) self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 self.wyp = wyp