def _answer_layer(self, doc, candidates): 'doc # [N,dim] candidates: [N,num_candidates, dim]' candidates = nn.highway_network(candidates, self.config.highway_layers, True, is_train=self.is_train) candidates = tf.layers.dense(candidates, doc.get_shape().as_list()[-1]) doc = tf.reshape(doc, [tf.shape(doc)[0], 1, tf.shape(doc)[1]]) logit = tf.matmul(doc, candidates, transpose_b=True) # [N, 1, num_candidates] return tf.reshape(logit, [tf.shape(logit)[0], tf.shape(logit)[2]])
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( 0, [word_emb_mat, self.new_emb_mat]) print(word_emb_mat.get_shape().as_list()) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0, p1 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell # with tf.variable_scope("activate"): # p0 = tf.nn.relu(_linear(tf.reshape(p0,[-1,1200]),300,bias=0.01,bias_start=0.0,scope='relu')) # if config.share_lstm_weights: # tf.get_variable_scope().reuse_variables() # p1 = tf.nn.relu(_linear(tf.reshape(p1,[-1,1200]),300,bias=0.01,bias_start=0.0,scope='relu')) with tf.variable_scope('two_lstm'): p0 = tf.reshape(p0, [N, 1, -1, 300]) p1 = tf.reshape(p1, [N, 1, -1, 300]) (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) q_len_new = tf.tile(tf.expand_dims(q_len, 1), [1, M]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p1, q_len_new, dtype='float', scope='g0') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) # with tf.variable_scope('two_lstm_1'): # (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(first_cell, first_cell, g0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] # g2 = tf.concat(3, [fw_g2, bw_g2]) # q_len_new = tf.tile(tf.expand_dims(q_len,1),[1,M]) # if config.share_lstm_weights: # tf.get_variable_scope().reuse_variables() # (fw_g3, bw_g3), _ = bidirectional_dynamic_rnn(first_cell, first_cell, g1, q_len_new, dtype='float', scope='g0') # [N, M, JX, 2d] # g3 = tf.concat(3, [fw_g3, bw_g3]) g0 = tf.reduce_sum(tf.reduce_max(g0, 2), 1) g1 = tf.reduce_sum(tf.reduce_max(g1, 2), 1) logits = _linear([g0, g1, tf.abs(tf.subtract(g0, g1)), g0 * g1], 2, bias=0.01, bias_start=0.0, scope='logits1') flat_logits2 = tf.reshape(logits, [N, 2]) yp = tf.nn.softmax(flat_logits2) # [-1, M*JX] self.tensor_dict['g0'] = g0 self.tensor_dict['g1'] = g1 self.logits = flat_logits2 self.yp = yp
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) JA = config.max_answer_length JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): # Char-CNN Embedding if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) # Word Embedding if config.use_word_emb: with tf.variable_scope("emb_var") as scope, tf.device( "/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') tf.get_variable_scope().reuse_variables() self.word_emb_scope = scope if config.use_glove_for_unk: word_emb_mat = tf.concat( [word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq # Concat Char-CNN Embedding and Word Embedding if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # exact match if config.use_exact_match: emx = tf.expand_dims(tf.cast(self.emx, tf.float32), -1) xx = tf.concat([xx, emx], 3) # [N, M, JX, di+1] emq = tf.expand_dims(tf.cast(self.emq, tf.float32), -1) qq = tf.concat([qq, emq], 2) # [N, JQ, di+1] # 2 layer highway network on Concat Embedding if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # Bidirection-LSTM (3rd layer on paper) cell = GRUCell(d) if config.GRU else BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), _ = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat([fw_u, bw_u], 2) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h # Attention Flow Layer (4th layer on paper) with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, size=d, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell # Modeling layer (5th layer on paper) tp0 = p0 for layer_idx in range(config.LSTM_num_layers - 1): (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope="g_{}".format(layer_idx)) # [N, M, JX, 2d] p0 = tf.concat([fw_g0, bw_g0], 3) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) # [N, M, JX, 2d] # Self match layer with tf.variable_scope("SelfMatch"): s0 = tf.reshape(g1, [N * M, JX, 2 * d]) # [N * M, JX, 2d] x_mask = tf.reshape(self.x_mask, [N * M, JX]) first_cell = AttentionCell(cell, s0, size=d, mask=x_mask, is_train=self.is_train) (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn( first_cell, first_cell, s0, x_len, dtype='float', scope='s') # [N, M, JX, 2d] s1 = tf.concat([fw_s, bw_s], 2) # [N * M, JX, 2d], M == 1 # prepare for PtrNet encoder_output = tf.expand_dims(s1, 1) # [N, M, JX, 2d] encoder_output = tf.expand_dims( tf.cast(self.x_mask, tf.float32), -1) * encoder_output # [N, M, JX, 2d] if config.GRU: encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: if isinstance(fw_s_f, LSTMStateTuple): encoder_state_c = tf.concat((fw_s_f.c, bw_s_f.c), 1, name='encoder_concat_c') encoder_state_h = tf.concat((fw_s_f.h, bw_s_f.h), 1, name='encoder_concat_h') encoder_state_final = LSTMStateTuple(c=encoder_state_c, h=encoder_state_h) elif isinstance(fw_s_f, tf.Tensor): encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: encoder_state_final = None tf.logging.error("encoder_state_final not set") print("encoder_state_final:", encoder_state_final) with tf.variable_scope("output"): # eos_symbol = config.eos_symbol # next_symbol = config.next_symbol tf.assert_equal( M, 1) # currently dynamic M is not supported, thus we assume M==1 answer_string = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.int32, name='answer_string') # [N, M, JA + 1] answer_string_mask = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.bool, name='answer_string_mask') # [N, M, JA + 1] answer_string_length = tf.placeholder( shape=(N, 1), dtype=tf.int32, name='answer_string_length', ) # [N, M] self.tensor_dict['answer_string'] = answer_string self.tensor_dict['answer_string_mask'] = answer_string_mask self.tensor_dict['answer_string_length'] = answer_string_length self.answer_string = answer_string self.answer_string_mask = answer_string_mask self.answer_string_length = answer_string_length answer_string_flattened = tf.reshape(answer_string, [N * M, JA + 1]) self.answer_string_flattened = answer_string_flattened # [N * M, JA+1] print("answer_string_flattened:", answer_string_flattened) answer_string_length_flattened = tf.reshape( answer_string_length, [N * M]) self.answer_string_length_flattened = answer_string_length_flattened # [N * M] print("answer_string_length_flattened:", answer_string_length_flattened) decoder_cell = GRUCell(2 * d) if config.GRU else BasicLSTMCell( 2 * d, state_is_tuple=True) with tf.variable_scope("Decoder"): decoder_train_logits = ptr_decoder( decoder_cell, tf.reshape(tp0, [N * M, JX, 2 * d]), # [N * M, JX, 2d] tf.reshape(encoder_output, [N * M, JX, 2 * d]), # [N * M, JX, 2d] encoder_final_state=encoder_state_final, max_encoder_length=config.sent_size_th, decoder_output_length= answer_string_length_flattened, # [N * M] batch_size=N, # N * M (M=1) attention_proj_dim=self.config.decoder_proj_dim, scope='ptr_decoder' ) # [batch_size, dec_len*, enc_seq_len + 1] self.decoder_train_logits = decoder_train_logits print("decoder_train_logits:", decoder_train_logits) self.decoder_train_softmax = tf.nn.softmax( self.decoder_train_logits) self.decoder_inference = tf.argmax( decoder_train_logits, axis=2, name='decoder_inference') # [N, JA + 1] self.yp = tf.ones([N, M, JX], dtype=tf.int32) * -1 self.yp2 = tf.ones([N, M, JX], dtype=tf.int32) * -1
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=tf.random_normal_initializer) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") cell_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") cell2_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") d_cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") cell3_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") d_cell3_fw = SwitchableDropoutWrapper( cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper( cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") cell4_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell") d_cell4_fw = SwitchableDropoutWrapper( cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper( cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell( cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell( cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell( cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell( cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) if config.na: na_bias = tf.get_variable("na_bias", shape=[], dtype='float') na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]), [N, 1]) # [N, 1] concat_flat_logits = tf.concat( axis=1, values=[na_bias_tiled, flat_logits]) concat_flat_yp = tf.nn.softmax(concat_flat_logits) na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]), [1]) flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1]) concat_flat_logits2 = tf.concat( axis=1, values=[na_bias_tiled, flat_logits2]) concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2) na_prob2 = tf.squeeze( tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1]) # [N] flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1]) self.concat_logits = concat_flat_logits self.concat_logits2 = concat_flat_logits2 self.na_prob = na_prob * na_prob2 yp = tf.reshape(flat_yp, [-1, M, JX], name="yp") yp2 = tf.reshape(flat_yp2, [-1, M, JX], name="yp2") wyp = tf.nn.sigmoid(logits2, name="wyp") self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 self.wyp = wyp
def _build_forward(self): config = self.config x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(self.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train, input_keep_prob=config.highway_keep_prob) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train, input_keep_prob=config.highway_keep_prob) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq with tf.variable_scope("prepro"): with tf.variable_scope('u1'): u, _ = bi_cudnn_rnn_encoder('lstm', config.hidden_size, 1, 1 - config.input_keep_prob, qq, q_len, self.is_train) if config.reasoning_layer == 'snmn': u_st = zhong_selfatt(u[:, ax, :, :], config.hidden_size * 2, seq_len=q_len, transform='squeeze') if config.share_lstm_weights: with tf.variable_scope('u1', reuse=True): h, _ = bi_cudnn_rnn_encoder('lstm', config.hidden_size, 1, 1 - config.input_keep_prob, tf.squeeze(xx, axis=1), tf.squeeze(x_len, axis=1), self.is_train) h = h[:, ax, :, :] else: with tf.variable_scope('h1'): h, _ = bi_cudnn_rnn_encoder('lstm', config.hidden_size, 1, 1 - config.input_keep_prob, tf.squeeze(xx, axis=1), tf.squeeze(x_len, axis=1), self.is_train) h = h[:, ax, :, :] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): context_dim = config.hidden_size * 2 ### Reconstruct before bidaf because otherwise we need to build a larger query tensor. x_mask = self.x_mask x_len_squeeze = tf.squeeze(x_len, axis=1) p0 = h ### Main model if config.reasoning_layer == 'snmn': module_names = ['_Find', '_Compare', '_Relocate', '_NoOp'] self.snmn = NMN_Model(config, u, qq, u_st, self.q_mask, q_len, p0, x_mask, x_len, module_names, \ self.is_train) self.u_weights = self.snmn.cv_list # question word distribution at each step self.module_prob_list = self.snmn.module_prob_list # module probability at each step g0 = tf.squeeze(self.snmn.att_second, axis=-1) if config.supervise_bridge_entity: self.hop0_logits = self.snmn.bridge_logits if config.self_att: with tf.variable_scope('g0'): g0, _ = bi_cudnn_rnn_encoder( 'lstm', config.hidden_size, 1, 1 - config.input_keep_prob, tf.squeeze(g0, axis=1), x_len_squeeze, self.is_train) g0 = g0[:, ax, :, :] g0 = hotpot_biattention(config, self.is_train, g0, tf.squeeze(g0, axis=1), h_mask=x_mask, u_mask=tf.squeeze(x_mask, axis=1), scope="self_att", tensor_dict=self.tensor_dict) g0 = tf.layers.dense(g0, config.hidden_size * 2) with tf.variable_scope('g1'): g1, _ = bi_cudnn_rnn_encoder('lstm', config.hidden_size, 1, 1 - config.input_keep_prob, tf.squeeze(g0, axis=1), tf.squeeze(x_len, axis=1), self.is_train) g1 = g1[:, ax, :, :] logits = get_logits([g1, g0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') with tf.variable_scope('g2'): a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(a1i[:, ax, ax, :], [1, M, JX, 1]) g2, _ = bi_cudnn_rnn_encoder( 'lstm', config.hidden_size, 1, 1 - config.input_keep_prob, tf.squeeze(tf.concat(axis=3, values=[g0, g1, a1i, g0 * a1i]), axis=1), x_len_squeeze, self.is_train) g2 = g2[:, ax, :, :] logits2 = get_logits([g2, g1], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') if config.dataset == 'hotpotqa': with tf.variable_scope('g3'): if config.nmn_qtype_class == 'mem_last': g3 = tf.concat( [self.snmn.mem_last[:, ax, :], u_st[:, ax, :]], axis=-1) elif config.nmn_qtype_class == 'ctrl_st': g3 = self.snmn.c_st_list[0][:, ax, :] else: raise NotImplementedError self.predict_type = dense(g3, 2, scope='predict_type') g3_1 = self.snmn.mem_last[:, ax, :] self.predict_yesno = dense(g3_1, 2, scope='predict_yesno') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M * JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp = tf.reshape(flat_yp, [-1, M, JX]) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) wyp = tf.nn.sigmoid(logits2) self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 self.wyp = wyp if config.dataset == 'hotpotqa': flat_predict_type = tf.reshape(self.predict_type, [-1, 2]) flat_yp3 = tf.nn.softmax(flat_predict_type) self.yp3 = tf.reshape(flat_yp3, [-1, 1, 2]) flat_predict_yesno = tf.reshape(self.predict_yesno, [-1, 2]) flat_yp3_yesno = tf.nn.softmax(flat_predict_yesno) self.yp3_yesno = tf.reshape(flat_yp3_yesno, [-1, 1, 2])
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JQ = JX print('VC:{} NEW_EMB:{}'.format(VW, self.new_emb_mat.get_shape())) dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq xx = tf.reshape(xx, [-1, M, JX, d]) qq = tf.reshape(qq, [-1, JQ, d]) if config.use_pos_emb: with tf.variable_scope("pos_onehot"), tf.device("/cpu:0"): pos_x = tf.one_hot( self.x_pos, depth=config.pos_tag_num) # [N,M,JX,depth] pos_q = tf.one_hot( self.q_pos, depth=config.pos_tag_num) # [N,JQ,depth] xx = tf.concat(axis=3, values=[xx, pos_x]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, pos_q]) if config.use_sem_emb: with tf.variable_scope("sem_onehot"), tf.device("/cpu:0"): sem_x = tf.one_hot(self.x_sem, depth=3) # [N,M,JX,3] sem_q = tf.one_hot(self.q_sem, depth=3) # [N,JQ,3] xx = tf.concat(axis=3, values=[xx, sem_x]) qq = tf.concat(axis=2, values=[qq, sem_q]) if config.use_neg_emb: with tf.variable_scope("neg_onehot"), tf.device("/cpu:0"): neg_x = tf.one_hot(self.x_neg, depth=2) # [N,M,JX,2] neg_q = tf.one_hot(self.q_neg, depth=2) # [N,JQ,2] xx = tf.concat(axis=3, values=[xx, neg_x]) qq = tf.concat(axis=2, values=[qq, neg_q]) if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell_fw2 = BasicLSTMCell(d, state_is_tuple=True) cell_bw2 = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw2 = SwitchableDropoutWrapper( cell_fw2, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw2 = SwitchableDropoutWrapper( cell_bw2, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] if config.lstm: with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] print('fw_u_f hsape:{}'.format(fw_u_f.get_shape())) u = tf.concat(axis=2, values=[fw_u, bw_u]) #[N,JQ,2d] if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] print('fw_u_f nn hsape:{}'.format(fw_u_f.get_shape())) else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h else: h = xx u = qq h1 = h[:, 0, :, :] h2 = h[:, 1, :, :] h3 = h[:, 2, :, :] h4 = h[:, 3, :, :] n_1 = tf.reshape(self.x_mask[:, 0, :], [N, JX]) n_2 = tf.reshape(self.x_mask[:, 1, :], [N, JX]) n_3 = tf.reshape(self.x_mask[:, 2, :], [N, JX]) n_4 = tf.reshape(self.x_mask[:, 3, :], [N, JX]) if config.self_attention: with tf.variable_scope("h_self_weight"): print(h.get_shape()) for i in range(2): with tf.variable_scope("self-attention"): h1 = self_attention_layer( config, self.is_train, h1, p_mask=tf.expand_dims(n_1, -1), scope="{}_layer_self_att_enc_e".format( i)) # [N, len, dim] tf.get_variable_scope().reuse_variables() h2 = self_attention_layer( config, self.is_train, h2, p_mask=tf.expand_dims(n_2, -1), scope="{}_layer_self_att_enc_e".format(i)) tf.get_variable_scope().reuse_variables() h3 = self_attention_layer( config, self.is_train, h3, p_mask=tf.expand_dims(n_3, -1), scope="{}_layer_self_att_enc_e".format(i)) tf.get_variable_scope().reuse_variables() h4 = self_attention_layer( config, self.is_train, h4, p_mask=tf.expand_dims(n_4, -1), scope="{}_layer_self_att_enc_e".format(i)) with tf.variable_scope("self-attention"): u = self_attention_layer( config, self.is_train, u, p_mask=tf.expand_dims(self.q_mask, -1), scope="{}_layer_self_att_enc_p".format(i)) if config.plot_encoder == "concate": h = tf.concat([h1, h2, h3, h4], axis=1) print("h concate shape".format(h.get_shape())) n_n = tf.concat([n_1, n_2, n_3, n_4], axis=1) elif config.plot_encoder == "sum": h1 = tf.expand_dims(h1, axis=1) h2 = tf.expand_dims(h2, axis=1) h3 = tf.expand_dims(h3, axis=1) h4 = tf.expand_dims(h4, axis=1) h = tf.concat([h1, h2, h3, h4], axis=1) h = tf.reduce_sum(h, axis=1) print("h sum shape".format(h.get_shape())) elif config.plot_encoder == "lstm": # h1 = tf.reduce_sum(h1, axis=1) h1 = tf.expand_dims(tf.reduce_sum(h1, axis=-1), axis=1) h2 = tf.expand_dims(tf.reduce_sum(h2, axis=-1), axis=1) h3 = tf.expand_dims(tf.reduce_sum(h3, axis=-1), axis=1) h4 = tf.expand_dims(tf.reduce_sum(h4, axis=-1), axis=1) (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw2, d_cell_bw2, tf.concat([h1, h2, h3, h4], axis=1), dtype='float', scope='1') # [N, J, d], [N, d] print('fw_u_f hsape:{}'.format(fw_u_f.get_shape())) h = tf.concat(axis=2, values=[fw_u, bw_u]) # [N,JQ,2d] u = tf.expand_dims(tf.reduce_sum(u, axis=-1), axis=1) tf.get_variable_scope().reuse_variables() (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw2, d_cell_bw2, tf.concat([u], axis=1), dtype='float', scope='1') # [N, J, d], [N, d] print('fw_u_f hsape:{}'.format(fw_u_f.get_shape())) u = tf.concat(axis=2, values=[fw_u, bw_u]) # [N,JQ,2d] if config.interact: with tf.variable_scope("interact"): def get_attention(h, u, m): JX = tf.shape(h)[1] JQ = tf.shape(u)[1] h = tf.expand_dims(h, 2) u = tf.expand_dims(u, 1) h = tf.tile(h, [1, 1, JQ, 1]) u = tf.tile(u, [1, JX, 1, 1]) attention = h * u # N,JX,JQ,2d return attention if config.plot_encoder == "concate": attention = get_attention(h, u, M) else: attention = get_attention(h, u, 1) with tf.variable_scope('conv_dense'): if config.plot_encoder == "concate": out_final = dense_net(config, attention, self.is_train) else: out_final = tf.reshape(attention, shape=[N, -1]) else: h = tf.reshape(h, [-1, M * 2 * d * JX]) print("h shape {}".format(h.get_shape())) u = tf.reshape(u, [-1, 2 * d * JQ]) print("U shape {}".format(u.get_shape())) attention = tf.concat([h, u], axis=-1) out_final = attention out_final = linear(tf.concat([attention], axis=-1), 1000, True, bias_start=0.0, scope="logit8", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) out_final = linear(tf.concat([out_final], axis=-1), 400, True, bias_start=0.0, scope="logit9", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) out_final = linear(out_final, 300, True, bias_start=0.0, scope="logit3", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) with tf.variable_scope('conv_dense'): if config.hao: out_final = linear(tf.concat( [out_final, self.haoruopeng_feature], axis=-1), 200, True, bias_start=0.0, scope="logit", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) out_final = linear(out_final, 100, True, bias_start=0.0, scope="logit3", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) else: out_final = linear(tf.concat([out_final], axis=-1), 200, True, bias_start=0.0, scope="logit", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = linear(out_final, 100, True, bias_start=0.0, scope="logit3", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) self.tensor_dict['outfinal'] = out_final self.prediction = linear(tf.concat([out_final], axis=-1), 1, True, bias_start=0.0, scope="logit2", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train)
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper( cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper( cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper( cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper( cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell( cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell( cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell( cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell( cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw config.ruminating_layer = True if config.ruminating_layer: ''' RUMINATING LAYER ''' with tf.variable_scope('rum_layer'): print('-' * 5 + "RUMINATING LAYER" + '-' * 5) print("Context", xx) #[N,M,JX,2d] print("Question", qq) #[N,JQ,2d] print("p0", p0) #[N,M,JX,8D] sum_cell = BasicLSTMCell(d, state_is_tuple=True) (s_f, s_b), _ = bidirectional_dynamic_rnn(sum_cell, sum_cell, p0, x_len, dtype=tf.float32, scope="sum_layer") batch_lens = (tf.reshape(x_len, [N * M])) s_f = tf.reshape(s_f, [N * M, JX, d]) s_b = tf.reshape(s_b, [N * M, JX, d]) s_fout = tf.reshape(extract_axis_1(s_f, batch_lens), [N, M, d]) s_bout = tf.reshape(extract_axis_1(s_b, batch_lens), [N, M, d]) s = tf.concat(axis=2, values=[s_fout, s_bout]) # [N,M,2d] print("summarization layer", s) print('-' * 5 + "QUESTION RUMINATE LAYER" + '-' * 5) S_Q = tf.tile(tf.expand_dims(s, 2), [1, 1, JQ, 1]) # [N,M,JQ,2d] S_cell_fw = BasicLSTMCell(d, state_is_tuple=True) S_cell_bw = BasicLSTMCell(d, state_is_tuple=True) (fw_hq, bw_hq), _ = bidirectional_dynamic_rnn(S_cell_fw, S_cell_bw, S_Q, q_len, dtype=tf.float32, scope="S_Q") S_Q = tf.concat(axis=3, values=[fw_hq, bw_hq]) q_m = tf.reshape(tf.expand_dims(qq, 1), [N, M, JQ, 2 * d]) with tf.variable_scope("question_rum_layer"): Q_hat = ruminating_layer(S_Q, q_m, N, M, JQ, d) print("Q_hat", Q_hat) #[N,M,JQ,2d] print('-' * 5 + "CONTEXT RUMINATE LAYER" + '-' * 5) S_C = tf.tile(tf.expand_dims(s, 2), [1, 1, JX, 1]) # [N,M,JX,2d] C_cell_fw = BasicLSTMCell(d, state_is_tuple=True) C_cell_bw = BasicLSTMCell(d, state_is_tuple=True) (fw_h, bw_h), _ = bidirectional_dynamic_rnn(C_cell_fw, C_cell_bw, S_C, x_len, dtype=tf.float32, scope="S_C") S_C = tf.concat(axis=3, values=[fw_h, bw_h]) #[N,M,JX,2d] with tf.variable_scope("context_rum_layer"): C_hat = ruminating_layer(S_C, xx, N, M, JX, d) print("C_hat", C_hat) #[N,M,JX,2d] #Second Hop bi-Attention print('-' * 5 + "SECOND HOP ATTENTION" + '-' * 5) sh_aug = tf.tile(tf.expand_dims(C_hat, 3), [1, 1, 1, JQ, 1]) #[N,M,JX,2d] su_aug = tf.tile(tf.expand_dims(Q_hat, 2), [1, 1, JX, 1, 1]) #[N,M,JQ,2d] sh_mask_aug = tf.tile(tf.expand_dims(self.x_mask, -1), [1, 1, 1, JQ]) su_mask_aug = tf.tile( tf.expand_dims(tf.expand_dims(self.q_mask, 1), 1), [1, M, JX, 1]) shu_mask = sh_mask_aug & su_mask_aug su_logits = get_logits([sh_aug, su_aug], None, True, wd=config.wd, mask=shu_mask, is_train=True, func=config.logit_func, scope='su_logits') su_a = softsel(su_aug, su_logits) sh_a = softsel(C_hat, tf.reduce_max(su_logits, 3)) sh_a = tf.tile(tf.expand_dims(sh_a, 2), [1, 1, JX, 1]) p00 = tf.concat( axis=3, values=[C_hat, su_a, C_hat * su_a, C_hat * sh_a]) print("p00", p00) #[N,M,JX,8d] p0 = p00 print('-' * 5 + "END RUMINATING LAYER" + '-' * 5) (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 if config.na: na_bias = tf.get_variable("na_bias", shape=[], dtype='float') na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]), [N, 1]) # [N, 1] concat_flat_logits = tf.concat( axis=1, values=[na_bias_tiled, flat_logits]) concat_flat_yp = tf.nn.softmax(concat_flat_logits) na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]), [1]) flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1]) concat_flat_logits2 = tf.concat( axis=1, values=[na_bias_tiled, flat_logits2]) concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2) na_prob2 = tf.squeeze( tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1]) # [N] flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1]) self.concat_logits = concat_flat_logits self.concat_logits2 = concat_flat_logits2 self.na_prob = na_prob * na_prob2 yp = tf.reshape(flat_yp, [-1, M, JX]) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) wyp = tf.nn.sigmoid(logits2) self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 self.wyp = wyp
def _build_forward(self): config = self.config N = config.batch_size M = config.max_num_sents JX = config.max_sent_size JQ = config.max_ques_size VW = config.word_vocab_size VC = config.char_vocab_size W = config.max_word_size d = config.hidden_size JX = tf.shape(self.x)[2] # JX max sentence size, length, JQ = tf.shape(self.q)[1] # JQ max questions size, length, is the M = tf.shape(self.x)[1] # m is the max number of sentences dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size # dc = 8, each char will be map to 8-number vector, "char-level word embedding size [100]" with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') # 330,8 a matrix for each char to its 8-number vector with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] 60,None,None,16,8, batch-size, # N is the number of batch_size # M the max number of sentences # JX is the max sentence length # W is the max length of a word # dc is the vector for each char # map each char to a vector Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] # JQ the max length of question # W the max length of words # mao each char in questiosn to vectors Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) # max questions size, length, max_word_size(16), char_emb_size(8) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) # so here, there are 100 filters and the size of each filter is 5 # different heights and there are different number of these filter, but here just 100 5-long filters assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape( qq, [-1, JQ, dco ]) # here, xx and qq are the output of cnn, if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: # create a new word embedding or use the glove? word_emb_mat = tf.concat( [word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # here we used cnn and word embedding represented each word with a 200-unit vector # so for, xx, (batch_size, sentence#, word#, embedding), qq (batch_size, word#, embedding) # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # same shape with line 173 cell = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M], [60,?] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] [60] # masks are true and false, here, he sums up those truths, with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat( [fw_u, bw_u], 2) # (60, ?, 200) | 200 becahse combined 2 100 hidden states if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u # [60, ?, 200] for question self.tensor_dict['h'] = h # [60, ?, ?, 200] for article with tf.variable_scope("main"): if config.dynamic_att: # todo what is this dynamic attention. p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) cell2 = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number first_cell = SwitchableDropoutWrapper( cell2, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, inputs=p0, sequence_length=x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat([fw_g0, bw_g0], 3) cell3 = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number first_cell3 = SwitchableDropoutWrapper( cell3, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell3, first_cell3, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) cell4 = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number first_cell4 = SwitchableDropoutWrapper( cell4, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( first_cell4, first_cell4, tf.concat([p0, g1, a1i, g1 * a1i], 3), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat([fw_g2, bw_g2], 3) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
def _build_forward(self): #config为预先配置好的参数等 config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size #嵌入层 with tf.variable_scope("emb"): #字符嵌入层 if config.use_char_emb: #若需要字符嵌入层 with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) #CNN的滤波器参数 filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) #词嵌入层 if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: #若调用已训练好的词嵌入文件 word_emb_mat = tf.concat( 0, [word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): #将文章主体context:x和问题query:q转换为词向量 #embedding_lookup(params, ids),根据ids寻找params中的第id行 Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: #若进行了字符嵌入,在指定维度上将字符嵌入和词嵌入进行拼接 xx = tf.concat(3, [xx, Ax]) # [N, M, JX, di] qq = tf.concat(2, [qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # 经过两层highway network得到context vector∈ R^(d*T)和query vectorQ∈R^(d∗J) if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell = BasicLSTMCell(d, state_is_tuple=True) #SwitchableDropoutWrapper为自定义的DropoutWrapper类 d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) #reduce_sum在指定的维度上求和(得到x和q的非空值总数),cast将输入的tensor映射到指定类型(此处为x_mask到int32) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] #Contextual Embedding Layer:对上一层得到的X和Q分别使用BiLSTM进行处理,分别捕捉X和Q中各自单词间的局部关系 with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] #fw_u和bw_u分别为双向lstm的output u = tf.concat(2, [fw_u, bw_u]) #[N, J, 2d] if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h #核心层Attention Flow Layer with tf.variable_scope("main"): if config.dynamic_att: p0 = h #expand_dims()在矩阵指定位置增加维度 #tile()对矩阵的指定维度进行复制 u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [ N * M, JQ, 2 * d ]) #先在索引1的位置添加一个维度,然后复制M(context中最多的sentence数量)次,使u和h能具有相同的维度 q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat(3, [p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(3, [fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list(map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat([word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper(cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper(cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper(cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper(cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper(cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper(cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell(cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell(cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell(cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell(cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) with tf.variable_scope("output"): if config.model_name == "basic": logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell4_fw, d_cell4_bw, tf.concat([p0, g1, a1i, g1 * a1i], 3), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat([fw_g2, bw_g2], 3) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 elif config.model_name == "basic-class": C = 3 if config.data_dir.startswith('data/snli') else 2 (fw_g2, bw_g2) = (fw_g1, bw_g1) if config.classifier == 'maxpool': g2 = tf.concat([fw_g2, bw_g2], 3) # [N, M, JX, 2d] g2 = tf.reduce_max(g2, 2) # [N, M, 2d] g2_dim = 2 * d elif config.classifier == 'sumpool': g2 = tf.concat([fw_g2, bw_g2], 3) g2 = tf.reduce_sum(g2, 2) g2_dim = 2 * d else: fw_g2_ = tf.gather(tf.transpose(fw_g2, [2, 0, 1, 3]), JX - 1) bw_g2_ = tf.gather(tf.transpose(bw_g2, [2, 0, 1, 3]), 0) g2 = tf.concat([fw_g2_, bw_g2_], 2) g2_dim = 2 * d g2_ = tf.reshape(g2, [N, g2_dim]) logits0 = linear(g2_, C, True, wd=config.wd, input_keep_prob=config.input_keep_prob, is_train=self.is_train, scope='classifier') flat_yp0 = tf.nn.softmax(logits0) yp0 = tf.reshape(flat_yp0, [N, M, C]) self.tensor_dict['g1'] = g1 self.logits0 = logits0 self.yp0 = yp0 self.logits = logits0 self.yp = yp0
def highway(self, X, name=""): return highway_network(X, 2, True, is_train=True, scope=name)
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size print("parameters", N, M, JX, JQ, VW, VC, d, W) JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list(map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # prepro layer cell cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell_fwh = BasicLSTMCell(d, state_is_tuple=True) cell_bwh = BasicLSTMCell(d, state_is_tuple=True) d_cell_fwh = SwitchableDropoutWrapper(cell_fwh, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bwh = SwitchableDropoutWrapper(cell_bwh, self.is_train, input_keep_prob=config.input_keep_prob) # attention layer cell cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper(cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper(cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper(cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper(cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) # out layer cell_bw cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper(cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper(cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_h, bw_h), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') h = tf.concat(axis=3, values=[fw_h, bw_h]) (fw_u, bw_u), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) # if config.share_lstm_weights: # tf.get_variable_scope().reuse_variables() # (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] # h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] # else: # (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] # h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h # h = tf.Print(h, [tf.reduce_max(h), tf.reduce_min(h), "h"], summarize=1000) with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell(cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell(cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell(cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell(cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw # p0 = tf.Print(p0, [p0, "lstm output-1:"], summarize=200) # two layer LSTM (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) # g0 = tf.Print(g0, [g0, "lstm output0:"], summarize=200) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) # g1 = tf.Print(g1, [g1, "lstm output:"], summarize=200) # output through a denselayer with tf.variable_scope("output"): #lstm_out = g1[:,0,-1,:] lstm_out = tf.reduce_sum(g1, axis = 2) lstm_out = tf.reshape(lstm_out, [-1, 2*d]) lstm_out = tf.nn.dropout(lstm_out, 0.5) #logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, # mask=self.x_mask, is_train=self.is_train, func='linear', scope='logits1') #print (logits) #lstm_out.set_shape([N, M*JX*2*d]) dense1 = tf.layers.dense(inputs=lstm_out, units=64, activation = tf.nn.relu, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.003)) score = tf.layers.dense(inputs=dense1, units = 2, activation = None, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.003)) self.probs = tf.nn.softmax(score) # self.tensor_dict['g1'] = g1 # self.tensor_dict['g2'] = g2 self.score = score
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W ,EW, WOW= \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size,config.word_vocab_size-config.vw_wo_entity_size,config.vw_wo_entity_size JX = tf.shape(self.x)[2] # words JQ = tf.shape(self.q)[1] # words M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size #print ("dhruv is here",N, self.x.get_shape(), JX, self.q.get_shape(), VW, VC, d, W,dc, dw, dco) with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list(map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': entity_emb_mat = tf.get_variable("entity_emb_mat", dtype='float', shape=[EW, EW], initializer=get_initializer(config.onehot_encoded)) entity_emb_out = _linear(entity_emb_mat, dw, True, bias_initializer=tf.constant_initializer(0.0)) word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[WOW, dw], initializer=get_initializer(config.emb_mat)) word_emb_mat = tf.concat(axis=0,values=[word_emb_mat, entity_emb_out]) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] i.e. [batch size, max sentences, max words, embedding size] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] i.e. [batch size, max words, embedding size] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq #xx = tf.Print(xx,[tf.shape(xx),xx],message="DHRUV xx=",summarize=20) cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper(cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper(cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper(cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper(cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper(cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper(cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: # not true p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell(cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell(cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell(cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell(cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) # p0 seems to be G in paper first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) # g1 seems to be M in paper g1= tf.Print(g1,[tf.shape(g1)],message="g1 shape",first_n=5,summarize=200) p0 = tf.Print(p0, [tf.shape(p0)], message="p0 shape", first_n=5, summarize=200) my_cell_fw = BasicLSTMCell(d, state_is_tuple=True) my_cell_fw_d = SwitchableDropoutWrapper(my_cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) my_cell_bw = BasicLSTMCell(d, state_is_tuple=True) my_cell_bw_d = SwitchableDropoutWrapper(my_cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g11,bw_g11),(my_fw_final_state, my_bw_final_state),g11_len = my_bidirectional_dynamic_rnn(my_cell_fw_d, my_cell_bw_d, g1, x_len, dtype='float', scope='my_g2') # [N, M, JX, 2d] g11 = tf.concat(axis=2, values=[fw_g11, bw_g11]) my_encoder_final_state_c = tf.concat(values = (my_fw_final_state.c, my_bw_final_state.c), axis = 1, name = "my_encoder_final_state_c") my_encoder_final_state_h = tf.concat(values = (my_fw_final_state.h, my_bw_final_state.h), axis = 1, name = "my_encoder_final_state_h") my_encoder_final_state = tf.contrib.rnn.LSTMStateTuple(c = my_encoder_final_state_c, h = my_encoder_final_state_h) #compute indices for finding span as the second task in multi task learning logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') logits = tf.Print(logits, [tf.shape(logits)], message="logits shape", first_n=5, summarize=200) a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_logits = tf.Print(flat_logits, [tf.shape(flat_logits),flat_logits], message="flat_logits shape and contents", first_n=5, summarize=200) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) tgt_vocab_size = config.len_new_emb_mat # hparam # FIXME: Obtain embeddings differently? print("length is",config.len_new_emb_mat) tgt_embedding_size = dw # hparam # Look up embedding decoder_emb_inp = tf.nn.embedding_lookup(word_emb_mat, self.decoder_inputs) # [batch_size, max words, embedding_size] def decode_with_attention(helper, scope, reuse=None,maximum_iterations=None): with tf.variable_scope(scope, reuse=reuse): attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=d, memory=g11) cell = tf.contrib.rnn.GRUCell(num_units=d) attn_cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism,attention_layer_size=d /2) out_cell = tf.contrib.rnn.OutputProjectionWrapper(attn_cell, tgt_vocab_size, reuse=reuse) decoder = tf.contrib.seq2seq.BasicDecoder(cell=out_cell, helper=helper,initial_state=out_cell.zero_state( dtype=tf.float32, batch_size=N)) # initial_state=encoder_final_state) outputs = tf.contrib.seq2seq.dynamic_decode(decoder=decoder, output_time_major=False, impute_finished=True, maximum_iterations=maximum_iterations) return outputs[0] def decode(helper, scope, reuse=None, maximum_iterations=None): with tf.variable_scope(scope, reuse=reuse): decoder_cell = BasicLSTMCell(2 * d, state_is_tuple=True) # hparam projection_layer = layers_core.Dense(tgt_vocab_size, use_bias=False) # hparam decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, my_encoder_final_state,output_layer=projection_layer) # decoder final_outputs, _ ,_= tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=False, impute_finished=True, maximum_iterations=maximum_iterations) # dynamic decoding return final_outputs # Decoder if config.mode == 'train': #TODO:doesnt seem to be correct to use this variable for dev training_helper = tf.contrib.seq2seq.TrainingHelper(decoder_emb_inp, self.target_sequence_length,time_major=False) #final_outputs = decode(helper=training_helper, scope="HAHA", reuse=None) final_outputs = decode_with_attention(helper=training_helper, scope="HAHA", reuse=None) else: training_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(word_emb_mat, tf.fill([N], self.tgt_sos_id),self.tgt_eos_id) #final_outputs= decode(helper=training_helper, scope="HAHA", reuse=True,maximum_iterations=100) final_outputs= decode_with_attention(helper=training_helper, scope="HAHA", reuse=True,maximum_iterations=100) self.decoder_logits_train = final_outputs.rnn_output self.index_start = flat_logits self.index_end = flat_logits2
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) JA = config.max_answer_length JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): # Char-CNN Embedding if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list(map(int, config.out_channel_dims.split(','))) # [100] heights = list(map(int, config.filter_heights.split(','))) # [5] assert sum(filter_sizes) == dco, (filter_sizes, dco) # Make sure filter channels = char_cnn_out size with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) # Word Embedding if config.use_word_emb: with tf.variable_scope("emb_var") as scope, tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') tf.get_variable_scope().reuse_variables() self.word_emb_scope = scope if config.use_glove_for_unk: word_emb_mat = tf.concat([word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq # Concat Char-CNN Embedding and Word Embedding if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # exact match if config.use_exact_match: # TODO: What does it mean? emx = tf.expand_dims(tf.cast(self.emx, tf.float32), -1) xx = tf.concat([xx, emx], 3) # [N, M, JX, di+1] emq = tf.expand_dims(tf.cast(self.emq, tf.float32), -1) qq = tf.concat([qq, emq], 2) # [N, JQ, di+1] # 2 layer highway network on Concat Embedding if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # Bidirection-LSTM (3rd layer on paper) cell = GRUCell(d) if config.GRU else BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] flat_x_len = flatten(x_len, 0) # [N * M] with tf.variable_scope("prepro"): if config.use_fused_lstm: #yes with tf.variable_scope("u1"): fw_inputs = tf.transpose(qq, [1, 0, 2]) #[time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, q_len, batch_dim=1, seq_dim=0) fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0) prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0) fw_outputs, fw_final = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=q_len, scope="fw") bw_outputs, bw_final = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=q_len, scope="bw") bw_outputs = tf.reverse_sequence(bw_outputs, q_len, batch_dim=1, seq_dim = 0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) output = tf.transpose(current_inputs, [1, 0, 2]) u = output flat_xx = flatten(xx, 2) # [N * M, JX, d] if config.share_lstm_weights: # Yes tf.get_variable_scope().reuse_variables() with tf.variable_scope("u1"): fw_inputs = tf.transpose(flat_xx, [1, 0, 2]) #[time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0) # fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) # bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) fw_outputs, fw_final = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="fw") bw_outputs, bw_final = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="bw") bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) output = tf.transpose(current_inputs, [1, 0, 2]) else: # No with tf.variable_scope("h1"): fw_inputs = tf.transpose(flat_xx, [1, 0, 2]) #[time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0) # fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) # bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0) prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0) fw_outputs, fw_final = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="fw") bw_outputs, bw_final = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="bw") bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) output = tf.transpose(current_inputs, [1, 0, 2]) h = tf.expand_dims(output, 1) # [N, M, JX, 2d] else: (fw_u, bw_u), _ = bidirectional_dynamic_rnn(d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat([fw_u, bw_u], 2) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u # hidden state of Q = u self.tensor_dict['h'] = h # hidden state of C = h # Attention Flow Layer (4th layer on paper) with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell(cell, u, size=d, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell # a GRU cell with dropout wrapper tp0 = p0 # Output of Attention layer # Modeling layer (5th layer on paper) with tf.variable_scope('modeling_layer'): if config.use_fused_lstm: g1, encoder_state_final = build_fused_bidirectional_rnn(inputs=p0, num_units=config.hidden_size, num_layers=config.num_modeling_layers, inputs_length=flat_x_len, input_keep_prob=config.input_keep_prob, scope='modeling_layer_g') else: for layer_idx in range(config.num_modeling_layers-1): (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell, first_cell, p0, x_len, dtype='float', scope="g_{}".format(layer_idx)) # [N, M, JX, 2d] p0 = tf.concat([fw_g0, bw_g0], 3) (fw_g1, bw_g1), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(first_cell, first_cell, p0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) # [N, M, JX, 2d] # Self match layer if config.use_self_match: s0 = tf.reshape(g1, [N * M, JX, 2 * d]) # [N * M, JX, 2d] x_mask = tf.reshape(self.x_mask, [N * M, JX]) # [N * M, JX] if config.use_static_self_match: with tf.variable_scope("StaticSelfMatch"): # implemented follow r-net section 3.3 W_x_Vj = tf.contrib.layers.fully_connected( # [N * M, JX, d] s0, int(d / 2), scope='row_first', activation_fn=None, biases_initializer=None ) W_x_Vt = tf.contrib.layers.fully_connected( # [N * M, JX, d] s0, int(d / 2), scope='col_first', activation_fn=None, biases_initializer=None ) sum_rc = tf.add( # [N * M, JX, JX, d] tf.expand_dims(W_x_Vj, 1), tf.expand_dims(W_x_Vt, 2) ) v = tf.get_variable('second', shape=[1, 1, 1, int(d / 2)], dtype=tf.float32) Sj = tf.reduce_sum(tf.multiply(v, tf.tanh(sum_rc)), -1) # [N * M, JX, JX] Ai = softmax(Sj, mask = tf.expand_dims(x_mask, 1)) # [N * M, JX, JX] Ai = tf.expand_dims(Ai, -1) # [N * M, JX, JX, 1] Vi = tf.expand_dims(s0, 1) # [N * M, 1, JX, 2d] Ct = tf.reduce_sum( # [N * M, JX, 2d] tf.multiply(Ai, Vi), axis = 2 ) inputs_Vt_Ct = tf.concat([s0, Ct], 2) # [N * M, JX, 4d] if config.use_fused_lstm: fw_inputs = tf.transpose(inputs_Vt_Ct, [1, 0, 2]) # [time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0) fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0) prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0) fw_outputs, fw_s_f = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="fw") bw_outputs, bw_s_f = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="bw") fw_s_f = LSTMStateTuple(c=fw_s_f[0], h=fw_s_f[1]) bw_s_f = LSTMStateTuple(c=bw_s_f[0], h=bw_s_f[1]) bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) s1 = tf.transpose(current_inputs, [1, 0, 2]) else: (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(first_cell, first_cell, inputs_Vt_Ct, flat_x_len, dtype='float', scope='s') # [N, M, JX, 2d] s1 = tf.concat([fw_s, bw_s], 2) # [N * M, JX, 2d], M == 1 else: with tf.variable_scope("DynamicSelfMatch"): first_cell = AttentionCell(cell, s0, size=d, mask=x_mask, is_train=self.is_train) (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(first_cell, first_cell, s0, x_len, dtype='float', scope='s') # [N, M, JX, 2d] s1 = tf.concat([fw_s, bw_s], 2) # [N * M, JX, 2d], M == 1 g1 = tf.expand_dims(s1, 1) # [N, M, JX, 2d] # prepare for PtrNet encoder_output = g1 # [N, M, JX, 2d] encoder_output = tf.expand_dims(tf.cast(self.x_mask, tf.float32), -1) * encoder_output # [N, M, JX, 2d] if config.use_self_match or not config.use_fused_lstm: if config.GRU: encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: if isinstance(fw_s_f, LSTMStateTuple): encoder_state_c = tf.concat( (fw_s_f.c, bw_s_f.c), 1, name='encoder_concat_c') encoder_state_h = tf.concat( (fw_s_f.h, bw_s_f.h), 1, name='encoder_concat_h') encoder_state_final = LSTMStateTuple(c=encoder_state_c, h=encoder_state_h) elif isinstance(fw_s_f, tf.Tensor): encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: encoder_state_final = None tf.logging.error("encoder_state_final not set") print("encoder_state_final:", encoder_state_final) with tf.variable_scope("output"): # eos_symbol = config.eos_symbol # next_symbol = config.next_symbol tf.assert_equal(M, 1) # currently dynamic M is not supported, thus we assume M==1 answer_string = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.int32, name='answer_string' ) # [N, M, JA + 1] answer_string_mask = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.bool, name='answer_string_mask' ) # [N, M, JA + 1] answer_string_length = tf.placeholder( shape=(N, 1), dtype=tf.int32, name='answer_string_length', ) # [N, M] self.tensor_dict['answer_string'] = answer_string self.tensor_dict['answer_string_mask'] = answer_string_mask self.tensor_dict['answer_string_length'] = answer_string_length self.answer_string = answer_string self.answer_string_mask = answer_string_mask self.answer_string_length = answer_string_length answer_string_flattened = tf.reshape(answer_string, [N * M, JA + 1]) self.answer_string_flattened = answer_string_flattened # [N * M, JA+1] print("answer_string_flattened:", answer_string_flattened) answer_string_length_flattened = tf.reshape(answer_string_length, [N * M]) self.answer_string_length_flattened = answer_string_length_flattened # [N * M] print("answer_string_length_flattened:", answer_string_length_flattened) decoder_cell = GRUCell(2 * d) if config.GRU else BasicLSTMCell(2 * d, state_is_tuple=True) with tf.variable_scope("Decoder"): decoder_train_logits = ptr_decoder(decoder_cell, tf.reshape(tp0, [N * M, JX, 2 * d]), # [N * M, JX, 2d] tf.reshape(encoder_output, [N * M, JX, 2 * d]), # [N * M, JX, 2d] flat_x_len, encoder_final_state=encoder_state_final, max_encoder_length=config.sent_size_th, decoder_output_length=answer_string_length_flattened, # [N * M] batch_size=N, # N * M (M=1) attention_proj_dim=self.config.decoder_proj_dim, scope='ptr_decoder') # [batch_size, dec_len*, enc_seq_len + 1] self.decoder_train_logits = decoder_train_logits print("decoder_train_logits:", decoder_train_logits) self.decoder_train_softmax = tf.nn.softmax(self.decoder_train_logits) self.decoder_inference = tf.argmax(decoder_train_logits, axis=2, name='decoder_inference') # [N, JA + 1] self.yp = tf.ones([N, M, JX], dtype=tf.int32) * -1 self.yp2 = tf.ones([N, M, JX], dtype=tf.int32) * -1
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: # 计算字符emb with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) # TODO What? heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: with tf.variable_scope(tf.get_variable_scope(), reuse=True): qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer( config.emb_mat)) # emb_mat is glove else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( [word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) with tf.variable_scope(tf.get_variable_scope(), reuse=True): qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat([fw_u, bw_u], 2) if config.share_lstm_weights: with tf.variable_scope(tf.get_variable_scope(), reuse=True): (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] TODO JX == x_len? h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat([fw_g0, bw_g0], 3) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) # Output Layer logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat([p0, g1, a1i, g1 * a1i], 3), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat([fw_g2, bw_g2], 3) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
def __init__(self, config, seq_length, emb_dim, hidden_dim, emb_train, embeddings=None, pred_size=3, context_seq_len=None, query_seq_len=None): ## Define hyperparameters # tf.reset_default_graph() self.embedding_dim = emb_dim self.dim = hidden_dim self.sequence_length = seq_length self.pred_size = pred_size self.context_seq_len = context_seq_len self.query_seq_len = query_seq_len # self.config = config ## Define the placeholders self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length], name='premise') self.hypothesis_x = tf.placeholder(tf.int32, [None, self.sequence_length], name='hypothesis') self.premise_pos = tf.placeholder(tf.int32, [None, self.sequence_length, 47], name='premise_pos') self.hypothesis_pos = tf.placeholder(tf.int32, [None, self.sequence_length, 47], name='hypothesis_pos') self.premise_char = tf.placeholder( tf.int32, [None, self.sequence_length, config.char_in_word_size], name='premise_char') self.hypothesis_char = tf.placeholder( tf.int32, [None, self.sequence_length, config.char_in_word_size], name='hypothesis_char') self.premise_exact_match = tf.placeholder( tf.int32, [None, self.sequence_length, 1], name='premise_exact_match') self.hypothesis_exact_match = tf.placeholder( tf.int32, [None, self.sequence_length, 1], name='hypothesis_exact_match') self.global_step = tf.Variable(0, name='global_step', trainable=False) self.dropout_keep_rate = tf.train.exponential_decay( config.keep_rate, self.global_step, config.dropout_decay_step, config.dropout_decay_rate, staircase=False, name='dropout_keep_rate') config.keep_rate = self.dropout_keep_rate tf.summary.scalar('dropout_keep_rate', self.dropout_keep_rate) self.y = tf.placeholder(tf.int32, [None], name='label_y') self.keep_rate_ph = tf.placeholder(tf.float32, [], name='keep_prob') self.is_train = tf.placeholder('bool', [], name='is_train') ## Fucntion for embedding lookup and dropout at embedding layer def emb_drop(E, x): emb = tf.nn.embedding_lookup(E, x) emb_drop = tf.cond(self.is_train, lambda: tf.nn.dropout(emb, config.keep_rate), lambda: emb) return emb_drop # Get lengths of unpadded sentences prem_seq_lengths, prem_mask = blocks.length( self.premise_x) # mask [N, L , 1] hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x) self.prem_mask = prem_mask self.hyp_mask = hyp_mask ### Embedding layer ### with tf.variable_scope("emb"): with tf.variable_scope("emb_var"), tf.device("/cpu:0"): self.E = tf.Variable(embeddings, trainable=emb_train) premise_in = emb_drop(self.E, self.premise_x) #P hypothesis_in = emb_drop(self.E, self.hypothesis_x) #H with tf.variable_scope("char_emb"): char_emb_mat = tf.get_variable( "char_emb_mat", shape=[config.char_vocab_size, config.char_emb_size]) with tf.variable_scope("char") as scope: char_pre = tf.nn.embedding_lookup(char_emb_mat, self.premise_char) char_hyp = tf.nn.embedding_lookup(char_emb_mat, self.hypothesis_char) filter_sizes = list( map(int, config.out_channel_dims.split(','))) #[100] heights = list(map(int, config.filter_heights.split(','))) #[5] assert sum(filter_sizes) == config.char_out_size, ( filter_sizes, config.char_out_size) with tf.variable_scope("conv") as scope: conv_pre = multi_conv1d(char_pre, filter_sizes, heights, "VALID", self.is_train, config.keep_rate, scope='conv') scope.reuse_variables() conv_hyp = multi_conv1d(char_hyp, filter_sizes, heights, "VALID", self.is_train, config.keep_rate, scope='conv') conv_pre = tf.reshape( conv_pre, [-1, self.sequence_length, config.char_out_size]) conv_hyp = tf.reshape( conv_hyp, [-1, self.sequence_length, config.char_out_size]) premise_in = tf.concat([premise_in, conv_pre], axis=2) hypothesis_in = tf.concat([hypothesis_in, conv_hyp], axis=2) premise_in = tf.concat( (premise_in, tf.cast(self.premise_pos, tf.float32)), axis=2) hypothesis_in = tf.concat( (hypothesis_in, tf.cast(self.hypothesis_pos, tf.float32)), axis=2) premise_in = tf.concat( [premise_in, tf.cast(self.premise_exact_match, tf.float32)], axis=2) hypothesis_in = tf.concat( [hypothesis_in, tf.cast(self.hypothesis_exact_match, tf.float32)], axis=2) with tf.variable_scope("highway") as scope: premise_in = highway_network(premise_in, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) scope.reuse_variables() hypothesis_in = highway_network(hypothesis_in, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) with tf.variable_scope("prepro") as scope: pre = premise_in hyp = hypothesis_in for i in range(config.self_att_enc_layers): with tf.variable_scope(tf.get_variable_scope(), reuse=False): p = self_attention_layer( config, self.is_train, pre, p_mask=prem_mask, scope="{}_layer_self_att_enc".format( i)) # [N, len, dim] h = self_attention_layer( config, self.is_train, hyp, p_mask=hyp_mask, scope="{}_layer_self_att_enc_h".format(i)) pre = p hyp = h variable_summaries(p, "p_self_enc_summary_layer_{}".format(i)) variable_summaries(h, "h_self_enc_summary_layer_{}".format(i)) with tf.variable_scope("main") as scope: def model_one_side(config, main, support, main_length, support_length, main_mask, support_mask, scope): bi_att_mx = bi_attention_mx(config, self.is_train, main, support, p_mask=main_mask, h_mask=support_mask) # [N, PL, HL] bi_att_mx = tf.cond( self.is_train, lambda: tf.nn.dropout(bi_att_mx, config.keep_rate), lambda: bi_att_mx) out_final = dense_net(config, bi_att_mx, self.is_train) return out_final premise_final = model_one_side(config, p, h, prem_seq_lengths, hyp_seq_lengths, prem_mask, hyp_mask, scope="premise_as_main") f0 = premise_final print('f0:', f0.get_shape().as_list()) self.logits = linear(f0, self.pred_size, True, bias_start=0.0, scope="logit", squeeze=False, wd=config.wd, input_keep_prob=config.keep_rate, is_train=self.is_train) tf.summary.histogram('logit_histogram', self.logits) # Define the cost function self.total_cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.logits)) self.acc = tf.reduce_mean( tf.cast( tf.equal(tf.arg_max(self.logits, dimension=1), tf.cast(self.y, tf.int64)), tf.float32)) tf.summary.scalar('acc', self.acc) tf.summary.scalar('loss', self.total_cost) # calculate acc # L2 Loss if config.l2_loss: if config.sigmoid_growing_l2loss: weights_added = tf.add_n([ tf.nn.l2_loss(tensor) for tensor in tf.trainable_variables() if tensor.name.endswith("weights:0") and not tensor.name.endswith("weighted_sum/weights:0") or tensor.name.endswith('kernel:0') ]) full_l2_step = tf.constant(config.weight_l2loss_step_full_reg, dtype=tf.int32, shape=[], name='full_l2reg_step') full_l2_ratio = tf.constant(config.l2_regularization_ratio, dtype=tf.float32, shape=[], name='l2_regularization_ratio') gs_flt = tf.cast(self.global_step, tf.float32) half_l2_step_flt = tf.cast(full_l2_step / 2, tf.float32) # (self.global_step - full_l2_step / 2) # tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32) # l2loss_ratio = tf.sigmoid( tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32)) * full_l2_ratio l2loss_ratio = tf.sigmoid(((gs_flt - half_l2_step_flt) * 8) / half_l2_step_flt) * full_l2_ratio tf.summary.scalar('l2loss_ratio', l2loss_ratio) l2loss = weights_added * l2loss_ratio else: l2loss = tf.add_n([ tf.nn.l2_loss(tensor) for tensor in tf.trainable_variables() if tensor.name. endswith("weights:0") or tensor.name.endswith('kernel:0') ]) * tf.constant(config.l2_regularization_ratio, dtype='float', shape=[], name='l2_regularization_ratio') tf.summary.scalar('l2loss', l2loss) self.total_cost += l2loss if config.wo_enc_sharing or config.wo_highway_sharing_but_penalize_diff: diffs = [] for i in range(config.self_att_enc_layers): for tensor in tf.trainable_variables(): print(tensor.name) if tensor.name == "prepro/{}_layer_self_att_enc/self_attention/h_logits/first/kernel:0".format( i): l_lg = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_attention/h_logits/first/kernel:0".format( i): r_lg = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_1/kernel:0".format( i): l_fg_lhs_1 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_1/kernel:0".format( i): r_fg_lhs_1 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_1/kernel:0".format( i): l_fg_rhs_1 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_1/kernel:0".format( i): r_fg_rhs_1 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_2/kernel:0".format( i): l_fg_lhs_2 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_2/kernel:0".format( i): r_fg_lhs_2 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_2/kernel:0".format( i): l_fg_rhs_2 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_2/kernel:0".format( i): r_fg_rhs_2 = tensor if config.two_gate_fuse_gate: if tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_3/kernel:0".format( i): l_fg_lhs_3 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_3/kernel:0".format( i): r_fg_lhs_3 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_3/kernel:0".format( i): l_fg_rhs_3 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_3/kernel:0".format( i): r_fg_rhs_3 = tensor diffs += [ l_lg - r_lg, l_fg_lhs_1 - r_fg_lhs_1, l_fg_rhs_1 - r_fg_rhs_1, l_fg_lhs_2 - r_fg_lhs_2, l_fg_rhs_2 - r_fg_rhs_2 ] if config.two_gate_fuse_gate: diffs += [l_fg_lhs_3 - r_fg_lhs_3, l_fg_rhs_3 - r_fg_rhs_3] diff_loss = tf.add_n([tf.nn.l2_loss(tensor) for tensor in diffs]) * tf.constant( config.diff_penalty_loss_ratio, dtype='float', shape=[], name='diff_penalty_loss_ratio') tf.summary.scalar('diff_penalty_loss', diff_loss) self.total_cost += diff_loss self.summary = tf.summary.merge_all() total_parameters = 0 for v in tf.global_variables(): if not v.name.endswith("weights:0") and not v.name.endswith( "biases:0") and not v.name.endswith( 'kernel:0') and not v.name.endswith('bias:0'): continue print(v.name) # print(type(v.name)) shape = v.get_shape().as_list() param_num = 1 for dim in shape: param_num *= dim print(param_num) total_parameters += param_num print(total_parameters)
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( 0, [word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(3, [xx, Ax]) # [N, M, JX, di] qq = tf.concat(2, [qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.two_prepro_layers: (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, u, q_len, dtype='float', scope='u2') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] if config.two_prepro_layers: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, h, x_len, dtype='float', scope='u2') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] if config.two_prepro_layers: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, h, x_len, dtype='float', scope='h2') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) if config.late: (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat(3, [g1, p0]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(3, [fw_g2, bw_g2]) # logits2 = u_logits(config, self.is_train, tf.concat(3, [g1, a1i]), u, h_mask=self.x_mask, u_mask=self.q_mask, scope="logits2") logits = get_logits([g1, g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') if config.feed_gt: logy = tf.log(tf.cast(self.y, 'float') + VERY_SMALL_NUMBER) logits = tf.cond(self.is_train, lambda: logy, lambda: logits) if config.feed_hard: hard_yp = tf.argmax(tf.reshape(logits, [N, M * JX]), 1) hard_logits = tf.reshape(tf.one_hot(hard_yp, M * JX), [N, M, JX]) # [N, M, JX] logits = tf.cond(self.is_train, lambda: logits, lambda: hard_logits) flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) logits2 = get_logits([g1, g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) else: logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) if config.feed_gt: logy = tf.log(tf.cast(self.y, 'float') + VERY_SMALL_NUMBER) logits = tf.cond(self.is_train, lambda: logy, lambda: logits) if config.feed_hard: hard_yp = tf.argmax(tf.reshape(logits, [N, M * JX]), 1) hard_logits = tf.reshape(tf.one_hot(hard_yp, M * JX), [N, M, JX]) # [N, M, JX] logits = tf.cond(self.is_train, lambda: logits, lambda: hard_logits) flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) yp_aug = tf.expand_dims(yp, -1) g1yp = g1 * yp_aug if config.prev_mode == 'a': prev = a1i elif config.prev_mode == 'y': prev = yp_aug elif config.prev_mode == 'gy': prev = g1yp else: raise Exception() (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat(3, [p0, g1, prev, g1 * prev]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(3, [fw_g2, bw_g2]) # logits2 = u_logits(config, self.is_train, tf.concat(3, [g1, a1i]), u, h_mask=self.x_mask, u_mask=self.q_mask, scope="logits2") logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size beam_width = config.beam_width GO_TOKEN = 0 EOS_TOKEN = 1 JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat), trainable=True) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper( cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper( cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper( cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper( cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell( cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell( cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell( cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell( cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) if config.na: na_bias = tf.get_variable("na_bias", shape=[], dtype='float') na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]), [N, 1]) # [N, 1] concat_flat_logits = tf.concat( axis=1, values=[na_bias_tiled, flat_logits]) concat_flat_yp = tf.nn.softmax(concat_flat_logits) na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]), [1]) flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1]) concat_flat_logits2 = tf.concat( axis=1, values=[na_bias_tiled, flat_logits2]) concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2) na_prob2 = tf.squeeze( tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1]) # [N] flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1]) self.concat_logits = concat_flat_logits self.concat_logits2 = concat_flat_logits2 self.na_prob = na_prob * na_prob2 yp = tf.reshape(flat_yp, [-1, M, JX]) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) wyp = tf.nn.sigmoid(logits2) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 self.wyp = wyp with tf.variable_scope("q_gen"): # Question Generation Using (Paragraph & Predicted Ans Pos) NM = config.max_num_sents * config.batch_size # Separated encoder #ss = tf.reshape(xx, (-1, JX, dw+dco)) q_worthy = tf.reduce_sum( tf.to_int32(self.y), axis=2 ) # so we get probability distribution of answer-likely. (N, M) q_worthy = tf.expand_dims(tf.to_int32(tf.argmax(q_worthy, axis=1)), axis=1) # (N) -> (N, 1) q_worthy = tf.concat([ tf.expand_dims(tf.range(0, N, dtype=tf.int32), axis=1), q_worthy ], axis=1) # example : [0, 9], [1, 11], [2, 8], [3, 5], [4, 0], [5, 1] ... ss = tf.gather_nd(xx, q_worthy) syp = tf.expand_dims(tf.gather_nd(yp, q_worthy), axis=-1) syp2 = tf.expand_dims(tf.gather_nd(yp2, q_worthy), axis=-1) ss_with_ans = tf.concat([ss, syp, syp2], axis=2) qg_dim = 600 cell_fw, cell_bw = rnn.DropoutWrapper(rnn.GRUCell(qg_dim), input_keep_prob=config.input_keep_prob), \ rnn.DropoutWrapper(rnn.GRUCell(qg_dim), input_keep_prob=config.input_keep_prob) s_outputs, s_states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ss_with_ans, dtype=tf.float32) s_outputs = tf.concat(s_outputs, axis=2) s_states = tf.concat(s_states, axis=1) start_tokens = tf.zeros([N], dtype=tf.int32) self.inp_q_with_GO = tf.concat( [tf.expand_dims(start_tokens, axis=1), self.q], axis=1) # supervise if mode is train if config.mode == "train": emb_q = tf.nn.embedding_lookup(params=word_emb_mat, ids=self.inp_q_with_GO) #emb_q = tf.reshape(tf.tile(tf.expand_dims(emb_q, axis=1), [1, M, 1, 1]), (NM, JQ+1, dw)) train_helper = seq2seq.TrainingHelper(emb_q, [JQ] * N) else: s_outputs = seq2seq.tile_batch(s_outputs, multiplier=beam_width) s_states = seq2seq.tile_batch(s_states, multiplier=beam_width) cell = rnn.DropoutWrapper(rnn.GRUCell(num_units=qg_dim * 2), input_keep_prob=config.input_keep_prob) attention_mechanism = seq2seq.BahdanauAttention(num_units=qg_dim * 2, memory=s_outputs) attn_cell = seq2seq.AttentionWrapper(cell, attention_mechanism, attention_layer_size=qg_dim * 2, output_attention=True, alignment_history=False) total_glove_vocab_size = 78878 #72686 out_cell = rnn.OutputProjectionWrapper(attn_cell, VW + total_glove_vocab_size) if config.mode == "train": decoder_initial_states = out_cell.zero_state( batch_size=N, dtype=tf.float32).clone(cell_state=s_states) decoder = seq2seq.BasicDecoder( cell=out_cell, helper=train_helper, initial_state=decoder_initial_states) else: decoder_initial_states = out_cell.zero_state( batch_size=N * beam_width, dtype=tf.float32).clone(cell_state=s_states) decoder = seq2seq.BeamSearchDecoder( cell=out_cell, embedding=word_emb_mat, start_tokens=start_tokens, end_token=EOS_TOKEN, initial_state=decoder_initial_states, beam_width=beam_width, length_penalty_weight=0.0) outputs = seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=JQ) if config.mode == "train": gen_q = outputs[0].sample_id gen_q_prob = outputs[0].rnn_output gen_q_states = outputs[1] else: gen_q = outputs[0].predicted_ids[:, :, 0] gen_q_prob = tf.nn.embedding_lookup( params=word_emb_mat, ids=outputs[0].predicted_ids[:, :, 0]) gen_q_states = outputs[1] self.gen_q = gen_q self.gen_q_prob = gen_q_prob self.gen_q_states = gen_q_states
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W ,EW, WOW= \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.len_new_emb_mat, config.char_vocab_size, config.hidden_size, \ config.max_word_size,config.word_vocab_size-config.vw_wo_entity_size,config.vw_wo_entity_size JX = tf.shape(self.x)[2] # words JQ = tf.shape(self.q)[1] # words M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': init_word_emb = tf.random_normal_initializer(-0.5, 0.5) #entity_emb_mat = tf.get_variable("entity_emb_mat", dtype='float', shape=[EW, EW], initializer=get_initializer(config.onehot_encoded)) #entity_emb_out = _linear(entity_emb_mat, dw, True, bias_initializer=tf.constant_initializer(0.0)) #word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=init_word_emb) #word_emb_mat = tf.concat(axis=0,values=[word_emb_mat, entity_emb_out]) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') #if config.use_glove_for_unk: # word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup( word_emb_mat, self.x ) # [N, M, JX, d] i.e. [batch size, max sentences, max words, embedding size] Aq = tf.nn.embedding_lookup( word_emb_mat, self.q ) # [N, JQ, d] i.e. [batch size, max words, embedding size] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq #xx = tf.Print(xx,[tf.shape(xx),xx],message="DHRUV xx=",summarize=20) cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper( cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper( cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper( cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper( cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N,M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), (fw_s, bw_s) = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: # not true p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell( cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell( cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell( cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell( cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer( config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) # p0 seems to be G in paper first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw #p1 = tf.reshape(p0,[N , M*JX, 8*d]) (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), (my_fw_final_state, my_bw_final_state) = bidirectional_dynamic_rnn( second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) # g1 seems to be M in paper #g1= tf.reshape(g1,[N, M , JX, 2*d]) #reshaping here again, since g1 is used ahead g1 = tf.Print(g1, [tf.shape(g1)], message="g1 shape", first_n=5, summarize=200) p0 = tf.Print(p0, [tf.shape(p0)], message="p0 shape", first_n=5, summarize=200) g11 = tf.reshape(g1, [N, -1, 2 * d]) my_encoder_final_state_c = tf.concat( values=(my_fw_final_state.c, my_bw_final_state.c), axis=1, name="my_encoder_final_state_c") my_encoder_final_state_h = tf.concat( values=(my_fw_final_state.h, my_bw_final_state.h), axis=1, name="my_encoder_final_state_h") my_encoder_final_state = tf.contrib.rnn.LSTMStateTuple( c=my_encoder_final_state_c, h=my_encoder_final_state_h) #compute indices for finding span as the second task in multi task learning logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') logits = tf.Print(logits, [tf.shape(logits)], message="logits shape", first_n=5, summarize=200) a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_logits = tf.Print(flat_logits, [tf.shape(flat_logits), flat_logits], message="flat_logits shape and contents", first_n=5, summarize=200) self.flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) self.flat_yp2 = tf.nn.softmax(flat_logits2) tgt_vocab_size = config.len_new_emb_mat # hparam # FIXME: Obtain embeddings differently? print("length is", config.len_new_emb_mat) nodes = d # Look up embedding decoder_emb_inp = tf.nn.embedding_lookup( word_emb_mat, self.decoder_inputs) # [batch_size, max words, embedding_size] with tf.variable_scope("rnn_decoder", reuse=tf.AUTO_REUSE): init = tf.random_normal_initializer(0.0, 0.5) W_dense = tf.get_variable(name="W_dense", shape=[2 * nodes, tgt_vocab_size], dtype=tf.float32, initializer=init) b_dense = tf.get_variable(name="b_dense", shape=[tgt_vocab_size], dtype=tf.float32, initializer=tf.zeros_initializer) W_att_dec = tf.get_variable(name="W_att_dec", shape=[2 * nodes, 2 * nodes], dtype=tf.float32, initializer=init) W_att_enc = tf.get_variable(name="W_att_enc1", shape=[1, 1, 2 * nodes, 2 * nodes], dtype=tf.float32, initializer=init) v_blend = tf.get_variable(name="v_blend", shape=[1, 2 * nodes], dtype=tf.float32, initializer=init) pad_time_slice = tf.fill([N], 0, name='PAD') pad_step_embedded = tf.nn.embedding_lookup( word_emb_mat, pad_time_slice) decoder_cell = tf.contrib.rnn.BasicLSTMCell( 2 * nodes, state_is_tuple=True ) # doesnt work without the factor of 2?? '''Loop transition function is a mapping (time, previous_cell_output, previous_cell_state, previous_loop_state) -> (elements_finished, input, cell_state, output, loop_state). It is called before RNNCell to prepare its inputs and state. Everything is a Tensor except for initial call at time=0 when everything is None (except time).''' def execute_pointer_network(attn_dist): #this is to find the word in the summary, which recieved highest probability and pass it to the next step in decoder index_pos = tf.argmax(attn_dist, axis=1) index_pos = tf.expand_dims(index_pos, 1) index_pos = tf.concat([ tf.reshape(tf.range(start=0, limit=N, dtype=tf.int64), [N, 1]), tf.zeros([N, 1], tf.int64), index_pos ], axis=1) index_pos = tf.cast(tf.gather_nd(params=self.x, indices=index_pos), dtype=tf.int64) return index_pos def execute_normal_decoder(previous_output, W_dense, b_dense): output_logits = tf.add(tf.matmul(previous_output, W_dense), b_dense) return tf.argmax(output_logits, axis=1) def loop_fn_initial(): initial_elements_finished = ( 0 >= self.target_sequence_length ) # all False at the initial step #initial_input = tf.concat([decoder_emb_inp[:,0], my_encoder_final_state_h], 1) initial_input = decoder_emb_inp[:, 0] initial_cell_state = my_encoder_final_state #setting the correct shapes , as it is used to determine the emit structure initial_cell_output = tf.cond( self.pointer_gen, lambda: tf.zeros([M * JX], tf.float32), lambda: tf.zeros([2 * nodes], tf.float32)) initial_loop_state = None # we don't need to pass any additional information return (initial_elements_finished, initial_input, initial_cell_state, initial_cell_output, initial_loop_state) encoder_output = tf.expand_dims(g11, axis=2) def loop_fn_transition(time, previous_output, previous_state, previous_loop_state): def get_next_input(): # compute Badhanau style attention #performing convolution or reshaping input to (-1,2*d) and then doing matmul, is essentially the same operation #see matrix_mult.py...conv2d might be faster?? #https://stackoverflow.com/questions/38235555/tensorflow-matmul-of-input-matrix-with-batch-data encoder_features = tf.nn.conv2d( encoder_output, W_att_enc, [1, 1, 1, 1], "SAME" ) # shape (batch_size,max_enc_steps,1,attention_vec_size) dec_portion = tf.matmul(previous_state.h, W_att_dec) decoder_features = tf.expand_dims( tf.expand_dims(dec_portion, 1), 1 ) # reshape to (batch_size, 1, 1, attention_vec_size) #python broadcasting will alllow the two features to get added e_not_masked = tf.reduce_sum( v_blend * tf.nn.tanh(encoder_features + decoder_features), [2, 3]) # calculate e, (batch_size, max_enc_steps) #The shape of output of a softmax is the same as the input: it just normalizes the values. attn_dist = tf.nn.softmax( e_not_masked) # (batch_size, max_enc_steps) attn_dist = tf.Print(attn_dist, [tf.shape(attn_dist)], message="attn_dist", first_n=5, summarize=200) #Multiplying all the 2d vectors with same attn_dist values,and finally keeping 1 2d vector for every batch example context_vector = tf.reduce_sum( tf.reshape(attn_dist, [N, -1, 1, 1]) * encoder_output, [1, 2]) # shape (batch_size, attn_size). context_vector = tf.reshape(context_vector, [-1, 2 * nodes]) #next_input = tf.cond(self.is_train, lambda: tf.concat( # [tf.reshape(decoder_emb_inp[:, time], (N, dw)), context_vector], 1), # lambda: tf.concat([tf.nn.embedding_lookup(word_emb_mat, prediction), context_vector], 1)) #output_logits = tf.add(tf.matmul(previous_output, W_dense), b_dense) prediction = tf.cond( self.pointer_gen, lambda: execute_pointer_network(attn_dist), lambda: execute_normal_decoder( previous_output, W_dense, b_dense)) with tf.variable_scope("modified_dec_inputs", reuse=tf.AUTO_REUSE): next_input = tf.cond( self.is_train, lambda: _linear(args=[context_vector] + [ tf.reshape(decoder_emb_inp[:, time], (N, dw)) ], output_size=dw, bias=True), lambda: _linear([context_vector] + [ tf.nn.embedding_lookup( word_emb_mat, prediction) ], dw, True)) return next_input, attn_dist elements_finished = ( time >= self.target_sequence_length ) # this operation produces boolean tensor of [batch_size] # defining if corresponding sequence has ended finished = tf.reduce_all( elements_finished) # -> boolean scalar #input = tf.cond(finished, lambda: tf.concat([pad_step_embedded, my_encoder_final_state_h], 1),get_next_input) input, attn_distribution = tf.cond( finished, lambda: (pad_step_embedded, tf.zeros([N, M * JX], tf.float32)), get_next_input) attn_distribution = tf.Print(attn_distribution, [tf.shape(attn_distribution)], message="attn_distribution", first_n=5, summarize=200) state = previous_state output = tf.cond(self.pointer_gen, lambda: attn_distribution, lambda: previous_output) output = tf.Print(output, [tf.shape(output)], message="OUTPUT", first_n=5, summarize=200) loop_state = None return (elements_finished, input, state, output, loop_state) def loop_fn(time, previous_output, previous_state, previous_loop_state): if previous_state is None: # time == 0 assert previous_output is None and previous_state is None return loop_fn_initial() else: return loop_fn_transition(time, previous_output, previous_state, previous_loop_state) decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn( decoder_cell, loop_fn) decoder_outputs = decoder_outputs_ta.stack() decoder_outputs = tf.Print(decoder_outputs, [tf.shape(decoder_outputs)], message="decoder_outputs", first_n=5, summarize=200) # To do output projection, we have to temporarilly flatten decoder_outputs from [max_steps, batch_size, hidden_dim] to # [max_steps*batch_size, hidden_dim], as tf.matmul needs rank-2 tensors at most. decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack( tf.shape(decoder_outputs)) decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_dim)) #if pointer networks, no need to pass through dense layer decoder_logits_flat = tf.cond( self.pointer_gen, lambda: decoder_outputs_flat, lambda: tf.add( tf.matmul(decoder_outputs_flat, W_dense), b_dense)) decoder_logits = tf.cond( self.pointer_gen, lambda: tf.reshape( decoder_logits_flat, (decoder_max_steps, decoder_batch_size, decoder_dim)), lambda: tf.reshape(decoder_logits_flat, (decoder_max_steps, decoder_batch_size, tgt_vocab_size))) decoder_logits = _transpose_batch_time(decoder_logits) #decoder_prediction = tf.argmax(decoder_logits, -1) #self.decoder_logits_train = final_outputs.rnn_output self.decoder_logits_train = decoder_logits self.index_start = flat_logits self.index_end = flat_logits2
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, emb=None, ngram_embedding=None): """ :param trained_model: :param scope: :param emb_dim: :param gru: :param rnn_dim: :param rnn_num: :param drop_out: :param emb: :return: """ # trained_model: 模型存储路径 if trained_model is not None: param_dic = { 'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'tag_scheme': self.tag_scheme, 'crf': self.crf, 'emb_dim': emb_dim, 'gru': gru, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'buckets_char': self.buckets_char, 'ngram': self.ngram } print "RNN dimension is %d" % rnn_dim print "RNN number is %d" % rnn_num print "Character embedding size is %d" % emb_dim # 存储模型超参数 if self.metric == 'All': # rindex() 返回子字符串 str 在字符串中最后出现的位置 # 截取模型文件名 pindex = trained_model.rindex('/') + 1 for m in self.all_metrics: f_model = open( trained_model[:pindex] + m + '_' + trained_model[pindex:], 'w') pickle.dump(param_dic, f_model) f_model.close() else: f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out # 字向量层 # 为什么字符数要加 500 ? # emb_dim 是每个字符的特征向量维度,可以通过命令行参数设置 # weights 表示预训练的字向量,可以通过命令行参数设置 self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer') if self.ngram is not None: if ngram_embedding is not None: assert len(ngram_embedding) == len(self.ngram) else: ngram_embedding = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append( EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ngram_embedding[i], name=str(i + 2) + 'gram_layer')) # 隐藏层,输入是前向 RNN 的输出加上 后向 RNN 的输出,所以输入维度为 rnn_dim * 2 # 输出维度即标签个数 tag_output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='tag_hidden'), name='tag_output_wrapper') if self.char_freq_loss: freq_output_wrapper = TimeDistributed(HiddenLayer( rnn_dim * 2, 1, activation='sigmoid', name='freq_hidden'), name='freq_output_wrapper') if self.co_train: lm_fw_wrapper = TimeDistributed(HiddenLayer(rnn_dim, self.nums_chars + 2, activation='linear', name='lm_fw_hidden'), name='lm_fw_wrapper') lm_bw_wrapper = TimeDistributed(HiddenLayer(rnn_dim, self.nums_chars + 2, activation='linear', name='lm_bw_hidden'), name='lm_bw_wrapper') # define model for each bucket # 每一个 bucket 中的句子长度不一样,所以需要定义单独的模型 # bucket: bucket 中的句子长度 for idx, bucket in enumerate(self.buckets_char): if idx == 1: # scope 是 tf.variable_scope("tagger", reuse=None, initializer=initializer) # 只需要设置一次 reuse,后面就都 reuse 了 scope.reuse_variables() t1 = time() # 输入的句子,one-hot 向量 # shape = (batch_size, 句子长度) input_sentences = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_sentences]) emb_set = [] word_out = self.emb_layer(input_sentences) emb_set.append(word_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if len(emb_set) > 1: # 各种字向量直接 concat 起来(字向量、偏旁部首、n-gram、图像信息等) word_embeddings = tf.concat(axis=2, values=emb_set) else: word_embeddings = emb_set[0] # rnn_out 是前向 RNN 的输出和后向 RNN 的输出 concat 之后的值 rnn_out_fw, rnn_out_bw = BiRNN(rnn_dim, p=dr, concat_output=False, gru=gru, name='BiLSTM' + str(bucket), scope='Tag-BiRNN')(word_embeddings, input_sentences) tag_rnn_out_fw, tag_rnn_out_bw = rnn_out_fw, rnn_out_bw if self.co_train: if self.highway_layers > 0: tag_rnn_out_fw = highway_network(rnn_out_fw, self.highway_layers, True, is_train=True, scope="tag_fw") tag_rnn_out_bw = highway_network(rnn_out_bw, self.highway_layers, True, is_train=True, scope="tag_bw") tag_rnn_out = tf.concat(values=[tag_rnn_out_fw, tag_rnn_out_bw], axis=2) # 应用全连接层,Wx+b 得到最后的输出 output = tag_output_wrapper(tag_rnn_out) # 为什么要 [output] 而不是 output 呢? self.output.append([output]) self.output_.append([ tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket)) ]) self.bucket_dit[bucket] = idx if self.co_train: # language model lm_rnn_out_fw, lm_rnn_out_bw = rnn_out_fw, rnn_out_bw if self.highway_layers > 0: lm_rnn_out_fw = highway_network(rnn_out_fw, self.highway_layers, True, is_train=True, scope="lm_fw") lm_rnn_out_bw = highway_network(rnn_out_bw, self.highway_layers, True, is_train=True, scope="lm_bw") self.lm_fw_predictions.append([lm_fw_wrapper(lm_rnn_out_fw)]) self.lm_bw_predictions.append([lm_bw_wrapper(lm_rnn_out_bw)]) self.lm_fw_groundtruthes.append([ tf.placeholder(tf.int32, [None, bucket], name='lm_fw_targets' + str(bucket)) ]) self.lm_bw_groundtruthes.append([ tf.placeholder(tf.int32, [None, bucket], name='lm_bw_targets' + str(bucket)) ]) if self.char_freq_loss: freq_rnn_out_fw, freq_rnn_out_bw = rnn_out_fw, rnn_out_bw if self.highway_layers > 0: freq_rnn_out_fw = highway_network(rnn_out_fw, self.highway_layers, True, is_train=True, scope="freq_fw") freq_rnn_out_bw = highway_network(rnn_out_bw, self.highway_layers, True, is_train=True, scope="freq_bw") freq_rnn_out = tf.concat( values=[freq_rnn_out_fw, freq_rnn_out_bw], axis=2) self.char_freq_groundtruthes.append([ tf.placeholder(tf.float32, [None, bucket], name='freq_targets_%d' % bucket) ]) self.char_freq_predictions.append( [freq_output_wrapper(freq_rnn_out)]) print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert \ len(self.input_v) == len(self.output) and \ len(self.output) == len(self.output_) and \ len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()