def _build_forward(self): #import pdb #pdb.set_trace() with tf.variable_scope("emb"): with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if self.config.mode == 'train': assert self.emb_mat is not None word_emb_mat = tf.get_variable("word_emb_mat", shape=[self.vocab_size-np.shape(self.emb_mat)[0], self.config.word_emb_size], dtype='float', trainable=True) pretrained_emb_mat = tf.get_variable("pretrained_emb_mat", shape=[np.shape(self.emb_mat)[0], self.config.word_emb_size], dtype='float', initializer=get_initializer(self.emb_mat), trainable=self.config.fine_tune) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[self.vocab_size-np.shape(self.emb_mat)[0], self.config.word_emb_size], dtype='float') pretrained_emb_mat = tf.get_variable("pretrained_emb_mat", shape=[np.shape(self.emb_mat)[0], self.config.word_emb_size], dtype='float') self.word_emb_mat = tf.concat(values=[word_emb_mat, pretrained_emb_mat], axis=0) with tf.name_scope("word"): self.doc_word_emb = tf.nn.embedding_lookup(self.word_emb_mat, self.doc) # [N, M, d] self.que_word_emb = tf.nn.embedding_lookup(self.word_emb_mat, self.que) # [N, M, d] self.ans_word_emb = tf.nn.embedding_lookup(self.word_emb_mat, self.c) # [N, 10, d] with tf.variable_scope("sentence_modeling"): self.doc_repre, _ = ops.m_cudnn_lstm(inputs=self.doc_word_emb, num_layers=1, hidden_size=self.config.hidden_size, weight_noise_std=None, is_training=self.is_train, scope='doc', input_drop_prob=0, i_m="linear_input", use_gru=self.config.use_gru) # [N,M,JX,d] self.que_repre, _ = ops.m_cudnn_lstm(inputs=self.que_word_emb, num_layers=1, hidden_size=self.config.hidden_size, weight_noise_std=None, is_training=self.is_train, scope='que', input_drop_prob=0, i_m="linear_input", use_gru=self.config.use_gru) # [N,M,JX,d] self.attend_passage = self._bi_attention(self.config, self.is_train, self.doc_repre, self.que_repre, h_mask=self.doc_mask, scope="bi_att", tensor_dict=self.tensor_dict) self.passage, _ = ops.m_cudnn_lstm(inputs=self.attend_passage, num_layers=1, hidden_size=self.config.hidden_size, weight_noise_std=None, is_training=self.is_train, scope='aggregate', input_drop_prob=0, i_m="linear_input", use_gru=self.config.use_gru) # Concatenate outputs and states of the forward and backward RNNs with tf.variable_scope("summary"): logit = tf.layers.dense(self.passage, 1, activation=None) rc_logits = tf.nn.softmax(logit, 1) passage = tf.reduce_sum(self.passage * rc_logits, 1) # [N,dim] with tf.variable_scope('get_answer'): self.prediction = self._answer_layer(passage, self.ans_word_emb)
def _build_forward(self): #import pdb #pdb.set_trace() with tf.variable_scope("emb"): with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if self.config.mode == 'train': assert self.emb_mat is not None word_emb_mat = tf.get_variable("word_emb_mat", shape=[self.vocab_size-np.shape(self.emb_mat)[0], self.config.word_emb_size], dtype='float', trainable=True) pretrained_emb_mat = tf.get_variable("pretrained_emb_mat", shape=[np.shape(self.emb_mat)[0], self.config.word_emb_size], dtype='float', initializer=get_initializer(self.emb_mat), trainable=self.config.fine_tune) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[self.vocab_size-np.shape(self.emb_mat)[0], self.config.word_emb_size],dtype='float') pretrained_emb_mat = tf.get_variable("pretrained_emb_mat", shape=[np.shape(self.emb_mat)[0], self.config.word_emb_size], dtype='float') word_emb_mat = tf.concat(values=[word_emb_mat, pretrained_emb_mat],axis=0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.doc) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.que) # [N, JQ, d] passage, _ = ops.m_cudnn_lstm(inputs=Ax, num_layers=1, hidden_size=self.config.hidden_size, weight_noise_std=None, is_training=self.is_train, scope='doc', input_drop_prob=0, i_m="linear_input", use_gru=self.config.use_gru) # [N,M,JX,d] question, _ = ops.m_cudnn_lstm(inputs=Aq, num_layers=1, hidden_size=self.config.hidden_size, weight_noise_std=None, is_training=self.is_train, scope='question', input_drop_prob=0, i_m="linear_input", use_gru=self.config.use_gru) # [N,M,JX,d] with tf.variable_scope("bidaf"): passage = self.bi_attention(self.config, self.is_train, passage, question, self.doc_mask) passage, _ = ops.m_cudnn_lstm(inputs=passage, num_layers=1, hidden_size=self.config.hidden_size, weight_noise_std=None, is_training=self.is_train, scope='aggregate', input_drop_prob=0, i_m="linear_input", use_gru=self.config.use_gru) with tf.variable_scope('get_answer'): prediction = tf.layers.dense(passage, 1) prediction = tf.reshape(prediction, [tf.shape(prediction)[0], tf.shape(prediction)[1]]) self.prediction = prediction + self.answer_mask
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) cell_bw = BasicLSTMCell(d, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) cell2_bw = BasicLSTMCell(d, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) d_cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) cell3_bw = BasicLSTMCell(d, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) d_cell3_fw = SwitchableDropoutWrapper( cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper( cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) cell4_bw = BasicLSTMCell(d, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) d_cell4_fw = SwitchableDropoutWrapper( cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper( cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h if not config.structured_sparsity: add_sparsity_regularization(config.l1wd, collection_name=SPARSITY_VARS, scope=tf.get_variable_scope().name) else: add_mixedlasso(config.row_col_wd, config.l1wd, coef_scaling=config.coef_scaling, collection_name=SPARSITY_VARS, scope=tf.get_variable_scope().name) with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell( cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell( cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell( cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell( cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) if not config.structured_sparsity: add_sparsity_regularization(config.l1wd, collection_name=SPARSITY_VARS, scope=tf.get_variable_scope().name) else: add_mixedlasso(config.row_col_wd, config.l1wd, coef_scaling=config.coef_scaling, collection_name=SPARSITY_VARS, scope=tf.get_variable_scope().name) if config.na: na_bias = tf.get_variable("na_bias", shape=[], dtype='float') na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]), [N, 1]) # [N, 1] concat_flat_logits = tf.concat( axis=1, values=[na_bias_tiled, flat_logits]) concat_flat_yp = tf.nn.softmax(concat_flat_logits) na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]), [1]) flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1]) concat_flat_logits2 = tf.concat( axis=1, values=[na_bias_tiled, flat_logits2]) concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2) na_prob2 = tf.squeeze( tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1]) # [N] flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1]) self.concat_logits = concat_flat_logits self.concat_logits2 = concat_flat_logits2 self.na_prob = na_prob * na_prob2 yp = tf.reshape(flat_yp, [-1, M, JX]) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) wyp = tf.nn.sigmoid(logits2) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 self.wyp = wyp
def _build_forward(self): config = self.config x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(self.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train, input_keep_prob=config.highway_keep_prob) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train, input_keep_prob=config.highway_keep_prob) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq with tf.variable_scope("prepro"): with tf.variable_scope('u1'): u, _ = bi_cudnn_rnn_encoder('lstm', config.hidden_size, 1, 1 - config.input_keep_prob, qq, q_len, self.is_train) if config.reasoning_layer == 'snmn': u_st = zhong_selfatt(u[:, ax, :, :], config.hidden_size * 2, seq_len=q_len, transform='squeeze') if config.share_lstm_weights: with tf.variable_scope('u1', reuse=True): h, _ = bi_cudnn_rnn_encoder('lstm', config.hidden_size, 1, 1 - config.input_keep_prob, tf.squeeze(xx, axis=1), tf.squeeze(x_len, axis=1), self.is_train) h = h[:, ax, :, :] else: with tf.variable_scope('h1'): h, _ = bi_cudnn_rnn_encoder('lstm', config.hidden_size, 1, 1 - config.input_keep_prob, tf.squeeze(xx, axis=1), tf.squeeze(x_len, axis=1), self.is_train) h = h[:, ax, :, :] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): context_dim = config.hidden_size * 2 ### Reconstruct before bidaf because otherwise we need to build a larger query tensor. x_mask = self.x_mask x_len_squeeze = tf.squeeze(x_len, axis=1) p0 = h ### Main model if config.reasoning_layer == 'snmn': module_names = ['_Find', '_Compare', '_Relocate', '_NoOp'] self.snmn = NMN_Model(config, u, qq, u_st, self.q_mask, q_len, p0, x_mask, x_len, module_names, \ self.is_train) self.u_weights = self.snmn.cv_list # question word distribution at each step self.module_prob_list = self.snmn.module_prob_list # module probability at each step g0 = tf.squeeze(self.snmn.att_second, axis=-1) if config.supervise_bridge_entity: self.hop0_logits = self.snmn.bridge_logits if config.self_att: with tf.variable_scope('g0'): g0, _ = bi_cudnn_rnn_encoder( 'lstm', config.hidden_size, 1, 1 - config.input_keep_prob, tf.squeeze(g0, axis=1), x_len_squeeze, self.is_train) g0 = g0[:, ax, :, :] g0 = hotpot_biattention(config, self.is_train, g0, tf.squeeze(g0, axis=1), h_mask=x_mask, u_mask=tf.squeeze(x_mask, axis=1), scope="self_att", tensor_dict=self.tensor_dict) g0 = tf.layers.dense(g0, config.hidden_size * 2) with tf.variable_scope('g1'): g1, _ = bi_cudnn_rnn_encoder('lstm', config.hidden_size, 1, 1 - config.input_keep_prob, tf.squeeze(g0, axis=1), tf.squeeze(x_len, axis=1), self.is_train) g1 = g1[:, ax, :, :] logits = get_logits([g1, g0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') with tf.variable_scope('g2'): a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(a1i[:, ax, ax, :], [1, M, JX, 1]) g2, _ = bi_cudnn_rnn_encoder( 'lstm', config.hidden_size, 1, 1 - config.input_keep_prob, tf.squeeze(tf.concat(axis=3, values=[g0, g1, a1i, g0 * a1i]), axis=1), x_len_squeeze, self.is_train) g2 = g2[:, ax, :, :] logits2 = get_logits([g2, g1], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') if config.dataset == 'hotpotqa': with tf.variable_scope('g3'): if config.nmn_qtype_class == 'mem_last': g3 = tf.concat( [self.snmn.mem_last[:, ax, :], u_st[:, ax, :]], axis=-1) elif config.nmn_qtype_class == 'ctrl_st': g3 = self.snmn.c_st_list[0][:, ax, :] else: raise NotImplementedError self.predict_type = dense(g3, 2, scope='predict_type') g3_1 = self.snmn.mem_last[:, ax, :] self.predict_yesno = dense(g3_1, 2, scope='predict_yesno') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M * JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp = tf.reshape(flat_yp, [-1, M, JX]) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) wyp = tf.nn.sigmoid(logits2) self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 self.wyp = wyp if config.dataset == 'hotpotqa': flat_predict_type = tf.reshape(self.predict_type, [-1, 2]) flat_yp3 = tf.nn.softmax(flat_predict_type) self.yp3 = tf.reshape(flat_yp3, [-1, 1, 2]) flat_predict_yesno = tf.reshape(self.predict_yesno, [-1, 2]) flat_yp3_yesno = tf.nn.softmax(flat_predict_yesno) self.yp3_yesno = tf.reshape(flat_yp3_yesno, [-1, 1, 2])
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size beam_width = config.beam_width GO_TOKEN = 0 EOS_TOKEN = 1 JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat), trainable=True) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper( cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper( cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper( cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper( cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell( cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell( cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell( cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell( cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) if config.na: na_bias = tf.get_variable("na_bias", shape=[], dtype='float') na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]), [N, 1]) # [N, 1] concat_flat_logits = tf.concat( axis=1, values=[na_bias_tiled, flat_logits]) concat_flat_yp = tf.nn.softmax(concat_flat_logits) na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]), [1]) flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1]) concat_flat_logits2 = tf.concat( axis=1, values=[na_bias_tiled, flat_logits2]) concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2) na_prob2 = tf.squeeze( tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1]) # [N] flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1]) self.concat_logits = concat_flat_logits self.concat_logits2 = concat_flat_logits2 self.na_prob = na_prob * na_prob2 yp = tf.reshape(flat_yp, [-1, M, JX]) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) wyp = tf.nn.sigmoid(logits2) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 self.wyp = wyp with tf.variable_scope("q_gen"): # Question Generation Using (Paragraph & Predicted Ans Pos) NM = config.max_num_sents * config.batch_size # Separated encoder #ss = tf.reshape(xx, (-1, JX, dw+dco)) q_worthy = tf.reduce_sum( tf.to_int32(self.y), axis=2 ) # so we get probability distribution of answer-likely. (N, M) q_worthy = tf.expand_dims(tf.to_int32(tf.argmax(q_worthy, axis=1)), axis=1) # (N) -> (N, 1) q_worthy = tf.concat([ tf.expand_dims(tf.range(0, N, dtype=tf.int32), axis=1), q_worthy ], axis=1) # example : [0, 9], [1, 11], [2, 8], [3, 5], [4, 0], [5, 1] ... ss = tf.gather_nd(xx, q_worthy) syp = tf.expand_dims(tf.gather_nd(yp, q_worthy), axis=-1) syp2 = tf.expand_dims(tf.gather_nd(yp2, q_worthy), axis=-1) ss_with_ans = tf.concat([ss, syp, syp2], axis=2) qg_dim = 600 cell_fw, cell_bw = rnn.DropoutWrapper(rnn.GRUCell(qg_dim), input_keep_prob=config.input_keep_prob), \ rnn.DropoutWrapper(rnn.GRUCell(qg_dim), input_keep_prob=config.input_keep_prob) s_outputs, s_states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ss_with_ans, dtype=tf.float32) s_outputs = tf.concat(s_outputs, axis=2) s_states = tf.concat(s_states, axis=1) start_tokens = tf.zeros([N], dtype=tf.int32) self.inp_q_with_GO = tf.concat( [tf.expand_dims(start_tokens, axis=1), self.q], axis=1) # supervise if mode is train if config.mode == "train": emb_q = tf.nn.embedding_lookup(params=word_emb_mat, ids=self.inp_q_with_GO) #emb_q = tf.reshape(tf.tile(tf.expand_dims(emb_q, axis=1), [1, M, 1, 1]), (NM, JQ+1, dw)) train_helper = seq2seq.TrainingHelper(emb_q, [JQ] * N) else: s_outputs = seq2seq.tile_batch(s_outputs, multiplier=beam_width) s_states = seq2seq.tile_batch(s_states, multiplier=beam_width) cell = rnn.DropoutWrapper(rnn.GRUCell(num_units=qg_dim * 2), input_keep_prob=config.input_keep_prob) attention_mechanism = seq2seq.BahdanauAttention(num_units=qg_dim * 2, memory=s_outputs) attn_cell = seq2seq.AttentionWrapper(cell, attention_mechanism, attention_layer_size=qg_dim * 2, output_attention=True, alignment_history=False) total_glove_vocab_size = 78878 #72686 out_cell = rnn.OutputProjectionWrapper(attn_cell, VW + total_glove_vocab_size) if config.mode == "train": decoder_initial_states = out_cell.zero_state( batch_size=N, dtype=tf.float32).clone(cell_state=s_states) decoder = seq2seq.BasicDecoder( cell=out_cell, helper=train_helper, initial_state=decoder_initial_states) else: decoder_initial_states = out_cell.zero_state( batch_size=N * beam_width, dtype=tf.float32).clone(cell_state=s_states) decoder = seq2seq.BeamSearchDecoder( cell=out_cell, embedding=word_emb_mat, start_tokens=start_tokens, end_token=EOS_TOKEN, initial_state=decoder_initial_states, beam_width=beam_width, length_penalty_weight=0.0) outputs = seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=JQ) if config.mode == "train": gen_q = outputs[0].sample_id gen_q_prob = outputs[0].rnn_output gen_q_states = outputs[1] else: gen_q = outputs[0].predicted_ids[:, :, 0] gen_q_prob = tf.nn.embedding_lookup( params=word_emb_mat, ids=outputs[0].predicted_ids[:, :, 0]) gen_q_states = outputs[1] self.gen_q = gen_q self.gen_q_prob = gen_q_prob self.gen_q_states = gen_q_states
def _build_forward(self): config = self.config N, M, JX, JQ, VW, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): print('word embedding') # if config.use_char_emb: # with tf.variable_scope("emb_var"), tf.device("/cpu:0"): # char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') # with tf.variable_scope("char"): # Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] # Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] # Acx = tf.reshape(Acx, [-1, JX, W, dc]) # Acq = tf.reshape(Acq, [-1, JQ, W, dc]) # filter_sizes = list(map(int, config.out_channel_dims.split(','))) # heights = list(map(int, config.filter_heights.split(','))) # assert sum(filter_sizes) == dco, (filter_sizes, dco) # with tf.variable_scope("conv"): # xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") # if config.share_cnn_weights: # tf.get_variable_scope().reuse_variables() # qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") # else: # qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") # xx = tf.reshape(xx, [-1, M, JX, dco]) # qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/gpu:7"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') # if config.use_glove_for_unk: # word_emb_mat = tf.concat(0, [word_emb_mat]) print(word_emb_mat) with tf.name_scope("word"): print('embedding lookup') Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq print('embedding lookup ready') # if config.use_char_emb: # xx = tf.concat(3, [xx, Ax]) # [N, M, JX, di] # qq = tf.concat(2, [qq, Aq]) # [N, JQ, di] # else: xx = Ax qq = Aq # highway network #if config.highway: # with tf.variable_scope("highway"): # xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) # tf.get_variable_scope().reuse_variables() # qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq print('context emmbedding') cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] print('prepro') with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell, d_cell, qq, q_len, dtype='float32', scope='u1') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h print('main pro') with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell(cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) logits = get_logits([g1, p0], [config.batch_size,config.max_num_sents] , True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func='sigmoid', scope='logits1') # a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) # a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) # (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell, d_cell, tf.concat(3, [p0, g1, a1i, g1 * a1i]), # x_len, dtype='float', scope='g2') # [N, M, JX, 2d] # g2 = tf.concat(3, [fw_g2, bw_g2]) # logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, # mask=self.x_mask, # is_train=self.is_train, func=config.answer_func, scope='logits2') # flat_logits = tf.reshape(logits, [-1, M * JX]) # flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] # yp = tf.reshape(flat_yp, [-1, M, JX]) yp = tf.greater(0.5, logits) # flat_logits2 = tf.reshape(logits2, [-1, M * JX]) # flat_yp2 = tf.nn.softmax(flat_logits2) # yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 # self.tensor_dict['g2'] = g2 self.logits = logits # self.logits2 = flat_logits2 self.yp = yp
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper( cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper( cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper( cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper( cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell( cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell( cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell( cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell( cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw config.ruminating_layer = True if config.ruminating_layer: ''' RUMINATING LAYER ''' with tf.variable_scope('rum_layer'): print('-' * 5 + "RUMINATING LAYER" + '-' * 5) print("Context", xx) #[N,M,JX,2d] print("Question", qq) #[N,JQ,2d] print("p0", p0) #[N,M,JX,8D] sum_cell = BasicLSTMCell(d, state_is_tuple=True) (s_f, s_b), _ = bidirectional_dynamic_rnn(sum_cell, sum_cell, p0, x_len, dtype=tf.float32, scope="sum_layer") batch_lens = (tf.reshape(x_len, [N * M])) s_f = tf.reshape(s_f, [N * M, JX, d]) s_b = tf.reshape(s_b, [N * M, JX, d]) s_fout = tf.reshape(extract_axis_1(s_f, batch_lens), [N, M, d]) s_bout = tf.reshape(extract_axis_1(s_b, batch_lens), [N, M, d]) s = tf.concat(axis=2, values=[s_fout, s_bout]) # [N,M,2d] print("summarization layer", s) print('-' * 5 + "QUESTION RUMINATE LAYER" + '-' * 5) S_Q = tf.tile(tf.expand_dims(s, 2), [1, 1, JQ, 1]) # [N,M,JQ,2d] S_cell_fw = BasicLSTMCell(d, state_is_tuple=True) S_cell_bw = BasicLSTMCell(d, state_is_tuple=True) (fw_hq, bw_hq), _ = bidirectional_dynamic_rnn(S_cell_fw, S_cell_bw, S_Q, q_len, dtype=tf.float32, scope="S_Q") S_Q = tf.concat(axis=3, values=[fw_hq, bw_hq]) q_m = tf.reshape(tf.expand_dims(qq, 1), [N, M, JQ, 2 * d]) with tf.variable_scope("question_rum_layer"): Q_hat = ruminating_layer(S_Q, q_m, N, M, JQ, d) print("Q_hat", Q_hat) #[N,M,JQ,2d] print('-' * 5 + "CONTEXT RUMINATE LAYER" + '-' * 5) S_C = tf.tile(tf.expand_dims(s, 2), [1, 1, JX, 1]) # [N,M,JX,2d] C_cell_fw = BasicLSTMCell(d, state_is_tuple=True) C_cell_bw = BasicLSTMCell(d, state_is_tuple=True) (fw_h, bw_h), _ = bidirectional_dynamic_rnn(C_cell_fw, C_cell_bw, S_C, x_len, dtype=tf.float32, scope="S_C") S_C = tf.concat(axis=3, values=[fw_h, bw_h]) #[N,M,JX,2d] with tf.variable_scope("context_rum_layer"): C_hat = ruminating_layer(S_C, xx, N, M, JX, d) print("C_hat", C_hat) #[N,M,JX,2d] #Second Hop bi-Attention print('-' * 5 + "SECOND HOP ATTENTION" + '-' * 5) sh_aug = tf.tile(tf.expand_dims(C_hat, 3), [1, 1, 1, JQ, 1]) #[N,M,JX,2d] su_aug = tf.tile(tf.expand_dims(Q_hat, 2), [1, 1, JX, 1, 1]) #[N,M,JQ,2d] sh_mask_aug = tf.tile(tf.expand_dims(self.x_mask, -1), [1, 1, 1, JQ]) su_mask_aug = tf.tile( tf.expand_dims(tf.expand_dims(self.q_mask, 1), 1), [1, M, JX, 1]) shu_mask = sh_mask_aug & su_mask_aug su_logits = get_logits([sh_aug, su_aug], None, True, wd=config.wd, mask=shu_mask, is_train=True, func=config.logit_func, scope='su_logits') su_a = softsel(su_aug, su_logits) sh_a = softsel(C_hat, tf.reduce_max(su_logits, 3)) sh_a = tf.tile(tf.expand_dims(sh_a, 2), [1, 1, JX, 1]) p00 = tf.concat( axis=3, values=[C_hat, su_a, C_hat * su_a, C_hat * sh_a]) print("p00", p00) #[N,M,JX,8d] p0 = p00 print('-' * 5 + "END RUMINATING LAYER" + '-' * 5) (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 if config.na: na_bias = tf.get_variable("na_bias", shape=[], dtype='float') na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]), [N, 1]) # [N, 1] concat_flat_logits = tf.concat( axis=1, values=[na_bias_tiled, flat_logits]) concat_flat_yp = tf.nn.softmax(concat_flat_logits) na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]), [1]) flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1]) concat_flat_logits2 = tf.concat( axis=1, values=[na_bias_tiled, flat_logits2]) concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2) na_prob2 = tf.squeeze( tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1]) # [N] flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1]) self.concat_logits = concat_flat_logits self.concat_logits2 = concat_flat_logits2 self.na_prob = na_prob * na_prob2 yp = tf.reshape(flat_yp, [-1, M, JX]) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) wyp = tf.nn.sigmoid(logits2) self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 self.wyp = wyp
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( 0, [word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(3, [xx, Ax]) # [N, M, JX, di] qq = tf.concat(2, [qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.two_prepro_layers: (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, u, q_len, dtype='float', scope='u2') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] if config.two_prepro_layers: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, h, x_len, dtype='float', scope='u2') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] if config.two_prepro_layers: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, h, x_len, dtype='float', scope='h2') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) if config.late: (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat(3, [g1, p0]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(3, [fw_g2, bw_g2]) # logits2 = u_logits(config, self.is_train, tf.concat(3, [g1, a1i]), u, h_mask=self.x_mask, u_mask=self.q_mask, scope="logits2") logits = get_logits([g1, g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') if config.feed_gt: logy = tf.log(tf.cast(self.y, 'float') + VERY_SMALL_NUMBER) logits = tf.cond(self.is_train, lambda: logy, lambda: logits) if config.feed_hard: hard_yp = tf.argmax(tf.reshape(logits, [N, M * JX]), 1) hard_logits = tf.reshape(tf.one_hot(hard_yp, M * JX), [N, M, JX]) # [N, M, JX] logits = tf.cond(self.is_train, lambda: logits, lambda: hard_logits) flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) logits2 = get_logits([g1, g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) else: logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) if config.feed_gt: logy = tf.log(tf.cast(self.y, 'float') + VERY_SMALL_NUMBER) logits = tf.cond(self.is_train, lambda: logy, lambda: logits) if config.feed_hard: hard_yp = tf.argmax(tf.reshape(logits, [N, M * JX]), 1) hard_logits = tf.reshape(tf.one_hot(hard_yp, M * JX), [N, M, JX]) # [N, M, JX] logits = tf.cond(self.is_train, lambda: logits, lambda: hard_logits) flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) yp_aug = tf.expand_dims(yp, -1) g1yp = g1 * yp_aug if config.prev_mode == 'a': prev = a1i elif config.prev_mode == 'y': prev = yp_aug elif config.prev_mode == 'gy': prev = g1yp else: raise Exception() (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat(3, [p0, g1, prev, g1 * prev]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(3, [fw_g2, bw_g2]) # logits2 = u_logits(config, self.is_train, tf.concat(3, [g1, a1i]), u, h_mask=self.x_mask, u_mask=self.q_mask, scope="logits2") logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( 0, [word_emb_mat, self.new_emb_mat]) print(word_emb_mat.get_shape().as_list()) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0, p1 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell # with tf.variable_scope("activate"): # p0 = tf.nn.relu(_linear(tf.reshape(p0,[-1,1200]),300,bias=0.01,bias_start=0.0,scope='relu')) # if config.share_lstm_weights: # tf.get_variable_scope().reuse_variables() # p1 = tf.nn.relu(_linear(tf.reshape(p1,[-1,1200]),300,bias=0.01,bias_start=0.0,scope='relu')) with tf.variable_scope('two_lstm'): p0 = tf.reshape(p0, [N, 1, -1, 300]) p1 = tf.reshape(p1, [N, 1, -1, 300]) (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) q_len_new = tf.tile(tf.expand_dims(q_len, 1), [1, M]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p1, q_len_new, dtype='float', scope='g0') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) # with tf.variable_scope('two_lstm_1'): # (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(first_cell, first_cell, g0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] # g2 = tf.concat(3, [fw_g2, bw_g2]) # q_len_new = tf.tile(tf.expand_dims(q_len,1),[1,M]) # if config.share_lstm_weights: # tf.get_variable_scope().reuse_variables() # (fw_g3, bw_g3), _ = bidirectional_dynamic_rnn(first_cell, first_cell, g1, q_len_new, dtype='float', scope='g0') # [N, M, JX, 2d] # g3 = tf.concat(3, [fw_g3, bw_g3]) g0 = tf.reduce_sum(tf.reduce_max(g0, 2), 1) g1 = tf.reduce_sum(tf.reduce_max(g1, 2), 1) logits = _linear([g0, g1, tf.abs(tf.subtract(g0, g1)), g0 * g1], 2, bias=0.01, bias_start=0.0, scope='logits1') flat_logits2 = tf.reshape(logits, [N, 2]) yp = tf.nn.softmax(flat_logits2) # [-1, M*JX] self.tensor_dict['g0'] = g0 self.tensor_dict['g1'] = g1 self.logits = flat_logits2 self.yp = yp
def _build_forward(self): config = self.config N, M, JX, JQ, VW, d, dc, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.hidden_size, \ config.char_emb_size, config.max_word_size H = config.max_tree_height x_mask = self.x > 0 q_mask = self.q > 0 tx_mask = self.tx > 0 # [N, M, H, JX] with tf.variable_scope("char_emb"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] filter = tf.get_variable("filter", shape=[1, config.char_filter_height, dc, d], dtype='float') bias = tf.get_variable("bias", shape=[d], dtype='float') strides = [1, 1, 1, 1] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) xxc = tf.nn.conv2d(Acx, filter, strides, "VALID") + bias # [N*M, JX, W/filter_stride, d] qqc = tf.nn.conv2d(Acq, filter, strides, "VALID") + bias # [N, JQ, W/filter_stride, d] xxc = tf.reshape(tf.reduce_max(tf.nn.relu(xxc), 2), [-1, M, JX, d]) qqc = tf.reshape(tf.reduce_max(tf.nn.relu(qqc), 2), [-1, JQ, d]) with tf.variable_scope("word_emb"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, config.word_emb_size], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, config.word_emb_size], dtype='float') Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] # Ax = linear([Ax], d, False, scope='Ax_reshape') # Aq = linear([Aq], d, False, scope='Aq_reshape') xx = tf.concat(3, [xxc, Ax]) # [N, M, JX, 2d] qq = tf.concat(2, [qqc, Aq]) # [N, JQ, 2d] D = d + config.word_emb_size with tf.variable_scope("pos_emb"): pos_emb_mat = tf.get_variable("pos_emb_mat", shape=[config.pos_vocab_size, d], dtype='float') Atx = tf.nn.embedding_lookup(pos_emb_mat, self.tx) # [N, M, H, JX, d] cell = BasicLSTMCell(D, state_is_tuple=True) cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(q_mask, 'int32'), 1) # [N] with tf.variable_scope("rnn"): (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='start') # [N, M, JX, 2d] tf.get_variable_scope().reuse_variables() (fw_us, bw_us), (_, (fw_u, bw_u)) = bidirectional_dynamic_rnn(cell, cell, qq, q_len, dtype='float', scope='start') # [N, J, d], [N, d] u = (fw_u + bw_u) / 2.0 h = (fw_h + bw_h) / 2.0 with tf.variable_scope("h"): no_op_cell = NoOpCell(D) tree_rnn_cell = TreeRNNCell(no_op_cell, d, tf.reduce_max) initial_state = tf.reshape(h, [N*M*JX, D]) # [N*M*JX, D] inputs = tf.concat(4, [Atx, tf.cast(self.tx_edge_mask, 'float')]) # [N, M, H, JX, d+JX] inputs = tf.reshape(tf.transpose(inputs, [0, 1, 3, 2, 4]), [N*M*JX, H, d + JX]) # [N*M*JX, H, d+JX] length = tf.reshape(tf.reduce_sum(tf.cast(tx_mask, 'int32'), 2), [N*M*JX]) # length = tf.reshape(tf.reduce_sum(tf.cast(tf.transpose(tx_mask, [0, 1, 3, 2]), 'float'), 3), [-1]) h, _ = dynamic_rnn(tree_rnn_cell, inputs, length, initial_state=initial_state) # [N*M*JX, H, D] h = tf.transpose(tf.reshape(h, [N, M, JX, H, D]), [0, 1, 3, 2, 4]) # [N, M, H, JX, D] u = tf.expand_dims(tf.expand_dims(tf.expand_dims(u, 1), 1), 1) # [N, 1, 1, 1, 4d] dot = linear(h * u, 1, True, squeeze=True, scope='dot') # [N, M, H, JX] # self.logits = tf.reshape(dot, [N, M * H * JX]) self.logits = tf.reshape(exp_mask(dot, tx_mask), [N, M * H * JX]) # [N, M, H, JX] self.yp = tf.reshape(tf.nn.softmax(self.logits), [N, M, H, JX])
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list(map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat([word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper(cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper(cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper(cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper(cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper(cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper(cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell(cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell(cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell(cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell(cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) with tf.variable_scope("output"): if config.model_name == "basic": logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell4_fw, d_cell4_bw, tf.concat([p0, g1, a1i, g1 * a1i], 3), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat([fw_g2, bw_g2], 3) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 elif config.model_name == "basic-class": C = 3 if config.data_dir.startswith('data/snli') else 2 (fw_g2, bw_g2) = (fw_g1, bw_g1) if config.classifier == 'maxpool': g2 = tf.concat([fw_g2, bw_g2], 3) # [N, M, JX, 2d] g2 = tf.reduce_max(g2, 2) # [N, M, 2d] g2_dim = 2 * d elif config.classifier == 'sumpool': g2 = tf.concat([fw_g2, bw_g2], 3) g2 = tf.reduce_sum(g2, 2) g2_dim = 2 * d else: fw_g2_ = tf.gather(tf.transpose(fw_g2, [2, 0, 1, 3]), JX - 1) bw_g2_ = tf.gather(tf.transpose(bw_g2, [2, 0, 1, 3]), 0) g2 = tf.concat([fw_g2_, bw_g2_], 2) g2_dim = 2 * d g2_ = tf.reshape(g2, [N, g2_dim]) logits0 = linear(g2_, C, True, wd=config.wd, input_keep_prob=config.input_keep_prob, is_train=self.is_train, scope='classifier') flat_yp0 = tf.nn.softmax(logits0) yp0 = tf.reshape(flat_yp0, [N, M, C]) self.tensor_dict['g1'] = g1 self.logits0 = logits0 self.yp0 = yp0 self.logits = logits0 self.yp = yp0
def _build_forward(self): with tf.variable_scope("emb"): with tf.variable_scope("bert_emb"): bs = self.doc.get_shape().as_list()[0] self.doc_bert_model = BertModel(config=self.bert_config, is_training=self.is_train, input_ids=self.doc, input_mask=self.doc_mask) tvars = tf.trainable_variables() initialized_variable_names = {} (assignment_map, initialized_variable_names ) = get_assignment_map_from_checkpoint( tvars, self.config.init_checkpoint) tf.train.init_from_checkpoint(self.config.init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) self.doc_bert_repre = self.doc_bert_model.get_sequence_output() with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if self.config.mode == 'train': assert self.emb_mat is not None word_emb_mat = tf.get_variable( "word_emb_mat", shape=[ self.vocab_size - np.shape(self.emb_mat)[0], self.config.word_emb_size ], dtype='float', trainable=True) pretrained_emb_mat = tf.get_variable( "pretrained_emb_mat", shape=[ np.shape(self.emb_mat)[0], self.config.word_emb_size ], dtype='float', initializer=get_initializer(self.emb_mat), trainable=self.config.fine_tune) else: word_emb_mat = tf.get_variable( "word_emb_mat", shape=[ self.vocab_size - np.shape(self.emb_mat)[0], self.config.word_emb_size ], dtype='float') pretrained_emb_mat = tf.get_variable( "pretrained_emb_mat", shape=[ np.shape(self.emb_mat)[0], self.config.word_emb_size ], dtype='float') self.word_emb_mat = tf.concat( values=[word_emb_mat, pretrained_emb_mat], axis=0) self.que_word_emb = tf.nn.embedding_lookup( self.word_emb_mat, self.que) # [N, M, d] self.ans_word_emb = tf.nn.embedding_lookup( self.word_emb_mat, self.c) # [N, 10, d] with tf.variable_scope("sentence_modeling"): self.que_repre, _ = ops.m_cudnn_lstm( inputs=self.que_word_emb, num_layers=1, hidden_size=self.config.hidden_size, weight_noise_std=None, is_training=self.is_train, scope='que', input_drop_prob=0, i_m="linear_input", use_gru=self.config.use_gru) # [N,M,d] self.attended_passage = self._bi_attention( self.config, self.is_train, self.doc_bert_repre, self.que_repre, h_mask=self.doc_mask, scope="bi_att", tensor_dict=self.tensor_dict) # Concatenate outputs and states of the forward and backward RNNs with tf.variable_scope("summary"): logit = tf.layers.dense(self.attended_passage, 1, activation=None) rc_logits = tf.nn.softmax(logit, 1) self.doc_final_repre = tf.reduce_sum(self.attended_passage * rc_logits, 1) # [N,dim] with tf.variable_scope('get_answer'): self.prediction = self._answer_layer(self.doc_final_repre, self.ans_word_emb)
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size print("parameters", N, M, JX, JQ, VW, VC, d, W) JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list(map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # prepro layer cell cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell_fwh = BasicLSTMCell(d, state_is_tuple=True) cell_bwh = BasicLSTMCell(d, state_is_tuple=True) d_cell_fwh = SwitchableDropoutWrapper(cell_fwh, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bwh = SwitchableDropoutWrapper(cell_bwh, self.is_train, input_keep_prob=config.input_keep_prob) # attention layer cell cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper(cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper(cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper(cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper(cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) # out layer cell_bw cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper(cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper(cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_h, bw_h), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') h = tf.concat(axis=3, values=[fw_h, bw_h]) (fw_u, bw_u), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) # if config.share_lstm_weights: # tf.get_variable_scope().reuse_variables() # (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] # h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] # else: # (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] # h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h # h = tf.Print(h, [tf.reduce_max(h), tf.reduce_min(h), "h"], summarize=1000) with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell(cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell(cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell(cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell(cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw # p0 = tf.Print(p0, [p0, "lstm output-1:"], summarize=200) # two layer LSTM (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) # g0 = tf.Print(g0, [g0, "lstm output0:"], summarize=200) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) # g1 = tf.Print(g1, [g1, "lstm output:"], summarize=200) # output through a denselayer with tf.variable_scope("output"): #lstm_out = g1[:,0,-1,:] lstm_out = tf.reduce_sum(g1, axis = 2) lstm_out = tf.reshape(lstm_out, [-1, 2*d]) lstm_out = tf.nn.dropout(lstm_out, 0.5) #logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, # mask=self.x_mask, is_train=self.is_train, func='linear', scope='logits1') #print (logits) #lstm_out.set_shape([N, M*JX*2*d]) dense1 = tf.layers.dense(inputs=lstm_out, units=64, activation = tf.nn.relu, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.003)) score = tf.layers.dense(inputs=dense1, units = 2, activation = None, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.003)) self.probs = tf.nn.softmax(score) # self.tensor_dict['g1'] = g1 # self.tensor_dict['g2'] = g2 self.score = score
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W ,EW, WOW= \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size,config.word_vocab_size-config.vw_wo_entity_size,config.vw_wo_entity_size JX = tf.shape(self.x)[2] # words JQ = tf.shape(self.q)[1] # words M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size #print ("dhruv is here",N, self.x.get_shape(), JX, self.q.get_shape(), VW, VC, d, W,dc, dw, dco) with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list(map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': entity_emb_mat = tf.get_variable("entity_emb_mat", dtype='float', shape=[EW, EW], initializer=get_initializer(config.onehot_encoded)) entity_emb_out = _linear(entity_emb_mat, dw, True, bias_initializer=tf.constant_initializer(0.0)) word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[WOW, dw], initializer=get_initializer(config.emb_mat)) word_emb_mat = tf.concat(axis=0,values=[word_emb_mat, entity_emb_out]) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] i.e. [batch size, max sentences, max words, embedding size] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] i.e. [batch size, max words, embedding size] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq #xx = tf.Print(xx,[tf.shape(xx),xx],message="DHRUV xx=",summarize=20) cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper(cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper(cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper(cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper(cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper(cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper(cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: # not true p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell(cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell(cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell(cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell(cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) # p0 seems to be G in paper first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) # g1 seems to be M in paper g1= tf.Print(g1,[tf.shape(g1)],message="g1 shape",first_n=5,summarize=200) p0 = tf.Print(p0, [tf.shape(p0)], message="p0 shape", first_n=5, summarize=200) my_cell_fw = BasicLSTMCell(d, state_is_tuple=True) my_cell_fw_d = SwitchableDropoutWrapper(my_cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) my_cell_bw = BasicLSTMCell(d, state_is_tuple=True) my_cell_bw_d = SwitchableDropoutWrapper(my_cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g11,bw_g11),(my_fw_final_state, my_bw_final_state),g11_len = my_bidirectional_dynamic_rnn(my_cell_fw_d, my_cell_bw_d, g1, x_len, dtype='float', scope='my_g2') # [N, M, JX, 2d] g11 = tf.concat(axis=2, values=[fw_g11, bw_g11]) my_encoder_final_state_c = tf.concat(values = (my_fw_final_state.c, my_bw_final_state.c), axis = 1, name = "my_encoder_final_state_c") my_encoder_final_state_h = tf.concat(values = (my_fw_final_state.h, my_bw_final_state.h), axis = 1, name = "my_encoder_final_state_h") my_encoder_final_state = tf.contrib.rnn.LSTMStateTuple(c = my_encoder_final_state_c, h = my_encoder_final_state_h) #compute indices for finding span as the second task in multi task learning logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') logits = tf.Print(logits, [tf.shape(logits)], message="logits shape", first_n=5, summarize=200) a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_logits = tf.Print(flat_logits, [tf.shape(flat_logits),flat_logits], message="flat_logits shape and contents", first_n=5, summarize=200) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) tgt_vocab_size = config.len_new_emb_mat # hparam # FIXME: Obtain embeddings differently? print("length is",config.len_new_emb_mat) tgt_embedding_size = dw # hparam # Look up embedding decoder_emb_inp = tf.nn.embedding_lookup(word_emb_mat, self.decoder_inputs) # [batch_size, max words, embedding_size] def decode_with_attention(helper, scope, reuse=None,maximum_iterations=None): with tf.variable_scope(scope, reuse=reuse): attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=d, memory=g11) cell = tf.contrib.rnn.GRUCell(num_units=d) attn_cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism,attention_layer_size=d /2) out_cell = tf.contrib.rnn.OutputProjectionWrapper(attn_cell, tgt_vocab_size, reuse=reuse) decoder = tf.contrib.seq2seq.BasicDecoder(cell=out_cell, helper=helper,initial_state=out_cell.zero_state( dtype=tf.float32, batch_size=N)) # initial_state=encoder_final_state) outputs = tf.contrib.seq2seq.dynamic_decode(decoder=decoder, output_time_major=False, impute_finished=True, maximum_iterations=maximum_iterations) return outputs[0] def decode(helper, scope, reuse=None, maximum_iterations=None): with tf.variable_scope(scope, reuse=reuse): decoder_cell = BasicLSTMCell(2 * d, state_is_tuple=True) # hparam projection_layer = layers_core.Dense(tgt_vocab_size, use_bias=False) # hparam decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, my_encoder_final_state,output_layer=projection_layer) # decoder final_outputs, _ ,_= tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=False, impute_finished=True, maximum_iterations=maximum_iterations) # dynamic decoding return final_outputs # Decoder if config.mode == 'train': #TODO:doesnt seem to be correct to use this variable for dev training_helper = tf.contrib.seq2seq.TrainingHelper(decoder_emb_inp, self.target_sequence_length,time_major=False) #final_outputs = decode(helper=training_helper, scope="HAHA", reuse=None) final_outputs = decode_with_attention(helper=training_helper, scope="HAHA", reuse=None) else: training_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(word_emb_mat, tf.fill([N], self.tgt_sos_id),self.tgt_eos_id) #final_outputs= decode(helper=training_helper, scope="HAHA", reuse=True,maximum_iterations=100) final_outputs= decode_with_attention(helper=training_helper, scope="HAHA", reuse=True,maximum_iterations=100) self.decoder_logits_train = final_outputs.rnn_output self.index_start = flat_logits self.index_end = flat_logits2
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, dc, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.char_emb_size, config.max_word_size H = config.max_tree_height x_mask = self.x > 0 q_mask = self.q > 0 tx_mask = self.tx > 0 # [N, M, H, JX] with tf.variable_scope("char_emb"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] filter = tf.get_variable("filter", shape=[1, config.char_filter_height, dc, d], dtype='float') bias = tf.get_variable("bias", shape=[d], dtype='float') strides = [1, 1, 1, 1] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) xxc = tf.nn.conv2d(Acx, filter, strides, "VALID") + bias # [N*M, JX, W/filter_stride, d] qqc = tf.nn.conv2d(Acq, filter, strides, "VALID") + bias # [N, JQ, W/filter_stride, d] xxc = tf.reshape(tf.reduce_max(tf.nn.relu(xxc), 2), [-1, M, JX, d]) qqc = tf.reshape(tf.reduce_max(tf.nn.relu(qqc), 2), [-1, JQ, d]) with tf.variable_scope("word_emb"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, config.word_emb_size], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, config.word_emb_size], dtype='float') Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] # Ax = linear([Ax], d, False, scope='Ax_reshape') # Aq = linear([Aq], d, False, scope='Aq_reshape') xx = tf.concat(3, [xxc, Ax]) # [N, M, JX, 2d] qq = tf.concat(2, [qqc, Aq]) # [N, JQ, 2d] D = d + config.word_emb_size with tf.variable_scope("pos_emb"): pos_emb_mat = tf.get_variable("pos_emb_mat", shape=[config.pos_vocab_size, d], dtype='float') Atx = tf.nn.embedding_lookup(pos_emb_mat, self.tx) # [N, M, H, JX, d] cell = BasicLSTMCell(D, state_is_tuple=True) cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(q_mask, 'int32'), 1) # [N] with tf.variable_scope("rnn"): (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='start') # [N, M, JX, 2d] tf.get_variable_scope().reuse_variables() (fw_us, bw_us), (_, (fw_u, bw_u)) = bidirectional_dynamic_rnn(cell, cell, qq, q_len, dtype='float', scope='start') # [N, J, d], [N, d] u = (fw_u + bw_u) / 2.0 h = (fw_h + bw_h) / 2.0 with tf.variable_scope("h"): no_op_cell = NoOpCell(D) tree_rnn_cell = TreeRNNCell(no_op_cell, d, tf.reduce_max) initial_state = tf.reshape(h, [N*M*JX, D]) # [N*M*JX, D] inputs = tf.concat(4, [Atx, tf.cast(self.tx_edge_mask, 'float')]) # [N, M, H, JX, d+JX] inputs = tf.reshape(tf.transpose(inputs, [0, 1, 3, 2, 4]), [N*M*JX, H, d + JX]) # [N*M*JX, H, d+JX] length = tf.reshape(tf.reduce_sum(tf.cast(tx_mask, 'int32'), 2), [N*M*JX]) # length = tf.reshape(tf.reduce_sum(tf.cast(tf.transpose(tx_mask, [0, 1, 3, 2]), 'float'), 3), [-1]) h, _ = dynamic_rnn(tree_rnn_cell, inputs, length, initial_state=initial_state) # [N*M*JX, H, D] h = tf.transpose(tf.reshape(h, [N, M, JX, H, D]), [0, 1, 3, 2, 4]) # [N, M, H, JX, D] u = tf.expand_dims(tf.expand_dims(tf.expand_dims(u, 1), 1), 1) # [N, 1, 1, 1, 4d] dot = linear(h * u, 1, True, squeeze=True, scope='dot') # [N, M, H, JX] # self.logits = tf.reshape(dot, [N, M * H * JX]) self.logits = tf.reshape(exp_mask(dot, tx_mask), [N, M * H * JX]) # [N, M, H, JX] self.yp = tf.reshape(tf.nn.softmax(self.logits), [N, M, H, JX])
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) JA = config.max_answer_length JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): # Char-CNN Embedding if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) # Word Embedding if config.use_word_emb: with tf.variable_scope("emb_var") as scope, tf.device( "/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') tf.get_variable_scope().reuse_variables() self.word_emb_scope = scope if config.use_glove_for_unk: word_emb_mat = tf.concat( [word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq # Concat Char-CNN Embedding and Word Embedding if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # exact match if config.use_exact_match: emx = tf.expand_dims(tf.cast(self.emx, tf.float32), -1) xx = tf.concat([xx, emx], 3) # [N, M, JX, di+1] emq = tf.expand_dims(tf.cast(self.emq, tf.float32), -1) qq = tf.concat([qq, emq], 2) # [N, JQ, di+1] # 2 layer highway network on Concat Embedding if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # Bidirection-LSTM (3rd layer on paper) cell = GRUCell(d) if config.GRU else BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), _ = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat([fw_u, bw_u], 2) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h # Attention Flow Layer (4th layer on paper) with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, size=d, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell # Modeling layer (5th layer on paper) tp0 = p0 for layer_idx in range(config.LSTM_num_layers - 1): (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope="g_{}".format(layer_idx)) # [N, M, JX, 2d] p0 = tf.concat([fw_g0, bw_g0], 3) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) # [N, M, JX, 2d] # Self match layer with tf.variable_scope("SelfMatch"): s0 = tf.reshape(g1, [N * M, JX, 2 * d]) # [N * M, JX, 2d] x_mask = tf.reshape(self.x_mask, [N * M, JX]) first_cell = AttentionCell(cell, s0, size=d, mask=x_mask, is_train=self.is_train) (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn( first_cell, first_cell, s0, x_len, dtype='float', scope='s') # [N, M, JX, 2d] s1 = tf.concat([fw_s, bw_s], 2) # [N * M, JX, 2d], M == 1 # prepare for PtrNet encoder_output = tf.expand_dims(s1, 1) # [N, M, JX, 2d] encoder_output = tf.expand_dims( tf.cast(self.x_mask, tf.float32), -1) * encoder_output # [N, M, JX, 2d] if config.GRU: encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: if isinstance(fw_s_f, LSTMStateTuple): encoder_state_c = tf.concat((fw_s_f.c, bw_s_f.c), 1, name='encoder_concat_c') encoder_state_h = tf.concat((fw_s_f.h, bw_s_f.h), 1, name='encoder_concat_h') encoder_state_final = LSTMStateTuple(c=encoder_state_c, h=encoder_state_h) elif isinstance(fw_s_f, tf.Tensor): encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: encoder_state_final = None tf.logging.error("encoder_state_final not set") print("encoder_state_final:", encoder_state_final) with tf.variable_scope("output"): # eos_symbol = config.eos_symbol # next_symbol = config.next_symbol tf.assert_equal( M, 1) # currently dynamic M is not supported, thus we assume M==1 answer_string = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.int32, name='answer_string') # [N, M, JA + 1] answer_string_mask = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.bool, name='answer_string_mask') # [N, M, JA + 1] answer_string_length = tf.placeholder( shape=(N, 1), dtype=tf.int32, name='answer_string_length', ) # [N, M] self.tensor_dict['answer_string'] = answer_string self.tensor_dict['answer_string_mask'] = answer_string_mask self.tensor_dict['answer_string_length'] = answer_string_length self.answer_string = answer_string self.answer_string_mask = answer_string_mask self.answer_string_length = answer_string_length answer_string_flattened = tf.reshape(answer_string, [N * M, JA + 1]) self.answer_string_flattened = answer_string_flattened # [N * M, JA+1] print("answer_string_flattened:", answer_string_flattened) answer_string_length_flattened = tf.reshape( answer_string_length, [N * M]) self.answer_string_length_flattened = answer_string_length_flattened # [N * M] print("answer_string_length_flattened:", answer_string_length_flattened) decoder_cell = GRUCell(2 * d) if config.GRU else BasicLSTMCell( 2 * d, state_is_tuple=True) with tf.variable_scope("Decoder"): decoder_train_logits = ptr_decoder( decoder_cell, tf.reshape(tp0, [N * M, JX, 2 * d]), # [N * M, JX, 2d] tf.reshape(encoder_output, [N * M, JX, 2 * d]), # [N * M, JX, 2d] encoder_final_state=encoder_state_final, max_encoder_length=config.sent_size_th, decoder_output_length= answer_string_length_flattened, # [N * M] batch_size=N, # N * M (M=1) attention_proj_dim=self.config.decoder_proj_dim, scope='ptr_decoder' ) # [batch_size, dec_len*, enc_seq_len + 1] self.decoder_train_logits = decoder_train_logits print("decoder_train_logits:", decoder_train_logits) self.decoder_train_softmax = tf.nn.softmax( self.decoder_train_logits) self.decoder_inference = tf.argmax( decoder_train_logits, axis=2, name='decoder_inference') # [N, JA + 1] self.yp = tf.ones([N, M, JX], dtype=tf.int32) * -1 self.yp2 = tf.ones([N, M, JX], dtype=tf.int32) * -1
def _build_forward(self): #import pdb #pdb.set_trace() with tf.variable_scope("emb"): with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if self.config.mode == 'train': assert self.emb_mat is not None word_emb_mat = tf.get_variable( "word_emb_mat", shape=[ self.vocab_size - np.shape(self.emb_mat)[0], self.config.word_emb_size ], dtype='float', trainable=True) pretrained_emb_mat = tf.get_variable( "pretrained_emb_mat", shape=[ np.shape(self.emb_mat)[0], self.config.word_emb_size ], dtype='float', initializer=get_initializer(self.emb_mat), trainable=self.config.fine_tune) else: word_emb_mat = tf.get_variable( "word_emb_mat", shape=[ self.vocab_size - np.shape(self.emb_mat)[0], self.config.word_emb_size ], dtype='float') pretrained_emb_mat = tf.get_variable( "pretrained_emb_mat", shape=[ np.shape(self.emb_mat)[0], self.config.word_emb_size ], dtype='float') word_emb_mat = tf.concat( values=[word_emb_mat, pretrained_emb_mat], axis=0) with tf.name_scope("word"): Aq = tf.nn.embedding_lookup(word_emb_mat, self.que) # [N, JQ, d] candidates = tf.nn.embedding_lookup(word_emb_mat, self.c) Aq = nn.dropout(Aq, self.config.keep_prob, self.is_train) question, _ = ops.m_cudnn_lstm( inputs=Aq, num_layers=1, hidden_size=self.config.hidden_size, weight_noise_std=None, is_training=self.is_train, scope='question', input_drop_prob=0, i_m="linear_input", use_gru=self.config.use_gru) # [N,M,JX,d] question = nn.dropout(question, self.config.keep_prob, self.is_train) with tf.variable_scope("summary"): rc_logits = tf.layers.dense(question, 1, activation=None) rc_logits = nn.dropout(rc_logits, self.config.keep_prob, self.is_train) rc_logits = tf.nn.softmax(rc_logits, 1) question = tf.reduce_sum(question * rc_logits, 1) # [N,dim] with tf.variable_scope('get_answer'): self.prediction = self._answer_layer(question, candidates)
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: # 计算字符emb with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) # TODO What? heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: with tf.variable_scope(tf.get_variable_scope(), reuse=True): qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer( config.emb_mat)) # emb_mat is glove else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( [word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) with tf.variable_scope(tf.get_variable_scope(), reuse=True): qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat([fw_u, bw_u], 2) if config.share_lstm_weights: with tf.variable_scope(tf.get_variable_scope(), reuse=True): (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] TODO JX == x_len? h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat([fw_g0, bw_g0], 3) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) # Output Layer logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat([p0, g1, a1i, g1 * a1i], 3), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat([fw_g2, bw_g2], 3) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
def _build_forward(self): config = self.config N = config.batch_size M = config.max_num_sents JX = config.max_sent_size JQ = config.max_ques_size VW = config.word_vocab_size VC = config.char_vocab_size W = config.max_word_size d = config.hidden_size JX = tf.shape(self.x)[2] # JX max sentence size, length, JQ = tf.shape(self.q)[1] # JQ max questions size, length, is the M = tf.shape(self.x)[1] # m is the max number of sentences dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size # dc = 8, each char will be map to 8-number vector, "char-level word embedding size [100]" with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') # 330,8 a matrix for each char to its 8-number vector with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] 60,None,None,16,8, batch-size, # N is the number of batch_size # M the max number of sentences # JX is the max sentence length # W is the max length of a word # dc is the vector for each char # map each char to a vector Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] # JQ the max length of question # W the max length of words # mao each char in questiosn to vectors Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) # max questions size, length, max_word_size(16), char_emb_size(8) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) # so here, there are 100 filters and the size of each filter is 5 # different heights and there are different number of these filter, but here just 100 5-long filters assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape( qq, [-1, JQ, dco ]) # here, xx and qq are the output of cnn, if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: # create a new word embedding or use the glove? word_emb_mat = tf.concat( [word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # here we used cnn and word embedding represented each word with a 200-unit vector # so for, xx, (batch_size, sentence#, word#, embedding), qq (batch_size, word#, embedding) # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # same shape with line 173 cell = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M], [60,?] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] [60] # masks are true and false, here, he sums up those truths, with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat( [fw_u, bw_u], 2) # (60, ?, 200) | 200 becahse combined 2 100 hidden states if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u # [60, ?, 200] for question self.tensor_dict['h'] = h # [60, ?, ?, 200] for article with tf.variable_scope("main"): if config.dynamic_att: # todo what is this dynamic attention. p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) cell2 = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number first_cell = SwitchableDropoutWrapper( cell2, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, inputs=p0, sequence_length=x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat([fw_g0, bw_g0], 3) cell3 = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number first_cell3 = SwitchableDropoutWrapper( cell3, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell3, first_cell3, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) cell4 = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number first_cell4 = SwitchableDropoutWrapper( cell4, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( first_cell4, first_cell4, tf.concat([p0, g1, a1i, g1 * a1i], 3), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat([fw_g2, bw_g2], 3) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JQ = JX print('VC:{} NEW_EMB:{}'.format(VW, self.new_emb_mat.get_shape())) dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq xx = tf.reshape(xx, [-1, M, JX, d]) qq = tf.reshape(qq, [-1, JQ, d]) if config.use_pos_emb: with tf.variable_scope("pos_onehot"), tf.device("/cpu:0"): pos_x = tf.one_hot( self.x_pos, depth=config.pos_tag_num) # [N,M,JX,depth] pos_q = tf.one_hot( self.q_pos, depth=config.pos_tag_num) # [N,JQ,depth] xx = tf.concat(axis=3, values=[xx, pos_x]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, pos_q]) if config.use_sem_emb: with tf.variable_scope("sem_onehot"), tf.device("/cpu:0"): sem_x = tf.one_hot(self.x_sem, depth=3) # [N,M,JX,3] sem_q = tf.one_hot(self.q_sem, depth=3) # [N,JQ,3] xx = tf.concat(axis=3, values=[xx, sem_x]) qq = tf.concat(axis=2, values=[qq, sem_q]) if config.use_neg_emb: with tf.variable_scope("neg_onehot"), tf.device("/cpu:0"): neg_x = tf.one_hot(self.x_neg, depth=2) # [N,M,JX,2] neg_q = tf.one_hot(self.q_neg, depth=2) # [N,JQ,2] xx = tf.concat(axis=3, values=[xx, neg_x]) qq = tf.concat(axis=2, values=[qq, neg_q]) if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell_fw2 = BasicLSTMCell(d, state_is_tuple=True) cell_bw2 = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw2 = SwitchableDropoutWrapper( cell_fw2, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw2 = SwitchableDropoutWrapper( cell_bw2, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] if config.lstm: with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] print('fw_u_f hsape:{}'.format(fw_u_f.get_shape())) u = tf.concat(axis=2, values=[fw_u, bw_u]) #[N,JQ,2d] if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] print('fw_u_f nn hsape:{}'.format(fw_u_f.get_shape())) else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h else: h = xx u = qq h1 = h[:, 0, :, :] h2 = h[:, 1, :, :] h3 = h[:, 2, :, :] h4 = h[:, 3, :, :] n_1 = tf.reshape(self.x_mask[:, 0, :], [N, JX]) n_2 = tf.reshape(self.x_mask[:, 1, :], [N, JX]) n_3 = tf.reshape(self.x_mask[:, 2, :], [N, JX]) n_4 = tf.reshape(self.x_mask[:, 3, :], [N, JX]) if config.self_attention: with tf.variable_scope("h_self_weight"): print(h.get_shape()) for i in range(2): with tf.variable_scope("self-attention"): h1 = self_attention_layer( config, self.is_train, h1, p_mask=tf.expand_dims(n_1, -1), scope="{}_layer_self_att_enc_e".format( i)) # [N, len, dim] tf.get_variable_scope().reuse_variables() h2 = self_attention_layer( config, self.is_train, h2, p_mask=tf.expand_dims(n_2, -1), scope="{}_layer_self_att_enc_e".format(i)) tf.get_variable_scope().reuse_variables() h3 = self_attention_layer( config, self.is_train, h3, p_mask=tf.expand_dims(n_3, -1), scope="{}_layer_self_att_enc_e".format(i)) tf.get_variable_scope().reuse_variables() h4 = self_attention_layer( config, self.is_train, h4, p_mask=tf.expand_dims(n_4, -1), scope="{}_layer_self_att_enc_e".format(i)) with tf.variable_scope("self-attention"): u = self_attention_layer( config, self.is_train, u, p_mask=tf.expand_dims(self.q_mask, -1), scope="{}_layer_self_att_enc_p".format(i)) if config.plot_encoder == "concate": h = tf.concat([h1, h2, h3, h4], axis=1) print("h concate shape".format(h.get_shape())) n_n = tf.concat([n_1, n_2, n_3, n_4], axis=1) elif config.plot_encoder == "sum": h1 = tf.expand_dims(h1, axis=1) h2 = tf.expand_dims(h2, axis=1) h3 = tf.expand_dims(h3, axis=1) h4 = tf.expand_dims(h4, axis=1) h = tf.concat([h1, h2, h3, h4], axis=1) h = tf.reduce_sum(h, axis=1) print("h sum shape".format(h.get_shape())) elif config.plot_encoder == "lstm": # h1 = tf.reduce_sum(h1, axis=1) h1 = tf.expand_dims(tf.reduce_sum(h1, axis=-1), axis=1) h2 = tf.expand_dims(tf.reduce_sum(h2, axis=-1), axis=1) h3 = tf.expand_dims(tf.reduce_sum(h3, axis=-1), axis=1) h4 = tf.expand_dims(tf.reduce_sum(h4, axis=-1), axis=1) (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw2, d_cell_bw2, tf.concat([h1, h2, h3, h4], axis=1), dtype='float', scope='1') # [N, J, d], [N, d] print('fw_u_f hsape:{}'.format(fw_u_f.get_shape())) h = tf.concat(axis=2, values=[fw_u, bw_u]) # [N,JQ,2d] u = tf.expand_dims(tf.reduce_sum(u, axis=-1), axis=1) tf.get_variable_scope().reuse_variables() (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw2, d_cell_bw2, tf.concat([u], axis=1), dtype='float', scope='1') # [N, J, d], [N, d] print('fw_u_f hsape:{}'.format(fw_u_f.get_shape())) u = tf.concat(axis=2, values=[fw_u, bw_u]) # [N,JQ,2d] if config.interact: with tf.variable_scope("interact"): def get_attention(h, u, m): JX = tf.shape(h)[1] JQ = tf.shape(u)[1] h = tf.expand_dims(h, 2) u = tf.expand_dims(u, 1) h = tf.tile(h, [1, 1, JQ, 1]) u = tf.tile(u, [1, JX, 1, 1]) attention = h * u # N,JX,JQ,2d return attention if config.plot_encoder == "concate": attention = get_attention(h, u, M) else: attention = get_attention(h, u, 1) with tf.variable_scope('conv_dense'): if config.plot_encoder == "concate": out_final = dense_net(config, attention, self.is_train) else: out_final = tf.reshape(attention, shape=[N, -1]) else: h = tf.reshape(h, [-1, M * 2 * d * JX]) print("h shape {}".format(h.get_shape())) u = tf.reshape(u, [-1, 2 * d * JQ]) print("U shape {}".format(u.get_shape())) attention = tf.concat([h, u], axis=-1) out_final = attention out_final = linear(tf.concat([attention], axis=-1), 1000, True, bias_start=0.0, scope="logit8", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) out_final = linear(tf.concat([out_final], axis=-1), 400, True, bias_start=0.0, scope="logit9", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) out_final = linear(out_final, 300, True, bias_start=0.0, scope="logit3", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) with tf.variable_scope('conv_dense'): if config.hao: out_final = linear(tf.concat( [out_final, self.haoruopeng_feature], axis=-1), 200, True, bias_start=0.0, scope="logit", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) out_final = linear(out_final, 100, True, bias_start=0.0, scope="logit3", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) else: out_final = linear(tf.concat([out_final], axis=-1), 200, True, bias_start=0.0, scope="logit", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = linear(out_final, 100, True, bias_start=0.0, scope="logit3", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) self.tensor_dict['outfinal'] = out_final self.prediction = linear(tf.concat([out_final], axis=-1), 1, True, bias_start=0.0, scope="logit2", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train)
def _build_forward(self): #config为预先配置好的参数等 config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size #嵌入层 with tf.variable_scope("emb"): #字符嵌入层 if config.use_char_emb: #若需要字符嵌入层 with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) #CNN的滤波器参数 filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) #词嵌入层 if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: #若调用已训练好的词嵌入文件 word_emb_mat = tf.concat( 0, [word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): #将文章主体context:x和问题query:q转换为词向量 #embedding_lookup(params, ids),根据ids寻找params中的第id行 Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: #若进行了字符嵌入,在指定维度上将字符嵌入和词嵌入进行拼接 xx = tf.concat(3, [xx, Ax]) # [N, M, JX, di] qq = tf.concat(2, [qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # 经过两层highway network得到context vector∈ R^(d*T)和query vectorQ∈R^(d∗J) if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell = BasicLSTMCell(d, state_is_tuple=True) #SwitchableDropoutWrapper为自定义的DropoutWrapper类 d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) #reduce_sum在指定的维度上求和(得到x和q的非空值总数),cast将输入的tensor映射到指定类型(此处为x_mask到int32) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] #Contextual Embedding Layer:对上一层得到的X和Q分别使用BiLSTM进行处理,分别捕捉X和Q中各自单词间的局部关系 with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] #fw_u和bw_u分别为双向lstm的output u = tf.concat(2, [fw_u, bw_u]) #[N, J, 2d] if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h #核心层Attention Flow Layer with tf.variable_scope("main"): if config.dynamic_att: p0 = h #expand_dims()在矩阵指定位置增加维度 #tile()对矩阵的指定维度进行复制 u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [ N * M, JQ, 2 * d ]) #先在索引1的位置添加一个维度,然后复制M(context中最多的sentence数量)次,使u和h能具有相同的维度 q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat(3, [p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(3, [fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2