def att_match(mid, pat, mid_mask, pat_mask, hidden, keep_prob, is_train): mid_d = dropout(mid, keep_prob=keep_prob, is_train=is_train) pat_d = dropout(pat, keep_prob=keep_prob, is_train=is_train) mid_a = attention(mid_d, hidden, mask=mid_mask) pat_a = attention(pat_d, hidden, mask=pat_mask) mid_v = tf.reduce_sum(tf.expand_dims(mid_a, axis=2) * mid, axis=1) pat_v = tf.reduce_sum(tf.expand_dims(pat_a, axis=2) * pat, axis=1) pat_v_d = tf.reduce_sum(tf.expand_dims(pat_a, axis=2) * pat_d, axis=1) sur_sim = cosine(mid_v, pat_v_d) pat_sim = cosine(pat_v, pat_v_d) return sur_sim, pat_sim
def mean_match(mid, pat, mid_mask, pat_mask, keep_prob, is_train): pat_d = dropout(pat, keep_prob=keep_prob, is_train=is_train) mid_v = mean(mid, mask=mid_mask) pat_v = mean(pat, mask=pat_mask) pat_v_d = mean(pat_d, mask=pat_mask) sur_sim = cosine(mid_v, pat_v, weighted=False) pat_sim = cosine(pat_v, pat_v_d, weighted=False) return sur_sim, pat_sim
def lstm_match(mid, pat, mid_mask, pat_mask, mid_len, pat_len, hidden, keep_prob, is_train): rnn = Cudnn_RNN(num_layers=1, num_units=hidden // 2) mid, _ = rnn(mid, seq_len=mid_len, concat_layers=False) pat, _ = rnn(pat, seq_len=pat_len, concat_layers=False) mid_d = dropout(mid, keep_prob=keep_prob, is_train=is_train) pat_d = dropout(pat, keep_prob=keep_prob, is_train=is_train) mid_a = attention(mid_d, hidden, mask=mid_mask) pat_a = attention(pat_d, hidden, mask=pat_mask) mid_v = tf.reduce_sum(tf.expand_dims(mid_a, axis=2) * mid, axis=1) pat_v = tf.reduce_sum(tf.expand_dims(pat_a, axis=2) * pat, axis=1) pat_v_d = tf.reduce_sum(tf.expand_dims(pat_a, axis=2) * pat_d, axis=1) sur_sim = cosine(mid_v, pat_v_d) pat_sim = cosine(pat_v, pat_v_d) return sur_sim, pat_sim
def __init__(self, cell, num_layers, num_units, batch_size, input_size, keep_prob=1.0, is_train=None, scope="native_rnn"): self.num_layers = num_layers self.cell_type = cell self.inits = [] self.dropout_mask = [] self.num_units = num_units self.scope = scope for layer in range(num_layers): input_size_ = input_size if layer == 0 else 2 * num_units init_fw = rnn.get_cell(cell, num_units).get_init_state( shape=[batch_size], scope="fw_{}".format(layer)) init_bw = rnn.get_cell(cell, num_units).get_init_state( shape=[batch_size], scope="bw_{}".format(layer)) mask_fw = dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train, mode=None) mask_bw = dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train, mode=None) self.inits.append(( init_fw, init_bw, )) self.dropout_mask.append(( mask_fw, mask_bw, ))
def ready(self): config = self.config x, senti, neg_senti, negation = self.x, self.senti, self.neg_senti, self.negation word_mat, op_word_mat = self.word_mat, self.op_word_mat score_scale = config.score_scale with tf.variable_scope("encoder"): x = tf.nn.embedding_lookup(word_mat, x) x = tf.expand_dims(x, -1) x = dropout(x, keep_prob=config.keep_prob, is_train=self.is_train) pooled_outputs = [] for f_size in self.filter_sizes: conv = tf.layers.conv2d(x, filters=self.num_filters, kernel_size=[f_size, self.emb_dim], strides=(1, 1), padding='VALID', activation=tf.nn.relu) pool = tf.layers.max_pooling2d( conv, pool_size=[config.max_len - f_size + 1, 1], strides=(1, 1), padding='VALID') pooled_outputs.append(pool) h_pool = tf.concat(pooled_outputs, 3) h_pool_flat = tf.reshape( h_pool, [-1, self.num_filters * len(self.filter_sizes)]) h_drop = dropout(h_pool_flat, config.keep_prob, is_train=self.is_train) with tf.variable_scope("predict"): logit = tf.layers.dense(h_drop, config.score_scale, activation=None) self.prob = tf.nn.softmax(logit) self.pred = tf.argmax(self.prob, axis=-1) self.golden = self.y self.loss = tf.reduce_mean( tf.reduce_sum(-self.golden * tf.log(self.prob + 1e-6), axis=1)) with tf.variable_scope("decoder"): senti_emb = tf.nn.embedding_lookup(op_word_mat, senti) self.senti_emb = senti_emb neg_senti_emb = tf.nn.embedding_lookup(op_word_mat, neg_senti) self.neg_senti_emb = neg_senti_emb self.vae_loss, entropy_term, self.W_decoder, self.u, self.u_neg_sample, self.log_u, self.log_u_neg_sample = selectional_preference( senti_emb, neg_senti_emb, negation, self.prob, score_scale) self.entropy_term_loss = tf.multiply(self.alpha, entropy_term, name="entropy_term") opinion_reg, self.similarity, self.b_x_b_mean, self.b_x_b_min, self.b_x_b_max = get_regularizer_score_pairwise( config, self.prob, senti_emb, negation) self.opinion_reg_loss = tf.multiply( self.beta, opinion_reg, name="opinion_words_regulazation")
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout( ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout( qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention( att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) #[10, ?,300] with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list( )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("content_modeling"): logits4, c_semantics = content_model(init, match, config.hidden) with tf.variable_scope("cross_passage_attention"): self.query_num = int(config.batch_size/config.passage_num) c_semantics = tf.reshape(c_semantics, shape=[self.query_num, config.passage_num, -1]) attnc_key = tf.tile(tf.expand_dims(c_semantics, axis=2), [1, 1, config.passage_num, 1]) attnc_mem = tf.tile(tf.expand_dims(c_semantics, axis=1), [1, config.passage_num, 1, 1]) attnc_w = tf.reduce_sum(attnc_key*attnc_mem, axis=-1) attnc_mask = tf.ones([config.passage_num, config.passage_num])-tf.diag([1.0]*config.passage_num) attnc_w = tf.nn.softmax(attnc_w*attnc_mask, axis=-1) attncp = tf.reduce_sum(tf.tile(tf.expand_dims(attnc_w, axis=-1), [1, 1, 1, 2*config.hidden])*attnc_mem, axis= 2) with tf.variable_scope("pseudo_label"): self.is_select = tf.reshape(tf.squeeze(self.is_select), shape=[self.query_num, config.passage_num]) self.is_select = self.is_select/tf.tile(tf.reduce_sum(self.is_select, axis=-1, keepdims=True), [1, config.passage_num]) sim_matrix = attnc_w lb_matrix = tf.tile(tf.expand_dims(self.is_select, axis=1), [1, config.passage_num, 1]) self.pse_is_select = tf.reduce_sum(sim_matrix*lb_matrix, axis=-1) + tf.constant([0.00000001]*config.passage_num, dtype=tf.float32) # avoid all zero self.pse_is_select = self.pse_is_select/tf.tile(tf.reduce_sum(self.pse_is_select, axis=-1, keepdims=True), [1,config.passage_num]) alpha = 0.7 self.fuse_label = alpha*self.is_select + (1-alpha)*tf.stop_gradient(self.pse_is_select) with tf.variable_scope("predict_passage"): init = tf.reshape(init, shape=[self.query_num, config.passage_num, -1]) attn_concat = tf.concat([init, attncp, c_semantics], axis=-1) d1 = tf.layers.dense(attn_concat, 2*config.hidden, activation= tf.nn.leaky_relu, bias_initializer= tf.glorot_uniform_initializer()) #150 d2 = tf.layers.dense(d1, config.hidden, activation= tf.nn.leaky_relu, bias_initializer= tf.glorot_uniform_initializer()) #75 logits3 = tf.squeeze(tf.layers.dense(d2, 1, activation= None, bias_initializer= tf.glorot_uniform_initializer())) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 30) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) #logits3 = tf.reduce_max(tf.reduce_max(outer, axis=2), axis=1) self.is_select_p = tf.nn.sigmoid(logits3) losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=tf.stop_gradient(self.y1)) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=tf.stop_gradient(self.y2)) weighted_losses = weighted_loss(config, 0.000001, self.y1, losses) #0.01 weighted_losses2 = weighted_loss(config, 0.000001, self.y2, losses2) #0.01 losses3 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits3, labels=tf.stop_gradient(self.fuse_label))) in_answer_weight = tf.ones_like(self.in_answer) + 3*self.in_answer losses4 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( logits=logits4, labels=tf.stop_gradient(self.in_answer))*in_answer_weight, axis=-1) weighted_losses4 = weighted_loss(config, 0.000001, self.in_answer, losses4) self.loss_dict = {'pos_s loss':losses, 'pos_e loss':losses2, 'select loss':losses3, 'in answer':losses4} for key, values in self.loss_dict.items(): self.loss_dict[key] = tf.reduce_mean(values) self.loss = tf.reduce_mean(weighted_losses + weighted_losses2 + losses3+ weighted_losses4)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru gi = [] att_vP = [] for i in range(config.max_para): print(i) with tf.variable_scope("emb" + str(i)): with tf.variable_scope("char" + str(i)): #CL = tf.Print(CL,[CL],message="CL:") #PL = tf.Print(PL,[PL],message="PL:") #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr.get_shape()],message="ch_pr:") self.ch_pr_ = self.ch_pr[:, i * 400:(i + 1) * 400, :] print(self.ch_pr_.get_shape()) #self.c_pr = tf.reshape(self.c_pr, [N, 12, PL]) #print(self.ch.get_shape()) #print(self.ch_pr.get_shape()) #print(self.c.get_shape()) #print(self.c_pr.get_shape()) #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr[:,2:,:]],message="ch_pr") ch_emb = tf.reshape(tf.nn.embedding_lookup(\ self.char_mat, self.ch_pr_), [N * PL, CL, dc]) # self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb") #qh_emb = tf.Print(qh_emb,[qh_emb],message="qh_emb") qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) #state_fw = tf.Print(state_fw,[state_fw],message="state_fw") #state_bw = tf.Print(state_bw,[state_bw],message="state_bw") qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb") with tf.name_scope("word" + str(i)): c_emb = tf.nn.embedding_lookup( self.word_mat, self.c_pr[:, i * 400:(i + 1) * 400]) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding" + str(i)): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention" + str(i)): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) # att is the v_P if i == 0: att_vP = att else: att_vP = tf.concat([att_vP, att], axis=1) #att = tf.Print(att,[att],message="att:") print("att:", att.get_shape().as_list()) print("att_vP:", att_vP.get_shape().as_list()) #att_vP = tf.Print(att_vP,[tf.shape(att_vP)],message="att_vP:") """ with tf.variable_scope("match"): self_att = dot_attention( att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) """ with tf.variable_scope("pointer"): # r_Q: init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) print("rQ:", init.get_shape().as_list()) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, att, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) #losses1_2 = tf.reduce_mean(losses1_2, axis=0) self.loss = tf.reduce_mean(losses + losses2) # print losses #condition = tf.greater(self.loss, 11) #self.yp1 = tf.where(condition, tf.Print(self.yp1,[self.yp1],message="Yp1:"), self.yp1) #self.yp2 = tf.where(condition, tf.Print(self.yp2,[self.yp2],message="Yp2:"), self.yp1) if config.with_passage_ranking: gi = None for i in range(config.max_para): # Passage ranking with tf.variable_scope("passage-ranking-attention" + str(i)): #att_vP = tf.Print(att_vP,[att_vP.get_shape()],message="att_vP:") vj_P = att_vP[:, i * 400:(i + 1) * 400, :] pr_att = pr_attention( batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) r_P = pr_att(init, vj_P, d, self.c_mask) #r_P = tf.Print(r_P,[r_P],message="r_p") # Wg concatenate = tf.concat([init, r_P], axis=1) g = tf.nn.tanh( dense(concatenate, hidden=d, use_bias=False, scope="g" + str(i))) g_ = dense(g, 1, use_bias=False, scope="g_" + str(i)) #g = tf.Print(g,[g],message="g") if i == 0: gi = tf.reshape(g_, [N, 1]) else: gi = tf.concat([gi, tf.reshape(g_, [N, 1])], axis=1) #gi_ = tf.convert_to_tensor(gi,dtype=tf.float32) #self.gi = tf.nn.softmax(gi_) #self.losses3 = tf.nn.softmax_cross_entropy_with_logits( # logits=gi_, labels=tf.reshape(self.pr,[-1,1])) self.losses3 = tf.nn.softmax_cross_entropy_with_logits( logits=gi, labels=self.pr) #self.losses3 = tf.Print(self.losses3,[self.losses3,tf.reduce_max(self.losses3), # tf.reduce_max(self.pr),tf.reduce_max(gi)],message="losses3:") self.pr_loss = tf.reduce_mean(self.losses3) #self.pr_loss = tf.Print(self.pr_loss,[self.pr_loss]) self.r = tf.constant(0.8) self.e_loss1 = tf.multiply(self.r, self.loss) self.e_loss2 = tf.multiply(tf.subtract(tf.constant(1.0), self.r), self.pr_loss) self.e_loss = tf.add(self.e_loss1, self.e_loss2)
def ready(self): config = self.config x, w_mask, w_len, num_sent, senti, weight, neg_senti = self.x, self.w_mask, self.w_len, self.sent_num, self.senti, self.weight, self.neg_senti word_mat, asp_word_mat, query_mat = self.word_mat, self.asp_word_mat, self.query_mat num_aspect = self.num_aspect score_scale = config.score_scale batch = tf.floordiv(tf.shape(x)[0], num_sent) query_mat = tf.reshape(query_mat, [config.num_aspects, -1, config.emb_dim]) with tf.variable_scope("word_level"): x = dropout(tf.nn.embedding_lookup(word_mat, x), keep_prob=config.keep_prob, is_train=self.is_train) x = cudnn_lstm(x, config.hidden // 2, sequence_length=w_len) query = tf.tanh(dense(query_mat, config.hidden)) doc = tf.expand_dims(x, axis=0) mask = tf.expand_dims(w_mask, axis=0) att = iter_attention(query, doc, mask, hop=config.hop_word) att = tf.reshape(att, [ num_aspect * batch, num_sent, config.hidden * config.hop_word ]) with tf.variable_scope("sent_level"): att = dropout(att, keep_prob=config.keep_prob, is_train=self.is_train) att = cudnn_lstm(att, config.hidden // 2) query = tf.tanh(dense(query_mat, config.hidden)) doc = tf.reshape(att, [num_aspect, batch, num_sent, config.hidden]) att = iter_attention(query, doc, hop=config.hop_sent) with tf.variable_scope("predict"): probs = [] att = dropout(att, keep_prob=config.keep_prob, is_train=self.is_train) aspects = [config.aspect] if config.unsupervised else list( range(num_aspect)) for i in aspects: with tf.variable_scope("aspect_{}".format(i)): probs.append(tf.nn.softmax(dense(att[i], score_scale))) self.prob = tf.stack(probs, axis=0) self.pred = tf.argmax(self.prob, axis=2) self.golden = self.y if config.overall else self.ay self.loss = tf.reduce_sum( tf.reduce_mean(tf.reduce_sum(-self.golden * tf.log(self.prob + 1e-6), axis=2), axis=1)) with tf.variable_scope("decoder"): sent_emb = tf.nn.embedding_lookup(asp_word_mat, senti) neg_sent_emb = tf.nn.embedding_lookup(asp_word_mat, neg_senti) self.r_loss, self.u_loss = selectional_preference( sent_emb, neg_sent_emb, weight, self.prob[0], score_scale, alpha=config.alpha)
def get_vp(self, i): config = self.config gru = cudnn_gru if config.use_cudnn else native_gru opt = True MPL = config.single_para_limit zero = tf.constant(0) i_ = tf.constant(i) start = i * MPL end = (i + 1) * MPL c_pr = self.c_pr[:, start:end] ch_pr = self.ch_pr[:, start:end, :] # local masks c_mask = tf.cast(c_pr, tf.bool) q_mask = tf.cast(self.q, tf.bool) c_len = tf.reduce_sum(tf.cast(c_mask, tf.int32), axis=1) q_len = tf.reduce_sum(tf.cast(q_mask, tf.int32), axis=1) """ ### this line will replace the c_len with values 8 as it is some # unnecessary padding from the examples which does not have # passages with the same number as the max number of passage in the batch eight_indexes = tf.not_equal(c_len, tf.constant(8,dtype=tf.int32)) eight_indexes = tf.cast(eight_indexes,tf.int32) c_len = c_len*eight_indexes """ if opt: N, CL = config.batch_size, config.char_limit c_maxlen = tf.reduce_max(c_len) q_maxlen = tf.reduce_max(q_len) c_pr = tf.slice(c_pr, [0, 0], [N, c_maxlen]) q = tf.slice(self.q, [0, 0], [N, q_maxlen]) c_mask = tf.slice(c_mask, [0, 0], [N, c_maxlen]) q_mask = tf.slice(q_mask, [0, 0], [N, q_maxlen]) ch_pr = tf.slice(ch_pr, [0, 0, 0], [N, c_maxlen, CL]) qh = tf.slice(self.qh, [0, 0, 0], [N, q_maxlen, CL]) y1 = tf.slice(self.y1, [0, 0], [N, c_maxlen]) y2 = tf.slice(self.y2, [0, 0], [N, c_maxlen]) seq_mask = tf.sequence_mask(c_len, maxlen=c_maxlen) else: self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit ch_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(ch_pr, tf.bool), tf.int32), axis=2), [-1]) qh_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(qh, tf.bool), tf.int32), axis=2), [-1]) N, PL, QL, CL, d, dc, dg = config.batch_size, c_maxlen, q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, ch_pr), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) #self.cell_fw = tf.contrib.rnn.GRUCell(dg) #self.cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( self.cell_fw, self.cell_bw, ch_emb, ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( self.cell_fw, self.cell_bw, qh_emb, qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, c_pr) q_emb = tf.nn.embedding_lookup(self.word_mat, q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): #gru1 = lambda: gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( # ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) #self.rnn1 = tf.cond(tf.equal(i_,zero), gru1, lambda: self.rnn1) #c = self.rnn1(c_emb, seq_len=c_len) #q = self.rnn1(q_emb, seq_len=q_len) if i == 0: self.rnn1 = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.q_enc = self.rnn1(q_emb, seq_len=q_len) c = self.rnn1(c_emb, seq_len=c_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, self.q_enc, mask=q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train, name_scope="attention_layer") #gru2 = lambda: gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( # ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) #self.rnn2 = tf.cond(tf.equal(i_,zero), gru2, lambda: self.rnn2) #att = self.rnn2(qc_att, seq_len=c_len) if i == 0: self.rnn2 = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = self.rnn2(qc_att, seq_len=c_len) return att, c_len, c_mask, y1, y2, seq_mask
def ready(self): config = self.config d = config.hidden batch_size = tf.shape(self.sent)[0] sent_mask = tf.cast(self.sent, tf.bool) sent_len = tf.reduce_sum(tf.cast(sent_mask, tf.int32), axis=1) sent_maxlen = tf.reduce_max(sent_len) sent_mask = tf.slice(sent_mask, [0, 0], [batch_size, sent_maxlen]) sent = tf.slice(self.sent, [0, 0], [batch_size, sent_maxlen]) mid_mask = tf.cast(self.mid, tf.bool) mid_len = tf.reduce_sum(tf.cast(mid_mask, tf.int32), axis=1) mid_maxlen = tf.reduce_max(mid_len) mid_mask = tf.slice(mid_mask, [0, 0], [batch_size, mid_maxlen]) mid = tf.slice(self.mid, [0, 0], [batch_size, mid_maxlen]) pat_mask = tf.cast(self.pats, tf.bool) pat_len = tf.reduce_sum(tf.cast(pat_mask, tf.int32), axis=1) with tf.variable_scope("embedding"): sent_emb = tf.nn.embedding_lookup(self.word_mat, sent) mid_emb = tf.nn.embedding_lookup(self.word_mat, mid) sent_emb = dropout(sent_emb, keep_prob=config.word_keep_prob, is_train=self.is_train, mode="embedding") pat_emb = tf.nn.embedding_lookup(self.word_mat, self.pats) with tf.variable_scope("encoder"): rnn = Cudnn_RNN(num_layers=2, num_units=d // 2) cont, _ = rnn(sent_emb, seq_len=sent_len, concat_layers=False) pat, _ = rnn(pat_emb, seq_len=pat_len, concat_layers=False) cont_d = dropout(cont, keep_prob=config.keep_prob, is_train=self.is_train) pat_d = dropout(pat, keep_prob=config.keep_prob, is_train=self.is_train) with tf.variable_scope("attention"): att_a = attention(cont_d, config.att_hidden, mask=sent_mask) pat_a = self.pat_a = attention(pat_d, config.att_hidden, mask=pat_mask) with tf.variable_scope("sim"): sim, pat_sim = att_match(mid_emb, pat_emb, mid_mask, pat_mask, d, keep_prob=config.keep_prob, is_train=self.is_train) neg_idxs = tf.matmul(self.rels, tf.transpose(self.rels, [1, 0])) pat_pos = tf.square(tf.maximum(config.tau - pat_sim, 0.)) pat_pos = tf.reduce_max(pat_pos - (1 - neg_idxs) * 1e30, axis=1) pat_neg = tf.square(tf.maximum(pat_sim, 0.)) pat_neg = tf.reduce_max(pat_neg - 1e30 * neg_idxs, axis=1) l_sim = tf.reduce_sum(self.weight * (pat_pos + pat_neg), axis=0) with tf.variable_scope("pred"): att2_d = tf.reduce_sum(tf.expand_dims(att_a, axis=2) * cont_d, axis=1) pat2_d = tf.reduce_sum(tf.expand_dims(pat_a, axis=2) * pat_d, axis=1) logit = self.logit = dense(att2_d, config.num_class, use_bias=False) pred = tf.nn.softmax(logit) l_a = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( logits=logit[:config.batch_size], labels=self.rel[:config.batch_size]), axis=0) xsim = tf.stop_gradient(sim[config.batch_size:]) pseudo_rel = tf.gather(self.rels, tf.argmax(xsim, axis=1)) bound = tf.reduce_max(xsim, axis=1) weight = tf.nn.softmax(10 * bound) l_u = tf.reduce_sum( weight * tf.nn.softmax_cross_entropy_with_logits_v2( logits=logit[config.batch_size:], labels=pseudo_rel), axis=0) logit = dense(pat2_d, config.num_class, use_bias=False) l_pat = self.pat_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( logits=logit, labels=self.rels), axis=0) self.max_val = tf.reduce_sum(pred * -log(pred), axis=1) self.pred = tf.argmax(pred, axis=1) self.loss = l_a + config.alpha * l_pat + config.beta * l_sim + config.gamma * l_u self.sim_pred = tf.argmax(tf.gather(self.rels, tf.argmax(self.sim, axis=1)), axis=1) self.sim_max_val = tf.reduce_max(self.sim, axis=1) self.gold = tf.argmax(self.rel, axis=1) self.max_logit = tf.reduce_max(self.logit, axis=1)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout( ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout( qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) self.c_emb = tf.stop_gradient(c_emb) self.q_emb = tf.stop_gradient(q_emb) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.c_rnn = c = rnn(c_emb, seq_len=self.c_len) self.q_rnn = q = rnn(q_emb, seq_len=self.q_len) c = tf.stop_gradient(c) q = tf.stop_gradient(q) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.att = [rnn(qc_att, seq_len=self.c_len)] self.att += [self.att[-1][:,-1,:]] with tf.variable_scope("binary"): for _ in range(3): self.att += [tf.nn.dropout(tf.keras.layers.Dense(300)(self.att[-1]), keep_prob=config.keep_prob)] with tf.variable_scope("badptr"): init = self.att[-1] pointer = ptr_net(batch=N, hidden=init.get_shape().as_list( )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, self.att[0], d, self.c_mask) with tf.variable_scope("badptr_predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1_distrib = tf.reduce_max(outer, axis=2) self.yp2_distrib = tf.reduce_max(outer, axis=1) self.yp1 = tf.argmax(self.yp1_distrib, axis=1) self.yp2 = tf.argmax(self.yp2_distrib, axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=tf.stop_gradient(self.y1)) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=tf.stop_gradient(self.y2)) self.loss = tf.reduce_mean(losses + losses2)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout( ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout( qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = tf.stop_gradient(c_emb) q_emb = tf.stop_gradient(q_emb) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.c_rnn = c = rnn(c_emb, seq_len=self.c_len) self.q_rnn = q = rnn(q_emb, seq_len=self.q_len) c = tf.stop_gradient(c) q = tf.stop_gradient(q) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.att = [rnn(qc_att, seq_len=self.c_len)[:,-1,:]] #self.att = [tf.concat([self.c_rnn[:,-1,:], self.q_rnn[:,-1,:]], 1)] #self.att += [tf.stop_gradient(self.att[-1])] with tf.variable_scope("binary"): for _ in range(3): self.att += [tf.nn.dropout(tf.keras.layers.Dense(300, activation='relu')(self.att[-1]), keep_prob=config.keep_prob)] self.prediction = tf.keras.layers.Dense(2)(self.att[-1]) #self.loss = tf.reduce_mean(tf.squared_difference(self.prediction, tf.cast(self.y_target, tf.float32))) self.loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.prediction, labels=tf.stop_gradient(self.y_target))
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): #3层 lstm对输出进行编码 rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) #with the size(batch_size,max_len,hidden_dim) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("relation analysis"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) g_theta_layers = [256, 128, 1] # attention component md = Relation_Module(config, self.c_maxlen, self.q_maxlen, g_theta_layers) #r add attention weight with q_summary r, alpha = md.hop_2(c, init, phase=self.is_train, activation=tf.nn.relu) c = r[-1] with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) #通过embedding q 获得rQ with tf.variable_scope("pointer"): # init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, # keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): self.start_logits = tf.nn.softmax(logits1) self.stop_logits = tf.nn.softmax(logits2) outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2)
def ready(self): config = self.config N, PL, QL, CL, BL, d, dc, dg, dbpe, dbpeh = config.batch_size, self.c_maxlen, self.q_maxlen, \ config.char_limit, config.bpe_limit, config.hidden, \ config.glove_dim if config.pretrained_char else config.char_dim, config.char_hidden, \ config.bpe_glove_dim if config.pretrained_bpe_emb else config.bpe_dim, config.bpe_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): if config.use_char: with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) if config.use_bpe: with tf.variable_scope("bpe"): cb_emb = tf.reshape( tf.nn.embedding_lookup(self.bpe_mat, self.cb), [N * PL, BL, dbpe]) qb_emb = tf.reshape( tf.nn.embedding_lookup(self.bpe_mat, self.qb), [N * QL, BL, dbpe]) cb_emb = dropout(cb_emb, keep_prob=config.keep_prob, is_train=self.is_train) qb_emb = dropout(qb_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dbpeh) cell_bw = tf.contrib.rnn.GRUCell(dbpeh) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, cb_emb, self.cb_len, dtype=tf.float32) cb_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qb_emb, self.qb_len, dtype=tf.float32) qb_emb = tf.concat([state_fw, state_bw], axis=1) qb_emb = tf.reshape(qb_emb, [N, QL, 2 * dbpeh]) cb_emb = tf.reshape(cb_emb, [N, PL, 2 * dbpeh]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) if config.use_char: c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) if config.use_bpe: c_emb = tf.concat([c_emb, cb_emb], axis=2) q_emb = tf.concat([q_emb, qb_emb], axis=2) if config.use_pos: cp_emb = tf.nn.embedding_lookup(self.pos_mat, self.cp) qp_emb = tf.nn.embedding_lookup(self.pos_mat, self.qp) c_emb = tf.concat([c_emb, cp_emb], axis=2) q_emb = tf.concat([q_emb, qp_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2)
def __init__(self, config, batch, word_mat=None, char_mat=None, trainable=True, opt=True): self.config = config self.global_step = tf.get_variable( 'global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.c, self.q, self.ch, self.qh, self.y1, self.y2, self.qa_id = batch.get_next( ) self.emb_keep_prob = tf.get_variable( "emb_keep_prob", shape=[], dtype=tf.float32, trainable=False, initializer=tf.constant_initializer(config.emb_keep_prob)) self.keep_prob = tf.get_variable("keep_prob", shape=[], dtype=tf.float32, trainable=False, initializer=tf.constant_initializer( config.keep_prob)) self.ptr_keep_prob = tf.get_variable( "ptr_keep_prob", shape=[], dtype=tf.float32, trainable=False, initializer=tf.constant_initializer(config.ptr_keep_prob)) self.is_train = tf.get_variable("is_train", shape=[], dtype=tf.bool, trainable=False) self.word_mat = dropout(tf.get_variable( "word_mat", initializer=tf.constant(word_mat, dtype=tf.float32), trainable=False), keep_prob=self.emb_keep_prob, is_train=self.is_train, mode="embedding") self.char_mat = dropout(tf.get_variable( "char_mat", initializer=tf.constant(char_mat, dtype=tf.float32)), keep_prob=self.emb_keep_prob, is_train=self.is_train, mode="embedding") self.c_mask = tf.cast(self.c, tf.bool) self.q_mask = tf.cast(self.q, tf.bool) self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) if opt: N, CL = config.batch_size, config.char_limit self.c_maxlen = tf.reduce_max(self.c_len) self.q_maxlen = tf.reduce_max(self.q_len) self.c = tf.slice(self.c, [0, 0], [N, self.c_maxlen]) self.q = tf.slice(self.q, [0, 0], [N, self.q_maxlen]) self.c_mask = tf.slice(self.c_mask, [0, 0], [N, self.c_maxlen]) self.q_mask = tf.slice(self.q_mask, [0, 0], [N, self.q_maxlen]) self.ch = tf.slice(self.ch, [0, 0, 0], [N, self.c_maxlen, CL]) self.qh = tf.slice(self.qh, [0, 0, 0], [N, self.q_maxlen, CL]) self.y1 = tf.slice(self.y1, [0, 0], [N, self.c_maxlen]) self.y2 = tf.slice(self.y2, [0, 0], [N, self.c_maxlen]) else: self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit self.ch_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1]) self.qh_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1]) self.ready() if trainable: self.lr = tf.get_variable("lr", shape=[], dtype=tf.float32, trainable=False) self.opt = tf.train.AdadeltaOptimizer(learning_rate=self.lr, epsilon=1e-6) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm(gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step)
def ready(self): config = self.config N, QL, CL, d, dc, dg = config.batch_size, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru SN, SL = self.c_s_maxnum, self.c_s_maxlen W = config.glove_dim print('embedding part') with tf.variable_scope("emb"): # with tf.variable_scope("char"): # ch_emb = tf.reshape(tf.nn.embedding_lookup( # self.char_mat, self.csh_slice), [N, SN * SL, CL, dc], name='char_reshape') # qh_emb = tf.reshape(tf.nn.embedding_lookup( # self.char_mat, self.qh_slice), [N, QL, CL, dc]) # ch_emb = dropout( # ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) # qh_emb = dropout( # qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) # ch_emb_char = tf.unstack(ch_emb, axis=0) # qh_emb_char = tf.unstack(qh_emb, axis=0) ''' filter_size = [3, 4, 5] att_char = [] merge_char = [] q_merge_char = [] for filter in filter_size: with tf.variable_scope("char-cnnencoder-%s" % filter): step_merge_char = [] step_att_char = [] q_step_merge_char = [] q_step_att_char = [] for i in range(2): if i==0: input_char=ch_emb else: input_char=qh_emb conv_branch_char = tf.layers.conv2d( inputs=input_char, # use as many filters as the hidden size filters=50, kernel_size=filter, use_bias=True, activation=tf.nn.relu, trainable=True, padding='SAME', name = 'conv_char_' + str(filter), reuse = tf.AUTO_REUSE, data_format='channels_last' ) if i ==0: step_att_char.append(conv_branch_char) # pool over the words to obtain: [first_dim x 1* hidden_size] pool_branch_char = tf.reduce_max(conv_branch_char, axis=2) merge_char.append(pool_branch_char) else: q_step_att_char.append(conv_branch_char) # pool over the words to obtain: [first_dim x 1* hidden_size] q_pool_branch_char = tf.reduce_max(conv_branch_char, axis=2) q_merge_char.append(q_pool_branch_char) # batch_merge = tf.stack(step_merge_char, axis=0) # merge_char.append(batch_merge) # batch_merge_q = tf.stack(q_step_merge_char, axis=0) # q_merge_char.append(batch_merge_q) ch_con = tf.concat(merge_char, axis=-1) ch_con = tf.reshape(ch_con,[N,SN,SL,150]) qh_con = tf.concat(q_merge_char,axis=-1) ''' # if(use_char): # with tf.variable_scope("char"): # ch_emb = tf.reshape(tf.nn.embedding_lookup( # self.char_mat, self.csh), [N * SN * SL, CL, dc], name='char_reshape') # qh_emb = tf.reshape(tf.nn.embedding_lookup( # self.char_mat, self.qh), [N * QL, CL, dc]) # ch_emb = dropout( # ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) # qh_emb = dropout( # qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) # cell_fw = tf.contrib.rnn.GRUCell(dg) # cell_bw = tf.contrib.rnn.GRUCell(dg) # _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( # cell_fw, cell_bw, ch_emb, self.csh_len, dtype=tf.float32) # ch_emb = tf.concat([state_fw, state_bw], axis=1) # _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( # cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) # qh_emb = tf.concat([state_fw, state_bw], axis=1) # qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) # ch_emb = tf.reshape(ch_emb, [N, SN, SL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.cs_slice) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q_slice) with tf.name_scope("softemb"): c_emb_linear = tf.nn.relu( dense(c_emb, d, use_bias=True, scope="c_emb_linear")) q_emb_linear = tf.nn.relu( dense(q_emb, d, use_bias=True, scope="q_emb_linear")) c_emb_linear = tf.reshape( c_emb_linear, [N, self.c_s_maxnum * self.c_s_maxlen, d]) align_cq = tf.matmul(c_emb_linear, tf.transpose(q_emb_linear, [0, 2, 1])) cq_mask = tf.tile(tf.expand_dims(self.q_mask, axis=1), [1, self.c_s_maxnum * self.c_s_maxlen, 1]) self.align_cq = tf.nn.softmax(softmax_mask(align_cq, cq_mask)) align_c_emb = tf.matmul(self.align_cq, q_emb_linear) align_c_emb = tf.reshape( align_c_emb, [N, self.c_s_maxnum, self.c_s_maxlen, d]) c_emb = tf.concat( [c_emb, align_c_emb, self.ce_slice, self.ct_slice], axis=3) c_emb = tf.reshape( c_emb, [N, self.c_s_maxnum, self.c_s_maxlen, W + d + 3 + 19], name='c_emb_reshape') q_emb = tf.concat([q_emb, self.qt_slice], axis=2) self.c_emb = c_emb self.q_emb = q_emb # c_emb = tf.reshape(c_emb, [N,self.c_s_maxnum,self.c_s_maxlen,W+self.q_maxlen]) print('encode-part') # c_s_len = tf.unstack(self.c_s_len, axis=1) cnn_out = [] c_s_emb = tf.unstack(c_emb, axis=0) # q_s_emb = tf.expand_dims(q_emb, axis=1) # q_sample_emb = tf.unstack(q_s_emb, axis = 0) filter_size = [3, 4, 5] att = [] merge = [] q_merge = [] with tf.variable_scope("cnnencoder"): for filter in filter_size: step_merge = [] step_att = [] q_step_merge = [] q_step_att = [] with tf.variable_scope("cnnencoder-%s" % filter): for i in range(N): conv_branch = tf.layers.conv1d( inputs=c_s_emb[i], # use as many filters as the hidden size filters=100, kernel_size=[filter], use_bias=True, activation=tf.nn.relu, trainable=True, padding='SAME', name='conv_' + str(filter), reuse=tf.AUTO_REUSE) # tf.get_variable_scope().reuse_variables() step_att.append(conv_branch) # pool over the words to obtain: [first_dim x 1* hidden_size] pool_branch = tf.reduce_max(conv_branch, axis=1) pool_branch = dropout(pool_branch, keep_prob=config.keep_prob, is_train=self.is_train) step_merge.append(pool_branch) batch_merge = tf.stack(step_merge, axis=0) merge.append(batch_merge) # batch_merge_q = tf.stack(q_step_merge, axis = 0) # q_merge.append(batch_merge_q) con = tf.concat(merge, axis=-1) # q_con = tf.concat(q_merge, axis = -1) # # attention_vis = tf.stack(att, axis=0) # attention_vis = tf.reduce_mean(attention_vis, axis=0) # cnn_out.append(con) # c_sen_emb = tf.concat(con, axis = 0) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=con.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) print('passage-encoder') c_s = rnn(con, seq_len=self.c_p_len) # q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("qencode"): with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=q_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) q = rnn(q_emb, seq_len=self.q_len) self.q_enc = q print('qc_att') with tf.variable_scope("attention"): qc_att = dot_attention(c_s, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.att_s = rnn(qc_att, seq_len=self.c_p_len) # print('pointer') with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train, is_sentence=True) logits1 = pointer(init, self.att_s, d, self.c_p_mask) self.lo = logits1 with tf.variable_scope("predict"): self.outer = tf.nn.softmax(logits1) self.yp = tf.argmax(self.outer, axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=tf.stop_gradient(self.y_slice)) self.out1 = tf.nn.top_k(self.outer, config.k).values self.policy = tf.nn.top_k(self.outer, 1).values self.policy = tf.reduce_sum(tf.nn.top_k(self.outer, config.k).values, axis=-1, keepdims=True) self.policy_log_part = tf.log(self.policy) #self.loss = tf.reduce_mean(-1 * self.policy_log_part * self.reward) reward = self.advantage reward_mean, reward_var = tf.nn.moments(reward, axes=[0]) reward_std = tf.sqrt(reward_var) + 1e-6 self.reward_mean = reward_mean self.reward_var = reward_std reward = tf.div(reward - reward_mean, reward_std) self.final_reward = reward - self.baseline self.loss = tf.reduce_mean(-1 * self.policy_log_part * self.advantage)
def ready(self): config = self.config d = config.hidden batch_size = tf.shape(self.sent_word)[0] sent_mask = tf.cast(self.sent_word, tf.bool) sent_len = tf.reduce_sum(tf.cast(sent_mask, tf.int32), axis=1) sent_maxlen = config.length sent = self.sent_word pretrain_sent_mask = tf.cast(self.pretrain_sents,tf.bool) rnn = Cudnn_RNN(num_layers=2, num_units=d // 2, keep_prob=config.keep_prob, is_train=self.is_train) label_mat,_= FIND_module(sent,self.raw_pats,self.word_mat,config,tf.constant(False,tf.bool),rnn) label_mat = tf.sigmoid(label_mat)*tf.tile(tf.reshape(tf.cast(sent_mask,tf.float32),[batch_size,sent_maxlen,1]),[1,1,self.raw_pats.get_shape()[0]]) # label_mat = tf.cast(tf.greater(label_mat,0.7),tf.float32) _,keywords_sim= FIND_module(sent,self.pats,self.word_mat,config,self.is_train,rnn) # keywords_sim = tf.sigmoid(keywords_sim) pretrain_pred_labels,_ = FIND_module(self.pretrain_sents,self.pretrain_pats,self.word_mat,config,self.is_train,rnn) pretrain_pred_labels = tf.transpose(pretrain_pred_labels,[0,2,1]) gather_order = tf.tile(tf.reshape(tf.range(max(config.pretrain_size,config.pretrain_size_together)), [-1, 1]),[1,2]) pretrain_pred_labels = tf.gather_nd(pretrain_pred_labels,gather_order) self.pretrain_loss = tf.reduce_mean(tf.reduce_sum(tf.nn.weighted_cross_entropy_with_logits(targets=self.pretrain_labels,logits=pretrain_pred_labels,pos_weight=config.pos_weight)*tf.cast(pretrain_sent_mask,tf.float32),axis=1)/tf.reduce_sum(tf.cast(pretrain_sent_mask,tf.float32),axis=1))#tf.losses.mean_squared_error(labels=self.pretrain_labels,predictions=pretrain_pred_labels) self.prt_loss = tf.nn.weighted_cross_entropy_with_logits(targets=self.pretrain_labels,logits=pretrain_pred_labels,pos_weight=config.pos_weight)*tf.cast(pretrain_sent_mask,tf.float32) self.prt_pred = tf.sigmoid(pretrain_pred_labels)*tf.cast(pretrain_sent_mask,tf.float32) self.pretrain_pred_labels = tf.reshape(tf.cast(tf.greater(tf.sigmoid(pretrain_pred_labels)*tf.cast(pretrain_sent_mask,tf.float32),config.pretrain_threshold),tf.int32),[-1]) neg_idxs = tf.matmul(self.keywords_rels, tf.transpose(self.keywords_rels, [1, 0])) pat_pos = tf.square(tf.maximum(0.9 - keywords_sim, 0.)) pat_pos = tf.reduce_max(pat_pos - tf.cast(1 - neg_idxs,tf.float32)*tf.constant(1e30,tf.float32), axis=1) pat_neg = tf.square(tf.maximum(keywords_sim, 0.)) pat_neg = tf.reduce_max(pat_neg - tf.constant(1e30,tf.float32) * tf.cast(neg_idxs,tf.float32), axis=1) pat_simloss = tf.reduce_mean(pat_pos + pat_neg,axis=0) # clustering的loss self.sim_loss = sim_loss = pat_simloss self.pretrain_loss_v2 = self.pretrain_loss+self.pretrain_alpha*self.sim_loss sim_raw = [] for i, soft_labeling_function in enumerate(self.labeling_functions_soft): try: sim_raw.append(soft_labeling_function(label_mat, self.raw_keyword_dict, self.mask_mat)( self.phrases_input) * self.type_restrict(i)) except: print(i) sim_raw.append(tf.cast(tf.reshape(0*self.phrases_input[:,0],[1,-1]),tf.float32)) self.sim =sim= tf.transpose(tf.concat(sim_raw,axis=0),[1,0]) #[tf.shape==(batch_size,1)]*num_functions->[batch_size,] with tf.variable_scope("classifier"): sent_emb = tf.nn.embedding_lookup(self.word_mat, sent) sent_emb = dropout(sent_emb, keep_prob=config.word_keep_prob, is_train=self.is_train, mode="embedding") rnn = Cudnn_RNN(num_layers=2, num_units=d // 2, keep_prob=config.keep_prob, is_train=self.is_train) cont, _ = rnn(sent_emb, seq_len=sent_len, concat_layers=False) cont_d = dropout(cont, keep_prob=config.keep_prob, is_train=self.is_train) att_a = attention(cont_d, config.att_hidden, mask=sent_mask) att2_d = tf.reduce_sum(tf.expand_dims(att_a, axis=2) * cont_d, axis=1) logit = dense(att2_d, config.num_class, use_bias=False) pred = tf.nn.softmax(logit) with tf.variable_scope("pred"): if not self.pseudo: sent_loss = self.sent_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logit, labels=self.rel), axis=0) else: self.hard_train_loss = sent_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2( logits=logit[:config.batch_size], labels=self.rel[:config.batch_size]), axis=0) lsim = sim[:config.batch_size] index_tensor = tf.reshape(tf.constant(np.arange(config.batch_size),tf.int32),[config.batch_size,1]) select_tensor = tf.reshape(self.hard_match_func_idx,[config.batch_size,1]) probs = tf.reshape(tf.gather_nd(lsim,tf.concat([index_tensor,select_tensor],axis=1)),[config.batch_size,1]) self.labeled_loss = labeled_loss = tf.reduce_mean(tf.square((1-probs))) xsim = tf.stop_gradient(sim[config.batch_size:]) pseudo_rel = tf.gather(self.rels, tf.argmax(xsim, axis=1)) bound = tf.reduce_max(xsim, axis=1) weight = tf.nn.softmax(10.0 * bound) self.unlabeled_loss = unlabeled_loss = tf.reduce_sum(weight * tf.nn.softmax_cross_entropy_with_logits_v2( logits=logit[config.batch_size:], labels=pseudo_rel), axis=0) sent_loss = self.sent_loss = sent_loss + self.gamma * unlabeled_loss+self.alpha*self.pretrain_loss#+self.alpha*labeled_loss #算entropy来对no_relation推断 self.max_val = entropy = tf.reduce_sum(pred * -log(pred), axis=1) #pred是test时候用到的 self.pred = tf.argmax(pred, axis=1) self.loss = sent_loss + self.beta * sim_loss #similarity model预测出来的结果 self.sim_pred = tf.argmax(tf.gather(self.rels, tf.argmax(self.sim, axis=1)), axis=1) self.sim_max_val = tf.reduce_max(self.sim, axis=1) #true label self.gold = tf.argmax(self.rel, axis=1) self.entropy = tf.reduce_mean(entropy, axis=0)
def ready(self): config = self.config N, PL, QL, d_gl, dc = config.batch_size, self.c_maxlen, self.q_maxlen, config.glove_dim, config.char_dim gru = cudnn_gru if config.use_cudnn else native_gru logging.info("feature embedding") # glove c_emb = tf.reshape(tf.nn.embedding_lookup( self.word_mat, self.c), [N, PL, d_gl]) q_emb = tf.reshape(tf.nn.embedding_lookup( self.word_mat, self.q), [N, QL, d_gl]) c_emb = tf.reshape(dropout( c_emb, keep_prob=config.keep_prob, is_train=self.is_train),[N,PL,d_gl]) q_emb = tf.reshape(dropout( q_emb, keep_prob=config.keep_prob, is_train=self.is_train),[N,QL,d_gl]) # cove ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.c), [N, PL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.q), [N, QL, dc]) ch_emb = tf.reshape(dropout( ch_emb, keep_prob=config.keep_prob, is_train=self.is_train),[N,PL,dc]) qh_emb = tf.reshape(dropout( qh_emb, keep_prob=config.keep_prob, is_train=self.is_train),[N,QL,dc]) # new_feature c_f = tf.reshape(tf.cast(self.context_feature, tf.float32), [N, PL, config.feature_dim]) logging.info("Word level infusion") # fused_a = fuse(para_glove, ques_glove, attention_dim, 'test') # high_level para_q_fused_glove = word_fusion(c_emb, q_emb, self.c_mask, self.q_mask) # low_level para_w_rep = tf.concat([c_emb, ch_emb, c_f], axis=2) # low_level ques_w_rep = tf.concat([q_emb, qh_emb],axis=2) # enhanced input vector for context para_enhanced_rep = tf.concat([para_w_rep, para_q_fused_glove], axis=2) # ---------------------reading logging.info("Building Reading section") # change LSTM to GRU with tf.variable_scope("Reading"): # hQh f_read_q_low = tf.contrib.rnn.LSTMCell(config.reading_rep_dim // 2) # in paper 125 b_read_q_low = tf.contrib.rnn.LSTMCell(config.reading_rep_dim // 2) inp = dropout(ques_w_rep, keep_prob=config.keep_prob, is_train=self.is_train) ques_low_h, _ = birnn(cell_fw=f_read_q_low, cell_bw=b_read_q_low, inputs=inp, dtype=tf.float32, scope='ques_low_under', sequence_length=self.q_len) ques_low_h = tf.concat(ques_low_h, axis=2) # Hqh f_read_q_high = tf.contrib.rnn.LSTMCell(config.reading_rep_dim // 2) b_read_q_high = tf.contrib.rnn.LSTMCell(config.reading_rep_dim // 2) inp = dropout(ques_low_h, keep_prob=config.keep_prob, is_train=self.is_train) ques_high_h, _ = birnn(cell_fw=f_read_q_high, cell_bw=b_read_q_high, inputs=inp, dtype=tf.float32, scope='ques_high_under', sequence_length=self.q_len) ques_high_h = tf.concat(ques_high_h, axis=2) # Hcl f_read_p_low = tf.contrib.rnn.LSTMCell(config.reading_rep_dim // 2) b_read_p_low = tf.contrib.rnn.LSTMCell(config.reading_rep_dim // 2) inp = dropout(para_enhanced_rep, keep_prob=config.keep_prob, is_train=self.is_train) para_low_h, _ = birnn(cell_fw=f_read_p_low, cell_bw=b_read_p_low, inputs=inp, dtype=tf.float32, scope='para_low_under', sequence_length=self.c_len) para_low_h = tf.concat(para_low_h, axis=2) # Hch f_read_p_high = tf.contrib.rnn.LSTMCell(config.reading_rep_dim // 2) b_read_p_high = tf.contrib.rnn.LSTMCell(config.reading_rep_dim // 2) inp = dropout(para_low_h, keep_prob=config.keep_prob, is_train=self.is_train) para_high_h, _ = birnn(cell_fw=f_read_p_high, cell_bw=b_read_p_high, inputs=inp, dtype=tf.float32, scope='para_high_under', sequence_length=self.c_len) para_high_h = tf.concat(para_high_h, axis=2) logging.info("Final Question Understanding") with tf.variable_scope("final_q_und"): f_uq = tf.contrib.rnn.LSTMCell(config.final_ques_under_dim // 2) b_uq = tf.contrib.rnn.LSTMCell(config.final_ques_under_dim // 2) inp = tf.concat([ques_low_h, ques_high_h], axis=2) inp = dropout(inp, keep_prob=config.keep_prob, is_train=self.is_train) final_q_und, _ = birnn(cell_fw=f_uq, cell_bw=b_uq, inputs=inp, dtype=tf.float32, scope='final_q_und', sequence_length=self.q_len) final_q_und = tf.concat(final_q_und, axis=2) logging.info("Fusion High level") with tf.variable_scope("high_level_fusion"): para_HoW = tf.concat([c_emb, ch_emb, para_low_h, para_high_h], axis=2) ques_HoW = tf.concat([q_emb, qh_emb, ques_low_h, ques_high_h], axis=2) para_fused_l = fuse(para_HoW, ques_HoW, self.c_mask, self.q_mask, config.sl_att_dim, B=ques_low_h, scope='low_level_fusion') para_fused_h = fuse(para_HoW, ques_HoW, self.c_mask, self.q_mask, config.sh_att_dim, B=ques_high_h, scope='high_level_fusion') para_fused_u = fuse(para_HoW, ques_HoW, self.c_mask, self.q_mask, config.su_att_dim, B=final_q_und, scope='understanding_fusion') inp = tf.concat([para_low_h, para_high_h, para_fused_l, para_fused_h, para_fused_u], axis=2) inp = dropout(inp, keep_prob=config.keep_prob, is_train=self.is_train) f_vc = tf.contrib.rnn.LSTMCell(config.fully_fused_para_dim // 2) b_vc = tf.contrib.rnn.LSTMCell(config.fully_fused_para_dim // 2) ff_para, _ = birnn(cell_fw=f_vc, cell_bw=b_vc, inputs=inp, dtype=tf.float32, scope='full_fused_para', sequence_length=self.c_len) ff_para = tf.concat(ff_para, axis=2) logging.info("Self boosting fusion") with tf.variable_scope("self_boosting_fusion"): para_HoW = tf.concat([c_emb, ch_emb, para_low_h, para_high_h, para_fused_l, para_fused_h, para_fused_u, ff_para], axis=2) ff_fused_para = fuse(para_HoW, para_HoW, self.c_mask, self.q_mask, config.selfboost_att_dim, B=ff_para, scope='self_boosted_fusion') f_sb = tf.contrib.rnn.LSTMCell(config.selfboost_rep_dim // 2) b_sb = tf.contrib.rnn.LSTMCell(config.selfboost_rep_dim // 2) inp = tf.concat([ff_para, ff_fused_para], axis=2) inp = dropout(inp, keep_prob=config.keep_prob, is_train=self.is_train) final_para_rep, _ = birnn(cell_fw=f_sb, cell_bw=b_sb, inputs=inp, dtype=tf.float32, scope='self_boosted') final_para_rep = tf.concat(final_para_rep, axis=2) logging.info("Fusion Net construction complete") logging.info("SQuAD specific construction begins") # now we have U_c, U_q = final_para_rep, final_q_und # The rest of the network is for SQuAD # TODO: This part is a little confusing logging.info("Sumarized question understanding vector") with tf.variable_scope("summarized_question"): w = tf.get_variable("W", shape=(config.final_ques_under_dim, 1), dtype=tf.float32) uq_s = tf.unstack(final_q_und, axis=1) attention_weight = [] for i, uq in enumerate(tqdm(uq_s, desc='Question Summary Vector')): s = tf.matmul(uq, w) attention_weight.append(s) attention_weight = tf.nn.softmax(tf.stack(attention_weight, axis=1)) summarized_question = tf.reduce_sum(tf.multiply(final_q_und, attention_weight), axis=1) logging.info("Span generation") # 通过embedding q 获得rQ with tf.variable_scope("pointer"): pointer = ptr_net(batch=N, hidden=summarized_question.get_shape().as_list( )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(summarized_question, final_para_rep, summarized_question.get_shape().as_list()[-1], self.c_mask) with tf.variable_scope("predict"): self.start_logits = tf.nn.softmax(logits1) self.stop_logits = tf.nn.softmax(logits2) outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits( logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits( logits=logits2, labels=self.y2) # change self.loss = losses + losses2
def FIND_module(sent,pats,word_mat,config,is_train,rnn,scope='Find_module'):#sents_emb [batch,maxlength_sent] pats [num_pats,maxlength_pat] [batch,maxlength_sent,dim] with tf.variable_scope(scope,reuse=tf.AUTO_REUSE): keep_prob = config.keep_prob d = config.hidden batch_size = tf.shape(sent)[0] maxlength_sent = tf.shape(sent)[1] dim = tf.shape(word_mat)[1] num_pats = tf.shape(pats)[0] sent_mask = tf.cast(sent, tf.bool) pat_mask = tf.cast(pats, tf.bool) pat_len = tf.reduce_sum(tf.cast(pat_mask, tf.int32), axis=1) with tf.variable_scope('embedding'): sent_emb = tf.nn.embedding_lookup(word_mat, sent) sent_emb_d = dropout(sent_emb, keep_prob=config.word_keep_prob, is_train=is_train, mode="embedding") pat_emb = tf.nn.embedding_lookup(word_mat, pats) pat_emb_d = dropout(pat_emb, keep_prob=config.word_keep_prob, is_train=is_train,mode='embedding') with tf.variable_scope('stack'): pad = tf.zeros([batch_size,1,dim],tf.float32) sent_emb_pad = tf.concat([pad,sent_emb,pad],axis=1) sent_emb_stack_2 = tf.reshape(sent_emb_pad,[batch_size,maxlength_sent+2,1,dim]) sent_emb_stack_2 = tf.concat([sent_emb_stack_2[:,0:-1,:],sent_emb_stack_2[:,1:,:]],axis=2) sent_emb_stack_2 = tf.reshape(sent_emb_stack_2,[batch_size*(maxlength_sent+1),2,dim]) sent_emb_pad2 = tf.concat([pad,pad,sent_emb,pad,pad],axis=1) sent_emb_stack_3 = tf.reshape(sent_emb_pad2,[batch_size,maxlength_sent+4,1,dim]) sent_emb_stack_3 = tf.concat([sent_emb_stack_3[:, 0:-2, :], sent_emb_stack_3[:, 1:-1, :], sent_emb_stack_3[:, 2:, :]], axis=2) sent_emb_stack_3 = tf.reshape(sent_emb_stack_3,[batch_size*(maxlength_sent+2),3,dim]) sent_emb_stack_1 = tf.reshape(sent_emb,[batch_size*maxlength_sent,1,dim]) with tf.variable_scope('stack_d'): pad = tf.zeros([batch_size,1,dim],tf.float32) sent_emb_pad_d = tf.concat([pad,sent_emb_d,pad],axis=1) sent_emb_stack_2_d = tf.reshape(sent_emb_pad_d,[batch_size,maxlength_sent+2,1,dim]) sent_emb_stack_2_d = tf.concat([sent_emb_stack_2_d[:,0:-1,:],sent_emb_stack_2_d[:,1:,:]],axis=2) sent_emb_stack_2_d = tf.reshape(sent_emb_stack_2_d,[batch_size*(maxlength_sent+1),2,dim]) sent_emb_pad2_d = tf.concat([pad,pad,sent_emb_d,pad,pad],axis=1) sent_emb_stack_3_d = tf.reshape(sent_emb_pad2_d,[batch_size,maxlength_sent+4,1,dim]) sent_emb_stack_3_d = tf.concat([sent_emb_stack_3_d[:, 0:-2, :], sent_emb_stack_3_d[:, 1:-1, :], sent_emb_stack_3_d[:, 2:, :]], axis=2) sent_emb_stack_3_d = tf.reshape(sent_emb_stack_3_d,[batch_size*(maxlength_sent+2),3,dim]) sent_emb_stack_1_d = tf.reshape(sent_emb_d,[batch_size*maxlength_sent,1,dim]) with tf.variable_scope("encoder"): with tf.variable_scope('encode_pat'): pat, _ = rnn(pat_emb, seq_len=pat_len, concat_layers=False) #[numpats,d] pat_d = dropout(pat, keep_prob=config.keep_prob, is_train=is_train) with tf.variable_scope('encode_sent'): cont_stack_3, _ = rnn(sent_emb_stack_3,seq_len=3 * tf.ones([batch_size * (maxlength_sent + 2)], tf.int32),concat_layers=False) cont_stack_2, _ = rnn(sent_emb_stack_2, seq_len=2*tf.ones([batch_size*(maxlength_sent+1)],tf.int32), concat_layers=False) #[batch_size*(maxlength_sent+1),d] cont_stack_1, _ = rnn(sent_emb_stack_1, seq_len=tf.ones([batch_size*maxlength_sent],tf.int32), concat_layers=False) #[batch_size*maxlength_sent,d] cont_stack_3_d = dropout(cont_stack_3, keep_prob=keep_prob, is_train=is_train) cont_stack_2_d = dropout(cont_stack_2, keep_prob=keep_prob, is_train=is_train) cont_stack_1_d = dropout(cont_stack_1, keep_prob=keep_prob, is_train=is_train) with tf.variable_scope('attention'): pat_d_a = attention(pat_d,config.att_hidden, mask=pat_mask) cont_stack_2_d_a = attention(cont_stack_2_d,config.att_hidden) cont_stack_3_d_a = attention(cont_stack_3_d,config.att_hidden) cont_stack_3_att = tf.reduce_sum(tf.expand_dims(cont_stack_3_d_a, axis=2) * cont_stack_3, axis=1) cont_stack_2_att = tf.reduce_sum(tf.expand_dims(cont_stack_2_d_a, axis=2) * cont_stack_2, axis=1) pat_d_att = tf.reduce_sum(tf.expand_dims(pat_d_a, axis=2) * pat_d, axis=1) pat_att = tf.reduce_sum(tf.expand_dims(pat_d_a, axis=2) * pat, axis=1) cont_stack_1_att = tf.squeeze(cont_stack_1) with tf.variable_scope('emb_attention'): pat_emb_d_a = attention(pat_emb_d, config.att_hidden, mask=pat_mask) pat_emb_d_att = tf.reduce_sum(tf.expand_dims(pat_emb_d_a, axis=2) * pat_emb_d, axis=1) pat_emb_att = tf.reduce_sum(tf.expand_dims(pat_emb_d_a, axis=2) * pat_emb, axis=1) sent_emb_stack_3_d_a = attention(sent_emb_stack_3_d, config.att_hidden) sent_emb_stack_3_att = tf.reduce_sum(tf.expand_dims(sent_emb_stack_3_d_a, axis=2) * sent_emb_stack_3, axis=1) sent_emb_stack_2_d_a = attention(sent_emb_stack_2_d, config.att_hidden) sent_emb_stack_2_att = tf.reduce_sum(tf.expand_dims(sent_emb_stack_2_d_a, axis=2) * sent_emb_stack_2, axis=1) sent_emb_stack_1_att = tf.squeeze(sent_emb_stack_1) with tf.variable_scope('Score'): scores_stack_2 = cosine(cont_stack_2_att,pat_d_att,weighted=False) scores_stack_1 = cosine(cont_stack_1_att,pat_d_att,weighted=False) scores_stack_3 = cosine(cont_stack_3_att, pat_d_att, weighted=False) scores_stack_3 = tf.reshape(scores_stack_3, [batch_size, 1, maxlength_sent + 2, num_pats]) scores_stack_2 = tf.reshape(scores_stack_2,[batch_size,1,maxlength_sent+1,num_pats]) scores_stack_1 = tf.reshape(scores_stack_1,[batch_size,1,maxlength_sent,num_pats]) scores_sim = cosine(pat_att, pat_d_att, weighted=False) with tf.variable_scope('emb_Score'): scores_stack_3_emb = cosine(sent_emb_stack_3_att,pat_emb_d_att) scores_stack_2_emb = cosine(sent_emb_stack_2_att,pat_emb_d_att) scores_stack_1_emb = cosine(sent_emb_stack_1_att,pat_emb_d_att) scores_stack_3_emb = tf.reshape(scores_stack_3_emb, [batch_size, 1, maxlength_sent + 2, num_pats]) scores_stack_2_emb = tf.reshape(scores_stack_2_emb,[batch_size,1,maxlength_sent+1,num_pats]) scores_stack_1_emb = tf.reshape(scores_stack_1_emb,[batch_size,1,maxlength_sent,num_pats]) phi = 0 scores_stack_3 = phi * scores_stack_3_emb + (1 - phi) * scores_stack_3 scores_stack_2 = phi*scores_stack_2_emb+(1-phi)*scores_stack_2 scores_stack_1 = phi*scores_stack_1_emb+(1-phi)*scores_stack_1 scores = tf.concat([scores_stack_3[:,:,0:-2,:],scores_stack_3[:,:,1:-1,:],scores_stack_3[:,:,2:,:],scores_stack_2[:,:,0:-1,:],scores_stack_2[:,:,1:,:],scores_stack_1],axis=1) scores = tf.reshape(scores,[batch_size,6,maxlength_sent,num_pats]) scores = tf.transpose(scores,[0,3,1,2]) scores = tf.reshape(scores,[batch_size*num_pats,6,maxlength_sent]) scores_sim_emb = cosine(pat_emb_att, pat_emb_d_att) scores_sim = phi*scores_sim_emb+(1-phi)*scores_sim with tf.variable_scope('SeqLabel'): seq = tf.layers.dense(tf.transpose(scores,[0,2,1]),1) seq = tf.squeeze(seq) seq = tf.reshape(seq,[batch_size,num_pats,maxlength_sent]) #seq = tf.reshape(tf.reduce_max(scores,axis=1),[batch_size,num_pats,maxlength_sent]) seq = tf.transpose(seq,[0,2,1]) seq = seq*tf.tile(tf.cast(tf.reshape(sent_mask,[batch_size,maxlength_sent,1]),tf.float32),[1,1,num_pats]) return seq,scores_sim
def __init__(self, config, word_mat=None, char_mat=None, trainable=True, opt=True): N = config.batch_size * 4 self.article_maxlen, self.question_maxlen, self.opt_maxlen = config.para_limit, config.ques_limit, config.opt_limit self.config = config self.article_input = tf.placeholder(tf.int32, name='article', shape=[N, self.article_maxlen]) self.question_input = tf.placeholder(tf.int32, name='question', shape=[N, self.question_maxlen]) self.option_input = tf.placeholder(tf.int32, name='option', shape=[N, self.opt_maxlen]) self.labels_input = tf.placeholder(tf.int32, name='label', shape=[N]) self.global_step = tf.get_variable( 'global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.article = self.article_input self.question = self.question_input self.option = self.option_input self.labels = self.labels_input self.question = tf.concat([self.question, self.option], axis=1) # concat question and option self.emb_keep_prob = tf.get_variable( "emb_keep_prob", shape=[], dtype=tf.float32, trainable=False, initializer=tf.constant_initializer(config.emb_keep_prob)) self.keep_prob = tf.get_variable("keep_prob", shape=[], dtype=tf.float32, trainable=False, initializer=tf.constant_initializer( config.keep_prob)) self.is_train = tf.get_variable("is_train", shape=[], dtype=tf.bool, trainable=False) self.word_mat = dropout(tf.get_variable( "word_mat", initializer=tf.constant(word_mat, dtype=tf.float32), trainable=False), keep_prob=self.emb_keep_prob, is_train=self.is_train, mode="embedding") self.article_mask = tf.cast(self.article, tf.bool) self.question_mask = tf.cast(self.question, tf.bool) self.labels = tf.cast(self.labels, tf.float32) self.article_len = tf.reduce_sum(tf.cast(self.article_mask, tf.int32), axis=1) self.question_len = tf.reduce_sum(tf.cast(self.question_mask, tf.int32), axis=1) self.article_maxlen = tf.reduce_max(self.article_len) self.question_maxlen = tf.reduce_max(self.question_len) # self.article = tf.slice(self.article, [0, 0], [N, self.article_maxlen]) # self.question = tf.slice(self.question, [0, 0], [N, self.question_maxlen]) # self.article_mask = tf.slice(self.article_mask, [0, 0], [N, self.article_maxlen]) # self.question_mask = tf.slice(self.question_mask, [0, 0], [N, self.question_maxlen]) # self.labels = tf.slice(self.labels, [0], [N]) self.define_model() if trainable: self.opt = tf.train.AdadeltaOptimizer(config.learning_rate) self.train_op = self.opt.minimize(self.loss) # self.lr = tf.get_variable("lr", shape=[], dtype=tf.float32, trainable=False) # self.opt = tf.train.AdadeltaOptimizer(learning_rate=self.lr, epsilon=1e-6) grads = self.opt.compute_gradients(self.loss) for grad, var in grads: tf.summary.histogram(var.name, var) tf.summary.histogram(var.name + '/gradient', grad) # gradients, variables = zip(*grads) # capped_grads, _ = tf.clip_by_global_norm(gradients, config.grad_clipping) # self.train_op = self.opt.apply_gradients(zip(capped_grads, variables), global_step=self.global_step) self.merged_summary_op = tf.summary.merge_all()
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): # with tf.variable_scope("char"): # ch_emb = tf.reshape(tf.nn.embedding_lookup( # self.char_mat, self.ch), [N * PL, CL, dc]) # qh_emb = tf.reshape(tf.nn.embedding_lookup( # self.char_mat, self.qh), [N * QL, CL, dc]) # # ch_emb = dropout( # ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) # qh_emb = dropout( # qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) # # cell_fw = tf.contrib.rnn.GRUCell(dg) # cell_bw = tf.contrib.rnn.GRUCell(dg) # # _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( # cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) # ch_emb = tf.concat([state_fw, state_bw], axis=1) # _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( # cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) # qh_emb = tf.concat([state_fw, state_bw], axis=1) # qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) # ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) embedding = tf.get_variable( 'embedding', [config.vocab_size, config.embedding_size], initializer=tf.random_uniform_initializer(minval=-0.05, maxval=0.05)) self.regularizer = tf.nn.l2_loss(embedding) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(embedding, self.c) q_emb = tf.nn.embedding_lookup(embedding, self.q) c_emb = dropout(c_emb, keep_prob=config.keep_prob, is_train=self.is_train) q_emb = dropout(q_emb, keep_prob=config.keep_prob, is_train=self.is_train) c_emb = tf.reshape(c_emb, [N, PL, config.embedding_size]) q_emb = tf.reshape(q_emb, [N, QL, config.embedding_size]) # c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) # q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) # c_emb = tf.concat([c_emb, ch_emb], axis=2) # q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): # 1层 lstm对输出进行编码 rnn_c = gru(num_layers=1, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) rnn_q = gru(num_layers=1, num_units=d, batch_size=N, input_size=q_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn_c(c_emb, seq_len=self.c_len) q = rnn_q(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): M = tf.matmul(c, q, adjoint_b=True) M_mask = tf.to_float( tf.matmul(tf.cast(tf.expand_dims(self.c_mask, -1), tf.int32), tf.cast(tf.expand_dims(self.q_mask, 1), tf.int32))) alpha = softmax(M, 1, M_mask) # (batch_size,M,N) beta = softmax(M, 2, M_mask) # (batch_size,M,N) # query_importance = tf.expand_dims(tf.reduce_mean(beta, reduction_indices=1), -1) query_importance = tf.expand_dims( tf.reduce_sum(beta, 1) / tf.to_float(tf.expand_dims(PL, -1)), -1) # (batch_size,N,1) s = tf.squeeze(tf.matmul(alpha, query_importance), [2]) # (batch_size,M) #unpacked_s = zip(tf.unstack(s, config.batch_size), tf.unstack(self.c, config.batch_size)) #y_hat=(batch_size,config.vocab_size) (代表每个词为答案的概率) #y_hat = tf.stack([tf.unsorted_segment_sum(attentions, sentence_ids, config.vocab_size) for (attentions, sentence_ids) in unpacked_s]) match = c * tf.reshape(s, [-1, PL, 1]) #(batch_size,max_c_len,dim) #通过embedding q 获得rQ with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): self.start_logits = tf.nn.softmax(logits1) self.stop_logits = tf.nn.softmax(logits2) outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean( losses + losses2) + config.l2_reg * self.regularizer
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope('emb'): with tf.variable_scope('char'): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout( ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout( qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope('word'): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope('encoding'): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope('attention'): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope('match'): self_att = dot_attention( att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope('pointer'): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list( )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope('predict'): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=tf.stop_gradient(self.y1)) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=tf.stop_gradient(self.y2)) self.loss = tf.reduce_mean(losses + losses2)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = \ config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, \ config.char_dim, config.char_hidden gru = CudnnGRU if config.use_cudnn else NativeGRU with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) # representation of paragraph q = rnn(q_emb, seq_len=self.q_len) # representation of question with tf.variable_scope( "attention" ): # gated att rnn (using dot att from Attention is All You Need actually) qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): # self-matching rnn self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = PointerNet(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru gi = [] att_vP = [] for i in range(config.max_para): with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape(tf.nn.embedding_lookup(\ self.char_mat, self.pr_ch), [N * PL, CL, dc]) # self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) # att is the v_P att_vP.append(att) """ with tf.variable_scope("match"): self_att = dot_attention( att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) """ with tf.variable_scope("pointer"): # r_Q: init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, att, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul( tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits( logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits( logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) # print losses #condition = tf.greater(self.loss, 11) #self.yp1 = tf.where(condition, tf.Print(self.yp1,[self.yp1],message="Yp1:"), self.yp1) #self.yp2 = tf.where(condition, tf.Print(self.yp2,[self.yp2],message="Yp2:"), self.yp1) for i in range(config.max_para): # Passage ranking with tf.variable_scope("passage-ranking-attention"): vj_P = dropout(att, keep_prob=keep_prob, is_train=is_train) r_Q = dropout(init, keep_prob=keep_prob, is_train=is_train) r_P = attention(r_Q, vj_P, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) #rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=pr_att.get_shape( #).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) #att_rp = rnn(qc_att, seq_len=self.c_len) # Wg concatenate = tf.concat([init, att_rp], axis=2) g = tf.nn.tanh( dense(concatenate, hidden=d, use_bias=False, scope="g")) g_ = dense(g, 1, use_bias=False, scope="g_") gi.append(g_) gi_ = tf.convert_to_tensor(gi) gi = tf.nn.softmax(gi_) self.pr_loss = tf.nn.softmax_cross_entropy_with_logits(logits=gi, labels=self.pr)
def ready(self): config = self.config N, PL, QL, d = config.batch_size, self.c_maxlen, self.q_maxlen, config.hidden keep_prob, is_train = config.keep_prob, config.is_train gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.name_scope("word"): c = tf.nn.embedding_lookup(self.word_mat, self.c) q = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c, self.fs, self.fe], axis=2) q_emb = q with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c_enc, bw_final_state_c = rnn(c_emb, seq_len=self.c_len) q_enc, bw_final_state_q = rnn(q_emb, seq_len=self.q_len) encoder_outputs = tf.concat([c_enc, q_enc], axis=1) bw_final_state = (bw_final_state_c, bw_final_state_q) with tf.variable_scope("attention"): bi_final_hidden = dropout(bw_final_state, keep_prob=keep_prob, is_train=is_train) source_sequence_length = tf.add(PL, QL) logits, sample_id, final_context_state = _build_decoder( encoder_outputs, bi_final_hidden, config, is_train, source_sequence_length, target_sequence_length, target_input, embedding_decoder) """ qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train, name_scope="attention_layer") rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) # att is the v_P if i==0: att_vP = att else: att_vP = tf.concat([att_vP, att], axis=1) #att = tf.Print(att,[att],message="att:") print("att:",att.get_shape().as_list()) print("att_vP:",att_vP.get_shape().as_list()) """ with tf.variable_scope("pointer"): # r_Q: init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) print("rQ:", init.get_shape().as_list()) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, att_vP, d, self.c_pr_mask) tf.summary.histogram('rQ_init', init) tf.summary.histogram('pointer_logits_1', logits1) tf.summary.histogram('pointer_logits_2', logits2) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1_pr) losses2 = tf.nn.softmax_cross_entropy_with_logits( logits=logits2, labels=self.y2_pr) #losses1_2 = tf.reduce_mean(losses1_2, axis=0) self.loss = tf.reduce_mean(losses + losses2) # print losses #condition = tf.greater(self.loss, 11) #self.yp1 = tf.where(condition, tf.Print(self.yp1,[self.yp1],message="Yp1:"), self.yp1) #self.yp2 = tf.where(condition, tf.Print(self.yp2,[self.yp2],message="Yp2:"), self.yp1) if config.with_passage_ranking: gi = None for i in range(config.max_para): # Passage ranking if i == 0: with tf.variable_scope("passage-ranking-attention"): #att_vP = tf.Print(att_vP,[att_vP.get_shape()],message="att_vP:") vj_P = att_vP[:, i * 400:(i + 1) * 400, :] pr_att = pr_attention( batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train, name_scope="passage_ranking_att_layer") r_P = pr_att(init, vj_P, d, self.c_mask) tf.summary.histogram('r_P_' + str(i), r_P) #r_P = tf.Print(r_P,[r_P],message="r_p") # Wg concatenate = tf.concat([init, r_P], axis=1) g = tf.nn.tanh( dense(concatenate, hidden=d, use_bias=False, scope="g", name_scope="dense_pr_att_layer_1")) g_ = dense(g, 1, use_bias=False, scope="g_", name_scope="dense_pr_att_layer_2") #g = tf.Print(g,[g],message="g") if i == 0: gi = tf.reshape(g_, [N, 1]) else: gi = tf.concat([gi, tf.reshape(g_, [N, 1])], axis=1) else: with tf.variable_scope("passage-ranking-attention", reuse=True): #att_vP = tf.Print(att_vP,[att_vP.get_shape()],message="att_vP:") vj_P = att_vP[:, i * 400:(i + 1) * 400, :] pr_att = pr_attention( batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train, name_scope="passage_ranking_att_layer") r_P = pr_att(init, vj_P, d, self.c_mask) tf.summary.histogram('r_P_' + str(i), r_P) #r_P = tf.Print(r_P,[r_P],message="r_p") # Wg concatenate = tf.concat([init, r_P], axis=1) g = tf.nn.tanh( dense(concatenate, hidden=d, use_bias=False, scope="g", name_scope="dense_pr_att_layer_1")) g_ = dense(g, 1, use_bias=False, scope="g_", name_scope="dense_pr_att_layer_2") #g = tf.Print(g,[g],message="g") if i == 0: gi = tf.reshape(g_, [N, 1]) else: gi = tf.concat([gi, tf.reshape(g_, [N, 1])], axis=1) tf.summary.histogram('gi', gi) #gi_ = tf.convert_to_tensor(gi,dtype=tf.float32) #self.gi = tf.nn.softmax(gi_) #self.losses3 = tf.nn.softmax_cross_entropy_with_logits( # logits=gi_, labels=tf.reshape(self.pr,[-1,1])) self.losses3 = tf.nn.softmax_cross_entropy_with_logits( logits=gi, labels=self.pr) #self.losses3 = tf.Print(self.losses3,[self.losses3,tf.reduce_max(self.losses3), # tf.reduce_max(self.pr),tf.reduce_max(gi)],message="losses3:") self.pr_loss = tf.reduce_mean(self.losses3) #self.pr_loss = tf.Print(self.pr_loss,[self.pr_loss]) self.r = tf.constant(0.8) self.e_loss1 = tf.multiply(self.r, self.loss) self.e_loss2 = tf.multiply(tf.subtract(tf.constant(1.0), self.r), self.pr_loss) self.e_loss = tf.add(self.e_loss1, self.e_loss2)
def get_vP(self, i, att_vP, q_, answer_info, y1, y2, c_pr_mask, cmax_c, clen_c): # max para limit config = self.config opt = True MPL = config.para_limit zero = tf.constant(0, dtype=tf.int32) j = tf.constant(0, dtype=tf.int32) c = self.c_pr[:, i * MPL:(i + 1) * MPL] ch = self.ch_pr[:, i * MPL:(i + 1) * MPL, :] qh = self.qh q = self.q c_mask = tf.cast(c, tf.bool) q_mask = self.q_mask # passage ranking line: #self.pr_mask = tf.cast(self.p, tf.bool) c_len = tf.reduce_sum(tf.cast(c_mask, tf.int32), axis=1) c_len_int = tf.reshape(c_len, [config.batch_size, 1]) q_len = self.q_len if opt: N, CL = config.batch_size, config.char_limit c_maxlen = tf.reduce_max(c_len) c_maxlen_int = tf.reshape(tf.reduce_max(c_len_int), [1]) q_maxlen = q_len c = tf.slice(c, [0, 0], [N, c_maxlen]) c_mask = tf.slice(c_mask, [0, 0], [N, c_maxlen]) q_mask = self.q_mask ch = tf.slice(ch, [0, 0, 0], [N, c_maxlen, CL]) qh = self.qh temp = self.y2[:, i * MPL:(i + 1) * MPL] #self.y1 = tf.Print(self.y1,["y1:",tf.shape(self.y1)]) #self.y2 = tf.Print(self.y2,["y2:",tf.shape(self.y2)]) y1__ = tf.slice(self.y1, [0, i * MPL], [N, c_maxlen]) #y1__ = tf.Print(y1__,["y1__:",tf.shape(y1__)]) y2__ = tf.slice(self.y2, [0, i * MPL], [N, c_maxlen]) def b1(): return c_mask def b2(): return tf.concat([c_pr_mask, c_mask], axis=1) c_pr_mask = tf.cond(tf.equal(i, zero), b1, b2) def b3(): return c_maxlen_int, c_len_int def b4(): print(clen_c.get_shape(), c_len_int.get_shape()) a = tf.concat([cmax_c, c_maxlen_int], axis=0) b = tf.concat([clen_c, c_len_int], axis=1) return a, b cmax_c, clen_c = tf.cond(tf.equal(i, zero), b3, b4) # passage ranking #print(self.ch_pr.get_shape()) #print(self.c_pr.get_shape()) #c_pr_mask = tf.cast(self.c_pr, tf.bool) #c_pr_mask = tf.slice(self.c_pr_mask, [0, i*MPL], [N, c_maxlen]) ### ### #ch_pr = tf.slice(self.ch_pr, [0, i*MPL, 0], [N, c_maxlen, CL]) else: self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit ch_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(ch, tf.bool), tf.int32), axis=2), [-1]) qh_len = self.qh_len config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, c_maxlen, self.q_maxlen, \ config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn_gru else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): #CL = tf.Print(CL,[CL],message="CL:") #PL = tf.Print(PL,[PL],message="PL:") #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr.get_shape()],message="ch_pr:") #self.c_pr = tf.reshape(self.c_pr, [N, 12, PL]) #print(self.ch.get_shape()) #print(self.ch_pr.get_shape()) #print(self.c.get_shape()) #print(self.c_pr.get_shape()) #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr[:,2:,:]],message="ch_pr") ch_emb = tf.reshape(tf.nn.embedding_lookup(\ self.char_mat, ch), [N * PL, CL, dc]) # self.char_mat, self.ch), [N * PL, CL, dc]) print(ch.shape, PL) print(qh.shape, QL) qh_emb = tf.reshape(tf.nn.embedding_lookup(\ self.char_mat, qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb") #qh_emb = tf.Print(qh_emb,[qh_emb],message="qh_emb") qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( self.cell_fw, self.cell_bw, ch_emb, ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( self.cell_fw, self.cell_bw, qh_emb, qh_len, dtype=tf.float32) #state_fw = tf.Print(state_fw,[state_fw],message="state_fw") #state_bw = tf.Print(state_bw,[state_bw],message="state_bw") qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb") with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, c) q_emb = tf.nn.embedding_lookup(self.word_mat, q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding", reuse=tf.AUTO_REUSE): """ def f1(): self.rnn1 = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) return self.rnn1(c_emb, seq_len=self.c_len) def f2(): return self.rnn1(c_emb, seq_len=self.c_len) c = tf.cond(tf.equal(i, zero), f1, f2) #q = tf.cond(tf.equal(i, zero), f1, f2) #c = rnn(c_emb, seq_len=self.c_len) q = self.rnn1(q_emb, seq_len=self.q_len) self.q_enc = q #self.rnn1 = rnn """ rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=c_len) q = rnn(q_emb, seq_len=q_len) #c_len = tf.Print(c_len,[c_len,tf.shape(c)],message="C:") #self.q_enc = q q__ = q with tf.variable_scope("attention", reuse=tf.AUTO_REUSE): qc_att = dot_attention(c, q, mask=q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train, name_scope="attention_layer") """ print("qc_att:",qc_att.shape) def f3(): self.rnn2 = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) return self.rnn2(qc_att, seq_len=self.c_len) def f4(): return self.rnn2(qc_att, seq_len=self.c_len) att = tf.cond(tf.equal(self.i, zero), f3, f4) """ rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=c_len) ### #att = tf.Print(att,[tf.greater(tf.cast(tf.shape(att)[1],tf.int64),y1_), # tf.shape(att)],message="att:") def f5(): return att def f6(): return tf.concat([att_vP, att], axis=1) #att = rnn(qc_att, seq_len=self.c_len) #self.rnn2 = rnn # att is the v_P att_vP = tf.cond(tf.equal(i, zero), f5, f6) def f7(): return y1__, y2__ def f8(): return tf.concat([y1, y1__], axis=1), tf.concat([y2, y2__], axis=1) y1, y2 = tf.cond(tf.equal(i, zero), f7, f8) return tf.add(i, tf.constant( 1)), att_vP, q__, answer_info, y1, y2, c_pr_mask, cmax_c, clen_c
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train, name_scope="attention_layer") rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) tf.summary.histogram('vt_P', att) self.att_logits = tf.get_collection('Softmax_logits')[0] self.att_outputs = tf.get_collection('MatMul_outputs')[0] with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train, name_scope="match_layer") rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) tf.summary.histogram('self_match', match) self.match_logits = tf.get_collection('Softmax_logits')[1] self.match_outputs = tf.get_collection('MatMul_outputs')[1] with tf.variable_scope("pointer"): # r_Q: init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) tf.summary.histogram('rQ_init', init) tf.summary.histogram('pointer_logits_1', logits1) tf.summary.histogram('pointer_logits_2', logits2) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) #### self.predict_outer_start = tf.reduce_max(outer, axis=2) self.predict_outer_end = tf.reduce_max(outer, axis=1) """
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout( ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout( qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention( att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list( )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits( logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits( logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = native_rnn c_elmo_features = self.elmo(self.c_elmo) q_elmo_features = self.elmo(self.q_elmo) with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_elmo_emb = weight_layers('embedding', c_elmo_features, l2_coef=0.0, do_layer_norm=False)['weighted_op'] tf.get_variable_scope().reuse_variables() q_elmo_emb = weight_layers('embedding', q_elmo_features, l2_coef=0.0, do_layer_norm=False)['weighted_op'] c_elmo_emb = dropout(c_elmo_emb, keep_prob=config.elmo_keep_prob, is_train=self.is_train) q_elmo_emb = dropout(q_elmo_emb, keep_prob=config.elmo_keep_prob, is_train=self.is_train) c_emb = tf.concat([c_emb, ch_emb, c_elmo_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb, q_elmo_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(config.cell, num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(config.cell, num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(config.cell, num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) c_elmo_enc = weight_layers('encoding', c_elmo_features, l2_coef=0.0, do_layer_norm=False)['weighted_op'] tf.get_variable_scope().reuse_variables() q_elmo_enc = weight_layers('encoding', q_elmo_features, l2_coef=0.0, do_layer_norm=False)['weighted_op'] c_elmo_enc = dropout(c_elmo_enc, keep_prob=config.elmo_keep_prob, is_train=self.is_train) q_elmo_enc = dropout(q_elmo_enc, keep_prob=config.elmo_keep_prob, is_train=self.is_train) match = tf.concat([match, c_elmo_enc], -1) q = tf.concat([q, q_elmo_enc], -1) with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=tf.stop_gradient(self.y1)) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=tf.stop_gradient(self.y2)) self.loss = tf.reduce_mean(losses + losses2)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) self.c_emb = c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) bad_c_emb = tf.stop_gradient(c_emb) bad_q_emb = tf.stop_gradient(q_emb) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=bad_c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.c_rnn = rnn(bad_c_emb, seq_len=self.c_len) self.q_rnn = rnn(bad_q_emb, seq_len=self.q_len) badptr_c = tf.stop_gradient(self.c_rnn) badptr_q = tf.stop_gradient(self.q_rnn) old_rnn = rnn with tf.variable_scope("badptr_attention"): qc_att, self.badptr_qc_att = dot_attention( badptr_c, badptr_q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train, give=True) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.att = [rnn(qc_att, seq_len=self.c_len)] self.att += [self.att[-1][:, -1, :]] with tf.variable_scope("badptr_dense"): for _ in range(3): self.att += [ tf.nn.dropout(tf.keras.layers.Dense(300)(self.att[-1]), keep_prob=config.keep_prob) ] with tf.variable_scope("badptr"): init = self.att[-1] pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, self.att[0], d, self.c_mask) with tf.variable_scope("badptr_predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.bad_yp1_distrib = tf.reduce_max(outer, axis=2) self.bad_yp2_distrib = tf.reduce_max(outer, axis=1) self.bad_yp1 = tf.argmax(self.bad_yp1_distrib, axis=1) self.bad_yp2 = tf.argmax(self.bad_yp2_distrib, axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=tf.stop_gradient(self.bad_y1)) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=tf.stop_gradient(self.bad_y2)) self.loss = tf.reduce_mean(losses + losses2) # recompute c with bitmask left = tf.sequence_mask(self.bad_yp1, tf.shape(c_emb)[1]) right = tf.logical_not( tf.sequence_mask(self.bad_yp2 + 1, tf.shape(c_emb)[1])) self.combo = combo = tf.logical_or(left, right) ### FOR TESTING ### ## self.combo = combo = tf.cast(tf.ones_like(combo), tf.bool) def adjust(c_emb_combo): c_emb, combo = c_emb_combo foo = c_emb bar = tf.boolean_mask(foo, combo) return tf.cond( tf.logical_and(tf.equal(combo[0], False), tf.equal(combo[1], True)), false_fn=lambda: tf.pad( bar, [[0, tf.shape(foo)[0] - tf.shape(bar)[0]], [0, 0]]), true_fn=lambda: foo) self.c_emb_new = c_emb_new = tf.map_fn(adjust, (c_emb, combo), dtype=(tf.float32)) self.c_len = tf.reduce_sum(tf.cast( tf.logical_and(self.c_mask, self.combo), tf.int32), axis=-1) self.c_mask = tf.sequence_mask( tf.reduce_sum(tf.cast(tf.logical_and(self.c_mask, self.combo), tf.int32), axis=-1), tf.shape(self.c_mask)[1]) with tf.variable_scope("encoding", reuse=True): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train, super_hacky_reload=True) #### SEQ LEN HAS TO BE FIXED!!!! #### c = rnn(c_emb_new, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) self.c_ck = c self.q_ck = c ### MAKE SURE THESE ARE RUN!!! ### print('RUN ASSIGN TRICK OPS (model.assign_trick_ops)!!') self.assign_trick_ops = [] for i in range(len(rnn.init_fw)): self.assign_trick_ops += [ tf.assign(rnn.init_fw[i], old_rnn.init_fw[i]) ] self.assign_trick_ops += [ tf.assign(rnn.init_bw[i], old_rnn.init_bw[i]) ] with tf.variable_scope("attention"): qc_att, self.qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train, give=True) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) self.att_ck = att with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) self.match_ck = match with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1_distrib = tf.reduce_max(outer, axis=2) self.yp2_distrib = tf.reduce_max(outer, axis=1) self.yp1 = tf.argmax(self.yp1_distrib, axis=1) self.yp2 = tf.argmax(self.yp2_distrib, axis=1)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) _, qh_emb = stacked_gru(qh_emb, dg, num_layers=1, seq_len=self.qh_len, keep_prob=self.keep_prob, is_train=self.is_train) tf.get_variable_scope().reuse_variables() _, ch_emb = stacked_gru(ch_emb, dg, num_layers=1, seq_len=self.ch_len, keep_prob=self.keep_prob, is_train=self.is_train) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): c, _ = stacked_gru(c_emb, d, batch=N, num_layers=3, seq_len=self.c_len, keep_prob=self.keep_prob, is_train=self.is_train) tf.get_variable_scope().reuse_variables() q, _ = stacked_gru(q_emb, d, batch=N, num_layers=3, seq_len=self.q_len, keep_prob=self.keep_prob, is_train=self.is_train) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=self.keep_prob, is_train=self.is_train) att, _ = stacked_gru(qc_att, d, num_layers=1, seq_len=self.c_len, keep_prob=self.keep_prob, is_train=self.is_train) with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=self.keep_prob, is_train=self.is_train) match, _ = stacked_gru(self_att, d, num_layers=1, seq_len=self.c_len, keep_prob=self.keep_prob, is_train=self.is_train) with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=self.ptr_keep_prob, is_train=self.is_train) d_match = dropout(match, keep_prob=self.ptr_keep_prob, is_train=self.is_train) hidden = init.get_shape().as_list()[-1] cell_fw = GRUCell(hidden) cell_bw = GRUCell(hidden) with tf.variable_scope("fw"): inp, logits1_fw = pointer(d_match, init, d, mask=self.c_mask) _, state = cell_fw(inp, init) tf.get_variable_scope().reuse_variables() _, logits2_fw = pointer(d_match, state, d, mask=self.c_mask) with tf.variable_scope("bw"): inp, logits2_bw = pointer(d_match, init, d, mask=self.c_mask) _, state = cell_bw(inp, init) tf.get_variable_scope().reuse_variables() _, logits1_bw = pointer(d_match, state, d, mask=self.c_mask) logits1 = (logits1_fw + logits1_bw) / 2. logits2 = (logits2_fw + logits2_bw) / 2. with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2)