def create_attention(self): with tf.name_scope('attention'): self.ct = func.dot_attention(self.decoder_h, self.passage_enc, self.passage_mask, config.dot_attention_dim, self.input_keep_prob) self.combined_h = tf.concat( [self.decoder_h, self.ct], -1, name='combined_h') #[batch, question_len, 450] self.wt = tf.get_variable('wt', shape=[ config.max_question_len, self.combined_h.get_shape()[-1], config.decoder_hidden_dim ]) self.ws = tf.get_variable( 'ws', shape=[config.decoder_hidden_dim, self.vocab_size]) question_len = tf.shape(self.combined_h)[1] self.wt_h = tf.einsum('bij,ijk->bik', self.combined_h, self.wt[:question_len, :, :], name='wt_h') self.ws_tanh_wt = tf.einsum('bik,kj->bij', tf.tanh(self.wt_h), self.ws)
def ready(self): config = self.config N, PL, QL, d = config.batch_size, self.c_maxlen, self.q_maxlen, config.hidden, gru = cudnn_gru if config.use_cudnn else native_gru # 词向量层 with tf.variable_scope("emb"): with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) alter_emb = tf.nn.embedding_lookup(self.word_mat, self.alternatives) # 编码层 with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) # [batch, c, 2*d*3] # 2:双向gru;3:连接3层的output作为最后的输出 c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) # with tf.variable_scope("mru_encoder"): # c_m = mru(c, self.c_maxlen, self.c_mask, mru_range, 250) with tf.variable_scope("q2c"): # q2c.shape=[b,c,c.shape[-1]+q.shape[-1]]=[b,c,12d] q2c = dot_attention(c, q, mask=self.q_mask, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=q2c.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) # v_c.shape=[b,c,2d] v_c = rnn(q2c, seq_len=self.c_len) with tf.variable_scope("q2o"): # alter.shape=[b,3,6d] alter = tf.layers.dense(alter_emb, units=6 * d, activation=tf.nn.relu) # q2o.shape=[b,3,12d] q2o = dot_attention(alter, q, mask=self.q_mask, keep_prob=config.keep_prob, is_train=self.is_train) with tf.variable_scope("o2c"): # v_o.shape=[b,3,2d] v_o = tf.layers.dense(q2o, units=2 * d, activation=tf.nn.relu) # o2c.shape=[b,c,4d] o2c = dot_attention(v_c, v_o, mask=self.alter_mask, keep_prob=config.keep_prob, is_train=self.is_train) r_c = tf.reduce_mean(o2c, axis=1, keepdims=True) with tf.variable_scope("predict"): # logits.shape=[b,3] logits = tf.reshape(bilinear(r_c, v_o), [N, v_o.get_shape().as_list()[1]]) self.yp = tf.argmax(tf.nn.softmax(logits), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits, labels=tf.stop_gradient(self.y)) self.loss = tf.reduce_mean(losses)
def ready(self): config = self.config N, PL, QL, d = config.batch_size, self.c_maxlen, self.q_maxlen, config.hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.name_scope("embedding"): with tf.name_scope("title"): t_emb = tf.nn.embedding_lookup(self.word_mat, self.t) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) # c_emb = tf.concat([c_emb, ch_emb], axis=2) # q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention( att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list( )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) # answer predict with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) #对答案区间进行限制 #outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits( logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits( logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) # document selected with tf.variable_scope("select"): # batch_size dim c_cum = attention_pooling(match, init, self.c_mask, hidden=d) fuse = tf.concat([c_cum, init], axis=1) fuse = dense(fuse, hidden=d, use_bias=False, scope = "fully1") fuse = dense(fuse, hidden=1, use_bias=False, scope = "fully2") # batch_size 1 logits_s = tf.sigmoid(fuse) fuse = tf.squeeze(fuse) self.s = tf.cast(self.s, tf.float32) self.loss_s = tf.nn.sigmoid_cross_entropy_with_logits(logits=fuse, labels=self.s)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout( ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout( qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention( att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list( )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits( logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits( logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout( ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout( qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention( att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) #[10, ?,300] with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list( )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("content_modeling"): logits4, c_semantics = content_model(init, match, config.hidden) with tf.variable_scope("cross_passage_attention"): self.query_num = int(config.batch_size/config.passage_num) c_semantics = tf.reshape(c_semantics, shape=[self.query_num, config.passage_num, -1]) attnc_key = tf.tile(tf.expand_dims(c_semantics, axis=2), [1, 1, config.passage_num, 1]) attnc_mem = tf.tile(tf.expand_dims(c_semantics, axis=1), [1, config.passage_num, 1, 1]) attnc_w = tf.reduce_sum(attnc_key*attnc_mem, axis=-1) attnc_mask = tf.ones([config.passage_num, config.passage_num])-tf.diag([1.0]*config.passage_num) attnc_w = tf.nn.softmax(attnc_w*attnc_mask, axis=-1) attncp = tf.reduce_sum(tf.tile(tf.expand_dims(attnc_w, axis=-1), [1, 1, 1, 2*config.hidden])*attnc_mem, axis= 2) with tf.variable_scope("pseudo_label"): self.is_select = tf.reshape(tf.squeeze(self.is_select), shape=[self.query_num, config.passage_num]) self.is_select = self.is_select/tf.tile(tf.reduce_sum(self.is_select, axis=-1, keepdims=True), [1, config.passage_num]) sim_matrix = attnc_w lb_matrix = tf.tile(tf.expand_dims(self.is_select, axis=1), [1, config.passage_num, 1]) self.pse_is_select = tf.reduce_sum(sim_matrix*lb_matrix, axis=-1) + tf.constant([0.00000001]*config.passage_num, dtype=tf.float32) # avoid all zero self.pse_is_select = self.pse_is_select/tf.tile(tf.reduce_sum(self.pse_is_select, axis=-1, keepdims=True), [1,config.passage_num]) alpha = 0.7 self.fuse_label = alpha*self.is_select + (1-alpha)*tf.stop_gradient(self.pse_is_select) with tf.variable_scope("predict_passage"): init = tf.reshape(init, shape=[self.query_num, config.passage_num, -1]) attn_concat = tf.concat([init, attncp, c_semantics], axis=-1) d1 = tf.layers.dense(attn_concat, 2*config.hidden, activation= tf.nn.leaky_relu, bias_initializer= tf.glorot_uniform_initializer()) #150 d2 = tf.layers.dense(d1, config.hidden, activation= tf.nn.leaky_relu, bias_initializer= tf.glorot_uniform_initializer()) #75 logits3 = tf.squeeze(tf.layers.dense(d2, 1, activation= None, bias_initializer= tf.glorot_uniform_initializer())) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 30) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) #logits3 = tf.reduce_max(tf.reduce_max(outer, axis=2), axis=1) self.is_select_p = tf.nn.sigmoid(logits3) losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=tf.stop_gradient(self.y1)) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=tf.stop_gradient(self.y2)) weighted_losses = weighted_loss(config, 0.000001, self.y1, losses) #0.01 weighted_losses2 = weighted_loss(config, 0.000001, self.y2, losses2) #0.01 losses3 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits3, labels=tf.stop_gradient(self.fuse_label))) in_answer_weight = tf.ones_like(self.in_answer) + 3*self.in_answer losses4 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( logits=logits4, labels=tf.stop_gradient(self.in_answer))*in_answer_weight, axis=-1) weighted_losses4 = weighted_loss(config, 0.000001, self.in_answer, losses4) self.loss_dict = {'pos_s loss':losses, 'pos_e loss':losses2, 'select loss':losses3, 'in answer':losses4} for key, values in self.loss_dict.items(): self.loss_dict[key] = tf.reduce_mean(values) self.loss = tf.reduce_mean(weighted_losses + weighted_losses2 + losses3+ weighted_losses4)
def get_vP(self, i, att_vP, q_, answer_info, y1, y2, c_pr_mask, cmax_c, clen_c): # max para limit config = self.config opt = True MPL = config.para_limit zero = tf.constant(0, dtype=tf.int32) j = tf.constant(0, dtype=tf.int32) c = self.c_pr[:, i * MPL:(i + 1) * MPL] ch = self.ch_pr[:, i * MPL:(i + 1) * MPL, :] qh = self.qh q = self.q c_mask = tf.cast(c, tf.bool) q_mask = self.q_mask # passage ranking line: #self.pr_mask = tf.cast(self.p, tf.bool) c_len = tf.reduce_sum(tf.cast(c_mask, tf.int32), axis=1) c_len_int = tf.reshape(c_len, [config.batch_size, 1]) q_len = self.q_len if opt: N, CL = config.batch_size, config.char_limit c_maxlen = tf.reduce_max(c_len) c_maxlen_int = tf.reshape(tf.reduce_max(c_len_int), [1]) q_maxlen = q_len c = tf.slice(c, [0, 0], [N, c_maxlen]) c_mask = tf.slice(c_mask, [0, 0], [N, c_maxlen]) q_mask = self.q_mask ch = tf.slice(ch, [0, 0, 0], [N, c_maxlen, CL]) qh = self.qh temp = self.y2[:, i * MPL:(i + 1) * MPL] #self.y1 = tf.Print(self.y1,["y1:",tf.shape(self.y1)]) #self.y2 = tf.Print(self.y2,["y2:",tf.shape(self.y2)]) y1__ = tf.slice(self.y1, [0, i * MPL], [N, c_maxlen]) #y1__ = tf.Print(y1__,["y1__:",tf.shape(y1__)]) y2__ = tf.slice(self.y2, [0, i * MPL], [N, c_maxlen]) def b1(): return c_mask def b2(): return tf.concat([c_pr_mask, c_mask], axis=1) c_pr_mask = tf.cond(tf.equal(i, zero), b1, b2) def b3(): return c_maxlen_int, c_len_int def b4(): print(clen_c.get_shape(), c_len_int.get_shape()) a = tf.concat([cmax_c, c_maxlen_int], axis=0) b = tf.concat([clen_c, c_len_int], axis=1) return a, b cmax_c, clen_c = tf.cond(tf.equal(i, zero), b3, b4) # passage ranking #print(self.ch_pr.get_shape()) #print(self.c_pr.get_shape()) #c_pr_mask = tf.cast(self.c_pr, tf.bool) #c_pr_mask = tf.slice(self.c_pr_mask, [0, i*MPL], [N, c_maxlen]) ### ### #ch_pr = tf.slice(self.ch_pr, [0, i*MPL, 0], [N, c_maxlen, CL]) else: self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit ch_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(ch, tf.bool), tf.int32), axis=2), [-1]) qh_len = self.qh_len config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, c_maxlen, self.q_maxlen, \ config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn_gru else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): #CL = tf.Print(CL,[CL],message="CL:") #PL = tf.Print(PL,[PL],message="PL:") #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr.get_shape()],message="ch_pr:") #self.c_pr = tf.reshape(self.c_pr, [N, 12, PL]) #print(self.ch.get_shape()) #print(self.ch_pr.get_shape()) #print(self.c.get_shape()) #print(self.c_pr.get_shape()) #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr[:,2:,:]],message="ch_pr") ch_emb = tf.reshape(tf.nn.embedding_lookup(\ self.char_mat, ch), [N * PL, CL, dc]) # self.char_mat, self.ch), [N * PL, CL, dc]) print(ch.shape, PL) print(qh.shape, QL) qh_emb = tf.reshape(tf.nn.embedding_lookup(\ self.char_mat, qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb") #qh_emb = tf.Print(qh_emb,[qh_emb],message="qh_emb") qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( self.cell_fw, self.cell_bw, ch_emb, ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( self.cell_fw, self.cell_bw, qh_emb, qh_len, dtype=tf.float32) #state_fw = tf.Print(state_fw,[state_fw],message="state_fw") #state_bw = tf.Print(state_bw,[state_bw],message="state_bw") qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb") with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, c) q_emb = tf.nn.embedding_lookup(self.word_mat, q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding", reuse=tf.AUTO_REUSE): """ def f1(): self.rnn1 = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) return self.rnn1(c_emb, seq_len=self.c_len) def f2(): return self.rnn1(c_emb, seq_len=self.c_len) c = tf.cond(tf.equal(i, zero), f1, f2) #q = tf.cond(tf.equal(i, zero), f1, f2) #c = rnn(c_emb, seq_len=self.c_len) q = self.rnn1(q_emb, seq_len=self.q_len) self.q_enc = q #self.rnn1 = rnn """ rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=c_len) q = rnn(q_emb, seq_len=q_len) #c_len = tf.Print(c_len,[c_len,tf.shape(c)],message="C:") #self.q_enc = q q__ = q with tf.variable_scope("attention", reuse=tf.AUTO_REUSE): qc_att = dot_attention(c, q, mask=q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train, name_scope="attention_layer") """ print("qc_att:",qc_att.shape) def f3(): self.rnn2 = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) return self.rnn2(qc_att, seq_len=self.c_len) def f4(): return self.rnn2(qc_att, seq_len=self.c_len) att = tf.cond(tf.equal(self.i, zero), f3, f4) """ rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=c_len) ### #att = tf.Print(att,[tf.greater(tf.cast(tf.shape(att)[1],tf.int64),y1_), # tf.shape(att)],message="att:") def f5(): return att def f6(): return tf.concat([att_vP, att], axis=1) #att = rnn(qc_att, seq_len=self.c_len) #self.rnn2 = rnn # att is the v_P att_vP = tf.cond(tf.equal(i, zero), f5, f6) def f7(): return y1__, y2__ def f8(): return tf.concat([y1, y1__], axis=1), tf.concat([y2, y2__], axis=1) y1, y2 = tf.cond(tf.equal(i, zero), f7, f8) return tf.add(i, tf.constant( 1)), att_vP, q__, answer_info, y1, y2, c_pr_mask, cmax_c, clen_c
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru gi = [] att_vP = [] for i in range(config.max_para): with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape(tf.nn.embedding_lookup(\ self.char_mat, self.pr_ch), [N * PL, CL, dc]) # self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) # att is the v_P att_vP.append(att) """ with tf.variable_scope("match"): self_att = dot_attention( att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) """ with tf.variable_scope("pointer"): # r_Q: init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, att, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul( tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits( logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits( logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) # print losses #condition = tf.greater(self.loss, 11) #self.yp1 = tf.where(condition, tf.Print(self.yp1,[self.yp1],message="Yp1:"), self.yp1) #self.yp2 = tf.where(condition, tf.Print(self.yp2,[self.yp2],message="Yp2:"), self.yp1) for i in range(config.max_para): # Passage ranking with tf.variable_scope("passage-ranking-attention"): vj_P = dropout(att, keep_prob=keep_prob, is_train=is_train) r_Q = dropout(init, keep_prob=keep_prob, is_train=is_train) r_P = attention(r_Q, vj_P, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) #rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=pr_att.get_shape( #).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) #att_rp = rnn(qc_att, seq_len=self.c_len) # Wg concatenate = tf.concat([init, att_rp], axis=2) g = tf.nn.tanh( dense(concatenate, hidden=d, use_bias=False, scope="g")) g_ = dense(g, 1, use_bias=False, scope="g_") gi.append(g_) gi_ = tf.convert_to_tensor(gi) gi = tf.nn.softmax(gi_) self.pr_loss = tf.nn.softmax_cross_entropy_with_logits(logits=gi, labels=self.pr)
def encoder(source, params): mask = dtype.tf_to_float(tf.cast(source, tf.bool)) hidden_size = params.hidden_size initializer = tf.random_normal_initializer(0.0, hidden_size**-0.5) source, mask = util.remove_invalid_seq(source, mask) embed_name = "embedding" if params.shared_source_target_embedding \ else "src_embedding" src_emb = tf.get_variable(embed_name, [params.src_vocab.size(), params.embed_size], initializer=initializer) src_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(src_emb, source) * (hidden_size**0.5) inputs = tf.nn.bias_add(inputs, src_bias) inputs = func.add_timing_signal(inputs) inputs = util.valid_apply_dropout(inputs, params.dropout) with tf.variable_scope("encoder"): x = inputs for layer in range(params.num_encoder_layer): if params.deep_transformer_init: layer_initializer = tf.variance_scaling_initializer( params.initializer_gain * (layer + 1)**-0.5, mode="fan_avg", distribution="uniform") else: layer_initializer = None with tf.variable_scope("layer_{}".format(layer), initializer=layer_initializer): with tf.variable_scope("self_attention"): y = func.dot_attention(x, None, func.attention_bias( mask, "masking"), hidden_size, num_heads=params.num_heads, dropout=params.attention_dropout) y = y['output'] x = func.residual_fn(x, y, dropout=params.residual_dropout) x = func.layer_norm(x) with tf.variable_scope("feed_forward"): y = func.ffn_layer( x, params.filter_size, hidden_size, dropout=params.relu_dropout, ) x = func.residual_fn(x, y, dropout=params.residual_dropout) x = func.layer_norm(x) source_encodes = x x_shp = util.shape_list(x) return { "encodes": source_encodes, "decoder_initializer": { "layer_{}".format(l): { # plan aan "aan": dtype.tf_to_float(tf.zeros([x_shp[0], 1, hidden_size])), } for l in range(params.num_decoder_layer) }, "mask": mask }
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = native_rnn c_elmo_features = self.elmo(self.c_elmo) q_elmo_features = self.elmo(self.q_elmo) with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_elmo_emb = weight_layers('embedding', c_elmo_features, l2_coef=0.0, do_layer_norm=False)['weighted_op'] tf.get_variable_scope().reuse_variables() q_elmo_emb = weight_layers('embedding', q_elmo_features, l2_coef=0.0, do_layer_norm=False)['weighted_op'] c_elmo_emb = dropout(c_elmo_emb, keep_prob=config.elmo_keep_prob, is_train=self.is_train) q_elmo_emb = dropout(q_elmo_emb, keep_prob=config.elmo_keep_prob, is_train=self.is_train) c_emb = tf.concat([c_emb, ch_emb, c_elmo_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb, q_elmo_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(config.cell, num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(config.cell, num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(config.cell, num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) c_elmo_enc = weight_layers('encoding', c_elmo_features, l2_coef=0.0, do_layer_norm=False)['weighted_op'] tf.get_variable_scope().reuse_variables() q_elmo_enc = weight_layers('encoding', q_elmo_features, l2_coef=0.0, do_layer_norm=False)['weighted_op'] c_elmo_enc = dropout(c_elmo_enc, keep_prob=config.elmo_keep_prob, is_train=self.is_train) q_elmo_enc = dropout(q_elmo_enc, keep_prob=config.elmo_keep_prob, is_train=self.is_train) match = tf.concat([match, c_elmo_enc], -1) q = tf.concat([q, q_elmo_enc], -1) with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=tf.stop_gradient(self.y1)) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=tf.stop_gradient(self.y2)) self.loss = tf.reduce_mean(losses + losses2)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train, name_scope="attention_layer") rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) tf.summary.histogram('vt_P', att) self.att_logits = tf.get_collection('Softmax_logits')[0] self.att_outputs = tf.get_collection('MatMul_outputs')[0] with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train, name_scope="match_layer") rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) tf.summary.histogram('self_match', match) self.match_logits = tf.get_collection('Softmax_logits')[1] self.match_outputs = tf.get_collection('MatMul_outputs')[1] with tf.variable_scope("pointer"): # r_Q: init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) tf.summary.histogram('rQ_init', init) tf.summary.histogram('pointer_logits_1', logits1) tf.summary.histogram('pointer_logits_2', logits2) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) #### self.predict_outer_start = tf.reduce_max(outer, axis=2) self.predict_outer_end = tf.reduce_max(outer, axis=1) """
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = \ config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, \ config.char_dim, config.char_hidden gru = CudnnGRU if config.use_cudnn else NativeGRU with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) # representation of paragraph q = rnn(q_emb, seq_len=self.q_len) # representation of question with tf.variable_scope( "attention" ): # gated att rnn (using dot att from Attention is All You Need actually) qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): # self-matching rnn self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = PointerNet(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout( ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout( qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = tf.stop_gradient(c_emb) q_emb = tf.stop_gradient(q_emb) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.c_rnn = c = rnn(c_emb, seq_len=self.c_len) self.q_rnn = q = rnn(q_emb, seq_len=self.q_len) c = tf.stop_gradient(c) q = tf.stop_gradient(q) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.att = [rnn(qc_att, seq_len=self.c_len)[:,-1,:]] #self.att = [tf.concat([self.c_rnn[:,-1,:], self.q_rnn[:,-1,:]], 1)] #self.att += [tf.stop_gradient(self.att[-1])] with tf.variable_scope("binary"): for _ in range(3): self.att += [tf.nn.dropout(tf.keras.layers.Dense(300, activation='relu')(self.att[-1]), keep_prob=config.keep_prob)] self.prediction = tf.keras.layers.Dense(2)(self.att[-1]) #self.loss = tf.reduce_mean(tf.squared_difference(self.prediction, tf.cast(self.y_target, tf.float32))) self.loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.prediction, labels=tf.stop_gradient(self.y_target))
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): #3层 lstm对输出进行编码 rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) #with the size(batch_size,max_len,hidden_dim) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("relation analysis"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) g_theta_layers = [256, 128, 1] # attention component md = Relation_Module(config, self.c_maxlen, self.q_maxlen, g_theta_layers) #r add attention weight with q_summary r, alpha = md.hop_2(c, init, phase=self.is_train, activation=tf.nn.relu) c = r[-1] with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) #通过embedding q 获得rQ with tf.variable_scope("pointer"): # init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, # keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): self.start_logits = tf.nn.softmax(logits1) self.stop_logits = tf.nn.softmax(logits2) outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2)
def ready(self): config = self.config N, PL, QL, CL, BL, d, dc, dg, dbpe, dbpeh = config.batch_size, self.c_maxlen, self.q_maxlen, \ config.char_limit, config.bpe_limit, config.hidden, \ config.glove_dim if config.pretrained_char else config.char_dim, config.char_hidden, \ config.bpe_glove_dim if config.pretrained_bpe_emb else config.bpe_dim, config.bpe_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): if config.use_char: with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) if config.use_bpe: with tf.variable_scope("bpe"): cb_emb = tf.reshape( tf.nn.embedding_lookup(self.bpe_mat, self.cb), [N * PL, BL, dbpe]) qb_emb = tf.reshape( tf.nn.embedding_lookup(self.bpe_mat, self.qb), [N * QL, BL, dbpe]) cb_emb = dropout(cb_emb, keep_prob=config.keep_prob, is_train=self.is_train) qb_emb = dropout(qb_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dbpeh) cell_bw = tf.contrib.rnn.GRUCell(dbpeh) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, cb_emb, self.cb_len, dtype=tf.float32) cb_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qb_emb, self.qb_len, dtype=tf.float32) qb_emb = tf.concat([state_fw, state_bw], axis=1) qb_emb = tf.reshape(qb_emb, [N, QL, 2 * dbpeh]) cb_emb = tf.reshape(cb_emb, [N, PL, 2 * dbpeh]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) if config.use_char: c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) if config.use_bpe: c_emb = tf.concat([c_emb, cb_emb], axis=2) q_emb = tf.concat([q_emb, qb_emb], axis=2) if config.use_pos: cp_emb = tf.nn.embedding_lookup(self.pos_mat, self.cp) qp_emb = tf.nn.embedding_lookup(self.pos_mat, self.qp) c_emb = tf.concat([c_emb, cp_emb], axis=2) q_emb = tf.concat([q_emb, qp_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2)
def ptrspan(self): config = self.config N, QL, CL, d, dc, dg = config.batch_size, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru SN = self.k W = config.glove_dim d = config.hidden print('embedding part') with tf.name_scope("word"): para_emb = tf.nn.embedding_lookup(self.word_mat, self.para_slice) c_emb = self.sentence_slice q_emb = self.q_slice with tf.name_scope("para_encode"): para_emb_linear = tf.layers.dense( para_emb, d, use_bias=False, kernel_initializer=tf.ones_initializer(), trainable=self.is_train, name='para_emb_line') q_emb_linear = tf.layers.dense( q_emb, d, use_bias=False, kernel_initializer=tf.ones_initializer(), trainable=self.is_train, name='q_emb_line') align_pq = tf.matmul(para_emb_linear, tf.transpose(q_emb_linear, [0, 2, 1])) pq_mask = tf.tile(tf.expand_dims(self.q_mask, axis=1), [1, self.para_maxlen, 1]) align_pq = tf.nn.softmax(softmax_mask(align_pq, pq_mask)) align_para_emb = tf.matmul(align_pq, q_emb_linear) para_emb_concat = tf.concat([ para_emb, align_para_emb, self.para_e_slice, self.para_t_slice ], axis=2) self.para_emb = para_emb_concat print('encode-part') # c_emb = self.sentence_slice c_emb_sen = tf.unstack(c_emb, axis=1) sentence_len = tf.unstack(self.sentence_len, axis=1) c_s = [] with tf.variable_scope("sentence_encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb_sen[0].get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) print('passage-encoder') for i in range(SN): c_s_emb = rnn(c_emb_sen[i], seq_len=sentence_len[i], concat_layers=False) c_s.append(c_s_emb) para_gru = rnn(para_emb_concat, seq_len=self.para_len, concat_layers=False) with tf.variable_scope("q_encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=q_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) q = rnn(q_emb, seq_len=self.q_len, concat_layers=False) # c_s_h = [] # with tf.variable_scope("highway_encoding",reuse = tf.AUTO_REUSE): # highway = Highway(hidden_size=2*d,is_train=self.is_train) # for i in range(SN): # c_s_highway = highway(c_s[i]) # c_s_h.append(c_s_highway) # para_gru = highway(para_gru) # q = highway(q) # c_s = c_s_h print('qc_att') self.c_s = c_s self.para_gru = para_gru qc_att = [] sen_mask = tf.unstack(self.sentence_mask, axis=1) with tf.variable_scope("sentence_attention", reuse=tf.AUTO_REUSE): for i in range(SN): qc_att_sample = dot_attention(c_s[i], q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) qc_att.append(qc_att_sample) para_att = dot_attention(para_gru, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) att_s = [] with tf.variable_scope("sentence_qcatt_rnn"): rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att[0].get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) for i in range(SN): att_s_single = rnn(qc_att[i], seq_len=sentence_len[i]) att_s.append(att_s_single) para_s = rnn(para_att, seq_len=self.para_len) self.sentence_att = qc_att self.para_att = para_att self_att = [] with tf.variable_scope("sentence_cpattention", reuse=tf.AUTO_REUSE): for i in range(SN): self_att_single = dot_attention(att_s[i], para_s, mask=self.para_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) self_att.append(self_att_single) with tf.variable_scope("para_selfattn"): # self.para_enc_slice, mask = self.para_enc_mask_slice, para_self_att = dot_attention(para_s, para_s, mask=self.para_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) self.sentence_selfatt = self_att self.para_selfatt = para_self_att match = [] with tf.variable_scope("sentence_cp_rnn"): rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att[0].get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) for i in range(SN): match_single = rnn(self_att[i], seq_len=sentence_len[i]) match.append(match_single) para_match = rnn(para_self_att, seq_len=self.para_len) self.match = match dense_prob = [] dense_con = [] with tf.variable_scope("dense_prob", reuse=tf.AUTO_REUSE): for i in range(SN): sentence_con = tf.concat([c_s[i], att_s[i], match[i]], axis=2) prob = dense_summ(sentence_con, d, mask=sen_mask[i], keep_prob=config.keep_prob, is_train=self.is_train) dense_prob.append(prob) dense_con.append(sentence_con) # with tf.variable_scope("para_prob"): para_con = tf.concat([para_gru, para_s, para_match], axis=2) para_prob = dense_summ(para_con, d, mask=self.para_mask, keep_prob=config.keep_prob, is_train=self.is_train) dense_prob.append(para_prob) dense_prob = tf.concat(dense_prob, axis=1) self.topk = tf.nn.softmax(dense_prob) batch_nums = tf.range(0, limit=N) batch_nums = tf.expand_dims(batch_nums, 1) batch_nums = tf.tile(batch_nums, [1, self.sentence_maxlen]) lo_shape = tf.constant([N, config.para_limit]) sentence_index_slice = tf.unstack(self.sentence_index_slice, axis=1) # how to ensure the probability # sentence1,sentence2,setence3,q,para =?*4 lo1 = [] lo2 = [] with tf.variable_scope("sentence_pointer", reuse=tf.AUTO_REUSE): self.init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.keep_prob, is_train=self.is_train) pointer = ptr_net_span(batch=N, hidden=self.init.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) indice_test = [] lo1_test = [] lo2_test = [] present = [] present_inp = [] for i in range(SN): logits1, logits2, inp1, inp2 = pointer(self.init, dense_con[i], d, sen_mask[i]) logits1 = logits1 * tf.cast(sen_mask[i], tf.float32) logits2 = logits2 * tf.cast(sen_mask[i], tf.float32) indice = tf.stack([batch_nums, sentence_index_slice[i]], axis=2) inp = tf.stack([inp1, inp2], axis=1) present.append(inp) present_inp.append(inp2) lo1_test.append(logits1) lo2_test.append(logits2) indice_test.append(indice) self.lo1 = lo1_test[0] self.lo2 = lo1_test[1] self.lo3 = lo1_test[2] lo1 = [ tf.slice(tf.scatter_nd(in1, in2, lo_shape), [0, 0], [N, self.para_maxlen]) for (in1, in2) in zip(indice_test, lo1_test) ] lo2 = [ tf.slice(tf.scatter_nd(in1, in2, lo_shape), [0, 0], [N, self.para_maxlen]) for (in1, in2) in zip(indice_test, lo2_test) ] with tf.variable_scope("para_pointer"): para_pointer = ptr_net_span( batch=N, hidden=self.init.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) para_lo1, para_lo2, inp1, inp2 = para_pointer( self.init, para_match, d, self.para_mask) present_para = tf.stack([inp1, inp2], axis=1) para_lo1 = softmax_mask(para_lo1, self.para_mask) para_lo2 = softmax_mask(para_lo2, self.para_mask) present.append(tf.tile(present_para, [1, 1, 3])) present_inp.append(inp2) lo1.append(para_lo1) lo2.append(para_lo2) self.lo4 = para_lo2 self.present = tf.stack(present, axis=2) out_lo1 = tf.stack(lo1, axis=1) out_lo2 = tf.stack(lo2, axis=1) out_lo1 = (tf.expand_dims(self.topk, axis=2)) * out_lo1 out_logits1 = tf.reduce_sum(out_lo1, axis=1) # out_logits1 = tf.slice(out_logits1, [0, 0], [N, self.para_maxlen]) # out_logits1 = softmax_mask(out_logits1, self.para_mask) out_lo2 = (tf.expand_dims(self.topk, axis=2)) * out_lo2 out_logits2 = tf.reduce_sum(out_lo2, axis=1) # out_logits2 = tf.slice(out_logits2, [0, 0], [N, self.para_maxlen]) # out_logits2 = softmax_mask(out_logits2, self.para_mask) self.out_lo1 = out_lo1 self.out_lo2 = out_logits1 # out_logits1 = tf.nn.softmax(out_logits1) # out_logits2 = tf.nn.softmax(out_logits2) outer = tf.matmul( tf.expand_dims(tf.nn.softmax(out_logits1), axis=2), tf.expand_dims(tf.nn.softmax(out_logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) with tf.variable_scope("predict"): self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=out_logits1, labels=tf.stop_gradient(self.y1_slice)) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=out_logits2, labels=tf.stop_gradient(self.y2_slice)) prob_y1 = tf.expand_dims(tf.reduce_max(tf.reduce_max(outer, axis=2), axis=1), axis=1) prob_y2 = tf.expand_dims(tf.reduce_max(tf.reduce_max(outer, axis=1), axis=1), axis=1) prob = tf.concat([prob_y1, prob_y2], axis=1) lossRL = -tf.log(prob) * self.reward_Diff self.out1 = losses self.out2 = losses2 loss = tf.concat([ tf.expand_dims(losses, axis=1), tf.expand_dims(losses2, axis=1) ], axis=1) final_reward = loss * self.reward_Diff self.loss3 = tf.reduce_mean((losses + losses2)) lam = config.lam self.loss_span = tf.reduce_mean(final_reward)
def decoder(target, state, params): mask = dtype.tf_to_float(tf.cast(target, tf.bool)) hidden_size = params.hidden_size initializer = tf.random_normal_initializer(0.0, hidden_size**-0.5) is_training = ('decoder' not in state) if is_training: target, mask = util.remove_invalid_seq(target, mask) embed_name = "embedding" if params.shared_source_target_embedding \ else "tgt_embedding" tgt_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size], initializer=initializer) tgt_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(tgt_emb, target) * (hidden_size**0.5) inputs = tf.nn.bias_add(inputs, tgt_bias) # shift if is_training: inputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]]) inputs = inputs[:, :-1, :] inputs = func.add_timing_signal(inputs) else: inputs = tf.cond( tf.reduce_all(tf.equal(target, params.tgt_vocab.pad())), lambda: tf.zeros_like(inputs), lambda: inputs) mask = tf.ones_like(mask) inputs = func.add_timing_signal(inputs, time=dtype.tf_to_float(state['time'])) inputs = util.valid_apply_dropout(inputs, params.dropout) with tf.variable_scope("decoder"): x = inputs for layer in range(params.num_decoder_layer): if params.deep_transformer_init: layer_initializer = tf.variance_scaling_initializer( params.initializer_gain * (layer + 1)**-0.5, mode="fan_avg", distribution="uniform") else: layer_initializer = None with tf.variable_scope("layer_{}".format(layer), initializer=layer_initializer): with tf.variable_scope("average_attention"): x_fwds = [] for strategy in params.strategies: with tf.variable_scope(strategy): x_fwd = average_attention_strategy( strategy, x, mask, state, layer, params) x_fwds.append(x_fwd) x_fwd = tf.add_n(x_fwds) / len(x_fwds) # FFN activation if params.use_ffn: y = func.ffn_layer( x_fwd, params.filter_size, hidden_size, dropout=params.relu_dropout, ) else: y = x_fwd # Gating layer z = func.linear(tf.concat([x, y], axis=-1), hidden_size * 2, scope="z_project") i, f = tf.split(z, 2, axis=-1) y = tf.sigmoid(i) * x + tf.sigmoid(f) * y x = func.residual_fn(x, y, dropout=params.residual_dropout) x = func.layer_norm(x) with tf.variable_scope("cross_attention"): y = func.dot_attention( x, state['encodes'], func.attention_bias(state['mask'], "masking"), hidden_size, num_heads=params.num_heads, dropout=params.attention_dropout, cache=None if is_training else state['decoder']['state']['layer_{}'.format(layer)]) if not is_training: # mk, mv state['decoder']['state']['layer_{}'.format(layer)]\ .update(y['cache']) y = y['output'] x = func.residual_fn(x, y, dropout=params.residual_dropout) x = func.layer_norm(x) with tf.variable_scope("feed_forward"): y = func.ffn_layer( x, params.filter_size, hidden_size, dropout=params.relu_dropout, ) x = func.residual_fn(x, y, dropout=params.residual_dropout) x = func.layer_norm(x) feature = x if 'dev_decode' in state: feature = x[:, -1, :] embed_name = "tgt_embedding" if params.shared_target_softmax_embedding \ else "softmax_embedding" embed_name = "embedding" if params.shared_source_target_embedding \ else embed_name softmax_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size], initializer=initializer) feature = tf.reshape(feature, [-1, params.embed_size]) logits = tf.matmul(feature, softmax_emb, False, True) logits = tf.cast(logits, tf.float32) soft_label, normalizer = util.label_smooth(target, util.shape_list(logits)[-1], factor=params.label_smooth) centropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=soft_label) centropy -= normalizer centropy = tf.reshape(centropy, tf.shape(target)) mask = tf.cast(mask, tf.float32) per_sample_loss = tf.reduce_sum(centropy * mask, -1) / tf.reduce_sum( mask, -1) loss = tf.reduce_mean(per_sample_loss) # these mask tricks mainly used to deal with zero shapes, such as [0, 1] loss = tf.cond(tf.equal(tf.shape(target)[0], 0), lambda: tf.constant(0, dtype=tf.float32), lambda: loss) return loss, logits, state, per_sample_loss
def ready(self): config = self.config N, QL, CL, d, dc, dg = config.batch_size, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru SN, SL = self.c_s_maxnum, self.c_s_maxlen W = config.glove_dim print('embedding part') with tf.variable_scope("emb"): # with tf.variable_scope("char"): # ch_emb = tf.reshape(tf.nn.embedding_lookup( # self.char_mat, self.csh_slice), [N, SN * SL, CL, dc], name='char_reshape') # qh_emb = tf.reshape(tf.nn.embedding_lookup( # self.char_mat, self.qh_slice), [N, QL, CL, dc]) # ch_emb = dropout( # ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) # qh_emb = dropout( # qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) # ch_emb_char = tf.unstack(ch_emb, axis=0) # qh_emb_char = tf.unstack(qh_emb, axis=0) ''' filter_size = [3, 4, 5] att_char = [] merge_char = [] q_merge_char = [] for filter in filter_size: with tf.variable_scope("char-cnnencoder-%s" % filter): step_merge_char = [] step_att_char = [] q_step_merge_char = [] q_step_att_char = [] for i in range(2): if i==0: input_char=ch_emb else: input_char=qh_emb conv_branch_char = tf.layers.conv2d( inputs=input_char, # use as many filters as the hidden size filters=50, kernel_size=filter, use_bias=True, activation=tf.nn.relu, trainable=True, padding='SAME', name = 'conv_char_' + str(filter), reuse = tf.AUTO_REUSE, data_format='channels_last' ) if i ==0: step_att_char.append(conv_branch_char) # pool over the words to obtain: [first_dim x 1* hidden_size] pool_branch_char = tf.reduce_max(conv_branch_char, axis=2) merge_char.append(pool_branch_char) else: q_step_att_char.append(conv_branch_char) # pool over the words to obtain: [first_dim x 1* hidden_size] q_pool_branch_char = tf.reduce_max(conv_branch_char, axis=2) q_merge_char.append(q_pool_branch_char) # batch_merge = tf.stack(step_merge_char, axis=0) # merge_char.append(batch_merge) # batch_merge_q = tf.stack(q_step_merge_char, axis=0) # q_merge_char.append(batch_merge_q) ch_con = tf.concat(merge_char, axis=-1) ch_con = tf.reshape(ch_con,[N,SN,SL,150]) qh_con = tf.concat(q_merge_char,axis=-1) ''' # if(use_char): # with tf.variable_scope("char"): # ch_emb = tf.reshape(tf.nn.embedding_lookup( # self.char_mat, self.csh), [N * SN * SL, CL, dc], name='char_reshape') # qh_emb = tf.reshape(tf.nn.embedding_lookup( # self.char_mat, self.qh), [N * QL, CL, dc]) # ch_emb = dropout( # ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) # qh_emb = dropout( # qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) # cell_fw = tf.contrib.rnn.GRUCell(dg) # cell_bw = tf.contrib.rnn.GRUCell(dg) # _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( # cell_fw, cell_bw, ch_emb, self.csh_len, dtype=tf.float32) # ch_emb = tf.concat([state_fw, state_bw], axis=1) # _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( # cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) # qh_emb = tf.concat([state_fw, state_bw], axis=1) # qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) # ch_emb = tf.reshape(ch_emb, [N, SN, SL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.cs_slice) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q_slice) with tf.name_scope("softemb"): c_emb_linear = tf.nn.relu( dense(c_emb, d, use_bias=True, scope="c_emb_linear")) q_emb_linear = tf.nn.relu( dense(q_emb, d, use_bias=True, scope="q_emb_linear")) c_emb_linear = tf.reshape( c_emb_linear, [N, self.c_s_maxnum * self.c_s_maxlen, d]) align_cq = tf.matmul(c_emb_linear, tf.transpose(q_emb_linear, [0, 2, 1])) cq_mask = tf.tile(tf.expand_dims(self.q_mask, axis=1), [1, self.c_s_maxnum * self.c_s_maxlen, 1]) self.align_cq = tf.nn.softmax(softmax_mask(align_cq, cq_mask)) align_c_emb = tf.matmul(self.align_cq, q_emb_linear) align_c_emb = tf.reshape( align_c_emb, [N, self.c_s_maxnum, self.c_s_maxlen, d]) c_emb = tf.concat( [c_emb, align_c_emb, self.ce_slice, self.ct_slice], axis=3) c_emb = tf.reshape( c_emb, [N, self.c_s_maxnum, self.c_s_maxlen, W + d + 3 + 19], name='c_emb_reshape') q_emb = tf.concat([q_emb, self.qt_slice], axis=2) self.c_emb = c_emb self.q_emb = q_emb # c_emb = tf.reshape(c_emb, [N,self.c_s_maxnum,self.c_s_maxlen,W+self.q_maxlen]) print('encode-part') # c_s_len = tf.unstack(self.c_s_len, axis=1) cnn_out = [] c_s_emb = tf.unstack(c_emb, axis=0) # q_s_emb = tf.expand_dims(q_emb, axis=1) # q_sample_emb = tf.unstack(q_s_emb, axis = 0) filter_size = [3, 4, 5] att = [] merge = [] q_merge = [] with tf.variable_scope("cnnencoder"): for filter in filter_size: step_merge = [] step_att = [] q_step_merge = [] q_step_att = [] with tf.variable_scope("cnnencoder-%s" % filter): for i in range(N): conv_branch = tf.layers.conv1d( inputs=c_s_emb[i], # use as many filters as the hidden size filters=100, kernel_size=[filter], use_bias=True, activation=tf.nn.relu, trainable=True, padding='SAME', name='conv_' + str(filter), reuse=tf.AUTO_REUSE) # tf.get_variable_scope().reuse_variables() step_att.append(conv_branch) # pool over the words to obtain: [first_dim x 1* hidden_size] pool_branch = tf.reduce_max(conv_branch, axis=1) pool_branch = dropout(pool_branch, keep_prob=config.keep_prob, is_train=self.is_train) step_merge.append(pool_branch) batch_merge = tf.stack(step_merge, axis=0) merge.append(batch_merge) # batch_merge_q = tf.stack(q_step_merge, axis = 0) # q_merge.append(batch_merge_q) con = tf.concat(merge, axis=-1) # q_con = tf.concat(q_merge, axis = -1) # # attention_vis = tf.stack(att, axis=0) # attention_vis = tf.reduce_mean(attention_vis, axis=0) # cnn_out.append(con) # c_sen_emb = tf.concat(con, axis = 0) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=con.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) print('passage-encoder') c_s = rnn(con, seq_len=self.c_p_len) # q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("qencode"): with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=q_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) q = rnn(q_emb, seq_len=self.q_len) self.q_enc = q print('qc_att') with tf.variable_scope("attention"): qc_att = dot_attention(c_s, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.att_s = rnn(qc_att, seq_len=self.c_p_len) # print('pointer') with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train, is_sentence=True) logits1 = pointer(init, self.att_s, d, self.c_p_mask) self.lo = logits1 with tf.variable_scope("predict"): self.outer = tf.nn.softmax(logits1) self.yp = tf.argmax(self.outer, axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=tf.stop_gradient(self.y_slice)) self.out1 = tf.nn.top_k(self.outer, config.k).values self.policy = tf.nn.top_k(self.outer, 1).values self.policy = tf.reduce_sum(tf.nn.top_k(self.outer, config.k).values, axis=-1, keepdims=True) self.policy_log_part = tf.log(self.policy) #self.loss = tf.reduce_mean(-1 * self.policy_log_part * self.reward) reward = self.advantage reward_mean, reward_var = tf.nn.moments(reward, axes=[0]) reward_std = tf.sqrt(reward_var) + 1e-6 self.reward_mean = reward_mean self.reward_var = reward_std reward = tf.div(reward - reward_mean, reward_std) self.final_reward = reward - self.baseline self.loss = tf.reduce_mean(-1 * self.policy_log_part * self.advantage)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope('emb'): with tf.variable_scope('char'): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout( ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout( qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope('word'): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope('encoding'): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope('attention'): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope('match'): self_att = dot_attention( att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope('pointer'): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list( )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope('predict'): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=tf.stop_gradient(self.y1)) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=tf.stop_gradient(self.y2)) self.loss = tf.reduce_mean(losses + losses2)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout( ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout( qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) self.c_emb = tf.stop_gradient(c_emb) self.q_emb = tf.stop_gradient(q_emb) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.c_rnn = c = rnn(c_emb, seq_len=self.c_len) self.q_rnn = q = rnn(q_emb, seq_len=self.q_len) c = tf.stop_gradient(c) q = tf.stop_gradient(q) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.att = [rnn(qc_att, seq_len=self.c_len)] self.att += [self.att[-1][:,-1,:]] with tf.variable_scope("binary"): for _ in range(3): self.att += [tf.nn.dropout(tf.keras.layers.Dense(300)(self.att[-1]), keep_prob=config.keep_prob)] with tf.variable_scope("badptr"): init = self.att[-1] pointer = ptr_net(batch=N, hidden=init.get_shape().as_list( )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, self.att[0], d, self.c_mask) with tf.variable_scope("badptr_predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1_distrib = tf.reduce_max(outer, axis=2) self.yp2_distrib = tf.reduce_max(outer, axis=1) self.yp1 = tf.argmax(self.yp1_distrib, axis=1) self.yp2 = tf.argmax(self.yp2_distrib, axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=tf.stop_gradient(self.y1)) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=tf.stop_gradient(self.y2)) self.loss = tf.reduce_mean(losses + losses2)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru gi = [] att_vP = [] for i in range(config.max_para): print(i) with tf.variable_scope("emb" + str(i)): with tf.variable_scope("char" + str(i)): #CL = tf.Print(CL,[CL],message="CL:") #PL = tf.Print(PL,[PL],message="PL:") #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr.get_shape()],message="ch_pr:") self.ch_pr_ = self.ch_pr[:, i * 400:(i + 1) * 400, :] print(self.ch_pr_.get_shape()) #self.c_pr = tf.reshape(self.c_pr, [N, 12, PL]) #print(self.ch.get_shape()) #print(self.ch_pr.get_shape()) #print(self.c.get_shape()) #print(self.c_pr.get_shape()) #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr[:,2:,:]],message="ch_pr") ch_emb = tf.reshape(tf.nn.embedding_lookup(\ self.char_mat, self.ch_pr_), [N * PL, CL, dc]) # self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb") #qh_emb = tf.Print(qh_emb,[qh_emb],message="qh_emb") qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) #state_fw = tf.Print(state_fw,[state_fw],message="state_fw") #state_bw = tf.Print(state_bw,[state_bw],message="state_bw") qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb") with tf.name_scope("word" + str(i)): c_emb = tf.nn.embedding_lookup( self.word_mat, self.c_pr[:, i * 400:(i + 1) * 400]) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding" + str(i)): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention" + str(i)): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) # att is the v_P if i == 0: att_vP = att else: att_vP = tf.concat([att_vP, att], axis=1) #att = tf.Print(att,[att],message="att:") print("att:", att.get_shape().as_list()) print("att_vP:", att_vP.get_shape().as_list()) #att_vP = tf.Print(att_vP,[tf.shape(att_vP)],message="att_vP:") """ with tf.variable_scope("match"): self_att = dot_attention( att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) """ with tf.variable_scope("pointer"): # r_Q: init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) print("rQ:", init.get_shape().as_list()) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, att, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) #losses1_2 = tf.reduce_mean(losses1_2, axis=0) self.loss = tf.reduce_mean(losses + losses2) # print losses #condition = tf.greater(self.loss, 11) #self.yp1 = tf.where(condition, tf.Print(self.yp1,[self.yp1],message="Yp1:"), self.yp1) #self.yp2 = tf.where(condition, tf.Print(self.yp2,[self.yp2],message="Yp2:"), self.yp1) if config.with_passage_ranking: gi = None for i in range(config.max_para): # Passage ranking with tf.variable_scope("passage-ranking-attention" + str(i)): #att_vP = tf.Print(att_vP,[att_vP.get_shape()],message="att_vP:") vj_P = att_vP[:, i * 400:(i + 1) * 400, :] pr_att = pr_attention( batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) r_P = pr_att(init, vj_P, d, self.c_mask) #r_P = tf.Print(r_P,[r_P],message="r_p") # Wg concatenate = tf.concat([init, r_P], axis=1) g = tf.nn.tanh( dense(concatenate, hidden=d, use_bias=False, scope="g" + str(i))) g_ = dense(g, 1, use_bias=False, scope="g_" + str(i)) #g = tf.Print(g,[g],message="g") if i == 0: gi = tf.reshape(g_, [N, 1]) else: gi = tf.concat([gi, tf.reshape(g_, [N, 1])], axis=1) #gi_ = tf.convert_to_tensor(gi,dtype=tf.float32) #self.gi = tf.nn.softmax(gi_) #self.losses3 = tf.nn.softmax_cross_entropy_with_logits( # logits=gi_, labels=tf.reshape(self.pr,[-1,1])) self.losses3 = tf.nn.softmax_cross_entropy_with_logits( logits=gi, labels=self.pr) #self.losses3 = tf.Print(self.losses3,[self.losses3,tf.reduce_max(self.losses3), # tf.reduce_max(self.pr),tf.reduce_max(gi)],message="losses3:") self.pr_loss = tf.reduce_mean(self.losses3) #self.pr_loss = tf.Print(self.pr_loss,[self.pr_loss]) self.r = tf.constant(0.8) self.e_loss1 = tf.multiply(self.r, self.loss) self.e_loss2 = tf.multiply(tf.subtract(tf.constant(1.0), self.r), self.pr_loss) self.e_loss = tf.add(self.e_loss1, self.e_loss2)
def decoder(target, state, params): mask = dtype.tf_to_float(tf.cast(target, tf.bool)) hidden_size = params.hidden_size initializer = tf.random_normal_initializer(0.0, hidden_size**-0.5) is_training = ('decoder' not in state) if is_training: target, mask = util.remove_invalid_seq(target, mask) embed_name = "embedding" if params.shared_source_target_embedding \ else "tgt_embedding" tgt_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size], initializer=initializer) tgt_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(tgt_emb, target) * (hidden_size**0.5) inputs = tf.nn.bias_add(inputs, tgt_bias) # shift if is_training: inputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]]) inputs = inputs[:, :-1, :] inputs = func.add_timing_signal(inputs) else: inputs = tf.cond( tf.reduce_all(tf.equal(target, params.tgt_vocab.pad())), lambda: tf.zeros_like(inputs), lambda: inputs) mask = tf.ones_like(mask) inputs = func.add_timing_signal(inputs, time=dtype.tf_to_float(state['time'])) inputs = util.valid_apply_dropout(inputs, params.dropout) # Applying L0Drop # -------- source_memory = state["encodes"] source_mask = state["mask"] # source_pruning: log alpha_i = x_i w^T source_pruning = func.linear(source_memory, 1, scope="source_pruning") if is_training: # training source_memory, l0_mask = l0norm.var_train( (source_memory, source_pruning)) l0_norm_loss = tf.squeeze(l0norm.l0_norm(source_pruning), -1) l0_norm_loss = tf.reduce_sum(l0_norm_loss * source_mask, -1) / tf.reduce_sum(source_mask, -1) l0_norm_loss = tf.reduce_mean(l0_norm_loss) l0_norm_loss = l0norm.l0_regularization_loss( l0_norm_loss, reg_scalar=params.l0_norm_reg_scalar, start_reg_ramp_up=params.l0_norm_start_reg_ramp_up, end_reg_ramp_up=params.l0_norm_end_reg_ramp_up, warm_up=params.l0_norm_warm_up, ) # force the model to only attend to unmasked position source_mask = dtype.tf_to_float( tf.cast(tf.squeeze(l0_mask, -1), tf.bool)) * source_mask else: # evaluation source_memory, l0_mask = l0norm.var_eval( (source_memory, source_pruning)) l0_norm_loss = 0.0 source_memory, source_mask, count_mask = extract_encodes( source_memory, source_mask, l0_mask) count_mask = tf.expand_dims(tf.expand_dims(count_mask, 1), 1) # -------- with tf.variable_scope("decoder"): x = inputs for layer in range(params.num_decoder_layer): if params.deep_transformer_init: layer_initializer = tf.variance_scaling_initializer( params.initializer_gain * (layer + 1)**-0.5, mode="fan_avg", distribution="uniform") else: layer_initializer = None with tf.variable_scope("layer_{}".format(layer), initializer=layer_initializer): with tf.variable_scope("self_attention"): y = func.dot_attention( x, None, func.attention_bias(tf.shape(mask)[1], "causal"), hidden_size, num_heads=params.num_heads, dropout=params.attention_dropout, cache=None if is_training else state['decoder']['state']['layer_{}'.format(layer)]) if not is_training: # k, v state['decoder']['state']['layer_{}'.format(layer)] \ .update(y['cache']) y = y['output'] x = func.residual_fn(x, y, dropout=params.residual_dropout) x = func.layer_norm(x) with tf.variable_scope("cross_attention"): if is_training: y = func.dot_attention( x, source_memory, func.attention_bias(source_mask, "masking"), hidden_size, num_heads=params.num_heads, dropout=params.attention_dropout, ) else: y = dot_attention(x, source_memory, func.attention_bias( source_mask, "masking"), hidden_size, count_mask=count_mask, num_heads=params.num_heads, dropout=params.attention_dropout, cache=state['decoder']['state'][ 'layer_{}'.format(layer)]) # mk, mv state['decoder']['state']['layer_{}'.format(layer)] \ .update(y['cache']) y = y['output'] x = func.residual_fn(x, y, dropout=params.residual_dropout) x = func.layer_norm(x) with tf.variable_scope("feed_forward"): y = func.ffn_layer( x, params.filter_size, hidden_size, dropout=params.relu_dropout, ) x = func.residual_fn(x, y, dropout=params.residual_dropout) x = func.layer_norm(x) feature = x if 'dev_decode' in state: feature = x[:, -1, :] embed_name = "tgt_embedding" if params.shared_target_softmax_embedding \ else "softmax_embedding" embed_name = "embedding" if params.shared_source_target_embedding \ else embed_name softmax_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size], initializer=initializer) feature = tf.reshape(feature, [-1, params.embed_size]) logits = tf.matmul(feature, softmax_emb, False, True) logits = tf.cast(logits, tf.float32) soft_label, normalizer = util.label_smooth(target, util.shape_list(logits)[-1], factor=params.label_smooth) centropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=soft_label) centropy -= normalizer centropy = tf.reshape(centropy, tf.shape(target)) mask = tf.cast(mask, tf.float32) per_sample_loss = tf.reduce_sum(centropy * mask, -1) / tf.reduce_sum( mask, -1) loss = tf.reduce_mean(per_sample_loss) loss = loss + l0_norm_loss # these mask tricks mainly used to deal with zero shapes, such as [0, 1] loss = tf.cond(tf.equal(tf.shape(target)[0], 0), lambda: tf.constant(0, tf.float32), lambda: loss) return loss, logits, state, per_sample_loss
def __init__(self, config, batch, word_mat=None, char_mat=None, pos_mat=None, filter_sizes=None, embedding_size=None, num_filters=None, trainable=True, l2_reg_lambda=0.0, keep_prob=0.9, graph=None): # Placeholders for input, output and dropout self.config = config self.graph = graph if graph is not None else tf.Graph() self.trainable = trainable gru = cudnn_gru if config.use_cudnn else native_gru self.is_train = tf.get_variable("is_train", shape=[], dtype=tf.bool, trainable=True) if trainable == True: self.input_x, self.c_pos, self.c_important, self.input_x1, self.q_pos, self.q_important, self.ch, self.qh, self.input_y, self.qa_id, self.alternatives_tokens = batch.get_next( ) # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len] else: self.input_x, self.c_pos, self.c_important, self.input_x1, self.q_pos, self.q_important, self.ch, self.qh, self.alternatives_tokens = batch.get_next( ) # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len] self.dropout_keep_prob = keep_prob self.global_step = tf.get_variable( 'global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.dropout = tf.placeholder_with_default(0.5, (), name="dropout") # Keeping track of l2 regularization loss (optional) l2_loss = tf.constant(0.0) self.c_mask = tf.cast( self.input_x, tf.bool) # 这里是判断出每一个数据集的context对应实际句子长度的位置(64,400) self.q_mask = tf.cast(self.input_x1, tf.bool) # 同上(64,50) self.c_pos_mask = tf.cast( self.c_pos, tf.bool) # 这里是判断出每一个数据集的context对应实际句子长度的位置(64,400) self.q_pos_mask = tf.cast(self.q_pos, tf.bool) # 同上(64,50) self.c_important_mask = tf.cast(self.c_important, tf.bool) self.q_important_mask = tf.cast(self.q_important, tf.bool) self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) # 每一个训练数据集实际长度 self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) # 每一个问题的实际长度 self.c_pos_len = tf.reduce_sum(tf.cast(self.c_pos_mask, tf.int32), axis=1) # 每一个训练数据集实际长度 self.q_pos_len = tf.reduce_sum(tf.cast(self.q_pos_mask, tf.int32), axis=1) # 每一个问题的实际长度 self.c_important_len = tf.reduce_sum(tf.cast(self.c_important_mask, tf.int32), axis=1) self.q_important_len = tf.reduce_sum(tf.cast(self.q_important_mask, tf.int32), axis=1) self.ch_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1]) self.qh_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1]) # Embedding layer N, PL, QL, CL, d, dc,dg= config.batch_size,config.para_limit,config.ques_limit,config.char_limit,\ config.hidden, config.char_dim,config.char_hidden self.words_embedding = tf.get_variable("word_mat", initializer=tf.constant( word_mat, dtype=tf.float32), trainable=True) self.pos_W_embedding = tf.get_variable("pos_mat", initializer=tf.constant( pos_mat, dtype=tf.float32), trainable=True) self.char_mat = tf.get_variable("char_mat", initializer=tf.constant( char_mat, dtype=tf.float32), trainable=True) with tf.variable_scope("Input_Embedding_Layer"): #字符表示方法 ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) cell_fw = tf.contrib.rnn.GRUCell(dg) # 按照字符有多少个gru神经单元 cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32 ) # self.ch_len表示训练数据集所有字符平摊之后,实际字符的长度,sequence_length=[bacth_size] is N * PL, because # char_hidden is 100 so state_fw and state_bw is [N * PL,100] ch_emb = tf.concat([state_fw, state_bw], axis=1) # [N * PL,200] _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) # state_* [N*QL] qh_emb = tf.concat([state_fw, state_bw], axis=1) # question_emd is [,200] qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) # [batch_size,que_len,200] ch_emb = tf.reshape( ch_emb, [N, PL, 2 * dg] ) # 以上过程对应了论文里边的 the character-level embedding are generate by ...in the token #这样就把每一个单词的字符转化为单词的字符级别embedding信息,tf.reshape(ch_emb, [N, PL, 2 * dg]) # 从这里可以看出作者最后那字符的state状态作为字符信息与原始单词embedding进行连接,那么是否可以用拼音 # 作为汉语的字符级别信息呢,可以尝试 with tf.variable_scope("Iportant_Embedding_Layer"): c_important_emb = tf.nn.embedding_lookup(self.words_embedding, self.c_important) q_important_emb = tf.nn.embedding_lookup(self.words_embedding, self.q_important) c_important_emb = tf.nn.dropout(c_important_emb, 1.0 - 0.5 * self.dropout) q_important_emb = tf.nn.dropout(q_important_emb, 1.0 - 0.5 * self.dropout) cell_fw = tf.contrib.rnn.GRUCell(dg) # 按照字符有多少个gru神经单元 cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, c_important_emb, self.c_important_len, dtype=tf.float32 ) # self.ch_len表示训练数据集所有字符平摊之后,实际字符的长度,sequence_length=[bacth_size] is N * PL, because # char_hidden is 100 so state_fw and state_bw is [N * PL,100] c_important_emb = tf.concat([state_fw, state_bw], axis=1) # [N * PL,200] _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, q_important_emb, self.q_important_len, dtype=tf.float32) # state_* [N*QL] q_important_emb = tf.concat([state_fw, state_bw], axis=1) # question_emd is [,200] print(c_important_emb, "222222222222222") with tf.variable_scope("pos_Embedding_Layer"): c_pos_em = tf.nn.embedding_lookup(self.pos_W_embedding, self.c_pos) q_pos_em = tf.nn.embedding_lookup(self.pos_W_embedding, self.q_pos) c_pos_em = tf.nn.dropout(c_pos_em, 1.0 - 0.5 * self.dropout) q_pos_em = tf.nn.dropout(q_pos_em, 1.0 - 0.5 * self.dropout) cell_fw = tf.contrib.rnn.GRUCell(dg) # 按照字符有多少个gru神经单元 cell_bw = tf.contrib.rnn.GRUCell(dg) (state_fw, state_bw), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, c_pos_em, self.c_pos_len, dtype=tf.float32 ) # self.ch_len表示训练数据集所有字符平摊之后,实际字符的长度,sequence_length=[bacth_size] is N * PL, because # char_hidden is 100 so state_fw and state_bw is [N * PL,100] c_pos_em = tf.concat([state_fw, state_bw], axis=2) # [N * PL,200] (state_fw, state_bw), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, q_pos_em, self.q_pos_len, dtype=tf.float32) # state_* [N*QL] q_pos_em = tf.concat([state_fw, state_bw], axis=2) # question_emd is [,200] print(c_pos_em, "222222222222222") with tf.name_scope("embedding"): if trainable: self.c_maxlen, self.q_maxlen, = config.para_limit, config.ques_limit, else: self.c_maxlen, self.q_maxlen = config.test_para_limit, config.test_ques_limit self.embedded_chars = tf.nn.embedding_lookup( self.words_embedding, self.input_x) self.embedded_chars1 = tf.nn.embedding_lookup( self.words_embedding, self.input_x1) c_emb = tf.concat([self.embedded_chars, ch_emb, c_pos_em], axis=2) q_emb = tf.concat([self.embedded_chars1, qh_emb, q_pos_em], axis=2) # self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) # self.embedded_chars_expanded1 = tf.expand_dims(self.embedded_chars1, -1) with tf.variable_scope("encoding"): rnn = gru( num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train ) #input_size对应embedding的长度,此过程是初始化一个gru,双向lstm,包括他们的初始状态 c = rnn( c_emb, seq_len=self.c_len ) #上下文编码输出为batch ,c_maxlen,以及lstm输出长度 [batch_size,sequncen_length,150*3] num_layers is 3 so concat each layers #each layer is 150 because each layers has back_forword and feed_forword(75+75) q = rnn(q_emb, seq_len=self.q_len) #问题编码 with tf.variable_scope("attention"): qc_att = dot_attention( c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) # 这个函数实现的是公式(4)中的所有公式 rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn( qc_att, seq_len=self.c_len) # this is 公式(3) #[batch,c_maxlen,150] # Create a convolution + maxpool layer for each filter size input_shape = att.get_shape().as_list() print(att, "rrrr") att = tf.expand_dims(att, -1) print(att, "hhhhhhhhhhhh") pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, input_shape[-1], 1, num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") l2_loss += tf.nn.l2_loss(W) b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") l2_loss += tf.nn.l2_loss(b) conv_ouput = tf.nn.conv2d(att, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv_ouput, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, config.para_limit - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) self.h_pool = tf.concat(pooled_outputs, 3) self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) # Add dropout with tf.name_scope("dropout"): self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) # Final (unnormalized) scores and predictions with tf.name_scope("output"): c_import_shape = c_important_emb.get_shape().as_list() # self.h_drop=tf.concat([self.h_drop,c_important_emb],axis=-1) W = tf.get_variable( "W", shape=[num_filters_total, 3], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[3]), name="b") l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") self.predictions = tf.argmax(self.scores, 1, name="predictions") # Calculate mean cross-entropy loss if trainable: with tf.name_scope("loss"): print(self.scores, self.input_y, "llllllllllllllll") losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") # if config.decay is not None: # self.var_ema = tf.train.ExponentialMovingAverage(config.decay) # ema_op = self.var_ema.apply(tf.trainable_variables()) # with tf.control_dependencies([ema_op]): # self.loss = tf.identity(self.loss) # # self.assign_vars = [] # for var in tf.global_variables(): # v = self.var_ema.average(var) # if v: # self.assign_vars.append(tf.assign(var, v)) self.lr = tf.minimum( config.init_lr, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm(gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
def define_model(self): config = self.config N, PL, QL, d = config.batch_size * 4, self.article_maxlen, self.question_maxlen, config.hidden_size self.debug_output_name = [] self.debug_output = [] with tf.device("/cpu:0"): with tf.variable_scope("emb"): with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.article) q_emb = tf.nn.embedding_lookup(self.word_mat, self.question) with tf.variable_scope("encoding"): c, _ = stacked_gru(c_emb, d, batch=N, num_layers=2, seq_len=self.article_len, keep_prob=self.keep_prob, is_train=self.is_train) tf.get_variable_scope().reuse_variables() q, _ = stacked_gru(q_emb, d, batch=N, num_layers=2, seq_len=self.question_len, keep_prob=self.keep_prob, is_train=self.is_train) # c size: [batch_size, c_len, 2*d] # q size: [batch_size, q_len, 2*d] with tf.variable_scope("attention_q2d"): qc_att, att_weight_ = dot_attention(c, q, mask=self.question_mask, hidden=d, keep_prob=self.keep_prob, is_train=self.is_train) # att_weight_ : [batch_size, c_len, q_len] # qc_att: [batch_size, c_len, 2*2*d] att, _ = stacked_gru(qc_att, d, num_layers=1, seq_len=self.article_len, batch=N, keep_prob=self.keep_prob, is_train=self.is_train) with tf.variable_scope("match"): self_att, self_att_weight_ = dot_attention( att, att, mask=self.article_mask, hidden=d, keep_prob=self.keep_prob, is_train=self.is_train) match, _ = stacked_gru(self_att, d, num_layers=1, seq_len=self.article_len, batch=N, keep_prob=self.keep_prob, is_train=self.is_train) # match size: [batch_size, c_len, 2*d] with tf.variable_scope("sum"): weight_for_each_passage_word = tf.expand_dims( tf.reduce_sum(att_weight_, 2), 1) # [batch_size, 1, c_len] passage_representation = tf.matmul(weight_for_each_passage_word, match) # [batch_size, 1, 2*d] -> [batch_size, 2*d] weight_for_each_question_word = tf.expand_dims( tf.reduce_sum(att_weight_, 1), 1) # [batch_size, 1, q_len] question_representation = tf.matmul(weight_for_each_question_word, q) # [batch_size, 1, 2*d] -> [batch_size, 2*d] with tf.variable_scope("predict"): p_hidden = 2 * d q_hidden = 2 * d W_predict = tf.get_variable( "W_predict", [q_hidden, p_hidden], initializer=tf.truncated_normal_initializer(stddev=0.1), dtype=tf.float64) question_representation = tf.reshape(question_representation, [-1, q_hidden]) # [batch_size, q_hidden] question_representation = tf.cast(question_representation, dtype=tf.float64) score = tf.matmul(question_representation, W_predict) # [batch_size, p_hidden] score = tf.reshape(score, [-1, 1, p_hidden]) # [batch_size, 1, p_hidden] passage_representation = tf.transpose(passage_representation, [0, 2, 1]) passage_representation = tf.cast(passage_representation, dtype=tf.float64) score = tf.matmul(score, passage_representation) # [batch_size, 1, 1] score = tf.reshape(score, [-1, 4]) score = tf.nn.softmax(score, dim=1) score = tf.cast(score, dtype=tf.float32) self.score = tf.reshape(score, [-1]) tf.summary.histogram('scores', self.score) self.loss = tf.losses.mean_squared_error(self.score, self.labels) tf.summary.scalar('loss_function', self.loss) self.debug_output_name = ['att_weight_', 'score'] self.debug_output = [att_weight_, self.score]
def ready(self): N, PL, QL, CL, d, dc, dg = 64, self.c_maxlen, self.q_maxlen, char_limit, hidden, char_dim, char_hidden gru = cudnn_gru if use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1]) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1]) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, hidden=d) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1]) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1]) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) #outer = tf.matrix_band_part(outer, 0, 15) outer = tf.matrix_band_part(outer, 0, 12) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
def get_vp(self, i): config = self.config gru = cudnn_gru if config.use_cudnn else native_gru opt = True MPL = config.single_para_limit zero = tf.constant(0) i_ = tf.constant(i) start = i * MPL end = (i + 1) * MPL c_pr = self.c_pr[:, start:end] ch_pr = self.ch_pr[:, start:end, :] # local masks c_mask = tf.cast(c_pr, tf.bool) q_mask = tf.cast(self.q, tf.bool) c_len = tf.reduce_sum(tf.cast(c_mask, tf.int32), axis=1) q_len = tf.reduce_sum(tf.cast(q_mask, tf.int32), axis=1) """ ### this line will replace the c_len with values 8 as it is some # unnecessary padding from the examples which does not have # passages with the same number as the max number of passage in the batch eight_indexes = tf.not_equal(c_len, tf.constant(8,dtype=tf.int32)) eight_indexes = tf.cast(eight_indexes,tf.int32) c_len = c_len*eight_indexes """ if opt: N, CL = config.batch_size, config.char_limit c_maxlen = tf.reduce_max(c_len) q_maxlen = tf.reduce_max(q_len) c_pr = tf.slice(c_pr, [0, 0], [N, c_maxlen]) q = tf.slice(self.q, [0, 0], [N, q_maxlen]) c_mask = tf.slice(c_mask, [0, 0], [N, c_maxlen]) q_mask = tf.slice(q_mask, [0, 0], [N, q_maxlen]) ch_pr = tf.slice(ch_pr, [0, 0, 0], [N, c_maxlen, CL]) qh = tf.slice(self.qh, [0, 0, 0], [N, q_maxlen, CL]) y1 = tf.slice(self.y1, [0, 0], [N, c_maxlen]) y2 = tf.slice(self.y2, [0, 0], [N, c_maxlen]) seq_mask = tf.sequence_mask(c_len, maxlen=c_maxlen) else: self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit ch_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(ch_pr, tf.bool), tf.int32), axis=2), [-1]) qh_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(qh, tf.bool), tf.int32), axis=2), [-1]) N, PL, QL, CL, d, dc, dg = config.batch_size, c_maxlen, q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, ch_pr), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) #self.cell_fw = tf.contrib.rnn.GRUCell(dg) #self.cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( self.cell_fw, self.cell_bw, ch_emb, ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( self.cell_fw, self.cell_bw, qh_emb, qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, c_pr) q_emb = tf.nn.embedding_lookup(self.word_mat, q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): #gru1 = lambda: gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( # ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) #self.rnn1 = tf.cond(tf.equal(i_,zero), gru1, lambda: self.rnn1) #c = self.rnn1(c_emb, seq_len=c_len) #q = self.rnn1(q_emb, seq_len=q_len) if i == 0: self.rnn1 = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.q_enc = self.rnn1(q_emb, seq_len=q_len) c = self.rnn1(c_emb, seq_len=c_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, self.q_enc, mask=q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train, name_scope="attention_layer") #gru2 = lambda: gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( # ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) #self.rnn2 = tf.cond(tf.equal(i_,zero), gru2, lambda: self.rnn2) #att = self.rnn2(qc_att, seq_len=c_len) if i == 0: self.rnn2 = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = self.rnn2(qc_att, seq_len=c_len) return att, c_len, c_mask, y1, y2, seq_mask
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) self.c_emb = c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) bad_c_emb = tf.stop_gradient(c_emb) bad_q_emb = tf.stop_gradient(q_emb) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=bad_c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.c_rnn = rnn(bad_c_emb, seq_len=self.c_len) self.q_rnn = rnn(bad_q_emb, seq_len=self.q_len) badptr_c = tf.stop_gradient(self.c_rnn) badptr_q = tf.stop_gradient(self.q_rnn) old_rnn = rnn with tf.variable_scope("badptr_attention"): qc_att, self.badptr_qc_att = dot_attention( badptr_c, badptr_q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train, give=True) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) self.att = [rnn(qc_att, seq_len=self.c_len)] self.att += [self.att[-1][:, -1, :]] with tf.variable_scope("badptr_dense"): for _ in range(3): self.att += [ tf.nn.dropout(tf.keras.layers.Dense(300)(self.att[-1]), keep_prob=config.keep_prob) ] with tf.variable_scope("badptr"): init = self.att[-1] pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, self.att[0], d, self.c_mask) with tf.variable_scope("badptr_predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.bad_yp1_distrib = tf.reduce_max(outer, axis=2) self.bad_yp2_distrib = tf.reduce_max(outer, axis=1) self.bad_yp1 = tf.argmax(self.bad_yp1_distrib, axis=1) self.bad_yp2 = tf.argmax(self.bad_yp2_distrib, axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=tf.stop_gradient(self.bad_y1)) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=tf.stop_gradient(self.bad_y2)) self.loss = tf.reduce_mean(losses + losses2) # recompute c with bitmask left = tf.sequence_mask(self.bad_yp1, tf.shape(c_emb)[1]) right = tf.logical_not( tf.sequence_mask(self.bad_yp2 + 1, tf.shape(c_emb)[1])) self.combo = combo = tf.logical_or(left, right) ### FOR TESTING ### ## self.combo = combo = tf.cast(tf.ones_like(combo), tf.bool) def adjust(c_emb_combo): c_emb, combo = c_emb_combo foo = c_emb bar = tf.boolean_mask(foo, combo) return tf.cond( tf.logical_and(tf.equal(combo[0], False), tf.equal(combo[1], True)), false_fn=lambda: tf.pad( bar, [[0, tf.shape(foo)[0] - tf.shape(bar)[0]], [0, 0]]), true_fn=lambda: foo) self.c_emb_new = c_emb_new = tf.map_fn(adjust, (c_emb, combo), dtype=(tf.float32)) self.c_len = tf.reduce_sum(tf.cast( tf.logical_and(self.c_mask, self.combo), tf.int32), axis=-1) self.c_mask = tf.sequence_mask( tf.reduce_sum(tf.cast(tf.logical_and(self.c_mask, self.combo), tf.int32), axis=-1), tf.shape(self.c_mask)[1]) with tf.variable_scope("encoding", reuse=True): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train, super_hacky_reload=True) #### SEQ LEN HAS TO BE FIXED!!!! #### c = rnn(c_emb_new, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) self.c_ck = c self.q_ck = c ### MAKE SURE THESE ARE RUN!!! ### print('RUN ASSIGN TRICK OPS (model.assign_trick_ops)!!') self.assign_trick_ops = [] for i in range(len(rnn.init_fw)): self.assign_trick_ops += [ tf.assign(rnn.init_fw[i], old_rnn.init_fw[i]) ] self.assign_trick_ops += [ tf.assign(rnn.init_bw[i], old_rnn.init_bw[i]) ] with tf.variable_scope("attention"): qc_att, self.qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train, give=True) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) self.att_ck = att with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) self.match_ck = match with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1_distrib = tf.reduce_max(outer, axis=2) self.yp2_distrib = tf.reduce_max(outer, axis=1) self.yp1 = tf.argmax(self.yp1_distrib, axis=1) self.yp2 = tf.argmax(self.yp2_distrib, axis=1)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) _, qh_emb = stacked_gru(qh_emb, dg, num_layers=1, seq_len=self.qh_len, keep_prob=self.keep_prob, is_train=self.is_train) tf.get_variable_scope().reuse_variables() _, ch_emb = stacked_gru(ch_emb, dg, num_layers=1, seq_len=self.ch_len, keep_prob=self.keep_prob, is_train=self.is_train) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): c, _ = stacked_gru(c_emb, d, batch=N, num_layers=3, seq_len=self.c_len, keep_prob=self.keep_prob, is_train=self.is_train) tf.get_variable_scope().reuse_variables() q, _ = stacked_gru(q_emb, d, batch=N, num_layers=3, seq_len=self.q_len, keep_prob=self.keep_prob, is_train=self.is_train) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=self.keep_prob, is_train=self.is_train) att, _ = stacked_gru(qc_att, d, num_layers=1, seq_len=self.c_len, keep_prob=self.keep_prob, is_train=self.is_train) with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=self.keep_prob, is_train=self.is_train) match, _ = stacked_gru(self_att, d, num_layers=1, seq_len=self.c_len, keep_prob=self.keep_prob, is_train=self.is_train) with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=self.ptr_keep_prob, is_train=self.is_train) d_match = dropout(match, keep_prob=self.ptr_keep_prob, is_train=self.is_train) hidden = init.get_shape().as_list()[-1] cell_fw = GRUCell(hidden) cell_bw = GRUCell(hidden) with tf.variable_scope("fw"): inp, logits1_fw = pointer(d_match, init, d, mask=self.c_mask) _, state = cell_fw(inp, init) tf.get_variable_scope().reuse_variables() _, logits2_fw = pointer(d_match, state, d, mask=self.c_mask) with tf.variable_scope("bw"): inp, logits2_bw = pointer(d_match, init, d, mask=self.c_mask) _, state = cell_bw(inp, init) tf.get_variable_scope().reuse_variables() _, logits1_bw = pointer(d_match, state, d, mask=self.c_mask) logits1 = (logits1_fw + logits1_bw) / 2. logits2 = (logits2_fw + logits2_bw) / 2. with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2)