def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, \ self.c_maxlen, \ self.q_maxlen, \ config.char_limit, \ config.hidden, \ config.char_dim, \ config.num_heads with tf.variable_scope('Input_Embedding_Layer', regularizer=regularizer): # ******************** char embedding ********************* # [batch_size, seq_len, word_len] -> [batch_size x seq_len, word_len, char_dim] ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), shape=[N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), shape=[N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, keep_prob=1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, keep_prob=1.0 - 0.5 * self.dropout) # BiDAF style conv-highway encoder, share weights # [N * PL/QL, CL, d] ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name='char_conv', reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name='char_conv', reuse=True) # [N * CL/QL, d], reduce max along CL ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) # [N, PL/QL, d] ch_emb = tf.reshape(ch_emb, shape=[N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, shape=[N, QL, ch_emb.shape[-1]]) # *********************** Word embedding ************************ # [N, PL/QL, dw] c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), keep_prob=1.0 - self.dropout) q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), keep_prob=1.0 - self.dropout) # Concat char embedding and word embedding # [N, PL/QL, dw + d] c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) # share weights c_emb = highway(c_emb, size=d, scope='highway', dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope='highway', dropout=self.dropout, reuse=True) print('highway, q_emb.shape: {}'.format(q_emb.shape)) print('highway, c_emb.shape: {}'.format(c_emb.shape)) """ *************************************Encoer ****************************************""" with tf.variable_scope('Encoder_Layer', regularizer=regularizer):
def _embed(self): with tf.variable_scope('word_char_embedding'): if self.config.fix_pretrained_vector: self.pretrained_word_mat = tf.get_variable( "word_emb_mat", [self.vocab.word_size() - 2, self.vocab.word_embed_dim], dtype=tf.float32, initializer=tf.constant_initializer( self.vocab.word_embeddings[2:], dtype=tf.float32), trainable=False) self.word_pad_unk_mat = tf.get_variable( "word_unk_pad", [2, self.pretrained_word_mat.get_shape()[1]], dtype=tf.float32, initializer=tf.constant_initializer( self.vocab.word_embeddings[:2], dtype=tf.float32), trainable=True) self.word_mat = tf.concat( [self.word_pad_unk_mat, self.pretrained_word_mat], axis=0) else: self.word_mat = tf.get_variable( 'word_embeddings', shape=[self.vocab.word_size(), self.vocab.word_embed_dim], initializer=tf.constant_initializer( self.vocab.word_embeddings), trainable=True) PL, QL, CL, d, dc, nh = self._params() with tf.variable_scope("Input_Embedding_Layer"): c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) self.c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) self.q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True)
def test_charcnn(): config = {} config['batch_size'] = 8 config['word_maxlen'] = 10 config['char_emb_size'] = 5 config['dropout_cnn'] = 0.1 CNN = Char_CNN(config) highway = HighwayMLP(300) X = torch.rand((8, 7, 10, 5)) out = CNN(X) out1 = highway(out) print(out1.shape)
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer", initializer=xavier_initializer()): from networks import QAnet_contextual_embedding params = user_params(procedure=None, label_name=None, learning_rate=None, embed_size=None, embedding_file_path=None, context_name="context", question_name="question", rnn_hidden_size=None, data_dir=None, model_dir=None, batch_size=None, drop_out_rate=self.dropout, p1=None, p2=None, feature_voc_file_path=None, gpu_cores_list=None, transfromer_conv_layers=4, transfromer_conv_kernel_size=7, transfromer_head_number=nh, tansformer_d_model=d, clip_norm=None, use_char_embedding=None, char_embedding_size=None, char_feature_name=None, char_question_name=None, example_max_length=None, enable_ema=None, ema_decay=None, char_filters=None, ans_limit=None) encoder = QAnetEmbedding(params, d, self.trainable) input = { params.context_name: c_emb, params.question_name: q_emb, "context_mask": tf.cast(tf.expand_dims(tf.sign(self.c), -1), tf.float32), "question_mask": tf.cast(tf.expand_dims(tf.sign(self.q), -1), tf.float32) } output = encoder(input) c = output[params.context_name] q = output[params.question_name] # c = residual_block(c_emb, # num_blocks = 1, # num_conv_layers = 4, # kernel_size = 7, # mask = self.c_mask, # num_filters = d, # num_heads = nh, # seq_len = self.c_len, # scope = "Encoder_Residual_Block", # bias = False, # dropout = self.dropout) # q = residual_block(q_emb, # num_blocks = 1, # num_conv_layers = 4, # kernel_size = 7, # mask = self.q_mask, # num_filters = d, # num_heads = nh, # seq_len = self.q_len, # scope = "Encoder_Residual_Block", # reuse = True, # Share the weights between passage and question # bias = False, # dropout = self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer", initializer=xavier_initializer()): from networks.dcn import DcnLayer params.q_seq_len = self.q_maxlen params.sent_number = 1 params.c_seq_len = self.c_maxlen params.cur_batch_size = tf.shape(c)[0] dcn = DcnLayer(params, d, self.trainable) output = dcn(output) # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) #[batch_size,c_len_,q_len] # S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout) # #[batch_size,1,q_len] # mask_q = tf.expand_dims(self.q_mask, 1) # S_ = tf.nn.softmax(mask_logits(S, mask = mask_q)) # mask_c = tf.expand_dims(self.c_mask, 2) # S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1)) # self.c2q = tf.matmul(S_, q) # self.q2c = tf.matmul(tf.matmul(S_, S_T), c) # attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer", initializer=xavier_initializer()): block = QaModelBlock(params, 4 * d, self.trainable) input = { params.context_name: output[params. context_name] #tf.concat(attention_outputs, axis = -1) , "context_mask": tf.cast(tf.expand_dims(tf.sign(self.c), -1), tf.float32), "question_mask": tf.cast(tf.expand_dims(tf.sign(self.q), -1), tf.float32) } output = block(input) # inputs = tf.concat(attention_outputs, axis = -1) # self.enc = [conv(inputs, d, name = "input_projection")] # for i in range(3): # if i % 2 == 0: # dropout every 2 blocks # self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) # self.enc.append( # residual_block(self.enc[i], # num_blocks = 7, # num_conv_layers = 2, # kernel_size = 5, # mask = self.c_mask, # num_filters = d, # num_heads = nh, # seq_len = self.c_len, # scope = "Model_Encoder", # bias = False, # reuse = True if i > 0 else None, # dropout = self.dropout) # ) # self.enc = [None,output["M0"],output["M1"],output["M2"]] with tf.variable_scope("Output_Layer", initializer=xavier_initializer()): # start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, bias = False, name = "start_pointer"),-1) # end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1, bias = False, name = "end_pointer"), -1) # self.logits = [mask_logits(start_logits, mask = self.c_mask), # mask_logits(end_logits, mask = self.c_mask)] # # logits1, logits2 = [l for l in self.logits] outlayer = QAOutputLayer(params, feature_size=d, is_trainning=self.trainable) logits1, logits2 = outlayer(output) outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = None) qh_emb = conv(qh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = True) ch_emb = tf.reduce_max(ch_emb, axis = 1) qh_emb = tf.reduce_max(qh_emb, axis = 1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None) q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks = 1, num_conv_layers = 4, kernel_size = 7, mask = self.c_mask, num_filters = d, num_heads = nh, seq_len = self.c_len, scope = "Encoder_Residual_Block", bias = False, dropout = self.dropout) q = residual_block(q_emb, num_blocks = 1, num_conv_layers = 4, kernel_size = 7, mask = self.q_mask, num_filters = d, num_heads = nh, seq_len = self.q_len, scope = "Encoder_Residual_Block", reuse = True, # Share the weights between passage and question bias = False, dropout = self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask = mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis = -1) self.enc = [conv(inputs, d, name = "input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks = 7, num_conv_layers = 2, kernel_size = 5, mask = self.c_mask, num_filters = d, num_heads = nh, seq_len = self.c_len, scope = "Model_Encoder", bias = False, reuse = True if i > 0 else None, dropout = self.dropout) ) with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, bias = False, name = "start_pointer"),-1) end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1, bias = False, name = "end_pointer"), -1) self.logits = [mask_logits(start_logits, mask = self.c_mask), mask_logits(end_logits, mask = self.c_mask)] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits( logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits( logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var,v))
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads d_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True) with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) print "ch_emb before", ch_emb.shape[-1] print "qh_emb before", qh_emb.shape[-1] ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) print "N", N, "PL", PL, "QL", QL print "ch_emb", ch_emb.shape print "qh_emb", qh_emb.shape c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) print "c_emb high", c_emb.shape print "q_emb high", q_emb.shape with tf.variable_scope("Embedding_Encoder_Layer"): c_tmp = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) # c_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True) c = drnn(d_cell, c_tmp, d) q_tmp = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) # q_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True) q = drnn(d_cell, q_tmp, d) print "embd enc output c", c.shape print "embd enc output q", q.shape # exit() with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] print "enc len", len(self.enc) # print self.ch_len.shape # print self.qh_len.shape # print self.c_len.shape # print self.q_len.shape # print ip_len.shape print "qh shape", self.qh.shape print "qh type", self.qh.dtype print "ip shape", inputs.shape print "ip type", inputs.dtype ip_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(inputs, tf.bool), tf.float32), axis=2), [-1]) print "ip_len", ip_len.shape # fw0 = drnn(d_cell, self.enc[0], d) # f_cell = tf.contrib.rnn.BasicLSTMCell(fw0[2], forget_bias=1.0, state_is_tuple=True) # fw1 = drnn(d_cell, fw0, d) # fw2 = drnn(d_cell, fw1, d) # self.enc.append(fw0) # self.enc.append(fw1) # self.enc.append(fw2) # print "fw1 shape", fw1 # # (fw0, bw0), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None, # initial_state_fw=None, initial_state_bw=None, # dtype=None, parallel_iterations=None, # swap_memory=False, time_major=False, scope=None): # bw_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True) # g0 = bidirlstm(fw_cell, bw_cell, inputs, d) # g1 = bidirlstm(fw_cell, bw_cell, g0, d) # g2 = bidirlstm(fw_cell, bw_cell, g1, d) # fw0 = bidirlstm(d_cell, d_cell, inputs, d) # d_cell1 = tf.contrib.rnn.BasicLSTMCell(fw0[1], forget_bias=1.0, state_is_tuple=True) # fw1 = bidirlstm(d_cell1, d_cell1, fw0, d) # (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(d_cell, d_cell, self.enc[0], dtype='float', scope='g0') # [N, M, JX, 2d] # g0 = tf.concat([fw_g0, bw_g0], 4) # (fw_g1, bw_g1) = bidirectional_dynamic_rnn(d_cell, d_cell, fw_g0, dtype='float', scope='g1') # [N, M, JX, 2d] # print "fw_g0", fw_g0.shape # print "bw_g0", bw_g0.shape # print g0.shape # (fw_g1, bw_g1), _ = bidirlstm(d_cell, d_cell, g0, dtype='float', scope='g1') # [N, M, JX, 2d] # g1 = tf.concat([fw_g1, bw_g1], 3) # flat_output_fw = nest.flatten(fw_g0) # flat_output_bw = nest.flatten(bw_g0) # flat_outputs = tuple(array_ops.concat(1, [fw, bw]) # for fw, bw in zip(flat_output_fw, flat_output_bw)) # outputs = nest.pack_sequence_as(structure=output_fw, # flat_sequence=flat_outputs) # print "output", outputs.shape for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( drnn( d_cell, residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout), d)) # print "enc[0] shape", self.enc[0].shape print "chalala" # exit() with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def forward(self): config = self.config N = config.batch_size if not self.demo else 1 PL = self.c_maxlen QL = self.q_maxlen XL = self.x_maxlen # DEBUG self.debug_ops.extend([PL, QL, XL]) CL = config.char_limit # 16 d = config.hidden # 96 dc = config.char_dim # 64 nh = config.num_heads # 1 with tf.variable_scope("Input_Embedding_Layer"): ''' self.ch : (N, c_maxlen, 16) self.qh : (N, q_maxlen, 16) self.xh : (N, x_maxlen, 16) ''' ###################################### #get elmo embeddings ###################################### datadir = "/data/elmo_experiment_20180906/20180906_model" vocab_file = os.path.join(datadir, 'vocab-2016-09-10.txt') options_file = os.path.join(datadir, 'options.json') weight_file = os.path.join(datadir, 'weights.hdf5') print(vocab_file) print(options_file) print(weight_file) # Create a Batcher to map text to character ids. batcher = Batcher(vocab_file, 50) # Input placeholders to the biLM. #context_character_ids = tf.placeholder('int32', shape=(None, None, 50)) #question_character_ids = tf.placeholder('int32', shape=(None, None, 50)) # Build the biLM graph. bilm = BidirectionalLanguageModel(options_file, weight_file) # Get ops to compute the LM embeddings. print(self.c) print(self.c.shape) #print(self.ch) #print(self.ch.shape) print(self.c_elmo) print(self.c_elmo.shape) print(self.q_elmo) print(self.q_elmo.shape) print(self.x_elmo) print(self.x_elmo.shape) context_embeddings_op = bilm(self.c_elmo) question_embeddings_op = bilm(self.q_elmo) candidate_embeddings_op = bilm(self.x_elmo) # Get an op to compute ELMo (weighted average of the internal biLM layers) # Our SQuAD model includes ELMo at both the input and output layers # of the task GRU, so we need 4x ELMo representations for the question # and context at each of the input and output. # We use the same ELMo weights for both the question and context # at each of the input and output. #context elmo elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0) with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question elmo_question_input = weight_layers( 'input', question_embeddings_op, l2_coef=0.0 ) elmo_candidate_input = weight_layers( 'input', candidate_embeddings_op, l2_coef=0.0 ) elmo_context_output = weight_layers( 'output', context_embeddings_op, l2_coef=0.0 ) with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question elmo_question_output = weight_layers( 'output', question_embeddings_op, l2_coef=0.0 ) elmo_candidate_output = weight_layers( 'output', candidate_embeddings_op, l2_coef=0.0 ) ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) #(N*PL,16,64) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) #(N*QL,16,64) xh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.xh), [N * XL, CL, dc]) #(N*XL,16,64) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) xh_emb = tf.nn.dropout(xh_emb, 1.0 - 0.5 * self.dropout) # BiDAF style conv-highway encoder: conv over chars in each word in a batch of passages ch_emb = conv(ch_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = None) # (N*c_maxlen, 16-5+1, 96) qh_emb = conv(qh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = True) # (N*q_maxlen, 16-5+1, 96) xh_emb = conv(xh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name="char_conv", reuse=True) # (N*x_maxlen, 16-5+1, 96) # Max Pooling ch_emb = tf.reduce_max(ch_emb, axis = 1) # (N*c_maxlen, 96) qh_emb = tf.reduce_max(qh_emb, axis = 1) # (N*q_maxlen, 96) xh_emb = tf.reduce_max(xh_emb, axis = 1) # (N*x_maxlen, 96) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) # (N, c_maxlen, 96) qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]]) # (N, q_maxlen, 96) xh_emb = tf.reshape(xh_emb, [N, XL, xh_emb.shape[-1]]) # (N, x_maxlen, 96) ''' self.c : (N, c_maxlen) self.q : (N, q_maxlen) self.x : (N, x_maxlen) ''' #print(self.c) #print(self.q) #print(self.x) c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout)#(N,c_maxlen,300) q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout)#(N,q_maxlen,300) x_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.x), 1.0 - self.dropout)#(N,x_maxlen,300) #c_emb_elmo = #q_emb_elmo = #x_emb_elmo = c_emb = tf.concat([c_emb, ch_emb], axis=2) # (N, c_maxlen, 396) q_emb = tf.concat([q_emb, qh_emb], axis=2) # (N, q_maxlen, 396) x_emb = tf.concat([x_emb, xh_emb], axis=2) # (N, x_maxlen, 396) print(c_emb) print(c_emb.shape) c_emb = tf.concat([elmo_context_output['weighted_op'], c_emb], axis=2) # (N, c_maxlen, 1024 + 396) q_emb = tf.concat([elmo_question_output['weighted_op'], q_emb], axis=2) # (N, q_maxlen, 1024 + 396) x_emb = tf.concat([elmo_candidate_output['weighted_op'], x_emb], axis=2) # (N, x_maxlen, 1024 + 396) print(c_emb) print(c_emb.shape) c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None)#(N,c_maxlen,96) q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)#(N,q_maxlen,96) x_emb = highway(x_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)#(N,x_maxlen,96) with tf.variable_scope("Embedding_Encoder_Layer"): ''' -> positional encoding -> layer_normalization -> depth-wise separable convolution -> self attention -> feed forward network In the paper: The total number of encoder blocks is 1 ''' # (N, c_maxlen, 96) c = residual_block(c_emb, num_blocks = 1, num_conv_layers = 4, kernel_size = 7, mask = self.c_mask, num_filters = d, num_heads = nh, seq_len = self.c_len, scope = "Encoder_Residual_Block", bias = False, dropout = self.dropout) # (N, q_maxlen, 96) q = residual_block(q_emb, num_blocks = 1, num_conv_layers = 4, kernel_size = 7, mask = self.q_mask, num_filters = d, num_heads = nh, seq_len = self.q_len, scope = "Encoder_Residual_Block", reuse = True, # Share the weights between passage and question bias = False, dropout = self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): ''' tf.tile(input, multiples, name=None): creates a new tensor by replicating input multiples times. The output tensor's i'th dimension has input.dims(i) * multiples[i] elements, and the values of input are replicated multiples[i] times along the 'i'th dimension. Paper: The layer parameters are the same as the Embedding Encoder Layer except that convolution layer number is 2 within a block and the total number of blocks is 7 ''' ''' c: (N, c_maxlen, d) q: (N, q_maxlen, d) ch_emb: (N, c_maxlen, d) qh_emb: (N, q_maxlen, d) C: (N, c_maxlen, q_maxlen, d) Q: (N, c_maxlen, q_maxlen, d) S: (N, c_maxlen, q_maxlen) mask_q: (N, 1, q_maxlen) mask_c: (N, c_maxlen, 1) S_: (N, c_maxlen, q_maxlen) S_T: (N, q_maxlen, c_maxlen) self.c2q: (N, c_maxlen, d) = tf.matmul(S_, q) self.q2c: (N, c_maxlen, d) = tf.matmul(tf.matmul(S_, S_T), c) ''' # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18 # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) # optimization from jasonwbw S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask = mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), axis = 1),(0,2,1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18 attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] # if config.q2c: # attention_outputs.append(c * self.q2c) # with tf.variable_scope("Model_Encoder_Layer"): # inputs = tf.concat(attention_outputs, axis = -1) # # # same as a dxd MLP layer # self.enc = [conv(inputs, d, name = "input_projection")] # d=hidden=96 # # for i in range(3): # if i % 2 == 0: # dropout every 2 blocks # self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) # self.enc.append( # residual_block(self.enc[i], # num_blocks = 7, # num_conv_layers = 2, # kernel_size = 5, # mask = self.c_mask, # num_filters = d, # num_heads = nh, # seq_len = self.c_len, # scope = "Model_Encoder", # bias = False, # reuse = True if i > 0 else None, # dropout = self.dropout) # ) # DEBUG # self.debug_ops.append(inputs) # self.debug_ops.extend(self.enc) with tf.variable_scope("Output_Layer"): ''' broadcasting:dimensions with size 1 are stretched or "copied" to match the other ''' ''' x_emb: (N, x_maxlen, d) inputs: (N, c_maxlen, 4*d) mask_x: (N, x_maxlen, 1) c_proj: (N, c_maxlen, d) S_xc/S_xc_: (N, x_maxlen, c_maxlen) x2c: (N, x_maxlen, d) xp_exp: (N, x_maxlen, c_maxlen, 1) c_proj_exp: (N, 1, c_maxlen, d) cand_context: (N, x_maxlen, c_maxlen, d) cand_context_pool: (N, x_maxlen, d) cand_condense: (N, x_maxlen, d*2) self.cand_condense: (N, x_maxlen, d) self.cand_logits: (N, x_maxlen, 1) ''' inputs = tf.concat(attention_outputs, axis = -1) # masking candidate embedding mask_x = tf.expand_dims(self.x_mask, 2) c_proj = conv(inputs, d, name="context_projection") S_xc = optimized_trilinear_for_attention([x_emb, c_proj], self.x_maxlen, self.c_maxlen, input_keep_prob=1.0 - self.dropout) S_xc_ = tf.nn.softmax(mask_logits(S_xc, mask = mask_x)) self.x2c = tf.matmul(S_xc_, c_proj) self.cand_condense = self.x2c if self.config.cand_condense_vector: xp_exp = tf.expand_dims(self.xp, axis=-1) c_proj_exp = tf.expand_dims(c_proj, axis=1) cand_context = tf.multiply(c_proj_exp, xp_exp) if self.config.cand_condense_conv: cand_context = tf.reshape(cand_context, [N*XL, PL, d]) cand_context = conv(cand_context, d, bias=True, activation=tf.nn.relu, kernel_size=3, name="candidate_from_context") cand_context = tf.reshape(cand_context, [N, XL, -1, d]) if self.config.cand_condense_pool: cand_context_pool = tf.reduce_max(cand_context, axis=-2) else: cand_context_pool = tf.reduce_mean(cand_context, axis=-2) cand_condense = tf.concat([self.x2c, cand_context_pool], axis = -1) self.cand_condense = conv(cand_condense, d, name="candidate_projection") if self.config.cand_fuse_vector: raise NotImplementedError # DEBUG self.debug_ops.extend([xp_exp, c_proj_exp, cand_context, cand_context_pool, cand_condense, self.cand_condense]) if not config.max_margin: cand_logits = tf.squeeze(conv(self.cand_condense, 1, bias=False, name="candidate_logits_1"), -1) self.cand_logits = mask_logits(cand_logits, mask=self.x_mask) loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.cand_logits, labels=self.yx) # DEBUG self.debug_ops.extend([loss, x_emb, c_proj, S_xc, S_xc_, self.x2c, self.x_mask, self.cand_logits, self.yx]) else: cand_logits = conv(self.cand_condense, 1, bias=False, name="candidate_logits_1") cand_logits = tf.tanh(cand_logits) cand_logits = tf.squeeze(conv(cand_logits, 1, bias=False, name="candidate_logits_2"), -1) self.cand_logits = tf.sigmoid(cand_logits) pos = tf.multiply(self.cand_logits, self.yx) pos = tf.reduce_max(pos, axis=-1) negs = tf.multiply(self.cand_logits, self.yx_inv) neg = tf.reduce_max(negs, axis=-1) loss = tf.maximum(tf.add(tf.subtract(neg, pos), config.margin), 0.0) # DEBUG self.debug_ops.extend([loss, x_emb, c_proj, S_xc, S_xc_, self.x2c, self.x_mask, self.cand_logits, self.yx, pos, negs, neg, self.yx, self.yx_inv]) self.loss = tf.reduce_mean(loss) # with tf.variable_scope("Output_Layer"): # ''' # tf.matrix_band_part: Copy a tensor setting everything outside a central band # in each innermost matrix to zero. # self.enc[i]: (N, c_maxlen, d) # start_logits: (N, c_maxlen) # end_logits: (N, c_maxlen) # logits1: (N, c_maxlen) # logits2: (N, c_maxlen) # outer: (N, c_maxlen, c_maxlen) # self.c_mask: (N, c_maxlen) # yp1, yp2, losses, losses2: (N,) # ''' # # # map vectors to scalars # start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, # bias = False, name = "start_pointer"),-1) # end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1, # bias = False, name = "end_pointer"), -1) # self.logits = [mask_logits(start_logits, mask = self.c_mask), mask_logits(end_logits, mask = self.c_mask)] # # logits1, logits2 = [l for l in self.logits] # # losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) # losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) # self.loss = tf.reduce_mean(losses + losses2) # # # find max-score span # outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), # tf.expand_dims(tf.nn.softmax(logits2), axis=1)) # # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18 # outer = tf.matrix_band_part(outer, 0, config.ans_limit) # self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) # self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) # # # DEBUG # self.debug_ops.extend([start_logits, end_logits, logits1, logits2, # outer, self.yp1, self.yp2, losses, losses2, self.loss]) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18 self.assign_vars = [] # self.shadow_vars = [] # self.global_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def forward(self): config = self.config ''' N: batch_size PL: passage最大长度 QL: question最大长度 CL: 单词最大字母长度 d: 输出通道数 dc: 字母的嵌入维度 nh: 自注意力的头数 ''' N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads # Embedding层:获取词向量和字符向量的拼接 with tf.variable_scope("Input_Embedding_Layer"): # # character嵌入: # 1、先对单词的每个字母进行char2vec ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # 2、将单词对应的word2vec矩阵通过conv编码成向量 # 卷积 ch_emb_shape = [N * PL, CL-5+1, d], qh_emb_shape = [N * QL, CL-5+1, d] ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) # max_time_pooling # ch_emb_shape = [N * PL, d], qh_emb_shape = [N * QL, d] ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) # ch_emb_shape = [N, PL, d], qh_emb_shape = [N, QL, d] ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) # # 词嵌入:从glove获取 c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) # 拼接词向量和字符向量 # c_emb_size = [batch, n_c, c_emb+ch_emb] c_emb = tf.concat([c_emb, ch_emb], axis=2) # q_emb_size = [batch, n_q, c_emb + ch_emb] q_emb = tf.concat([q_emb, qh_emb], axis=2) # 分别通过highway网络 # c_emb_size = [batch, n_c, d] c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) # c_emb_size = [batch, n_q, d] q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) # Stacking Embedding Encoder Block的实现:共1个encoder block,每个7个卷积层,卷积核数d=96 with tf.variable_scope("Embedding_Encoder_Layer"): # c_size = [batch, n_c, d] c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) # q_size = [batch, n_q, d] q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse= True, # 共享passage和question的Stacking Embedding Encoder Block的权重 bias=False, dropout=self.dropout) # Context-Query-Attention实现: with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) # S_size = [batch, n_c, n_q], q_size = [batch, n_q, d], c_size = [batch, n_c, d] S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) # n_q方向进行softmax S_ = tf.nn.softmax(mask_logits(S, mask=mask_q), dim=-1) mask_c = tf.expand_dims(self.c_mask, 2) # n_c方向进行softmax S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) # c2q_size = [batch, n_c, d] self.c2q = tf.matmul(S_, q) # q2c_size = [batch, n_c, d] self.q2c = tf.matmul(tf.matmul(S_, S_T), c) # attention_size = [4, batch, n_c, d] attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] # Stacked Model Encoder Blocks实现:共7个encoder block,每个2个卷积层,卷积核数d=96 with tf.variable_scope("Model_Encoder_Layer"): # c, self.c2q, c * self.c2q, c * self.q2c 按照通道维度进行合并 # input_shape = [batch, n_c, 4d] inputs = tf.concat(attention_outputs, axis=-1) # self.enc[i]_shape = [batch, n_c, d] self.enc = [conv(inputs, d, name="input_projection")] # 3个Stacked Model Encoder Blocks for i in range(3): if i % 2 == 0: # 每两层进行一次dropout self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block( self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, # 共享同一个Stacked Model Encoder Blocks的权重 dropout=self.dropout)) # 输出层实现: with tf.variable_scope("Output_Layer"): # 合并Stacked Model Encoder Blocks的第一个和第二个输出,并和并通道 # start_logits_shape = [batch, n_c, 1] start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) # 合并Stacked Model Encoder Blocks的第一个和第三个输出,并和并通道 # end_logits_shape = [batch, n_c, 1] end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] # outer_shape = [bacth, n_c, n_c] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) # 保留行坐标<纵坐标,且行坐标+纵坐标<=ans_limit的数据,其余置0 outer = tf.matrix_band_part(outer, 0, config.ans_limit) # 最大值的行坐标,代表起始位置 self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) # 最大值的列坐标,代表结束位置 self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) # L2正则化 if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) # control_dependencies传入的操作是先于with后的操作 with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) # shape = (?, 16, 64) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder # d(hidden_size) = 96 ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) # shape = (?, 12, 96) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) # shape = (?, 96) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) # shape = (32, ?, 96) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): # self.enc[1] = (32, ?, 96) conv1 = conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer") # tf.shape(conv1) = (32, ?, 1) start_logits = tf.squeeze(conv1, -1) # tf.shape(start_logits) = (32, ?) conv2 = conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer") end_logits = tf.squeeze(conv2, -1) # mask ?? self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] # shape = (32, ?) -> cause the context length is variable # matmul([32, ?, 1] x [32, 1, ?]) outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) # outer = (32, ?, ?) outer = tf.matrix_band_part(outer, 0, config.ans_limit) reduced1 = tf.reduce_max(outer, axis=2) reduced2 = tf.reduce_max(outer, axis=1) # tf.shape(reduced) = (32, ?) # ############################################### paddings = [[0, 0], [0, self.MAX_PL - tf.shape(reduced1)[0]]] reduced1 = tf.pad(reduced1, paddings, "CONSTANT") reduced2 = tf.pad(reduced2, paddings, "CONSTANT") reduced1 = tf.slice(reduced1, [0, 0], [N, self.MAX_PL]) reduced2 = tf.slice(reduced2, [0, 0], [N, self.MAX_PL]) # tf.shape(reduced) = (32, ?) # no answer flag: (no_answer, answer_exist) # TODO add additinal layer # TODO dimenstion between reduced and weight na_flag1 = tf.cast( tf.argmax(tf.matmul(reduced1, self.weights1), axis=1), tf.float32) na_flag2 = tf.cast( tf.argmax(tf.matmul(reduced2, self.weights2), axis=1), tf.float32) # Tensor("Output_Layer/ArgMax:0", shape=(32, ?), dtype=int64) self.yp1 = tf.argmax(reduced1, axis=1) self.yp2 = tf.argmax(reduced2, axis=1) print(tf.reduce_sum(reduced1, axis=1)) print(tf.multiply(na_flag1, tf.reduce_sum(reduced1, axis=1))) print( tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y1)) # no_answer losses = tf.where( self.no_answer > 0, tf.multiply(na_flag1, tf.reduce_sum(reduced1, axis=1)), tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y1)) losses2 = tf.where( self.no_answer > 0, tf.multiply(na_flag2, tf.reduce_sum(reduced2, axis=1)), tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2, labels=self.y2)) ################################################# self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def __init__(self, config, batch, word_mat=None,char_mat=None, filter_sizes=None, embedding_size=None,num_filters=None,trainable=True, l2_reg_lambda=0.0, keep_prob=0.9, graph=None): # Placeholders for input, output and dropout self.config = config self.graph = graph if graph is not None else tf.Graph() self.trainable = trainable gru = cudnn_gru if config.use_cudnn else native_gru self.is_train = tf.get_variable("is_train", shape=[], dtype=tf.bool, trainable=True) if trainable == True: self.input_x, self.input_x1, self.ch, self.qh, self.input_y, self.qa_id,self.alternatives_tokens = batch.get_next() # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len] else: self.input_x, self.input_x1, self.ch, self.qh,self.alternatives_tokens= batch.get_next() # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len] self.dropout_keep_prob =keep_prob self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.dropout = tf.placeholder_with_default(0.5, (), name="dropout") # Keeping track of l2 regularization loss (optional) l2_loss = tf.constant(0.0) self.c_mask = tf.cast(self.input_x, tf.bool) # 这里是判断出每一个数据集的context对应实际句子长度的位置(64,400) self.q_mask = tf.cast(self.input_x1, tf.bool) # 同上(64,50) self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) # 每一个训练数据集实际长度 self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) # 每一个问题的实际长度 self.ch_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1]) self.qh_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1]) # Embedding layer N, PL, QL, CL, d, dc,dg,nh= config.batch_size,config.para_limit,config.ques_limit,config.char_limit,\ config.hidden, config.char_dim,config.char_hidden,config.num_heads with tf.variable_scope("Input_Embedding_Layer"): self.char_mat = tf.get_variable("char_mat", initializer=tf.constant(char_mat, dtype=tf.float32),trainable=True) ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) cell_fw = tf.contrib.rnn.GRUCell(dg) # 按照字符有多少个gru神经单元 cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) # self.ch_len表示训练数据集所有字符平摊之后,实际字符的长度,sequence_length=[bacth_size] is N * PL, because # char_hidden is 100 so state_fw and state_bw is [N * PL,100] ch_emb = tf.concat([state_fw, state_bw], axis=1) # [N * PL,200] _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, qh_emb, self.qh_len,dtype=tf.float32) # state_* [N*QL] qh_emb = tf.concat([state_fw, state_bw], axis=1) # question_emd is [,200] qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) # [batch_size,que_len,200] ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) # 以上过程对应了论文里边的 the character-level embedding are generate by ...in the token #这样就把每一个单词的字符转化为单词的字符级别embedding信息,tf.reshape(ch_emb, [N, PL, 2 * dg]) # 从这里可以看出作者最后那字符的state状态作为字符信息与原始单词embedding进行连接,那么是否可以用拼音 # 作为汉语的字符级别信息呢,可以尝试 print(qh_emb,"llllllllllllll") with tf.name_scope("embedding"): self.W = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32), trainable=True) self.c_mask = tf.cast(self.input_x, tf.bool) # self.c为填充之后的长度是一致的,用0进行填充 self.q_mask = tf.cast(self.input_x1, tf.bool) if trainable: self.c_maxlen, self.q_maxlen, = config.para_limit, config.ques_limit, else: self.c_maxlen, self.q_maxlen = config.test_para_limit, config.test_ques_limit self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) self.embedded_chars1 = tf.nn.embedding_lookup(self.W, self.input_x1) c_emb = tf.concat([self.embedded_chars, ch_emb], axis=2) q_emb= tf.concat([self.embedded_chars1, qh_emb], axis=2) # self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) # self.embedded_chars_expanded1 = tf.expand_dims(self.embedded_chars1, -1) with tf.variable_scope("cnn_predict"): pooled_outputs = [] c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) # 相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75] q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block(q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) # 这个函数实现的是公式(4)中的所有 rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) # this is 公式(3) #[batch,c_maxlen,150] print(att,"111111111111111111111111") c_emb_expanded_shape=att.get_shape().as_list() c_emb_expanded=tf.expand_dims(att, -1) for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size,c_emb_expanded_shape[-1], 1, num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") l2_loss += tf.nn.l2_loss(W) b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") l2_loss += tf.nn.l2_loss(b) conv_ouput = tf.nn.conv2d( c_emb_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv_ouput, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, c_emb_expanded_shape[1]- filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") print(pooled,"222222222222222222222") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) self.h_pool = tf.concat(pooled_outputs, 3) self.h_pool_flat_cnn = tf.reshape(self.h_pool, [-1, num_filters_total]) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) #input_size对应embedding的长度,此过程是初始化一个gru,双向lstm,包括他们的初始状态 c = rnn(c_emb, seq_len=self.c_len) #上下文编码输出为batch ,c_maxlen,以及lstm输出长度 [batch_size,sequncen_length,150*3] num_layers is 3 so concat each layers #each layer is 150 because each layers has back_forword and feed_forword(75+75) q = rnn(q_emb, seq_len=self.q_len) #问题编码 with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) # 这个函数实现的是公式(4)中的所有公式 rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) # this is 公式(3) #[batch,c_maxlen,150] # Create a convolution + maxpool layer for each filter size input_shape=att.get_shape().as_list() print(att,"rrrr") att=tf.expand_dims(att,-1) print(att,"hhhhhhhhhhhh") pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, input_shape[-1], 1, num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") l2_loss += tf.nn.l2_loss(W) b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") l2_loss += tf.nn.l2_loss(b) conv_ouput = tf.nn.conv2d( att, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv_ouput, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, config.para_limit - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") print(pooled,"3333333333333333333333333") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) self.h_pool = tf.concat(pooled_outputs, 3) self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) # Add dropout with tf.name_scope("dropout"): self.h_drop_lstm = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) self.h_drop_cnn=tf.nn.dropout(self.h_pool_flat_cnn, self.dropout_keep_prob) self.h_drop=tf.concat([self.h_drop_lstm,self.h_drop_cnn],axis=-1) # Final (unnormalized) scores and predictions with tf.name_scope("output"): W = tf.get_variable( "W", shape=[num_filters_total*2, 3], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[3]), name="b") l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") self.predictions = tf.argmax(self.scores, 1, name="predictions") # Calculate mean cross-entropy loss if trainable: with tf.name_scope("loss"): print(self.scores,self.input_y, "llllllllllllllll") losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") # if config.decay is not None: # self.var_ema = tf.train.ExponentialMovingAverage(config.decay) # ema_op = self.var_ema.apply(tf.trainable_variables()) # with tf.control_dependencies([ema_op]): # self.loss = tf.identity(self.loss) # # self.assign_vars = [] # for var in tf.global_variables(): # v = self.var_ema.average(var) # if v: # self.assign_vars.append(tf.assign(var, v)) self.lr = tf.minimum(config.init_lr, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm( gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh, AL1,AL2,AL3= config.batch_size,self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads,self.aletr1_maxlen, \ self.aletr2_maxlen,self.aletr3_maxlen with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) #[一个句子共有多少单词,每个单词的字符个数,每一个字符的维度] qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) self.alternati_emb1 = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.alter1h), [N * AL1, CL, dc]) # (875, 25, 20) self.alternati_emb2 = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.alter2h), [N * AL2, CL, dc]) # (768, 16, 300) self.alternati_emb3 = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.alter3h), [N * AL3, CL, dc]) # (768, 16, 300) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) alternati_emb1 = tf.nn.dropout(self.alternati_emb1, 1.0 - 0.5 * self.dropout) alternati_emb2 = tf.nn.dropout(self.alternati_emb2, 1.0 - 0.5 * self.dropout) alternati_emb3 = tf.nn.dropout(self.alternati_emb3, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder以下是得到卷积之后的特征输出 ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) #[batch,feature_len,d] qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) alternati_emb1 = conv(alternati_emb1, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) alternati_emb2 = conv(alternati_emb2, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) alternati_emb3 = conv(alternati_emb3, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max( ch_emb, axis=1) #求出横向唯独的最大特征,这里可以用k_max尝试,而没有用max_pooling qh_emb = tf.reduce_max(qh_emb, axis=1) alternati_emb1 = tf.reduce_max(alternati_emb1, axis=1) alternati_emb2 = tf.reduce_max(alternati_emb2, axis=1) alternati_emb3 = tf.reduce_max(alternati_emb3, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) #最终转变为句子长度对应的维度, qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]]) alternati_emb1 = tf.reshape(alternati_emb1, [N, AL1, qh_emb.shape[-1]]) alternati_emb2 = tf.reshape(alternati_emb2, [N, AL2, qh_emb.shape[-1]]) alternati_emb3 = tf.reshape(alternati_emb3, [N, AL3, qh_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) alter_embedding1 = tf.nn.embedding_lookup(self.word_mat, self.alter1) # 上下文 alter_embedding2 = tf.nn.embedding_lookup(self.word_mat, self.alter2) # 上下文 alter_embedding3 = tf.nn.embedding_lookup(self.word_mat, self.alter3) # 上下文 c_emb = tf.concat( [c_emb, ch_emb], axis=2) #把字符与对应的特征进行连接[batch,sequence_len,对应的输出维度] q_emb = tf.concat([q_emb, qh_emb], axis=2) alter_embedding1 = tf.concat([alter_embedding1, alternati_emb1], axis=2) alter_embedding2 = tf.concat([alter_embedding2, alternati_emb2], axis=2) alter_embedding3 = tf.concat([alter_embedding3, alternati_emb3], axis=2) c_emb = highway( c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) #相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75] self.alter_embedding1 = c_emb q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) alter_embedding1 = highway(alter_embedding1, size=d, scope="highway", dropout=self.dropout, reuse=True) alter_embedding2 = highway(alter_embedding2, size=d, scope="highway", dropout=self.dropout, reuse=True) alter_embedding3 = highway(alter_embedding3, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) alter1 = residual_block( alter_embedding1, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.alter1_mask, num_filters=d, num_heads=nh, seq_len=self.alterh1_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) alter2 = residual_block( alter_embedding2, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.alter2_mask, num_filters=d, num_heads=nh, seq_len=self.alter2_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) alter3 = residual_block( alter_embedding3, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.alter3_mask, num_filters=d, num_heads=nh, seq_len=self.alter3_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope('question_rnn'): self.gru = tf.contrib.rnn.GRUCell(d) initstate = self.gru.zero_state(batch_size=N, dtype=tf.float32) output, state = tf.nn.dynamic_rnn(self.gru, q, initial_state=initstate) # self.qandc=tf.concat([self.q2c,self.c2q],axis=2) # self.qandc=dense(self.qandc,d) # output,state=tf.nn.dynamic_rnn(self.gru,self.qandc,initial_state=initstate)#(32,?,75) output1, state1 = tf.nn.dynamic_rnn(self.gru, alter1, initial_state=state) output2, state2 = tf.nn.dynamic_rnn(self.gru, alter2, initial_state=state) output3, state3 = tf.nn.dynamic_rnn(self.gru, alter3, initial_state=state) state = tf.expand_dims(state, axis=2) weight1 = tf.matmul(self.enc[1], state) weight2 = tf.matmul(self.enc[2], state) weight3 = tf.matmul(self.enc[3], state) weight_enc1 = tf.multiply(self.enc[1], weight1) weight_enc1 = tf.reduce_sum(weight_enc1, axis=1) weight_enc2 = tf.multiply(self.enc[2], weight2) weight_enc2 = tf.reduce_sum(weight_enc2, axis=1) weight_enc3 = tf.multiply(self.enc[3], weight3) weight_enc3 = tf.reduce_sum(weight_enc3, axis=1) with tf.variable_scope("Output_Layer"): # start_logits = tf.squeeze( # conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) # end_logits = tf.squeeze( # conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) # self.logits = [mask_logits(start_logits, mask=self.c_mask), # mask_logits(end_logits, mask=self.c_mask)] # # logits1, logits2 = [l for l in self.logits] # # outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), # tf.expand_dims(tf.nn.softmax(logits2), axis=1)) # outer = tf.matrix_band_part(outer, 0, config.ans_limit) # self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) # self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) similary1 = tf.expand_dims(self.cos_sine(weight_enc1, state1), axis=1) similary2 = tf.expand_dims(self.cos_sine(weight_enc2, state2), axis=1) similary3 = tf.expand_dims(self.cos_sine(weight_enc3, state3), axis=1) self.logits1 = tf.nn.softmax( tf.concat([similary1, similary2, similary3], axis=1)) print(self.logits1, "lllllllllllllllllllllllllllllllllllll")
def _embed(self): with tf.device('/cpu:0'): word_pad_emb = tf.get_variable('word_pad_embedding', shape=(1, self.word_vocab.embed_dim), initializer=tf.zeros_initializer, trainable=False) word_unk_emb = tf.get_variable('word_unk_embedding', shape=(1, self.word_vocab.embed_dim), initializer=tf.zeros_initializer, trainable=True) word_emb_init = tf.constant_initializer(self.word_vocab.embeddings[2:]) \ if self.word_vocab.embeddings is not None \ else tf.random_normal_initializer() normal_word_embs = tf.get_variable( 'normal_word_embeddings', shape=(self.word_vocab.size() - 2, self.word_vocab.embed_dim), initializer=word_emb_init, trainable=False) self.word_emb_mat = tf.concat( [word_pad_emb, word_unk_emb, normal_word_embs], 0) char_pad_emb = tf.get_variable('char_pad_embedding', shape=(1, self.char_vocab.embed_dim), initializer=tf.zeros_initializer, trainable=False) char_emb_init = tf.constant_initializer(self.char_vocab.embeddings[1:]) \ if self.char_vocab.embeddings is not None \ else tf.random_normal_initializer() normal_char_embs = tf.get_variable( 'normal_char_embeddings', shape=(self.char_vocab.size() - 1, self.char_vocab.embed_dim), initializer=char_emb_init, trainable=True) self.char_emb_mat = tf.concat([char_pad_emb, normal_char_embs], 0) self.emb_c = tf.nn.dropout( tf.nn.embedding_lookup(self.word_emb_mat, self.c), 1.0 - self.dropout) self.emb_q = tf.nn.dropout( tf.nn.embedding_lookup(self.word_emb_mat, self.q), 1.0 - self.dropout) self.emb_cc = tf.nn.dropout( tf.nn.embedding_lookup(self.char_emb_mat, self.cc), 1.0 - 0.5 * self.dropout) self.emb_qc = tf.nn.dropout( tf.nn.embedding_lookup(self.char_emb_mat, self.qc), 1.0 - 0.5 * self.dropout) # check the paper, it seems to use another operation # self.conv_emb_cc = conv(self.emb_cc, self.hidden_size, kernel_size=5, activation=tf.nn.relu, reuse=None) # self.conv_emb_qc = conv(self.emb_qc, self.hidden_size, kernel_size=5, activation=tf.nn.relu, reuse=True) self.conv_emb_cc = tf.reduce_max(self.emb_cc, 2) self.conv_emb_qc = tf.reduce_max(self.emb_qc, 2) self.conv_emb_cc = fc(self.conv_emb_cc, self.char_vocab.embed_dim, activation_fn=None) self.conv_emb_qc = fc(self.conv_emb_qc, self.char_vocab.embed_dim, activation_fn=None) self.emb_c = highway(tf.concat([self.emb_c, self.conv_emb_cc], axis=2), size=self.hidden_size, dropout=self.dropout, num_layers=2, scope='highway', reuse=None) self.emb_q = highway(tf.concat([self.emb_q, self.conv_emb_qc], axis=2), size=self.hidden_size, dropout=self.dropout, num_layers=2, scope='highway', reuse=True)
def forward(self, trainable): config = self.config N, PL, QL, CL, d, dc, nh= config.batch_size,self.c_maxlen, self.q_maxlen,\ config.char_limit, config.hidden, config.char_dim, \ config.num_heads, with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) #[一个句子共有多少单词,每个单词的字符个数,每一个字符的维度] qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder以下是得到卷积之后的特征输出 ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) #[batch,feature_len,d] qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max( ch_emb, axis=1) #求出横向唯独的最大特征,这里可以用k_max尝试,而没有用max_pooling qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) #最终转变为句子长度对应的维度, qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat( [c_emb, ch_emb], axis=2) #把字符与对应的特征进行连接[batch,sequence_len,对应的输出维度] q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway( c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) #相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75] q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope('question_rnn'): self.gru = tf.contrib.rnn.GRUCell(d) initstate = self.gru.zero_state(batch_size=N, dtype=tf.float32) output, state = tf.nn.dynamic_rnn(self.gru, q, initial_state=initstate) # self.qandc=tf.concat([self.q2c,self.c2q],axis=2) # self.qandc=dense(self.qandc,d) # output,state=tf.nn.dynamic_rnn(self.gru,self.qandc,initial_state=initstate)#(32,?,75) state = tf.expand_dims(state, axis=2) weight1 = tf.matmul(self.enc[1], state) weight2 = tf.matmul(self.enc[2], state) weight3 = tf.matmul(self.enc[3], state) weight_enc1 = tf.multiply(self.enc[1], weight1) weight_enc1 = tf.reduce_sum(weight_enc1, axis=1) weight_enc2 = tf.multiply(self.enc[2], weight2) weight_enc2 = tf.reduce_sum(weight_enc2, axis=1) weight_enc3 = tf.multiply(self.enc[3], weight3) weight_enc3 = tf.reduce_sum(weight_enc3, axis=1) with tf.variable_scope("Output_Layer"): print(weight_enc1, "ggggggggggggggggg") inputs_shape = weight_enc1.get_shape().as_list() W = tf.get_variable( "W", shape=[inputs_shape[-1], 3], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[3]), name="b") self.l2_loss += tf.nn.l2_loss(W) self.l2_loss += tf.nn.l2_loss(b) self.scores1 = tf.nn.xw_plus_b(weight_enc1, W, b, name="scores") self.scores2 = tf.nn.xw_plus_b(weight_enc2, W, b, name="scores") self.scores3 = tf.nn.xw_plus_b(weight_enc3, W, b, name="scores") self.scores = (self.scores1 + self.scores2 + self.scores3) / 3.0 print(self.scores) self.predictions = tf.argmax(self.scores, 1, name="predictions") if trainable: with tf.name_scope("loss"): print(self.scores, self.input_y, "llllllllllllllll") losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean( losses) + self.l2_reg_lambda * self.l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast( correct_predictions, "float"), name="accuracy") # losses2 = tf.nn.softmax_cross_entropy_with_logits( # logits=logits2, labels=self.y2) if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage( config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v)) self.lr = tf.minimum( config.init_lr, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm( gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
def build_model(self): PL, QL, CL, d, dc, nh = self.c_maxlen, self.q_maxlen, self.char_limit, self.filters, self.char_dim, self.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.contc_input), [-1, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.quesc_input), [-1, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [-1, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [-1, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.contw_input), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.quesw_input), 1.0 - self.dropout) # if self.use_cove: # c_emb_cove = self.cove_model(c_emb) # q_emb_cove = self.cove_model(q_emb) # c_emb = tf.concat([c_emb, c_emb_cove], axis=-1) # q_emb = tf.concat([q_emb, q_emb_cove], axis=-1) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) if self.use_elmo: c_emb = tf.concat([c_emb, self.cont_elmo], axis=-1) q_emb = tf.concat([q_emb, self.ques_elmo], axis=-1) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.cont_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block(q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.ques_len, scope="Encoder_Residual_Block", reuse=True, bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) c2q = tf.matmul(S_, q) q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, c2q, c * c2q, c * q2c] with tf.variable_scope("Model_Encoder_Layer"): attention_inputs = tf.concat(attention_outputs, axis=-1) enc = [conv(attention_inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks enc[i] = tf.nn.dropout(enc[i], 1.0 - self.dropout) enc.append( residual_block(enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.cont_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): start_logits = tf.concat([enc[1], enc[2]], axis=-1) end_logits = tf.concat([enc[1], enc[3]], axis=-1) if self.use_elmo: start_logits = tf.concat((start_logits, self.cont_elmo), axis=-1) end_logits = tf.concat((end_logits, self.cont_elmo), axis=-1) start_logits = tf.squeeze( conv(start_logits, 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(end_logits, 1, bias=False, name="end_pointer"), -1) unanswer_bias = tf.get_variable( "unanswer_bias", [1], regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7), initializer=tf.zeros_initializer()) unanswer_bias = tf.reshape( tf.tile(unanswer_bias, [self.batch_size]), [-1, 1]) self.logits1 = tf.concat( (unanswer_bias, mask_logits(start_logits, mask=self.c_mask)), axis=-1) self.logits2 = tf.concat( (unanswer_bias, mask_logits(end_logits, mask=self.c_mask)), axis=-1) start_loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits1, labels=self.y_start) end_loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits2, labels=self.y_end) self.loss = tf.reduce_mean(start_loss + end_loss) if self.l2_norm is not None: variables = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss # output outer = tf.matmul( tf.expand_dims(tf.nn.softmax(self.logits1), axis=2), tf.expand_dims(tf.nn.softmax(self.logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, self.ans_limit) self.output1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) - 1 self.output2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) - 1 if self.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(self.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v is not None: self.assign_vars.append(tf.assign(var, v))
def forward(self): config = self.config N = config.batch_size if not self.demo else 1 PL = self.c_maxlen QL = self.q_maxlen CL = config.char_limit # 16 d = config.hidden # 96 dc = config.char_dim # 64 nh = config.num_heads # 1 with tf.variable_scope("Input_Embedding_Layer"): ''' self.ch : (N, c_maxlen, 16) self.qh : (N, q_maxlen, 16) ''' ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) # (N*c_maxlen, 16, 64) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) # (N*q_maxlen, 16, 64) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # BiDAF style conv-highway encoder: conv over chars in each word in a batch of passages ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) # (N*c_maxlen, 16-5+1, 96) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) # (N*q_maxlen, 16-5+1, 96) ch_emb = tf.reduce_max(ch_emb, axis=1) # (N*c_maxlen, 96) qh_emb = tf.reduce_max(qh_emb, axis=1) # (N*q_maxlen, 96) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) # (N, c_maxlen, 96) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) # (N, q_maxlen, 96) ''' self.c : (N, c_maxlen) self.q : (N, q_maxlen) ''' c_emb = tf.nn.dropout(tf.nn.embedding_lookup( self.word_mat, self.c), 1.0 - self.dropout) # (N, c_maxlen, 300) q_emb = tf.nn.dropout(tf.nn.embedding_lookup( self.word_mat, self.q), 1.0 - self.dropout) # (N, q_maxlen, 300) c_emb = tf.concat([c_emb, ch_emb], axis=2) # (N, c_maxlen, 396) q_emb = tf.concat([q_emb, qh_emb], axis=2) # (N, q_maxlen, 396) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) # (N, c_maxlen, 96) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) # (N, q_maxlen, 96) with tf.variable_scope("Embedding_Encoder_Layer"): ''' -> positional encoding -> layer_normalization -> depth-wise separable convolution -> self attention -> feed forward network In the paper: The total number of encoder blocks is 1 ''' # (N, c_maxlen, 96) c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) # (N, q_maxlen, 96) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): ''' tf.tile(input, multiples, name=None): creates a new tensor by replicating input multiples times. The output tensor's i'th dimension has input.dims(i) * multiples[i] elements, and the values of input are replicated multiples[i] times along the 'i'th dimension. Paper: The layer parameters are the same as the Embedding Encoder Layer except that convolution layer number is 2 within a block and the total number of blocks is 7 ''' ''' c: (N, c_maxlen, d) q: (N, q_maxlen, d) ch_emb: (N, c_maxlen, d) qh_emb: (N, q_maxlen, d) C: (N, c_maxlen, q_maxlen, d) Q: (N, c_maxlen, q_maxlen, d) S: (N, c_maxlen, q_maxlen) mask_q: (N, 1, q_maxlen) mask_c: (N, c_maxlen, 1) S_: (N, c_maxlen, q_maxlen) S_T: (N, q_maxlen, c_maxlen) self.c2q: (N, c_maxlen, d) = tf.matmul(S_, q) self.q2c: (N, c_maxlen, d) = tf.matmul(tf.matmul(S_, S_T), c) ''' C = tf.tile(tf.expand_dims(c, 2), [1, 1, self.q_maxlen, 1]) Q = tf.tile(tf.expand_dims(q, 1), [1, self.c_maxlen, 1, 1]) S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q] if config.q2c: attention_outputs.append(c * self.q2c) with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] # d=hidden=96 for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): ''' tf.matrix_band_part: Copy a tensor setting everything outside a central band in each innermost matrix to zero. self.enc[i]: (N, c_maxlen, d) start_logits: (N, c_maxlen) end_logits: (N, c_maxlen) logits1: (N, c_maxlen) logits2: (N, c_maxlen) outer: (N, c_maxlen, c_maxlen) yp1, yp2, losses, losses2: (N,) ''' start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) # find max-score span outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) #DEBUG self.debug_ops.extend([ self.enc[1], start_logits, end_logits, logits1, logits2, outer, self.yp1, self.yp2, losses, losses2, self.loss ]) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.shadow_vars = [] self.global_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.shadow_vars.append(v) self.global_vars.append(var) self.assign_vars = [] for g, v in zip(self.global_vars, self.shadow_vars): self.assign_vars.append(tf.assign(g, v))
def build_model(self): PL, QL, CL, d, dc, nh = self.c_maxlen, self.q_maxlen, self.char_limit, self.filters, self.char_dim, self.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.contc_input), [-1, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.quesc_input), [-1, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [-1, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [-1, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.contw_input), 1.0 - self.dropout) q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.quesw_input), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) if self.use_elmo: c_emb = tf.concat([c_emb, self.cont_elmo], axis=-1) q_emb = tf.concat([q_emb, self.ques_elmo], axis=-1) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.cont_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block(q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.ques_len, scope="Encoder_Residual_Block", reuse=True, bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) c2q = tf.matmul(S_, q) q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, c2q, c * c2q, c * q2c] with tf.variable_scope("Model_Encoder_Layer"): attention_inputs = tf.concat(attention_outputs, axis=-1) enc = [conv(attention_inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks enc[i] = tf.nn.dropout(enc[i], 1.0 - self.dropout) enc.append(residual_block(enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.cont_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): start_logits = tf.concat([enc[1], enc[2]], axis=-1) end_logits = tf.concat([enc[1], enc[3]], axis=-1) if self.use_elmo: start_logits = tf.concat((start_logits, self.cont_elmo), axis=-1) end_logits = tf.concat((end_logits, self.cont_elmo), axis=-1) start_logits = tf.squeeze(conv(start_logits, 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze(conv(end_logits, 1, bias=False, name="end_pointer"), -1) # 2.0 Dataset # unanswer_bias = tf.get_variable("unanswer_bias", [1], # regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7), # initializer=tf.zeros_initializer()) # unanswer_bias = tf.reshape(tf.tile(unanswer_bias, [self.batch_size]), [-1, 1]) # self.logits1 = tf.concat((unanswer_bias, mask_logits(start_logits, mask=self.c_mask)), axis=-1) # self.logits2 = tf.concat((unanswer_bias, mask_logits(end_logits, mask=self.c_mask)), axis=-1) self.logits1 = mask_logits(start_logits, mask=self.c_mask) self.logits2 = mask_logits(end_logits, mask=self.c_mask) start_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits1, labels=self.y_start) end_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits2, labels=self.y_end) self.loss = tf.reduce_mean(start_loss + end_loss) # output outer = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2), tf.expand_dims(tf.nn.softmax(self.logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, self.ans_limit) self.output1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.output2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) if self.use_topk: with tf.variable_scope("Topk_Layer"): top_size = 3 outer = tf.reshape(outer, [self.batch_size, -1]) outer_inds = tf.nn.top_k(outer, top_size).indices # [N,top_size] self.yp1 = outer_inds // tf.shape(self.logits1)[-1] self.yp2 = outer_inds % tf.shape(self.logits2)[-1] def sen_mask(tensor): def sen_mask_(a, b, filters): try: mata = tf.zeros([a, filters], tf.int32) except: mata = [] matb = tf.ones([b - a, filters], tf.int32) matc = tf.zeros([tf.shape(self.logits1)[-1] - b, filters], tf.int32) mat = tf.concat((mata, matb, matc), axis=0) return mat return tf.map_fn(lambda x: sen_mask_(x[0], x[1], self.filters), tensor) self.yp3 = self.yp2 + 1 self.yp1 = tf.expand_dims(self.yp1, -1) self.yp2 = tf.expand_dims(self.yp2, -1) self.yp3 = tf.expand_dims(self.yp3, -1) self.y_mask = tf.concat([self.yp1, self.yp3], axis=-1) self.y_mask = tf.map_fn(lambda x: sen_mask(x), self.y_mask) # answer c = tf.tile(tf.expand_dims(c2q, 1), [1, top_size, 1, 1]) c_topk = tf.multiply(tf.cast(self.y_mask, tf.float32), c) W1 = tf.get_variable("W1", initializer=tf.ones([1, 1, 1, self.filters])) W1 = tf.tile(W1, [self.batch_size, top_size, 1, 1]) alpha1 = tf.nn.softmax(tf.matmul(W1, c_topk, transpose_b=True), axis=2) answer = tf.matmul(alpha1, c_topk) # [32,top_size,1,128] # question W2 = tf.get_variable("W2", initializer=tf.ones([1, 1, self.filters])) W2 = tf.tile(W2, [self.batch_size, 1, 1]) alpha2 = tf.nn.softmax(tf.matmul(W2, q, transpose_b=True), axis=1) ques = tf.matmul(alpha2, q) ques = tf.tile(tf.expand_dims(ques, 1), [1, top_size, 1, 1]) # [32,top_size,1,128] # question & answer W3 = tf.get_variable("W3", initializer=tf.ones([1, 1, self.filters, self.filters])) W3 = tf.tile(W3, [self.batch_size, top_size, 1, 1]) y_topk_logits = tf.nn.sigmoid(tf.matmul(ques, tf.matmul(W3, answer, transpose_b=True))) # [32,top_size,1,1] y_topk_logits = tf.squeeze(y_topk_logits) # [32,top_size] self.yp1 = tf.squeeze(self.yp1) self.yp2 = tf.squeeze(self.yp2) coeff1_topk = tf.one_hot(self.yp1, self.c_maxlen, axis=-1) # [32,top_size,400] one-hot coeff2_topk = tf.one_hot(self.yp2, self.c_maxlen, axis=-1) # [0,1,0,0,0][0,0,0,1,0]->[0,1,1,1,1]-[0,0,0,1,1]->[0,1,1,0,0]+[0,0,0,1,0]->[0,1,1,1,0] coeff1_topk_cumsum = tf.cumsum(coeff1_topk, axis=-1) coeff2_topk_cumsum = tf.cumsum(coeff2_topk, axis=-1) self.y_d = coeff1_topk_cumsum - coeff2_topk_cumsum + coeff2_topk # [32, top_size, 400] def clip_for_sigmoid(output): _epsilon = tf.convert_to_tensor(1e-7, dtype=output.dtype.base_dtype) output = tf.clip_by_value(output, _epsilon, 1 - _epsilon) output = tf.log(output / (1 - output)) return output if self.topk_loss=='f1': # f1 loss y_start_ind = tf.cumsum(self.y_start, axis=-1) y_end_ind = tf.cumsum(self.y_end, axis=-1) y_gtd = y_start_ind - y_end_ind + self.y_end # [32, 400] def cal_num_same(y_pred, y_truth): # [top_size, 400] [400,] def cal_num_same_(y_pred_, y_truth): # [400,] [400,] return tf.reduce_sum(tf.cast(tf.logical_and(tf.cast(y_pred_, tf.bool), tf.cast(y_truth, tf.bool)), tf.float32),axis=-1) return [tf.map_fn(lambda x:cal_num_same_(x,y_truth),y_pred),tf.map_fn(lambda x:cal_num_same_(x,y_truth),y_pred)] num_same = tf.map_fn(lambda x:cal_num_same(x[0], x[1]), [self.y_d, y_gtd])[0] # [32, top_size] y_precision = num_same / (tf.cast(tf.reduce_sum(self.y_d, axis=-1),tf.float32) + 1e-8) # [32, top_size] y_recall = num_same / tf.expand_dims(tf.cast(tf.reduce_sum(y_gtd, axis=-1),tf.float32) + 1e-8, axis=-1) # [32, top_size] y_f1 = (2.0 * y_precision * y_recall) / (tf.cast(y_precision + y_recall,tf.float32) + 1e-8) # [32, top_size] topk_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=clip_for_sigmoid(y_topk_logits), labels=y_f1)) elif self.topk_loss=='em': # em loss start_em = tf.equal(tf.cast(tf.expand_dims(tf.argmax(self.y_start, axis=-1), axis=1), tf.int32), tf.cast(self.yp1, tf.int32)) # [32, top_size] end_em = tf.equal(tf.cast(tf.expand_dims(tf.argmax(self.y_end, axis=-1), axis=1), tf.int32), tf.cast(self.yp2, tf.int32)) # [32, top_size] y_em = tf.cast(tf.logical_and(start_em, end_em), tf.float32) # [32, top_size] topk_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=clip_for_sigmoid(y_topk_logits), labels=y_em)) # final loss self.Lambda1 = tf.get_variable("Lambda1", initializer=tf.constant([0.9]), trainable=False) self.loss = tf.reduce_mean(self.Lambda1 * (start_loss + end_loss) + (1 - self.Lambda1) * topk_loss) # output outer_topk = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2), tf.expand_dims(tf.nn.softmax(self.logits2), axis=1)) outer_topk = tf.matrix_band_part(outer_topk, 0, self.ans_limit) self.output1 = tf.argmax(tf.reduce_max(outer_topk, axis=2), axis=1) self.output2 = tf.argmax(tf.reduce_max(outer_topk, axis=1), axis=1) # diversity loss if self.diversity_loss: self.Lambda2 = tf.get_variable("Lambda2", initializer=tf.constant([0.1]),trainable=False) diversity_loss = tf.reduce_mean(tf.reduce_prod(self.y_d, axis=1),axis=-1) # [32,top_size,400]->[32,400]->[32,] self.loss = self.loss + tf.reduce_mean(self.Lambda2 * diversity_loss) if self.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables) self.loss += l2_loss if self.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(self.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v is not None: self.assign_vars.append(tf.assign(var, v))
def BuildModel(self): print("preprocessing build model....") # word embedding self.utterance_ph = tf.placeholder(tf.int32, shape=(None, self.max_num_utterance, self.max_sentence_len)) self.response_ph = tf.placeholder(tf.int32, shape=(None, self.max_sentence_len)) self.y_true = tf.placeholder(tf.int32, shape=(None, )) self.embedding_ph = tf.placeholder(tf.float32, shape=(self.total_words, self.word_embedding_size)) self.response_len = tf.placeholder(tf.int32, shape=(None, )) self.all_utterance_len_ph = tf.placeholder( tf.int32, shape=(None, self.max_num_utterance)) word_embeddings = tf.get_variable('word_embeddings_v', shape=(self.total_words, self.word_embedding_size), dtype=tf.float32, trainable=False) self.embedding_init = word_embeddings.assign(self.embedding_ph) all_utterance_embeddings = tf.nn.embedding_lookup( word_embeddings, self.utterance_ph) response_embeddings = tf.nn.embedding_lookup(word_embeddings, self.response_ph) sentence_GRU = tf.nn.rnn_cell.GRUCell( self.rnn_units, kernel_initializer=tf.orthogonal_initializer()) all_utterance_embeddings = tf.unstack(all_utterance_embeddings, num=self.max_num_utterance, axis=1) all_utterance_len = tf.unstack(self.all_utterance_len_ph, num=self.max_num_utterance, axis=1) # char embedding self.response_cph = tf.placeholder(tf.int32, shape=(None, self.max_sentence_len, self.wordlen)) self.embedding_cph = tf.placeholder(tf.float32, shape=(self.total_chars, self.char_embedding_dim)) char_embeddings = tf.get_variable('char_embeddings_v', shape=(self.total_chars, self.char_embedding_dim), dtype=tf.float32, trainable=False) self.char_embeddings_init = char_embeddings.assign(self.embedding_cph) response_char_embeddings = tf.nn.embedding_lookup( char_embeddings, self.response_cph) self.utterance_cph = tf.placeholder( tf.int32, shape=(None, self.max_num_utterance, self.max_sentence_len, self.wordlen)) all_utterance_ch_embeddings = tf.nn.embedding_lookup( char_embeddings, self.utterance_cph) all_utterance_ch_embeddings = tf.unstack(all_utterance_ch_embeddings, num=self.max_num_utterance, axis=1) # response : char _ word embedding self.N = tf.placeholder(tf.int32, shape=(None)) d = 96 dro = 0.1 self.sample_numbers = tf.placeholder(tf.int32, shape=(None)) # 2 means (nagetive_samples + 1) ch_emb = tf.reshape(response_char_embeddings, [ self.sample_numbers * self.N * self.max_sentence_len, self.wordlen, self.char_embedding_dim ]) self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) ch_emb = tf.reduce_max(ch_emb, axis=1) ch_emb = tf.reshape(ch_emb, [ self.sample_numbers * self.N, self.max_sentence_len, int(ch_emb.shape[-1]) ]) c_emb = tf.nn.dropout(response_embeddings, 1.0 - self.dropout) # c_emb = tf.concat([c_emb, ch_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) # chamge A_matrix = tf.get_variable( 'A_matrix_v', shape=(self.rnn_units, self.rnn_units), initializer=tf.contrib.layers.xavier_initializer(), dtype=tf.float32) final_GRU = tf.nn.rnn_cell.GRUCell( self.rnn_units, kernel_initializer=tf.orthogonal_initializer()) reuse = None response_GRU_embeddings, _ = tf.nn.dynamic_rnn( sentence_GRU, c_emb, sequence_length=self.response_len, dtype=tf.float32, scope='sentence_GRU') self.response_embedding_save = response_GRU_embeddings c_emb = tf.transpose(c_emb, perm=[0, 2, 1]) response_GRU_embeddings = tf.transpose(response_GRU_embeddings, perm=[0, 2, 1]) matching_vectors = [] linecounter = 0 for utterance_embeddings, utterance_len in zip( all_utterance_embeddings, all_utterance_len): # utterance embedding utt_ch_emb = tf.reshape(all_utterance_ch_embeddings[linecounter], [ self.sample_numbers * self.N * self.max_sentence_len, self.wordlen, self.char_embedding_dim ]) utt_ch_emb = tf.nn.dropout(utt_ch_emb, 1.0 - 0.5 * self.dropout) utt_ch_emb = conv(utt_ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) utt_ch_emb = tf.reduce_max(utt_ch_emb, axis=1) utt_ch_emb = tf.reshape(utt_ch_emb, [ self.sample_numbers * self.N, self.max_sentence_len, int(utt_ch_emb.shape[-1]) ]) utt_emb = tf.nn.dropout(utterance_embeddings, 1.0 - self.dropout) # utt_emb = tf.concat([utt_emb, utt_ch_emb], axis=2) utt_emb = highway(utt_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) matrix1 = tf.matmul(utt_emb, c_emb) utterance_GRU_embeddings, _ = tf.nn.dynamic_rnn( sentence_GRU, utt_emb, sequence_length=utterance_len, dtype=tf.float32, scope='sentence_GRU') matrix2 = tf.einsum('aij,jk->aik', utterance_GRU_embeddings, A_matrix) # TODO:check this matrix2 = tf.matmul(matrix2, response_GRU_embeddings) matrix = tf.stack([matrix1, matrix2], axis=3, name='matrix_stack') conv_layer = tf.layers.conv2d( matrix, filters=8, kernel_size=(3, 3), padding='VALID', kernel_initializer=tf.contrib.keras.initializers.he_normal(), activation=tf.nn.relu, reuse=reuse, name='conv') # TODO: check other params pooling_layer = tf.layers.max_pooling2d( conv_layer, (3, 3), strides=(3, 3), padding='VALID', name='max_pooling') # TODO: check other params matching_vector = tf.layers.dense( tf.contrib.layers.flatten(pooling_layer), 50, kernel_initializer=tf.contrib.layers.xavier_initializer(), activation=tf.tanh, reuse=reuse, name='matching_v') # TODO: check wthether this is correct if not reuse: reuse = True matching_vectors.append(matching_vector) linecounter += 1 _, last_hidden = tf.nn.dynamic_rnn( final_GRU, tf.stack(matching_vectors, axis=0, name='matching_stack'), dtype=tf.float32, time_major=True, scope='final_GRU') # TODO: check time_major logits = tf.layers.dense( last_hidden, 2, kernel_initializer=tf.contrib.layers.xavier_initializer(), name='final_v') self.y_pred = tf.nn.softmax(logits) self.total_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y_true, logits=logits)) tf.summary.scalar('loss', self.total_loss) optimizer = tf.train.AdamOptimizer(learning_rate=0.001) self.train_op = optimizer.minimize(self.total_loss)
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh, dw = config.test_batch_size if self.loop_function else config.batch_size, self.c_maxlen, self.q_maxlen, \ config.char_limit, config.hidden, config.char_dim, config.num_heads, config.glove_dim with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=2, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=2, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=2, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Decoder_Layer"): memory = tf.concat([self.enc[1], self.enc[2], self.enc[3]], axis=-1) oups = tf.split(self.a, [1] * self.a_maxlen, 1) h = tf.tanh( _linear(tf.reduce_mean(memory, axis=1), output_size=d, bias=False, scope="h_initial")) c = tf.tanh( _linear(tf.reduce_mean(memory, axis=1), output_size=d, bias=False, scope="c_initial")) state = (c, h) outputs = [] prev = None prev_probs = [0.0] symbols = [] for i, inp in enumerate(oups): einp = tf.reshape(tf.nn.embedding_lookup(self.word_mat, inp), [N, dw]) if i > 0: tf.get_variable_scope().reuse_variables() if self.loop_function is not None and prev is not None: with tf.variable_scope("loop_function", reuse=True): einp, prev_probs, index, prev_symbol = self.loop_function( prev, prev_probs, self.beam_size, i) h = tf.gather(h, index) # update prev state state = tuple(tf.gather(s, index) for s in state) # update prev state for j, symbol in enumerate(symbols): symbols[j] = tf.gather( symbol, index) # update prev symbols for j, output in enumerate(outputs): outputs[j] = tf.gather( output, index) # update prev outputs symbols.append(prev_symbol) attn = tf.reshape( multihead_attention(tf.expand_dims(h, 1), units=d, num_heads=nh, memory=memory, mask=self.c_mask, bias=False), [-1, nh * d]) cinp = tf.concat([einp, attn], 1) h, state = self.cell(cinp, state) with tf.variable_scope("AttnOutputProjection"): output = _linear([h] + [cinp], output_size=dw * 2, bias=False, scope="output") output = tf.reshape(output, [-1, dw, 2]) output = tf.reduce_max(output, 2) # maxout outputs.append(output) if self.loop_function is not None: prev = output if self.loop_function is not None: # process the last symbol einp, prev_probs, index, prev_symbol = self.loop_function( prev, prev_probs, self.beam_size, i + 1) for j, symbol in enumerate(symbols): symbols[j] = tf.gather(symbol, index) # update prev symbols for j, output in enumerate(outputs): outputs[j] = tf.gather(output, index) # update prev outputs symbols.append(prev_symbol) # output the final best result of beam search for k, symbol in enumerate(symbols): symbols[k] = tf.gather(symbol, 0) for k, output in enumerate(outputs): outputs[k] = tf.expand_dims(tf.gather(output, 0), 0) self.gen_loss = self._compute_loss(outputs, oups, N) self.symbols = symbols with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) self.loss = self.gen_loss if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def _embed(self): with tf.variable_scope('word_char_embedding'): if self.config.fix_pretrained_vector: self.pretrained_word_mat = tf.get_variable( "word_emb_mat", [self.vocab.word_size() - 2, self.vocab.word_embed_dim], dtype=tf.float32, initializer=tf.constant_initializer( self.vocab.word_embeddings[2:], dtype=tf.float32), trainable=False) self.word_pad_unk_mat = tf.get_variable( "word_unk_pad", [2, self.pretrained_word_mat.get_shape()[1]], dtype=tf.float32, initializer=tf.constant_initializer( self.vocab.word_embeddings[:2], dtype=tf.float32), trainable=True) self.word_mat = tf.concat( [self.word_pad_unk_mat, self.pretrained_word_mat], axis=0) self.pretrained_char_mat = tf.get_variable( "char_emb_mat", [self.vocab.char_size() - 2, self.vocab.char_embed_dim], dtype=tf.float32, initializer=tf.constant_initializer( self.vocab.char_embeddings[2:], dtype=tf.float32), trainable=False) self.char_pad_unk_mat = tf.get_variable( "char_unk_pad", [2, self.pretrained_char_mat.get_shape()[1]], dtype=tf.float32, initializer=tf.constant_initializer( self.vocab.char_embeddings[:2], dtype=tf.float32), trainable=True) self.char_mat = tf.concat( [self.char_pad_unk_mat, self.pretrained_char_mat], axis=0) else: self.word_mat = tf.get_variable( 'word_embeddings', shape=[self.vocab.word_size(), self.vocab.word_embed_dim], initializer=tf.constant_initializer( self.vocab.word_embeddings), trainable=True) self.char_mat = tf.get_variable( 'char_embeddings', shape=[self.vocab.char_size(), self.vocab.char_embed_dim], initializer=tf.constant_initializer( self.vocab.char_embeddings), trainable=True) self.ch_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1]) self.qh_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1]) N, PL, QL, CL, d, dc, nh = self._params() if self.config.fix_pretrained_vector: dc = self.char_mat.get_shape()[-1] with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL * self.max_p_num, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL * self.max_p_num, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [N * self.max_p_num, PL, -1]) qh_emb = tf.reshape(qh_emb, [N * self.max_p_num, QL, -1]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) self.c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) self.q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True)
def __init__(self, config, batch, word_mat=None, char_mat=None, filter_sizes=None, embedding_size=None, num_filters=None, trainable=True, l2_reg_lambda=0.0, keep_prob=0.9, graph=None): # Placeholders for input, output and dropout self.config = config self.graph = graph if graph is not None else tf.Graph() self.trainable = trainable if trainable == True: self.input_x, self.input_x1, self.ch, self.qh, self.input_y, self.qa_id, self.alternatives_tokens = batch.get_next( ) # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len] else: self.input_x, self.input_x1, self.ch, self.qh, self.alternatives_tokens = batch.get_next( ) # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len] self.dropout_keep_prob = keep_prob self.global_step = tf.get_variable( 'global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.dropout = tf.placeholder_with_default(0.5, (), name="dropout") # Keeping track of l2 regularization loss (optional) l2_loss = tf.constant(0.0) # Embedding layer with tf.name_scope("embedding"): self.char_mat = tf.get_variable("char_mat", initializer=tf.constant( char_mat, dtype=tf.float32), trainable=True) self.W = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32), trainable=True) self.c_mask = tf.cast(self.input_x, tf.bool) # self.c为填充之后的长度是一致的,用0进行填充 self.q_mask = tf.cast(self.input_x1, tf.bool) if trainable: self.c_maxlen, self.q_maxlen, = config.para_limit, config.ques_limit, else: self.c_maxlen, self.q_maxlen = config.test_para_limit, config.test_ques_limit self.ch_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1]) self.qh_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1]) N, PL, QL, CL, d, dc, nh,dg = config.batch_size, self.c_maxlen, self.q_maxlen,\ config.char_limit, config.hidden, config.char_dim, \ config.num_heads,config.char_hidden ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) with tf.variable_scope("cnn_char_Embedding_Layer"): # Bidaf style conv-highway encoder ch_emb_cnn = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb_cnn = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb_cnn = tf.reduce_max(ch_emb_cnn, axis=1) # 求出横向唯独的最大特征,这里可以用k_max尝试 qh_emb_cnn = tf.reduce_max(qh_emb_cnn, axis=1) ch_emb_cnn = tf.reshape(ch_emb_cnn, [N, PL, ch_emb_cnn.shape[-1]]) qh_emb_cnn = tf.reshape(qh_emb_cnn, [N, QL, qh_emb_cnn.shape[-1]]) with tf.variable_scope('lstm_char_embedding'): cell_fw = tf.contrib.rnn.GRUCell(dg) # 按照字符有多少个gru神经单元 cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32 ) # self.ch_len表示训练数据集所有字符平摊之后,实际字符的长度,sequence_length=[bacth_size] is N * PL, because # char_hidden is 100 so state_fw and state_bw is [N * PL,100] ch_emb_lstm = tf.concat([state_fw, state_bw], axis=1) # [N * PL,200] _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) # state_* [N*QL] qh_emb_lstm = tf.concat([state_fw, state_bw], axis=1) # question_emd is [,200] qh_emb_lstm = tf.reshape( qh_emb_lstm, [N, QL, 2 * dg]) # [batch_size,que_len,200] ch_emb_lstm = tf.reshape( ch_emb_lstm, [N, PL, 2 * dg] ) # 以上过程对应了论文里边的 the character-level embedding are generate by ...in the token # 这样就把每一个单词的字符转化为单词的字符级别embedding信息,tf.reshape(ch_emb, [N, PL, 2 * dg]) # 从这里可以看出作者最后那字符的state状态作为字符信息与原始单词embedding进行连接,那么是否可以用拼音 # 作为汉语的字符级别信息呢,可以尝试 self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) self.embedded_chars1 = tf.nn.embedding_lookup( self.W, self.input_x1) ch_emb_cnn = tf.nn.dropout(ch_emb_cnn, self.dropout) ch_emb_lstm = tf.nn.dropout(ch_emb_lstm, self.dropout) qh_emb_cnn = tf.nn.dropout(qh_emb_cnn, self.dropout) qh_emb_lstm = tf.nn.dropout(qh_emb_lstm, self.dropout) with tf.variable_scope("lstm_output"): c_emb = tf.concat([self.embedded_chars, ch_emb_lstm], axis=2) q_emb = tf.concat([self.embedded_chars1, qh_emb_lstm], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) # 相当于对信息进行一次筛选并且让表示的维度降低到75 q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) self.embedded_chars_expanded = tf.expand_dims(c_emb, -1) self.embedded_chars_expanded1 = tf.expand_dims(q_emb, -1) # Create a convolution + maxpool layer for each filter size input_shape = c_emb.get_shape().as_list() pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [ filter_size, input_shape[-1], 1, num_filters ] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) conv_ouput = tf.nn.conv2d(self.embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv_ouput, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool(h, ksize=[ 1, config.para_limit - filter_size + 1, 1, 1 ], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) self.h_pool = tf.concat(pooled_outputs, 3) self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) # Add dropout with tf.name_scope("dropout"): self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) # Final (unnormalized) scores and predictions with tf.name_scope("output"): W = tf.get_variable( "W", shape=[num_filters_total, 3], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[3]), name="b") l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) self.lstm_scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") with tf.variable_scope("cnn_output"): c_emb = tf.concat([self.embedded_chars, ch_emb_cnn], axis=2) q_emb = tf.concat([self.embedded_chars1, qh_emb_cnn], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) # 相当于对信息进行一次筛选并且让表示的维度降低到75 q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) self.embedded_chars_expanded = tf.expand_dims(c_emb, -1) self.embedded_chars_expanded1 = tf.expand_dims(q_emb, -1) # Create a convolution + maxpool layer for each filter size input_shape = c_emb.get_shape().as_list() pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [ filter_size, input_shape[-1], 1, num_filters ] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) conv_ouput = tf.nn.conv2d(self.embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv_ouput, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool(h, ksize=[ 1, config.para_limit - filter_size + 1, 1, 1 ], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) self.h_pool = tf.concat(pooled_outputs, 3) self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) # Add dropout with tf.name_scope("dropout"): self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) # Final (unnormalized) scores and predictions with tf.name_scope("output"): W = tf.get_variable( "W", shape=[num_filters_total, 3], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[3]), name="b") l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) self.cnn_scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") self.scores = tf.add(self.lstm_scores, self.cnn_scores) / 2.0 print(self.scores) print(self.lstm_scores) print("3333333333333333333333333") self.predictions = tf.argmax(self.scores, 1, name="predictions") # Calculate mean cross-entropy loss if trainable: with tf.name_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v)) self.lr = tf.minimum( config.init_lr, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm(gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, \ self.q_maxlen, config.char_limit, config.hidden, config.tw_char_dim, config.num_heads with tf.variable_scope("Input_Embedding_Layer"): if config.type == "all": ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) elif config.type == 'char': c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.char_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.char_mat, self.q), 1.0 - self.dropout) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) # guess : mask the padding part pad in the end of the passage self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def forward(self): self.c_words = tf.placeholder(tf.int32, [None, self.config.context_len], 'context-words') self.c_chars = tf.placeholder( tf.int32, [None, self.config.context_len, self.config.max_char_len], 'context-chars') self.c_mask = tf.sign(self.c_words) self.q_words = tf.placeholder(tf.int32, [None, self.config.question_len], 'query-words') self.q_chars = tf.placeholder( tf.int32, [None, self.config.question_len, self.config.max_char_len], 'query-chars') self.q_mask = tf.sign(self.q_words) self.c_len = tf.cast(tf.reduce_sum(self.c_mask, -1), tf.int32) self.q_len = tf.cast(tf.reduce_sum(self.q_mask, -1), tf.int32) self.start = tf.placeholder(tf.int32, [None], 'start-index') self.end = tf.placeholder(tf.int32, [None], 'end-index') with tf.variable_scope('input-embedding'): c_w = tf.nn.embedding_lookup(self.word_embed, self.c_words) q_w = tf.nn.embedding_lookup(self.word_embed, self.q_words) c_ch = layers.char_embed(self.c_chars, self.char_embed, dropout=self.dropout) q_ch = layers.char_embed(self.q_chars, self.char_embed, dropout=self.dropout, reuse=True) c = tf.concat([c_w, c_ch], -1) q = tf.concat([q_w, q_ch], -1) with tf.variable_scope('highway-1'): c = layers.highway(c, self.config.embed_size, dropout=self.dropout) q = layers.highway(q, self.config.embed_size, dropout=self.dropout, reuse=True) with tf.variable_scope('highway-2'): c = layers.highway(c, self.config.embed_size, dropout=self.dropout) q = layers.highway(q, self.config.embed_size, dropout=self.dropout, reuse=True) with tf.variable_scope('projection'): c = tf.layers.conv1d(c, self.config.filters, 1, padding='same') q = tf.layers.conv1d(q, self.config.filters, 1, padding='same', reuse=True) with tf.variable_scope('input-encoder'): c = layers.encoder_block(c, num_blocks=1, num_convolutions=4, kernel=7, mask=self.c_mask, dropout=self.dropout) q = layers.encoder_block(q, num_blocks=1, num_convolutions=4, kernel=7, mask=self.q_mask, dropout=self.dropout, reuse=True) with tf.variable_scope('attention'): attention = layers.bi_attention(c, q, layers.trilinear(c, q), self.c_mask, self.q_mask) attention = tf.layers.conv1d(attention, self.config.filters, 1, padding='same') modeling = [attention] for i in range(3): reuse = i > 0 m = layers.encoder_block(modeling[i], num_blocks=7, num_convolutions=2, kernel=5, mask=self.c_mask, dropout=self.dropout, reuse=reuse) if i % 2 == 0: m = tf.nn.dropout(m, 1.0 - self.dropout) modeling.append(m) with tf.variable_scope('start-index') as scope: self.start_linear = tf.concat([modeling[-3], modeling[-2]], -1) self.start_linear = tf.squeeze( tf.layers.dense(self.start_linear, 1, use_bias=False), -1) self.pred_start = tf.nn.softmax(self.start_linear, name='pred-start') with tf.variable_scope('end-index') as scope: self.end_linear = tf.concat([modeling[-3], modeling[-1]], -1) self.end_linear = tf.squeeze( tf.layers.dense(self.end_linear, 1, use_bias=False), -1) self.pred_end = tf.nn.softmax(self.end_linear, name='pred-end') with tf.variable_scope('loss') as scope: loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.start_linear, labels=self.start) loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.end_linear, labels=self.end) loss = tf.reduce_mean(loss1 + loss2) lossL2 = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name ]) * self.config.l2 self.loss = loss + lossL2 with tf.variable_scope('optimizer') as scope: optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) grads = tf.gradients(self.loss, tf.trainable_variables()) grads, _ = tf.clip_by_global_norm(grads, self.config.grad_clip) grads_and_vars = zip(grads, tf.trainable_variables()) self.optimize = optimizer.apply_gradients( grads_and_vars, global_step=self.global_step) if self.config.ema_decay > 0: with tf.variable_scope('ema') as scope: ema = tf.train.ExponentialMovingAverage( decay=self.config.ema_decay) ema_op = ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) assign_vars = [] for var in tf.global_variables(): v = ema.average(var) if v: assign_vars.append(tf.assign(var, v)) self.assign_vars = assign_vars