def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) # shape = (?, 16, 64) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder # d(hidden_size) = 96 ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) # shape = (?, 12, 96) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) # shape = (?, 96) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) # shape = (32, ?, 96) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): # self.enc[1] = (32, ?, 96) conv1 = conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer") # tf.shape(conv1) = (32, ?, 1) start_logits = tf.squeeze(conv1, -1) # tf.shape(start_logits) = (32, ?) conv2 = conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer") end_logits = tf.squeeze(conv2, -1) # mask ?? self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] # shape = (32, ?) -> cause the context length is variable # matmul([32, ?, 1] x [32, 1, ?]) outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) # outer = (32, ?, ?) outer = tf.matrix_band_part(outer, 0, config.ans_limit) reduced1 = tf.reduce_max(outer, axis=2) reduced2 = tf.reduce_max(outer, axis=1) # tf.shape(reduced) = (32, ?) # ############################################### paddings = [[0, 0], [0, self.MAX_PL - tf.shape(reduced1)[0]]] reduced1 = tf.pad(reduced1, paddings, "CONSTANT") reduced2 = tf.pad(reduced2, paddings, "CONSTANT") reduced1 = tf.slice(reduced1, [0, 0], [N, self.MAX_PL]) reduced2 = tf.slice(reduced2, [0, 0], [N, self.MAX_PL]) # tf.shape(reduced) = (32, ?) # no answer flag: (no_answer, answer_exist) # TODO add additinal layer # TODO dimenstion between reduced and weight na_flag1 = tf.cast( tf.argmax(tf.matmul(reduced1, self.weights1), axis=1), tf.float32) na_flag2 = tf.cast( tf.argmax(tf.matmul(reduced2, self.weights2), axis=1), tf.float32) # Tensor("Output_Layer/ArgMax:0", shape=(32, ?), dtype=int64) self.yp1 = tf.argmax(reduced1, axis=1) self.yp2 = tf.argmax(reduced2, axis=1) print(tf.reduce_sum(reduced1, axis=1)) print(tf.multiply(na_flag1, tf.reduce_sum(reduced1, axis=1))) print( tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y1)) # no_answer losses = tf.where( self.no_answer > 0, tf.multiply(na_flag1, tf.reduce_sum(reduced1, axis=1)), tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y1)) losses2 = tf.where( self.no_answer > 0, tf.multiply(na_flag2, tf.reduce_sum(reduced2, axis=1)), tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2, labels=self.y2)) ################################################# self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, \ self.q_maxlen, config.char_limit, config.hidden, config.tw_char_dim, config.num_heads with tf.variable_scope("Input_Embedding_Layer"): if config.type == "all": ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) elif config.type == 'char': c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.char_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.char_mat, self.q), 1.0 - self.dropout) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) # guess : mask the padding part pad in the end of the passage self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads d_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True) with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) print "ch_emb before", ch_emb.shape[-1] print "qh_emb before", qh_emb.shape[-1] ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) print "N", N, "PL", PL, "QL", QL print "ch_emb", ch_emb.shape print "qh_emb", qh_emb.shape c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) print "c_emb high", c_emb.shape print "q_emb high", q_emb.shape with tf.variable_scope("Embedding_Encoder_Layer"): c_tmp = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) # c_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True) c = drnn(d_cell, c_tmp, d) q_tmp = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) # q_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True) q = drnn(d_cell, q_tmp, d) print "embd enc output c", c.shape print "embd enc output q", q.shape # exit() with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] print "enc len", len(self.enc) # print self.ch_len.shape # print self.qh_len.shape # print self.c_len.shape # print self.q_len.shape # print ip_len.shape print "qh shape", self.qh.shape print "qh type", self.qh.dtype print "ip shape", inputs.shape print "ip type", inputs.dtype ip_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(inputs, tf.bool), tf.float32), axis=2), [-1]) print "ip_len", ip_len.shape # fw0 = drnn(d_cell, self.enc[0], d) # f_cell = tf.contrib.rnn.BasicLSTMCell(fw0[2], forget_bias=1.0, state_is_tuple=True) # fw1 = drnn(d_cell, fw0, d) # fw2 = drnn(d_cell, fw1, d) # self.enc.append(fw0) # self.enc.append(fw1) # self.enc.append(fw2) # print "fw1 shape", fw1 # # (fw0, bw0), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None, # initial_state_fw=None, initial_state_bw=None, # dtype=None, parallel_iterations=None, # swap_memory=False, time_major=False, scope=None): # bw_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True) # g0 = bidirlstm(fw_cell, bw_cell, inputs, d) # g1 = bidirlstm(fw_cell, bw_cell, g0, d) # g2 = bidirlstm(fw_cell, bw_cell, g1, d) # fw0 = bidirlstm(d_cell, d_cell, inputs, d) # d_cell1 = tf.contrib.rnn.BasicLSTMCell(fw0[1], forget_bias=1.0, state_is_tuple=True) # fw1 = bidirlstm(d_cell1, d_cell1, fw0, d) # (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(d_cell, d_cell, self.enc[0], dtype='float', scope='g0') # [N, M, JX, 2d] # g0 = tf.concat([fw_g0, bw_g0], 4) # (fw_g1, bw_g1) = bidirectional_dynamic_rnn(d_cell, d_cell, fw_g0, dtype='float', scope='g1') # [N, M, JX, 2d] # print "fw_g0", fw_g0.shape # print "bw_g0", bw_g0.shape # print g0.shape # (fw_g1, bw_g1), _ = bidirlstm(d_cell, d_cell, g0, dtype='float', scope='g1') # [N, M, JX, 2d] # g1 = tf.concat([fw_g1, bw_g1], 3) # flat_output_fw = nest.flatten(fw_g0) # flat_output_bw = nest.flatten(bw_g0) # flat_outputs = tuple(array_ops.concat(1, [fw, bw]) # for fw, bw in zip(flat_output_fw, flat_output_bw)) # outputs = nest.pack_sequence_as(structure=output_fw, # flat_sequence=flat_outputs) # print "output", outputs.shape for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( drnn( d_cell, residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout), d)) # print "enc[0] shape", self.enc[0].shape print "chalala" # exit() with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = None) qh_emb = conv(qh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = True) ch_emb = tf.reduce_max(ch_emb, axis = 1) qh_emb = tf.reduce_max(qh_emb, axis = 1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None) q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks = 1, num_conv_layers = 4, kernel_size = 7, mask = self.c_mask, num_filters = d, num_heads = nh, seq_len = self.c_len, scope = "Encoder_Residual_Block", bias = False, dropout = self.dropout) q = residual_block(q_emb, num_blocks = 1, num_conv_layers = 4, kernel_size = 7, mask = self.q_mask, num_filters = d, num_heads = nh, seq_len = self.q_len, scope = "Encoder_Residual_Block", reuse = True, # Share the weights between passage and question bias = False, dropout = self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask = mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis = -1) self.enc = [conv(inputs, d, name = "input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks = 7, num_conv_layers = 2, kernel_size = 5, mask = self.c_mask, num_filters = d, num_heads = nh, seq_len = self.c_len, scope = "Model_Encoder", bias = False, reuse = True if i > 0 else None, dropout = self.dropout) ) with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, bias = False, name = "start_pointer"),-1) end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1, bias = False, name = "end_pointer"), -1) self.logits = [mask_logits(start_logits, mask = self.c_mask), mask_logits(end_logits, mask = self.c_mask)] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits( logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits( logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var,v))
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh, dw = config.test_batch_size if self.loop_function else config.batch_size, self.c_maxlen, self.q_maxlen, \ config.char_limit, config.hidden, config.char_dim, config.num_heads, config.glove_dim with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=2, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=2, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=2, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Decoder_Layer"): memory = tf.concat([self.enc[1], self.enc[2], self.enc[3]], axis=-1) oups = tf.split(self.a, [1] * self.a_maxlen, 1) h = tf.tanh( _linear(tf.reduce_mean(memory, axis=1), output_size=d, bias=False, scope="h_initial")) c = tf.tanh( _linear(tf.reduce_mean(memory, axis=1), output_size=d, bias=False, scope="c_initial")) state = (c, h) outputs = [] prev = None prev_probs = [0.0] symbols = [] for i, inp in enumerate(oups): einp = tf.reshape(tf.nn.embedding_lookup(self.word_mat, inp), [N, dw]) if i > 0: tf.get_variable_scope().reuse_variables() if self.loop_function is not None and prev is not None: with tf.variable_scope("loop_function", reuse=True): einp, prev_probs, index, prev_symbol = self.loop_function( prev, prev_probs, self.beam_size, i) h = tf.gather(h, index) # update prev state state = tuple(tf.gather(s, index) for s in state) # update prev state for j, symbol in enumerate(symbols): symbols[j] = tf.gather( symbol, index) # update prev symbols for j, output in enumerate(outputs): outputs[j] = tf.gather( output, index) # update prev outputs symbols.append(prev_symbol) attn = tf.reshape( multihead_attention(tf.expand_dims(h, 1), units=d, num_heads=nh, memory=memory, mask=self.c_mask, bias=False), [-1, nh * d]) cinp = tf.concat([einp, attn], 1) h, state = self.cell(cinp, state) with tf.variable_scope("AttnOutputProjection"): output = _linear([h] + [cinp], output_size=dw * 2, bias=False, scope="output") output = tf.reshape(output, [-1, dw, 2]) output = tf.reduce_max(output, 2) # maxout outputs.append(output) if self.loop_function is not None: prev = output if self.loop_function is not None: # process the last symbol einp, prev_probs, index, prev_symbol = self.loop_function( prev, prev_probs, self.beam_size, i + 1) for j, symbol in enumerate(symbols): symbols[j] = tf.gather(symbol, index) # update prev symbols for j, output in enumerate(outputs): outputs[j] = tf.gather(output, index) # update prev outputs symbols.append(prev_symbol) # output the final best result of beam search for k, symbol in enumerate(symbols): symbols[k] = tf.gather(symbol, 0) for k, output in enumerate(outputs): outputs[k] = tf.expand_dims(tf.gather(output, 0), 0) self.gen_loss = self._compute_loss(outputs, oups, N) self.symbols = symbols with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) self.loss = self.gen_loss if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def forward(self): config = self.config N = config.batch_size if not self.demo else 1 PL = self.c_maxlen QL = self.q_maxlen XL = self.x_maxlen # DEBUG self.debug_ops.extend([PL, QL, XL]) CL = config.char_limit # 16 d = config.hidden # 96 dc = config.char_dim # 64 nh = config.num_heads # 1 with tf.variable_scope("Input_Embedding_Layer"): ''' self.ch : (N, c_maxlen, 16) self.qh : (N, q_maxlen, 16) self.xh : (N, x_maxlen, 16) ''' ###################################### #get elmo embeddings ###################################### datadir = "/data/elmo_experiment_20180906/20180906_model" vocab_file = os.path.join(datadir, 'vocab-2016-09-10.txt') options_file = os.path.join(datadir, 'options.json') weight_file = os.path.join(datadir, 'weights.hdf5') print(vocab_file) print(options_file) print(weight_file) # Create a Batcher to map text to character ids. batcher = Batcher(vocab_file, 50) # Input placeholders to the biLM. #context_character_ids = tf.placeholder('int32', shape=(None, None, 50)) #question_character_ids = tf.placeholder('int32', shape=(None, None, 50)) # Build the biLM graph. bilm = BidirectionalLanguageModel(options_file, weight_file) # Get ops to compute the LM embeddings. print(self.c) print(self.c.shape) #print(self.ch) #print(self.ch.shape) print(self.c_elmo) print(self.c_elmo.shape) print(self.q_elmo) print(self.q_elmo.shape) print(self.x_elmo) print(self.x_elmo.shape) context_embeddings_op = bilm(self.c_elmo) question_embeddings_op = bilm(self.q_elmo) candidate_embeddings_op = bilm(self.x_elmo) # Get an op to compute ELMo (weighted average of the internal biLM layers) # Our SQuAD model includes ELMo at both the input and output layers # of the task GRU, so we need 4x ELMo representations for the question # and context at each of the input and output. # We use the same ELMo weights for both the question and context # at each of the input and output. #context elmo elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0) with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question elmo_question_input = weight_layers( 'input', question_embeddings_op, l2_coef=0.0 ) elmo_candidate_input = weight_layers( 'input', candidate_embeddings_op, l2_coef=0.0 ) elmo_context_output = weight_layers( 'output', context_embeddings_op, l2_coef=0.0 ) with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question elmo_question_output = weight_layers( 'output', question_embeddings_op, l2_coef=0.0 ) elmo_candidate_output = weight_layers( 'output', candidate_embeddings_op, l2_coef=0.0 ) ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) #(N*PL,16,64) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) #(N*QL,16,64) xh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.xh), [N * XL, CL, dc]) #(N*XL,16,64) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) xh_emb = tf.nn.dropout(xh_emb, 1.0 - 0.5 * self.dropout) # BiDAF style conv-highway encoder: conv over chars in each word in a batch of passages ch_emb = conv(ch_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = None) # (N*c_maxlen, 16-5+1, 96) qh_emb = conv(qh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = True) # (N*q_maxlen, 16-5+1, 96) xh_emb = conv(xh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name="char_conv", reuse=True) # (N*x_maxlen, 16-5+1, 96) # Max Pooling ch_emb = tf.reduce_max(ch_emb, axis = 1) # (N*c_maxlen, 96) qh_emb = tf.reduce_max(qh_emb, axis = 1) # (N*q_maxlen, 96) xh_emb = tf.reduce_max(xh_emb, axis = 1) # (N*x_maxlen, 96) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) # (N, c_maxlen, 96) qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]]) # (N, q_maxlen, 96) xh_emb = tf.reshape(xh_emb, [N, XL, xh_emb.shape[-1]]) # (N, x_maxlen, 96) ''' self.c : (N, c_maxlen) self.q : (N, q_maxlen) self.x : (N, x_maxlen) ''' #print(self.c) #print(self.q) #print(self.x) c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout)#(N,c_maxlen,300) q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout)#(N,q_maxlen,300) x_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.x), 1.0 - self.dropout)#(N,x_maxlen,300) #c_emb_elmo = #q_emb_elmo = #x_emb_elmo = c_emb = tf.concat([c_emb, ch_emb], axis=2) # (N, c_maxlen, 396) q_emb = tf.concat([q_emb, qh_emb], axis=2) # (N, q_maxlen, 396) x_emb = tf.concat([x_emb, xh_emb], axis=2) # (N, x_maxlen, 396) print(c_emb) print(c_emb.shape) c_emb = tf.concat([elmo_context_output['weighted_op'], c_emb], axis=2) # (N, c_maxlen, 1024 + 396) q_emb = tf.concat([elmo_question_output['weighted_op'], q_emb], axis=2) # (N, q_maxlen, 1024 + 396) x_emb = tf.concat([elmo_candidate_output['weighted_op'], x_emb], axis=2) # (N, x_maxlen, 1024 + 396) print(c_emb) print(c_emb.shape) c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None)#(N,c_maxlen,96) q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)#(N,q_maxlen,96) x_emb = highway(x_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)#(N,x_maxlen,96) with tf.variable_scope("Embedding_Encoder_Layer"): ''' -> positional encoding -> layer_normalization -> depth-wise separable convolution -> self attention -> feed forward network In the paper: The total number of encoder blocks is 1 ''' # (N, c_maxlen, 96) c = residual_block(c_emb, num_blocks = 1, num_conv_layers = 4, kernel_size = 7, mask = self.c_mask, num_filters = d, num_heads = nh, seq_len = self.c_len, scope = "Encoder_Residual_Block", bias = False, dropout = self.dropout) # (N, q_maxlen, 96) q = residual_block(q_emb, num_blocks = 1, num_conv_layers = 4, kernel_size = 7, mask = self.q_mask, num_filters = d, num_heads = nh, seq_len = self.q_len, scope = "Encoder_Residual_Block", reuse = True, # Share the weights between passage and question bias = False, dropout = self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): ''' tf.tile(input, multiples, name=None): creates a new tensor by replicating input multiples times. The output tensor's i'th dimension has input.dims(i) * multiples[i] elements, and the values of input are replicated multiples[i] times along the 'i'th dimension. Paper: The layer parameters are the same as the Embedding Encoder Layer except that convolution layer number is 2 within a block and the total number of blocks is 7 ''' ''' c: (N, c_maxlen, d) q: (N, q_maxlen, d) ch_emb: (N, c_maxlen, d) qh_emb: (N, q_maxlen, d) C: (N, c_maxlen, q_maxlen, d) Q: (N, c_maxlen, q_maxlen, d) S: (N, c_maxlen, q_maxlen) mask_q: (N, 1, q_maxlen) mask_c: (N, c_maxlen, 1) S_: (N, c_maxlen, q_maxlen) S_T: (N, q_maxlen, c_maxlen) self.c2q: (N, c_maxlen, d) = tf.matmul(S_, q) self.q2c: (N, c_maxlen, d) = tf.matmul(tf.matmul(S_, S_T), c) ''' # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18 # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) # optimization from jasonwbw S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask = mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), axis = 1),(0,2,1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18 attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] # if config.q2c: # attention_outputs.append(c * self.q2c) # with tf.variable_scope("Model_Encoder_Layer"): # inputs = tf.concat(attention_outputs, axis = -1) # # # same as a dxd MLP layer # self.enc = [conv(inputs, d, name = "input_projection")] # d=hidden=96 # # for i in range(3): # if i % 2 == 0: # dropout every 2 blocks # self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) # self.enc.append( # residual_block(self.enc[i], # num_blocks = 7, # num_conv_layers = 2, # kernel_size = 5, # mask = self.c_mask, # num_filters = d, # num_heads = nh, # seq_len = self.c_len, # scope = "Model_Encoder", # bias = False, # reuse = True if i > 0 else None, # dropout = self.dropout) # ) # DEBUG # self.debug_ops.append(inputs) # self.debug_ops.extend(self.enc) with tf.variable_scope("Output_Layer"): ''' broadcasting:dimensions with size 1 are stretched or "copied" to match the other ''' ''' x_emb: (N, x_maxlen, d) inputs: (N, c_maxlen, 4*d) mask_x: (N, x_maxlen, 1) c_proj: (N, c_maxlen, d) S_xc/S_xc_: (N, x_maxlen, c_maxlen) x2c: (N, x_maxlen, d) xp_exp: (N, x_maxlen, c_maxlen, 1) c_proj_exp: (N, 1, c_maxlen, d) cand_context: (N, x_maxlen, c_maxlen, d) cand_context_pool: (N, x_maxlen, d) cand_condense: (N, x_maxlen, d*2) self.cand_condense: (N, x_maxlen, d) self.cand_logits: (N, x_maxlen, 1) ''' inputs = tf.concat(attention_outputs, axis = -1) # masking candidate embedding mask_x = tf.expand_dims(self.x_mask, 2) c_proj = conv(inputs, d, name="context_projection") S_xc = optimized_trilinear_for_attention([x_emb, c_proj], self.x_maxlen, self.c_maxlen, input_keep_prob=1.0 - self.dropout) S_xc_ = tf.nn.softmax(mask_logits(S_xc, mask = mask_x)) self.x2c = tf.matmul(S_xc_, c_proj) self.cand_condense = self.x2c if self.config.cand_condense_vector: xp_exp = tf.expand_dims(self.xp, axis=-1) c_proj_exp = tf.expand_dims(c_proj, axis=1) cand_context = tf.multiply(c_proj_exp, xp_exp) if self.config.cand_condense_conv: cand_context = tf.reshape(cand_context, [N*XL, PL, d]) cand_context = conv(cand_context, d, bias=True, activation=tf.nn.relu, kernel_size=3, name="candidate_from_context") cand_context = tf.reshape(cand_context, [N, XL, -1, d]) if self.config.cand_condense_pool: cand_context_pool = tf.reduce_max(cand_context, axis=-2) else: cand_context_pool = tf.reduce_mean(cand_context, axis=-2) cand_condense = tf.concat([self.x2c, cand_context_pool], axis = -1) self.cand_condense = conv(cand_condense, d, name="candidate_projection") if self.config.cand_fuse_vector: raise NotImplementedError # DEBUG self.debug_ops.extend([xp_exp, c_proj_exp, cand_context, cand_context_pool, cand_condense, self.cand_condense]) if not config.max_margin: cand_logits = tf.squeeze(conv(self.cand_condense, 1, bias=False, name="candidate_logits_1"), -1) self.cand_logits = mask_logits(cand_logits, mask=self.x_mask) loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.cand_logits, labels=self.yx) # DEBUG self.debug_ops.extend([loss, x_emb, c_proj, S_xc, S_xc_, self.x2c, self.x_mask, self.cand_logits, self.yx]) else: cand_logits = conv(self.cand_condense, 1, bias=False, name="candidate_logits_1") cand_logits = tf.tanh(cand_logits) cand_logits = tf.squeeze(conv(cand_logits, 1, bias=False, name="candidate_logits_2"), -1) self.cand_logits = tf.sigmoid(cand_logits) pos = tf.multiply(self.cand_logits, self.yx) pos = tf.reduce_max(pos, axis=-1) negs = tf.multiply(self.cand_logits, self.yx_inv) neg = tf.reduce_max(negs, axis=-1) loss = tf.maximum(tf.add(tf.subtract(neg, pos), config.margin), 0.0) # DEBUG self.debug_ops.extend([loss, x_emb, c_proj, S_xc, S_xc_, self.x2c, self.x_mask, self.cand_logits, self.yx, pos, negs, neg, self.yx, self.yx_inv]) self.loss = tf.reduce_mean(loss) # with tf.variable_scope("Output_Layer"): # ''' # tf.matrix_band_part: Copy a tensor setting everything outside a central band # in each innermost matrix to zero. # self.enc[i]: (N, c_maxlen, d) # start_logits: (N, c_maxlen) # end_logits: (N, c_maxlen) # logits1: (N, c_maxlen) # logits2: (N, c_maxlen) # outer: (N, c_maxlen, c_maxlen) # self.c_mask: (N, c_maxlen) # yp1, yp2, losses, losses2: (N,) # ''' # # # map vectors to scalars # start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, # bias = False, name = "start_pointer"),-1) # end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1, # bias = False, name = "end_pointer"), -1) # self.logits = [mask_logits(start_logits, mask = self.c_mask), mask_logits(end_logits, mask = self.c_mask)] # # logits1, logits2 = [l for l in self.logits] # # losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) # losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) # self.loss = tf.reduce_mean(losses + losses2) # # # find max-score span # outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), # tf.expand_dims(tf.nn.softmax(logits2), axis=1)) # # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18 # outer = tf.matrix_band_part(outer, 0, config.ans_limit) # self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) # self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) # # # DEBUG # self.debug_ops.extend([start_logits, end_logits, logits1, logits2, # outer, self.yp1, self.yp2, losses, losses2, self.loss]) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18 self.assign_vars = [] # self.shadow_vars = [] # self.global_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def forward(self, trainable): config = self.config N, PL, QL, CL, d, dc, nh= config.batch_size,self.c_maxlen, self.q_maxlen,\ config.char_limit, config.hidden, config.char_dim, \ config.num_heads, with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) #[一个句子共有多少单词,每个单词的字符个数,每一个字符的维度] qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder以下是得到卷积之后的特征输出 ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) #[batch,feature_len,d] qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max( ch_emb, axis=1) #求出横向唯独的最大特征,这里可以用k_max尝试,而没有用max_pooling qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) #最终转变为句子长度对应的维度, qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat( [c_emb, ch_emb], axis=2) #把字符与对应的特征进行连接[batch,sequence_len,对应的输出维度] q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway( c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) #相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75] q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope('question_rnn'): self.gru = tf.contrib.rnn.GRUCell(d) initstate = self.gru.zero_state(batch_size=N, dtype=tf.float32) output, state = tf.nn.dynamic_rnn(self.gru, q, initial_state=initstate) # self.qandc=tf.concat([self.q2c,self.c2q],axis=2) # self.qandc=dense(self.qandc,d) # output,state=tf.nn.dynamic_rnn(self.gru,self.qandc,initial_state=initstate)#(32,?,75) state = tf.expand_dims(state, axis=2) weight1 = tf.matmul(self.enc[1], state) weight2 = tf.matmul(self.enc[2], state) weight3 = tf.matmul(self.enc[3], state) weight_enc1 = tf.multiply(self.enc[1], weight1) weight_enc1 = tf.reduce_sum(weight_enc1, axis=1) weight_enc2 = tf.multiply(self.enc[2], weight2) weight_enc2 = tf.reduce_sum(weight_enc2, axis=1) weight_enc3 = tf.multiply(self.enc[3], weight3) weight_enc3 = tf.reduce_sum(weight_enc3, axis=1) with tf.variable_scope("Output_Layer"): print(weight_enc1, "ggggggggggggggggg") inputs_shape = weight_enc1.get_shape().as_list() W = tf.get_variable( "W", shape=[inputs_shape[-1], 3], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[3]), name="b") self.l2_loss += tf.nn.l2_loss(W) self.l2_loss += tf.nn.l2_loss(b) self.scores1 = tf.nn.xw_plus_b(weight_enc1, W, b, name="scores") self.scores2 = tf.nn.xw_plus_b(weight_enc2, W, b, name="scores") self.scores3 = tf.nn.xw_plus_b(weight_enc3, W, b, name="scores") self.scores = (self.scores1 + self.scores2 + self.scores3) / 3.0 print(self.scores) self.predictions = tf.argmax(self.scores, 1, name="predictions") if trainable: with tf.name_scope("loss"): print(self.scores, self.input_y, "llllllllllllllll") losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean( losses) + self.l2_reg_lambda * self.l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast( correct_predictions, "float"), name="accuracy") # losses2 = tf.nn.softmax_cross_entropy_with_logits( # logits=logits2, labels=self.y2) if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage( config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v)) self.lr = tf.minimum( config.init_lr, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm( gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh, AL1,AL2,AL3= config.batch_size,self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads,self.aletr1_maxlen, \ self.aletr2_maxlen,self.aletr3_maxlen with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) #[一个句子共有多少单词,每个单词的字符个数,每一个字符的维度] qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) self.alternati_emb1 = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.alter1h), [N * AL1, CL, dc]) # (875, 25, 20) self.alternati_emb2 = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.alter2h), [N * AL2, CL, dc]) # (768, 16, 300) self.alternati_emb3 = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.alter3h), [N * AL3, CL, dc]) # (768, 16, 300) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) alternati_emb1 = tf.nn.dropout(self.alternati_emb1, 1.0 - 0.5 * self.dropout) alternati_emb2 = tf.nn.dropout(self.alternati_emb2, 1.0 - 0.5 * self.dropout) alternati_emb3 = tf.nn.dropout(self.alternati_emb3, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder以下是得到卷积之后的特征输出 ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) #[batch,feature_len,d] qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) alternati_emb1 = conv(alternati_emb1, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) alternati_emb2 = conv(alternati_emb2, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) alternati_emb3 = conv(alternati_emb3, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max( ch_emb, axis=1) #求出横向唯独的最大特征,这里可以用k_max尝试,而没有用max_pooling qh_emb = tf.reduce_max(qh_emb, axis=1) alternati_emb1 = tf.reduce_max(alternati_emb1, axis=1) alternati_emb2 = tf.reduce_max(alternati_emb2, axis=1) alternati_emb3 = tf.reduce_max(alternati_emb3, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) #最终转变为句子长度对应的维度, qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]]) alternati_emb1 = tf.reshape(alternati_emb1, [N, AL1, qh_emb.shape[-1]]) alternati_emb2 = tf.reshape(alternati_emb2, [N, AL2, qh_emb.shape[-1]]) alternati_emb3 = tf.reshape(alternati_emb3, [N, AL3, qh_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) alter_embedding1 = tf.nn.embedding_lookup(self.word_mat, self.alter1) # 上下文 alter_embedding2 = tf.nn.embedding_lookup(self.word_mat, self.alter2) # 上下文 alter_embedding3 = tf.nn.embedding_lookup(self.word_mat, self.alter3) # 上下文 c_emb = tf.concat( [c_emb, ch_emb], axis=2) #把字符与对应的特征进行连接[batch,sequence_len,对应的输出维度] q_emb = tf.concat([q_emb, qh_emb], axis=2) alter_embedding1 = tf.concat([alter_embedding1, alternati_emb1], axis=2) alter_embedding2 = tf.concat([alter_embedding2, alternati_emb2], axis=2) alter_embedding3 = tf.concat([alter_embedding3, alternati_emb3], axis=2) c_emb = highway( c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) #相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75] self.alter_embedding1 = c_emb q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) alter_embedding1 = highway(alter_embedding1, size=d, scope="highway", dropout=self.dropout, reuse=True) alter_embedding2 = highway(alter_embedding2, size=d, scope="highway", dropout=self.dropout, reuse=True) alter_embedding3 = highway(alter_embedding3, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) alter1 = residual_block( alter_embedding1, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.alter1_mask, num_filters=d, num_heads=nh, seq_len=self.alterh1_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) alter2 = residual_block( alter_embedding2, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.alter2_mask, num_filters=d, num_heads=nh, seq_len=self.alter2_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) alter3 = residual_block( alter_embedding3, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.alter3_mask, num_filters=d, num_heads=nh, seq_len=self.alter3_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope('question_rnn'): self.gru = tf.contrib.rnn.GRUCell(d) initstate = self.gru.zero_state(batch_size=N, dtype=tf.float32) output, state = tf.nn.dynamic_rnn(self.gru, q, initial_state=initstate) # self.qandc=tf.concat([self.q2c,self.c2q],axis=2) # self.qandc=dense(self.qandc,d) # output,state=tf.nn.dynamic_rnn(self.gru,self.qandc,initial_state=initstate)#(32,?,75) output1, state1 = tf.nn.dynamic_rnn(self.gru, alter1, initial_state=state) output2, state2 = tf.nn.dynamic_rnn(self.gru, alter2, initial_state=state) output3, state3 = tf.nn.dynamic_rnn(self.gru, alter3, initial_state=state) state = tf.expand_dims(state, axis=2) weight1 = tf.matmul(self.enc[1], state) weight2 = tf.matmul(self.enc[2], state) weight3 = tf.matmul(self.enc[3], state) weight_enc1 = tf.multiply(self.enc[1], weight1) weight_enc1 = tf.reduce_sum(weight_enc1, axis=1) weight_enc2 = tf.multiply(self.enc[2], weight2) weight_enc2 = tf.reduce_sum(weight_enc2, axis=1) weight_enc3 = tf.multiply(self.enc[3], weight3) weight_enc3 = tf.reduce_sum(weight_enc3, axis=1) with tf.variable_scope("Output_Layer"): # start_logits = tf.squeeze( # conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) # end_logits = tf.squeeze( # conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) # self.logits = [mask_logits(start_logits, mask=self.c_mask), # mask_logits(end_logits, mask=self.c_mask)] # # logits1, logits2 = [l for l in self.logits] # # outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), # tf.expand_dims(tf.nn.softmax(logits2), axis=1)) # outer = tf.matrix_band_part(outer, 0, config.ans_limit) # self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) # self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) similary1 = tf.expand_dims(self.cos_sine(weight_enc1, state1), axis=1) similary2 = tf.expand_dims(self.cos_sine(weight_enc2, state2), axis=1) similary3 = tf.expand_dims(self.cos_sine(weight_enc3, state3), axis=1) self.logits1 = tf.nn.softmax( tf.concat([similary1, similary2, similary3], axis=1)) print(self.logits1, "lllllllllllllllllllllllllllllllllllll")
def build_model(self): PL, QL, CL, d, dc, nh = self.c_maxlen, self.q_maxlen, self.char_limit, self.filters, self.char_dim, self.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.contc_input), [-1, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.quesc_input), [-1, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [-1, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [-1, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.contw_input), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.quesw_input), 1.0 - self.dropout) # if self.use_cove: # c_emb_cove = self.cove_model(c_emb) # q_emb_cove = self.cove_model(q_emb) # c_emb = tf.concat([c_emb, c_emb_cove], axis=-1) # q_emb = tf.concat([q_emb, q_emb_cove], axis=-1) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) if self.use_elmo: c_emb = tf.concat([c_emb, self.cont_elmo], axis=-1) q_emb = tf.concat([q_emb, self.ques_elmo], axis=-1) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.cont_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block(q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.ques_len, scope="Encoder_Residual_Block", reuse=True, bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) c2q = tf.matmul(S_, q) q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, c2q, c * c2q, c * q2c] with tf.variable_scope("Model_Encoder_Layer"): attention_inputs = tf.concat(attention_outputs, axis=-1) enc = [conv(attention_inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks enc[i] = tf.nn.dropout(enc[i], 1.0 - self.dropout) enc.append( residual_block(enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.cont_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): start_logits = tf.concat([enc[1], enc[2]], axis=-1) end_logits = tf.concat([enc[1], enc[3]], axis=-1) if self.use_elmo: start_logits = tf.concat((start_logits, self.cont_elmo), axis=-1) end_logits = tf.concat((end_logits, self.cont_elmo), axis=-1) start_logits = tf.squeeze( conv(start_logits, 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(end_logits, 1, bias=False, name="end_pointer"), -1) unanswer_bias = tf.get_variable( "unanswer_bias", [1], regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7), initializer=tf.zeros_initializer()) unanswer_bias = tf.reshape( tf.tile(unanswer_bias, [self.batch_size]), [-1, 1]) self.logits1 = tf.concat( (unanswer_bias, mask_logits(start_logits, mask=self.c_mask)), axis=-1) self.logits2 = tf.concat( (unanswer_bias, mask_logits(end_logits, mask=self.c_mask)), axis=-1) start_loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits1, labels=self.y_start) end_loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits2, labels=self.y_end) self.loss = tf.reduce_mean(start_loss + end_loss) if self.l2_norm is not None: variables = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss # output outer = tf.matmul( tf.expand_dims(tf.nn.softmax(self.logits1), axis=2), tf.expand_dims(tf.nn.softmax(self.logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, self.ans_limit) self.output1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) - 1 self.output2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) - 1 if self.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(self.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v is not None: self.assign_vars.append(tf.assign(var, v))
def pred(self): with tf.variable_scope("embedding_layer"): (self.questions, question_lengths), ( self.contexts, context_lengths), self.answers = self.iterator.get_next() max_context_length = tf.reduce_max(context_lengths) max_question_length = tf.reduce_max(question_lengths) #max_context_length = self.train_max_context_length #max_question_length = self.train_max_question_length context_mask = tf.sequence_mask(context_lengths, maxlen=max_context_length) question_mask = tf.sequence_mask(question_lengths, maxlen=max_question_length) question_embeddings = tf.nn.embedding_lookup( self.embedding, self.questions) context_embeddings = tf.nn.embedding_lookup( self.embedding, self.contexts) print('question_embeddings', question_embeddings.get_shape().as_list()) print('context_embeddings', context_embeddings.get_shape().as_list()) with tf.variable_scope("embedding_layer"): c = residual_block(context_embeddings, num_blocks=1, num_conv_layers=1, kernel_size=7, mask=context_mask, num_filters=self.lstm_hidden_size, num_heads=1, seq_len=max_context_length, scope="Encoder_Residual_Block", bias=False, dropout=1.0 - self.keep_prob) print('c', c.get_shape().as_list()) q = residual_block( question_embeddings, num_blocks=1, num_conv_layers=1, kernel_size=7, mask=question_mask, num_filters=self.lstm_hidden_size, num_heads=1, seq_len=max_question_length, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=1.0 - self.keep_prob) print('q', q.get_shape().as_list()) # context_output dimension is BS * max_context_length * d # where d = 2*lstm_hidden_size with tf.variable_scope("attention_layer"): S = optimized_trilinear_for_attention( [c, q], max_context_length, max_question_length, input_keep_prob=self.keep_prob) mask_q = tf.expand_dims(question_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(context_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("modeling_layer"): attention = tf.concat(attention_outputs, axis=-1) self.enc = [ conv(attention, self.lstm_hidden_size, name="input_projection") ] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], self.keep_prob) self.enc.append( residual_block(self.enc[i], num_blocks=1, num_conv_layers=1, kernel_size=5, mask=context_mask, num_filters=self.lstm_hidden_size, num_heads=1, seq_len=max_context_length, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=1.0 - self.keep_prob)) print('self.enc[i]', self.enc[i].get_shape().as_list()) with tf.variable_scope("output_layer_start"): pred_start = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) print('pred_start', pred_start.get_shape().as_list()) self.pred_start = preprocess_softmax(pred_start, context_mask) print('self.pred_start', self.pred_start.get_shape().as_list()) with tf.variable_scope("output_layer_end"): pred_end = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) print('pred_end', pred_end.get_shape().as_list()) self.pred_end = preprocess_softmax(pred_end, context_mask) print('self.pred_end', self.pred_end.get_shape().as_list()) self.preds = tf.transpose([ tf.argmax(self.pred_start, axis=1), tf.argmax(self.pred_end, axis=1) ])
def build_model(self): PL, QL, CL, d, dc, nh = self.c_maxlen, self.q_maxlen, self.char_limit, self.filters, self.char_dim, self.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.contc_input), [-1, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.quesc_input), [-1, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [-1, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [-1, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.contw_input), 1.0 - self.dropout) q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.quesw_input), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) if self.use_elmo: c_emb = tf.concat([c_emb, self.cont_elmo], axis=-1) q_emb = tf.concat([q_emb, self.ques_elmo], axis=-1) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.cont_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block(q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.ques_len, scope="Encoder_Residual_Block", reuse=True, bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) c2q = tf.matmul(S_, q) q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, c2q, c * c2q, c * q2c] with tf.variable_scope("Model_Encoder_Layer"): attention_inputs = tf.concat(attention_outputs, axis=-1) enc = [conv(attention_inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks enc[i] = tf.nn.dropout(enc[i], 1.0 - self.dropout) enc.append(residual_block(enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.cont_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): start_logits = tf.concat([enc[1], enc[2]], axis=-1) end_logits = tf.concat([enc[1], enc[3]], axis=-1) if self.use_elmo: start_logits = tf.concat((start_logits, self.cont_elmo), axis=-1) end_logits = tf.concat((end_logits, self.cont_elmo), axis=-1) start_logits = tf.squeeze(conv(start_logits, 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze(conv(end_logits, 1, bias=False, name="end_pointer"), -1) # 2.0 Dataset # unanswer_bias = tf.get_variable("unanswer_bias", [1], # regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7), # initializer=tf.zeros_initializer()) # unanswer_bias = tf.reshape(tf.tile(unanswer_bias, [self.batch_size]), [-1, 1]) # self.logits1 = tf.concat((unanswer_bias, mask_logits(start_logits, mask=self.c_mask)), axis=-1) # self.logits2 = tf.concat((unanswer_bias, mask_logits(end_logits, mask=self.c_mask)), axis=-1) self.logits1 = mask_logits(start_logits, mask=self.c_mask) self.logits2 = mask_logits(end_logits, mask=self.c_mask) start_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits1, labels=self.y_start) end_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits2, labels=self.y_end) self.loss = tf.reduce_mean(start_loss + end_loss) # output outer = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2), tf.expand_dims(tf.nn.softmax(self.logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, self.ans_limit) self.output1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.output2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) if self.use_topk: with tf.variable_scope("Topk_Layer"): top_size = 3 outer = tf.reshape(outer, [self.batch_size, -1]) outer_inds = tf.nn.top_k(outer, top_size).indices # [N,top_size] self.yp1 = outer_inds // tf.shape(self.logits1)[-1] self.yp2 = outer_inds % tf.shape(self.logits2)[-1] def sen_mask(tensor): def sen_mask_(a, b, filters): try: mata = tf.zeros([a, filters], tf.int32) except: mata = [] matb = tf.ones([b - a, filters], tf.int32) matc = tf.zeros([tf.shape(self.logits1)[-1] - b, filters], tf.int32) mat = tf.concat((mata, matb, matc), axis=0) return mat return tf.map_fn(lambda x: sen_mask_(x[0], x[1], self.filters), tensor) self.yp3 = self.yp2 + 1 self.yp1 = tf.expand_dims(self.yp1, -1) self.yp2 = tf.expand_dims(self.yp2, -1) self.yp3 = tf.expand_dims(self.yp3, -1) self.y_mask = tf.concat([self.yp1, self.yp3], axis=-1) self.y_mask = tf.map_fn(lambda x: sen_mask(x), self.y_mask) # answer c = tf.tile(tf.expand_dims(c2q, 1), [1, top_size, 1, 1]) c_topk = tf.multiply(tf.cast(self.y_mask, tf.float32), c) W1 = tf.get_variable("W1", initializer=tf.ones([1, 1, 1, self.filters])) W1 = tf.tile(W1, [self.batch_size, top_size, 1, 1]) alpha1 = tf.nn.softmax(tf.matmul(W1, c_topk, transpose_b=True), axis=2) answer = tf.matmul(alpha1, c_topk) # [32,top_size,1,128] # question W2 = tf.get_variable("W2", initializer=tf.ones([1, 1, self.filters])) W2 = tf.tile(W2, [self.batch_size, 1, 1]) alpha2 = tf.nn.softmax(tf.matmul(W2, q, transpose_b=True), axis=1) ques = tf.matmul(alpha2, q) ques = tf.tile(tf.expand_dims(ques, 1), [1, top_size, 1, 1]) # [32,top_size,1,128] # question & answer W3 = tf.get_variable("W3", initializer=tf.ones([1, 1, self.filters, self.filters])) W3 = tf.tile(W3, [self.batch_size, top_size, 1, 1]) y_topk_logits = tf.nn.sigmoid(tf.matmul(ques, tf.matmul(W3, answer, transpose_b=True))) # [32,top_size,1,1] y_topk_logits = tf.squeeze(y_topk_logits) # [32,top_size] self.yp1 = tf.squeeze(self.yp1) self.yp2 = tf.squeeze(self.yp2) coeff1_topk = tf.one_hot(self.yp1, self.c_maxlen, axis=-1) # [32,top_size,400] one-hot coeff2_topk = tf.one_hot(self.yp2, self.c_maxlen, axis=-1) # [0,1,0,0,0][0,0,0,1,0]->[0,1,1,1,1]-[0,0,0,1,1]->[0,1,1,0,0]+[0,0,0,1,0]->[0,1,1,1,0] coeff1_topk_cumsum = tf.cumsum(coeff1_topk, axis=-1) coeff2_topk_cumsum = tf.cumsum(coeff2_topk, axis=-1) self.y_d = coeff1_topk_cumsum - coeff2_topk_cumsum + coeff2_topk # [32, top_size, 400] def clip_for_sigmoid(output): _epsilon = tf.convert_to_tensor(1e-7, dtype=output.dtype.base_dtype) output = tf.clip_by_value(output, _epsilon, 1 - _epsilon) output = tf.log(output / (1 - output)) return output if self.topk_loss=='f1': # f1 loss y_start_ind = tf.cumsum(self.y_start, axis=-1) y_end_ind = tf.cumsum(self.y_end, axis=-1) y_gtd = y_start_ind - y_end_ind + self.y_end # [32, 400] def cal_num_same(y_pred, y_truth): # [top_size, 400] [400,] def cal_num_same_(y_pred_, y_truth): # [400,] [400,] return tf.reduce_sum(tf.cast(tf.logical_and(tf.cast(y_pred_, tf.bool), tf.cast(y_truth, tf.bool)), tf.float32),axis=-1) return [tf.map_fn(lambda x:cal_num_same_(x,y_truth),y_pred),tf.map_fn(lambda x:cal_num_same_(x,y_truth),y_pred)] num_same = tf.map_fn(lambda x:cal_num_same(x[0], x[1]), [self.y_d, y_gtd])[0] # [32, top_size] y_precision = num_same / (tf.cast(tf.reduce_sum(self.y_d, axis=-1),tf.float32) + 1e-8) # [32, top_size] y_recall = num_same / tf.expand_dims(tf.cast(tf.reduce_sum(y_gtd, axis=-1),tf.float32) + 1e-8, axis=-1) # [32, top_size] y_f1 = (2.0 * y_precision * y_recall) / (tf.cast(y_precision + y_recall,tf.float32) + 1e-8) # [32, top_size] topk_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=clip_for_sigmoid(y_topk_logits), labels=y_f1)) elif self.topk_loss=='em': # em loss start_em = tf.equal(tf.cast(tf.expand_dims(tf.argmax(self.y_start, axis=-1), axis=1), tf.int32), tf.cast(self.yp1, tf.int32)) # [32, top_size] end_em = tf.equal(tf.cast(tf.expand_dims(tf.argmax(self.y_end, axis=-1), axis=1), tf.int32), tf.cast(self.yp2, tf.int32)) # [32, top_size] y_em = tf.cast(tf.logical_and(start_em, end_em), tf.float32) # [32, top_size] topk_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=clip_for_sigmoid(y_topk_logits), labels=y_em)) # final loss self.Lambda1 = tf.get_variable("Lambda1", initializer=tf.constant([0.9]), trainable=False) self.loss = tf.reduce_mean(self.Lambda1 * (start_loss + end_loss) + (1 - self.Lambda1) * topk_loss) # output outer_topk = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2), tf.expand_dims(tf.nn.softmax(self.logits2), axis=1)) outer_topk = tf.matrix_band_part(outer_topk, 0, self.ans_limit) self.output1 = tf.argmax(tf.reduce_max(outer_topk, axis=2), axis=1) self.output2 = tf.argmax(tf.reduce_max(outer_topk, axis=1), axis=1) # diversity loss if self.diversity_loss: self.Lambda2 = tf.get_variable("Lambda2", initializer=tf.constant([0.1]),trainable=False) diversity_loss = tf.reduce_mean(tf.reduce_prod(self.y_d, axis=1),axis=-1) # [32,top_size,400]->[32,400]->[32,] self.loss = self.loss + tf.reduce_mean(self.Lambda2 * diversity_loss) if self.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables) self.loss += l2_loss if self.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(self.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v is not None: self.assign_vars.append(tf.assign(var, v))
def forward(self): config = self.config ''' N: batch_size PL: passage最大长度 QL: question最大长度 CL: 单词最大字母长度 d: 输出通道数 dc: 字母的嵌入维度 nh: 自注意力的头数 ''' N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads # Embedding层:获取词向量和字符向量的拼接 with tf.variable_scope("Input_Embedding_Layer"): # # character嵌入: # 1、先对单词的每个字母进行char2vec ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # 2、将单词对应的word2vec矩阵通过conv编码成向量 # 卷积 ch_emb_shape = [N * PL, CL-5+1, d], qh_emb_shape = [N * QL, CL-5+1, d] ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) # max_time_pooling # ch_emb_shape = [N * PL, d], qh_emb_shape = [N * QL, d] ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) # ch_emb_shape = [N, PL, d], qh_emb_shape = [N, QL, d] ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) # # 词嵌入:从glove获取 c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) # 拼接词向量和字符向量 # c_emb_size = [batch, n_c, c_emb+ch_emb] c_emb = tf.concat([c_emb, ch_emb], axis=2) # q_emb_size = [batch, n_q, c_emb + ch_emb] q_emb = tf.concat([q_emb, qh_emb], axis=2) # 分别通过highway网络 # c_emb_size = [batch, n_c, d] c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) # c_emb_size = [batch, n_q, d] q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) # Stacking Embedding Encoder Block的实现:共1个encoder block,每个7个卷积层,卷积核数d=96 with tf.variable_scope("Embedding_Encoder_Layer"): # c_size = [batch, n_c, d] c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) # q_size = [batch, n_q, d] q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse= True, # 共享passage和question的Stacking Embedding Encoder Block的权重 bias=False, dropout=self.dropout) # Context-Query-Attention实现: with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) # S_size = [batch, n_c, n_q], q_size = [batch, n_q, d], c_size = [batch, n_c, d] S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) # n_q方向进行softmax S_ = tf.nn.softmax(mask_logits(S, mask=mask_q), dim=-1) mask_c = tf.expand_dims(self.c_mask, 2) # n_c方向进行softmax S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) # c2q_size = [batch, n_c, d] self.c2q = tf.matmul(S_, q) # q2c_size = [batch, n_c, d] self.q2c = tf.matmul(tf.matmul(S_, S_T), c) # attention_size = [4, batch, n_c, d] attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] # Stacked Model Encoder Blocks实现:共7个encoder block,每个2个卷积层,卷积核数d=96 with tf.variable_scope("Model_Encoder_Layer"): # c, self.c2q, c * self.c2q, c * self.q2c 按照通道维度进行合并 # input_shape = [batch, n_c, 4d] inputs = tf.concat(attention_outputs, axis=-1) # self.enc[i]_shape = [batch, n_c, d] self.enc = [conv(inputs, d, name="input_projection")] # 3个Stacked Model Encoder Blocks for i in range(3): if i % 2 == 0: # 每两层进行一次dropout self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block( self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, # 共享同一个Stacked Model Encoder Blocks的权重 dropout=self.dropout)) # 输出层实现: with tf.variable_scope("Output_Layer"): # 合并Stacked Model Encoder Blocks的第一个和第二个输出,并和并通道 # start_logits_shape = [batch, n_c, 1] start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) # 合并Stacked Model Encoder Blocks的第一个和第三个输出,并和并通道 # end_logits_shape = [batch, n_c, 1] end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] # outer_shape = [bacth, n_c, n_c] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) # 保留行坐标<纵坐标,且行坐标+纵坐标<=ans_limit的数据,其余置0 outer = tf.matrix_band_part(outer, 0, config.ans_limit) # 最大值的行坐标,代表起始位置 self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) # 最大值的列坐标,代表结束位置 self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) # L2正则化 if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) # control_dependencies传入的操作是先于with后的操作 with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def __init__(self, config, batch, word_mat=None, filter_sizes=None, embedding_size=None,num_filters=None,trainable=True, l2_reg_lambda=0.0, keep_prob=0.9, graph=None): # Placeholders for input, output and dropout self.config = config self.graph = graph if graph is not None else tf.Graph() self.trainable = trainable if trainable == True: self.input_x, self.input_x1, self.ch, self.qh, self.input_y, self.qa_id = batch.get_next() # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len] else: self.input_x, self.input_x1, self.ch, self.qh = batch.get_next() # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len] self.dropout_keep_prob =keep_prob self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.dropout = tf.placeholder_with_default(0.5, (), name="dropout") # Keeping track of l2 regularization loss (optional) l2_loss = tf.constant(0.0) # Embedding layer with tf.device('/cpu:0'), tf.name_scope("embedding"): self.W = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32), trainable=True) self.c_mask = tf.cast(self.input_x, tf.bool) # self.c为填充之后的长度是一致的,用0进行填充 self.q_mask = tf.cast(self.input_x1, tf.bool) if trainable: self.c_maxlen, self.q_maxlen, = config.para_limit, config.ques_limit, else: self.c_maxlen, self.q_maxlen = config.test_para_limit, config.test_ques_limit self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) self.embedded_chars1 = tf.nn.embedding_lookup(self.W, self.input_x1) self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) self.embedded_question = tf.expand_dims(self.embedded_chars1, -1) S = optimized_trilinear_for_attention([self.embedded_chars_expanded, self.embedded_question], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) print(S,"2222222222222222222") # Create a convolution + maxpool layer for each filter size pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, embedding_size, 1, num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") conv = tf.nn.conv2d( self.embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, config.para_limit - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) self.h_pool = tf.concat(pooled_outputs, 3) self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) # Add dropout with tf.name_scope("dropout"): self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) # Final (unnormalized) scores and predictions with tf.name_scope("output"): W = tf.get_variable( "W", shape=[num_filters_total, 3], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[3]), name="b") l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") self.predictions = tf.argmax(self.scores, 1, name="predictions") # Calculate mean cross-entropy loss with tf.name_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v)) self.lr = tf.minimum(config.init_lr, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm( gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)