def _encode(self): PL, QL, CL, d, dc, nh = self._params() with tf.variable_scope("Embedding_Encoder_Layer"): self.c_embed_encoding = residual_block( self.c_emb, num_blocks=1, num_conv_layers=4, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) self.q_embed_encoding = residual_block( self.q_emb, num_blocks=1, num_conv_layers=4, kernel_size=5, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout)
def _encode(self): N, PL, QL, CL, d, dc, nh = self._params() if self.config.fix_pretrained_vector: dc = self.char_mat.get_shape()[-1] with tf.variable_scope("Embedding_Encoder_Layer"): self.c_embed_encoding = residual_block( self.c_emb, num_blocks=1, num_conv_layers=2, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) self.q_embed_encoding = residual_block( self.q_emb, num_blocks=1, num_conv_layers=2, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout)
def style_transformer_network(inputs, style_params): with tf.variable_scope('style_transformer'): with tf.variable_scope('encode') as scope: conv1 = layers.conv2d(inputs, 32, 9, 1, "conv1", scope.name, style_params) conv2 = layers.conv2d(conv1, 64, 3, 2, "conv2", scope.name, style_params) conv3 = layers.conv2d(conv2, 128, 3, 2, "conv3", scope.name, style_params) with tf.variable_scope('residual') as scope: res1 = layers.residual_block(conv3, 3, "residual1", scope.name, style_params) res2 = layers.residual_block(res1, 3, "residual2", scope.name, style_params) res3 = layers.residual_block(res2, 3, "residual3", scope.name, style_params) with tf.variable_scope('upsample') as scope: up1 = layers.upsampling(res3, 3, 2, 64, 'upsample1', scope.name, style_params) up2 = layers.upsampling(up1, 3, 2, 32, 'upsample2', scope.name, style_params) return layers.upsampling(up2, 9, 2, 3, 'upsample3', scope.name, style_params, tf.nn.sigmoid)
def discriminator(sequence,training=tf.constant(True)): num = tf.shape(sequence)[0] x = lyr.conv('discriminator.conv1.filter','discriminator.conv1.bias','discriminator',(5,encode_length,64),sequence,max_size) x = tf.nn.leaky_relu(x) x = lyr.residual_block('discriminator.res1.filter1','discriminator.res1.bias1','discriminator.res1.filter2','discriminator.res1.bias1','discriminator',64,64,x,max_size) x = lyr.layernorm(x,num) x = lyr.residual_block('discriminator.res4.filter1','discriminator.res4.bias1','discriminator.res4.filter2','discriminator.res4.bias1','discriminator',64,64,x,max_size) x = lyr.layernorm(x,num) x = lyr.residual_block('discriminator.res5.filter1','discriminator.res5.bias1','discriminator.res5.filter2','discriminator.res5.bias1','discriminator',64,64,x,max_size) x = lyr.layernorm(x,num) x = tf.reshape(x,(num,max_size*64)) output = lyr.dense('discriminator.dense1.matrix','discriminator.dense1.bias','discriminator',max_size*64,1,x) return output
def discriminator(sequence): x = lyr.conv('discriminator.conv1.filter', 'discriminator.conv1.bias', 'discriminator', (5, encode_length, 64), sequence, max_size) x = tf.nn.leaky_relu(x) x = lyr.residual_block('discriminator.res1.filter1', 'discriminator.res1.bias1', 'discriminator.res1.filter2', 'discriminator.res1.bias1', 'discriminator', 64, 64, x, max_size) x = lyr.layernorm(x, batch_size) x = lyr.residual_block('discriminator.res2.filter1', 'discriminator.res2.bias1', 'discriminator.res2.filter2', 'discriminator.res2.bias1', 'discriminator', 64, 64, x, max_size) x = lyr.layernorm(x, batch_size) x = lyr.residual_block('discriminator.res3.filter1', 'discriminator.res3.bias1', 'discriminator.res3.filter2', 'discriminator.res3.bias1', 'discriminator', 64, 64, x, max_size) x = lyr.layernorm(x, batch_size) x = lyr.residual_block('discriminator.res4.filter1', 'discriminator.res4.bias1', 'discriminator.res4.filter2', 'discriminator.res4.bias1', 'discriminator', 64, 64, x, max_size) x = lyr.layernorm(x, batch_size) x = lyr.residual_block('discriminator.res5.filter1', 'discriminator.res5.bias1', 'discriminator.res5.filter2', 'discriminator.res5.bias1', 'discriminator', 64, 64, x, max_size) x = lyr.layernorm(x, batch_size) x = tf.reshape(x, (batch_size, max_size * 64)) output = lyr.dense('discriminator.dense1.matrix', 'discriminator.dense1.bias', 'discriminator', max_size * 64, 1, x) return output
def _fuse(self): with tf.variable_scope("Context_to_Query_Attention_Layer"): C = tf.tile(tf.expand_dims(self.c_embed_encoding, 2), [1, 1, self.max_q_len, 1]) Q = tf.tile(tf.expand_dims(self.q_embed_encoding, 1), [1, self.max_p_len, 1, 1]) S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, self.q_embed_encoding) self.q2c = tf.matmul(tf.matmul(S_, S_T), self.c_embed_encoding) self.attention_outputs = [ self.c_embed_encoding, self.c2q, self.c_embed_encoding * self.c2q, self.c_embed_encoding * self.q2c ] # self.config.batch_size if not self.demo else 1, # self.max_p_len, # self.max_q_len, # self.config.max_ch_len, # self.config.hidden_size, # self.config.char_embed_size, # self.config.head_size N, PL, QL, CL, d, dc, nh = self._params() if self.config.fix_pretrained_vector: dc = self.char_mat.get_shape()[-1] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(self.attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=1, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=True, reuse=True if i > 0 else None, dropout=self.dropout)) for i, item in enumerate(self.enc): self.enc[i] = tf.reshape(self.enc[i], [N, -1, self.enc[i].get_shape()[-1]])
def predictor(sequence): num = tf.shape(sequence)[0] x = lyr.conv('predictor.conv1.filter','predictor.conv1.bias','predictor',(5,encode_length,16),sequence,max_size) x = tf.nn.leaky_relu(x) x = lyr.residual_block('predictor.res1.filter1','predictor.res1.bias1','predictor.res1.filter2','predictor.res1.bias1','predictor',16,16,x,max_size,channels=16) x = tf.reshape(x,(num,max_size*16)) output = lyr.dense('predictor.dense1.matrix','predictor.dense1.bias','predictor',max_size*16,num_classes,x) return output
def generator(seed, training=True): seed = tf.reshape(seed, (batch_size, 100)) seed2 = lyr.dense('generator.dense1.matrix', 'generator.dense1.bias', 'generator', 100, max_size * 64, seed) seed2 = tf.nn.leaky_relu(seed2) seed2 = tf.reshape(seed2, [batch_size, max_size, 64]) x = lyr.residual_block('generator.res1.filter1', 'generator.res1.bias1', 'generator.res1.filter2', 'generator.res1.bias2', 'generator', 64, 64, seed2, max_size) x = lyr.residual_block('generator.res2.filter1', 'generator.res2.bias1', 'generator.res2.filter2', 'generator.res2.bias2', 'generator', 64, 64, x, max_size) x = lyr.residual_block('generator.res3.filter1', 'generator.res3.bias1', 'generator.res3.filter2', 'generator.res3.bias2', 'generator', 64, 64, x, max_size) x = lyr.residual_block('generator.res4.filter1', 'generator.res4.bias1', 'generator.res4.filter2', 'generator.res4.bias2', 'generator', 64, 64, x, max_size) x = lyr.residual_block('generator.res5.filter1', 'generator.res5.bias1', 'generator.res5.filter2', 'generator.res5.bias2', 'generator', 64, 64, x, max_size) x = lyr.conv('generator.conv1.filter', 'generator.conv1.bias', 'generator', (5, 64, encode_length), x, max_size) x = tf.nn.softmax(x) return x
def predictor_stem(sequence): x = lyr.conv('predictor_stem.conv1.filter', 'predictor_stem.conv1.bias', 'predictor_stem', (5, encode_length, 64), sequence, stem_size) x = tf.nn.leaky_relu(x) x = lyr.residual_block('predictor_stem.res1.filter1', 'predictor_stem.res1.bias1', 'predictor_stem.res1.filter2', 'predictor_stem.res1.bias1', 'predictor_stem', 64, 64, x, stem_size) x = lyr.residual_block('predictor_stem.res2.filter1', 'predictor_stem.res2.bias1', 'predictor_stem.res2.filter2', 'predictor_stem.res2.bias1', 'predictor_stem', 64, 64, x, stem_size) x = lyr.residual_block('predictor_stem.res3.filter1', 'predictor_stem.res3.bias1', 'predictor_stem.res3.filter2', 'predictor_stem.res3.bias1', 'predictor_stem', 64, 64, x, stem_size) x = lyr.residual_block('predictor_stem.res4.filter1', 'predictor_stem.res4.bias1', 'predictor_stem.res4.filter2', 'predictor_stem.res4.bias1', 'predictor_stem', 64, 64, x, stem_size) x = lyr.residual_block('predictor_stem.res5.filter1', 'predictor_stem.res5.bias1', 'predictor_stem.res5.filter2', 'predictor_stem.res5.bias1', 'predictor_stem', 64, 64, x, stem_size) x = tf.reshape(x, (batch_size, stem_size * 64)) output = lyr.dense('predictor_stem.dense1.matrix', 'predictor_stem.dense1.bias', 'predictor_stem', stem_size * 64, num_classes, x) return output
def generator(seed,training=tf.constant(True)): num = tf.shape(seed)[0] seed = tf.reshape(seed,(num,100)) seed2 = lyr.dense('generator.dense1.matrix','generator.dense1.bias','generator',100,max_size*64,seed) seed2 = tf.nn.leaky_relu(seed2) seed2 = lyr.batchnorm(seed2,'generator.batchnorm1.offset','generator.batchnorm1.scale','generator.batchnorm1.average_means','generator.batchnorm1.average_variances','generator.num_means','generator',(max_size*64,),training=training) seed2 = tf.reshape(seed2,[num,max_size,64]) x = lyr.residual_block('generator.res1.filter1','generator.res1.bias1','generator.res1.filter2','generator.res1.bias2','generator',64,64,seed2,max_size) x = lyr.batchnorm(x,'generator.batchnorm2.offset','generator.batchnorm2.scale','generator.batchnorm2.average_means','generator.batchnorm2.average_variances','generator.num_means','generator',(max_size,64),training=training) x = lyr.residual_block('generator.res2.filter1','generator.res2.bias1','generator.res2.filter2','generator.res2.bias2','generator',64,64,x,max_size) x = lyr.batchnorm(x,'generator.batchnorm3.offset','generator.batchnorm3.scale','generator.batchnorm3.average_means','generator.batchnorm3.average_variances','generator.num_means','generator',(max_size,64),training=training) x = lyr.residual_block('generator.res3.filter1','generator.res3.bias1','generator.res3.filter2','generator.res3.bias2','generator',64,64,x,max_size) x = lyr.batchnorm(x,'generator.batchnorm4.offset','generator.batchnorm4.scale','generator.batchnorm4.average_means','generator.batchnorm4.average_variances','generator.num_means','generator',(max_size,64),training=training) x = lyr.conv('generator.conv1.filter','generator.conv1.bias','generator',(5,64,encode_length),x,max_size) x = tf.nn.softmax(x) return x
def encoder(sequence,training=True): num = tf.shape(sequence)[0] x = lyr.conv('encoder.conv1.filter','encoder.conv1.bias','encoder',(5,encode_length,args.channels),sequence,max_size) x = tf.nn.leaky_relu(x) x = lyr.batchnorm(x,'encoder.batchnorm1.offset','encoder.batchnorm1.scale','encoder.batchnorm1.average_means','encoder.batchnorm1.average_variances','encoder.num_means','encoder',(max_size,args.channels),training=training) x = lyr.residual_block('encoder.res1.filter1','encoder.res1.bias1','encoder.res1.filter2','encoder.res1.bias1','encoder',args.channels,args.channels,x,max_size,channels=args.channels) x = lyr.batchnorm(x,'encoder.batchnorm2.offset','encoder.batchnorm2.scale','encoder.batchnorm2.average_means','encoder.batchnorm2.average_variances','encoder.num_means','encoder',(max_size,args.channels),training=training) x = tf.reshape(x,(num,max_size*args.channels)) x = lyr.dense('encoder.dense1.matrix','encoder.dense1.bias','encoder',max_size*args.channels,2*latent_dim,x) x = tf.nn.leaky_relu(x) output = lyr.batchnorm(x,'encoder.batchnorm3.offset','encoder.batchnorm3.scale','encoder.batchnorm3.average_means','encoder.batchnorm3.average_variances','encoder.num_means','encoder',(2*latent_dim),training=training) return output
def _fuse(self): with tf.variable_scope("Context_to_Query_Attention_Layer"): C = tf.tile(tf.expand_dims(self.c_embed_encoding, 2), [1, 1, self.max_q_len, 1]) Q = tf.tile(tf.expand_dims(self.q_embed_encoding, 1), [1, self.max_p_len, 1, 1]) S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, self.q_embed_encoding) self.q2c = tf.matmul(tf.matmul(S_, S_T), self.c_embed_encoding) self.attention_outputs = [ self.c_embed_encoding, self.c2q, self.c_embed_encoding * self.c2q, self.c_embed_encoding * self.q2c ] PL, QL, CL, d, dc, nh = self._params() with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(self.attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=3, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout))
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh, dw = config.test_batch_size if self.loop_function else config.batch_size, self.c_maxlen, self.q_maxlen, \ config.char_limit, config.hidden, config.char_dim, config.num_heads, config.glove_dim with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=2, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=2, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=2, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Decoder_Layer"): memory = tf.concat([self.enc[1], self.enc[2], self.enc[3]], axis=-1) oups = tf.split(self.a, [1] * self.a_maxlen, 1) h = tf.tanh( _linear(tf.reduce_mean(memory, axis=1), output_size=d, bias=False, scope="h_initial")) c = tf.tanh( _linear(tf.reduce_mean(memory, axis=1), output_size=d, bias=False, scope="c_initial")) state = (c, h) outputs = [] prev = None prev_probs = [0.0] symbols = [] for i, inp in enumerate(oups): einp = tf.reshape(tf.nn.embedding_lookup(self.word_mat, inp), [N, dw]) if i > 0: tf.get_variable_scope().reuse_variables() if self.loop_function is not None and prev is not None: with tf.variable_scope("loop_function", reuse=True): einp, prev_probs, index, prev_symbol = self.loop_function( prev, prev_probs, self.beam_size, i) h = tf.gather(h, index) # update prev state state = tuple(tf.gather(s, index) for s in state) # update prev state for j, symbol in enumerate(symbols): symbols[j] = tf.gather( symbol, index) # update prev symbols for j, output in enumerate(outputs): outputs[j] = tf.gather( output, index) # update prev outputs symbols.append(prev_symbol) attn = tf.reshape( multihead_attention(tf.expand_dims(h, 1), units=d, num_heads=nh, memory=memory, mask=self.c_mask, bias=False), [-1, nh * d]) cinp = tf.concat([einp, attn], 1) h, state = self.cell(cinp, state) with tf.variable_scope("AttnOutputProjection"): output = _linear([h] + [cinp], output_size=dw * 2, bias=False, scope="output") output = tf.reshape(output, [-1, dw, 2]) output = tf.reduce_max(output, 2) # maxout outputs.append(output) if self.loop_function is not None: prev = output if self.loop_function is not None: # process the last symbol einp, prev_probs, index, prev_symbol = self.loop_function( prev, prev_probs, self.beam_size, i + 1) for j, symbol in enumerate(symbols): symbols[j] = tf.gather(symbol, index) # update prev symbols for j, output in enumerate(outputs): outputs[j] = tf.gather(output, index) # update prev outputs symbols.append(prev_symbol) # output the final best result of beam search for k, symbol in enumerate(symbols): symbols[k] = tf.gather(symbol, 0) for k, output in enumerate(outputs): outputs[k] = tf.expand_dims(tf.gather(output, 0), 0) self.gen_loss = self._compute_loss(outputs, oups, N) self.symbols = symbols with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) self.loss = self.gen_loss if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = None) qh_emb = conv(qh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = True) ch_emb = tf.reduce_max(ch_emb, axis = 1) qh_emb = tf.reduce_max(qh_emb, axis = 1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None) q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks = 1, num_conv_layers = 4, kernel_size = 7, mask = self.c_mask, num_filters = d, num_heads = nh, seq_len = self.c_len, scope = "Encoder_Residual_Block", bias = False, dropout = self.dropout) q = residual_block(q_emb, num_blocks = 1, num_conv_layers = 4, kernel_size = 7, mask = self.q_mask, num_filters = d, num_heads = nh, seq_len = self.q_len, scope = "Encoder_Residual_Block", reuse = True, # Share the weights between passage and question bias = False, dropout = self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask = mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis = -1) self.enc = [conv(inputs, d, name = "input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks = 7, num_conv_layers = 2, kernel_size = 5, mask = self.c_mask, num_filters = d, num_heads = nh, seq_len = self.c_len, scope = "Model_Encoder", bias = False, reuse = True if i > 0 else None, dropout = self.dropout) ) with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, bias = False, name = "start_pointer"),-1) end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1, bias = False, name = "end_pointer"), -1) self.logits = [mask_logits(start_logits, mask = self.c_mask), mask_logits(end_logits, mask = self.c_mask)] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits( logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits( logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var,v))
def pred(self): with tf.variable_scope("embedding_layer"): (self.questions, question_lengths), ( self.contexts, context_lengths), self.answers = self.iterator.get_next() max_context_length = tf.reduce_max(context_lengths) max_question_length = tf.reduce_max(question_lengths) #max_context_length = self.train_max_context_length #max_question_length = self.train_max_question_length context_mask = tf.sequence_mask(context_lengths, maxlen=max_context_length) question_mask = tf.sequence_mask(question_lengths, maxlen=max_question_length) question_embeddings = tf.nn.embedding_lookup( self.embedding, self.questions) context_embeddings = tf.nn.embedding_lookup( self.embedding, self.contexts) print('question_embeddings', question_embeddings.get_shape().as_list()) print('context_embeddings', context_embeddings.get_shape().as_list()) with tf.variable_scope("embedding_layer"): c = residual_block(context_embeddings, num_blocks=1, num_conv_layers=1, kernel_size=7, mask=context_mask, num_filters=self.lstm_hidden_size, num_heads=1, seq_len=max_context_length, scope="Encoder_Residual_Block", bias=False, dropout=1.0 - self.keep_prob) print('c', c.get_shape().as_list()) q = residual_block( question_embeddings, num_blocks=1, num_conv_layers=1, kernel_size=7, mask=question_mask, num_filters=self.lstm_hidden_size, num_heads=1, seq_len=max_question_length, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=1.0 - self.keep_prob) print('q', q.get_shape().as_list()) # context_output dimension is BS * max_context_length * d # where d = 2*lstm_hidden_size with tf.variable_scope("attention_layer"): S = optimized_trilinear_for_attention( [c, q], max_context_length, max_question_length, input_keep_prob=self.keep_prob) mask_q = tf.expand_dims(question_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(context_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("modeling_layer"): attention = tf.concat(attention_outputs, axis=-1) self.enc = [ conv(attention, self.lstm_hidden_size, name="input_projection") ] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], self.keep_prob) self.enc.append( residual_block(self.enc[i], num_blocks=1, num_conv_layers=1, kernel_size=5, mask=context_mask, num_filters=self.lstm_hidden_size, num_heads=1, seq_len=max_context_length, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=1.0 - self.keep_prob)) print('self.enc[i]', self.enc[i].get_shape().as_list()) with tf.variable_scope("output_layer_start"): pred_start = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) print('pred_start', pred_start.get_shape().as_list()) self.pred_start = preprocess_softmax(pred_start, context_mask) print('self.pred_start', self.pred_start.get_shape().as_list()) with tf.variable_scope("output_layer_end"): pred_end = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) print('pred_end', pred_end.get_shape().as_list()) self.pred_end = preprocess_softmax(pred_end, context_mask) print('self.pred_end', self.pred_end.get_shape().as_list()) self.preds = tf.transpose([ tf.argmax(self.pred_start, axis=1), tf.argmax(self.pred_end, axis=1) ])
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads d_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True) with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) print "ch_emb before", ch_emb.shape[-1] print "qh_emb before", qh_emb.shape[-1] ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) print "N", N, "PL", PL, "QL", QL print "ch_emb", ch_emb.shape print "qh_emb", qh_emb.shape c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) print "c_emb high", c_emb.shape print "q_emb high", q_emb.shape with tf.variable_scope("Embedding_Encoder_Layer"): c_tmp = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) # c_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True) c = drnn(d_cell, c_tmp, d) q_tmp = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) # q_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True) q = drnn(d_cell, q_tmp, d) print "embd enc output c", c.shape print "embd enc output q", q.shape # exit() with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] print "enc len", len(self.enc) # print self.ch_len.shape # print self.qh_len.shape # print self.c_len.shape # print self.q_len.shape # print ip_len.shape print "qh shape", self.qh.shape print "qh type", self.qh.dtype print "ip shape", inputs.shape print "ip type", inputs.dtype ip_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(inputs, tf.bool), tf.float32), axis=2), [-1]) print "ip_len", ip_len.shape # fw0 = drnn(d_cell, self.enc[0], d) # f_cell = tf.contrib.rnn.BasicLSTMCell(fw0[2], forget_bias=1.0, state_is_tuple=True) # fw1 = drnn(d_cell, fw0, d) # fw2 = drnn(d_cell, fw1, d) # self.enc.append(fw0) # self.enc.append(fw1) # self.enc.append(fw2) # print "fw1 shape", fw1 # # (fw0, bw0), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None, # initial_state_fw=None, initial_state_bw=None, # dtype=None, parallel_iterations=None, # swap_memory=False, time_major=False, scope=None): # bw_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True) # g0 = bidirlstm(fw_cell, bw_cell, inputs, d) # g1 = bidirlstm(fw_cell, bw_cell, g0, d) # g2 = bidirlstm(fw_cell, bw_cell, g1, d) # fw0 = bidirlstm(d_cell, d_cell, inputs, d) # d_cell1 = tf.contrib.rnn.BasicLSTMCell(fw0[1], forget_bias=1.0, state_is_tuple=True) # fw1 = bidirlstm(d_cell1, d_cell1, fw0, d) # (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(d_cell, d_cell, self.enc[0], dtype='float', scope='g0') # [N, M, JX, 2d] # g0 = tf.concat([fw_g0, bw_g0], 4) # (fw_g1, bw_g1) = bidirectional_dynamic_rnn(d_cell, d_cell, fw_g0, dtype='float', scope='g1') # [N, M, JX, 2d] # print "fw_g0", fw_g0.shape # print "bw_g0", bw_g0.shape # print g0.shape # (fw_g1, bw_g1), _ = bidirlstm(d_cell, d_cell, g0, dtype='float', scope='g1') # [N, M, JX, 2d] # g1 = tf.concat([fw_g1, bw_g1], 3) # flat_output_fw = nest.flatten(fw_g0) # flat_output_bw = nest.flatten(bw_g0) # flat_outputs = tuple(array_ops.concat(1, [fw, bw]) # for fw, bw in zip(flat_output_fw, flat_output_bw)) # outputs = nest.pack_sequence_as(structure=output_fw, # flat_sequence=flat_outputs) # print "output", outputs.shape for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( drnn( d_cell, residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout), d)) # print "enc[0] shape", self.enc[0].shape print "chalala" # exit() with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def build_model(self): PL, QL, CL, d, dc, nh = self.c_maxlen, self.q_maxlen, self.char_limit, self.filters, self.char_dim, self.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.contc_input), [-1, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.quesc_input), [-1, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [-1, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [-1, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.contw_input), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.quesw_input), 1.0 - self.dropout) # if self.use_cove: # c_emb_cove = self.cove_model(c_emb) # q_emb_cove = self.cove_model(q_emb) # c_emb = tf.concat([c_emb, c_emb_cove], axis=-1) # q_emb = tf.concat([q_emb, q_emb_cove], axis=-1) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) if self.use_elmo: c_emb = tf.concat([c_emb, self.cont_elmo], axis=-1) q_emb = tf.concat([q_emb, self.ques_elmo], axis=-1) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.cont_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block(q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.ques_len, scope="Encoder_Residual_Block", reuse=True, bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) c2q = tf.matmul(S_, q) q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, c2q, c * c2q, c * q2c] with tf.variable_scope("Model_Encoder_Layer"): attention_inputs = tf.concat(attention_outputs, axis=-1) enc = [conv(attention_inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks enc[i] = tf.nn.dropout(enc[i], 1.0 - self.dropout) enc.append( residual_block(enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.cont_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): start_logits = tf.concat([enc[1], enc[2]], axis=-1) end_logits = tf.concat([enc[1], enc[3]], axis=-1) if self.use_elmo: start_logits = tf.concat((start_logits, self.cont_elmo), axis=-1) end_logits = tf.concat((end_logits, self.cont_elmo), axis=-1) start_logits = tf.squeeze( conv(start_logits, 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(end_logits, 1, bias=False, name="end_pointer"), -1) unanswer_bias = tf.get_variable( "unanswer_bias", [1], regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7), initializer=tf.zeros_initializer()) unanswer_bias = tf.reshape( tf.tile(unanswer_bias, [self.batch_size]), [-1, 1]) self.logits1 = tf.concat( (unanswer_bias, mask_logits(start_logits, mask=self.c_mask)), axis=-1) self.logits2 = tf.concat( (unanswer_bias, mask_logits(end_logits, mask=self.c_mask)), axis=-1) start_loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits1, labels=self.y_start) end_loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits2, labels=self.y_end) self.loss = tf.reduce_mean(start_loss + end_loss) if self.l2_norm is not None: variables = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss # output outer = tf.matmul( tf.expand_dims(tf.nn.softmax(self.logits1), axis=2), tf.expand_dims(tf.nn.softmax(self.logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, self.ans_limit) self.output1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) - 1 self.output2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) - 1 if self.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(self.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v is not None: self.assign_vars.append(tf.assign(var, v))
def forward(self): config = self.config N = config.batch_size if not self.demo else 1 PL = self.c_maxlen QL = self.q_maxlen XL = self.x_maxlen # DEBUG self.debug_ops.extend([PL, QL, XL]) CL = config.char_limit # 16 d = config.hidden # 96 dc = config.char_dim # 64 nh = config.num_heads # 1 with tf.variable_scope("Input_Embedding_Layer"): ''' self.ch : (N, c_maxlen, 16) self.qh : (N, q_maxlen, 16) self.xh : (N, x_maxlen, 16) ''' ###################################### #get elmo embeddings ###################################### datadir = "/data/elmo_experiment_20180906/20180906_model" vocab_file = os.path.join(datadir, 'vocab-2016-09-10.txt') options_file = os.path.join(datadir, 'options.json') weight_file = os.path.join(datadir, 'weights.hdf5') print(vocab_file) print(options_file) print(weight_file) # Create a Batcher to map text to character ids. batcher = Batcher(vocab_file, 50) # Input placeholders to the biLM. #context_character_ids = tf.placeholder('int32', shape=(None, None, 50)) #question_character_ids = tf.placeholder('int32', shape=(None, None, 50)) # Build the biLM graph. bilm = BidirectionalLanguageModel(options_file, weight_file) # Get ops to compute the LM embeddings. print(self.c) print(self.c.shape) #print(self.ch) #print(self.ch.shape) print(self.c_elmo) print(self.c_elmo.shape) print(self.q_elmo) print(self.q_elmo.shape) print(self.x_elmo) print(self.x_elmo.shape) context_embeddings_op = bilm(self.c_elmo) question_embeddings_op = bilm(self.q_elmo) candidate_embeddings_op = bilm(self.x_elmo) # Get an op to compute ELMo (weighted average of the internal biLM layers) # Our SQuAD model includes ELMo at both the input and output layers # of the task GRU, so we need 4x ELMo representations for the question # and context at each of the input and output. # We use the same ELMo weights for both the question and context # at each of the input and output. #context elmo elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0) with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question elmo_question_input = weight_layers( 'input', question_embeddings_op, l2_coef=0.0 ) elmo_candidate_input = weight_layers( 'input', candidate_embeddings_op, l2_coef=0.0 ) elmo_context_output = weight_layers( 'output', context_embeddings_op, l2_coef=0.0 ) with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question elmo_question_output = weight_layers( 'output', question_embeddings_op, l2_coef=0.0 ) elmo_candidate_output = weight_layers( 'output', candidate_embeddings_op, l2_coef=0.0 ) ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) #(N*PL,16,64) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) #(N*QL,16,64) xh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.xh), [N * XL, CL, dc]) #(N*XL,16,64) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) xh_emb = tf.nn.dropout(xh_emb, 1.0 - 0.5 * self.dropout) # BiDAF style conv-highway encoder: conv over chars in each word in a batch of passages ch_emb = conv(ch_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = None) # (N*c_maxlen, 16-5+1, 96) qh_emb = conv(qh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = True) # (N*q_maxlen, 16-5+1, 96) xh_emb = conv(xh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name="char_conv", reuse=True) # (N*x_maxlen, 16-5+1, 96) # Max Pooling ch_emb = tf.reduce_max(ch_emb, axis = 1) # (N*c_maxlen, 96) qh_emb = tf.reduce_max(qh_emb, axis = 1) # (N*q_maxlen, 96) xh_emb = tf.reduce_max(xh_emb, axis = 1) # (N*x_maxlen, 96) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) # (N, c_maxlen, 96) qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]]) # (N, q_maxlen, 96) xh_emb = tf.reshape(xh_emb, [N, XL, xh_emb.shape[-1]]) # (N, x_maxlen, 96) ''' self.c : (N, c_maxlen) self.q : (N, q_maxlen) self.x : (N, x_maxlen) ''' #print(self.c) #print(self.q) #print(self.x) c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout)#(N,c_maxlen,300) q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout)#(N,q_maxlen,300) x_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.x), 1.0 - self.dropout)#(N,x_maxlen,300) #c_emb_elmo = #q_emb_elmo = #x_emb_elmo = c_emb = tf.concat([c_emb, ch_emb], axis=2) # (N, c_maxlen, 396) q_emb = tf.concat([q_emb, qh_emb], axis=2) # (N, q_maxlen, 396) x_emb = tf.concat([x_emb, xh_emb], axis=2) # (N, x_maxlen, 396) print(c_emb) print(c_emb.shape) c_emb = tf.concat([elmo_context_output['weighted_op'], c_emb], axis=2) # (N, c_maxlen, 1024 + 396) q_emb = tf.concat([elmo_question_output['weighted_op'], q_emb], axis=2) # (N, q_maxlen, 1024 + 396) x_emb = tf.concat([elmo_candidate_output['weighted_op'], x_emb], axis=2) # (N, x_maxlen, 1024 + 396) print(c_emb) print(c_emb.shape) c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None)#(N,c_maxlen,96) q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)#(N,q_maxlen,96) x_emb = highway(x_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)#(N,x_maxlen,96) with tf.variable_scope("Embedding_Encoder_Layer"): ''' -> positional encoding -> layer_normalization -> depth-wise separable convolution -> self attention -> feed forward network In the paper: The total number of encoder blocks is 1 ''' # (N, c_maxlen, 96) c = residual_block(c_emb, num_blocks = 1, num_conv_layers = 4, kernel_size = 7, mask = self.c_mask, num_filters = d, num_heads = nh, seq_len = self.c_len, scope = "Encoder_Residual_Block", bias = False, dropout = self.dropout) # (N, q_maxlen, 96) q = residual_block(q_emb, num_blocks = 1, num_conv_layers = 4, kernel_size = 7, mask = self.q_mask, num_filters = d, num_heads = nh, seq_len = self.q_len, scope = "Encoder_Residual_Block", reuse = True, # Share the weights between passage and question bias = False, dropout = self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): ''' tf.tile(input, multiples, name=None): creates a new tensor by replicating input multiples times. The output tensor's i'th dimension has input.dims(i) * multiples[i] elements, and the values of input are replicated multiples[i] times along the 'i'th dimension. Paper: The layer parameters are the same as the Embedding Encoder Layer except that convolution layer number is 2 within a block and the total number of blocks is 7 ''' ''' c: (N, c_maxlen, d) q: (N, q_maxlen, d) ch_emb: (N, c_maxlen, d) qh_emb: (N, q_maxlen, d) C: (N, c_maxlen, q_maxlen, d) Q: (N, c_maxlen, q_maxlen, d) S: (N, c_maxlen, q_maxlen) mask_q: (N, 1, q_maxlen) mask_c: (N, c_maxlen, 1) S_: (N, c_maxlen, q_maxlen) S_T: (N, q_maxlen, c_maxlen) self.c2q: (N, c_maxlen, d) = tf.matmul(S_, q) self.q2c: (N, c_maxlen, d) = tf.matmul(tf.matmul(S_, S_T), c) ''' # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18 # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) # optimization from jasonwbw S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask = mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), axis = 1),(0,2,1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18 attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] # if config.q2c: # attention_outputs.append(c * self.q2c) # with tf.variable_scope("Model_Encoder_Layer"): # inputs = tf.concat(attention_outputs, axis = -1) # # # same as a dxd MLP layer # self.enc = [conv(inputs, d, name = "input_projection")] # d=hidden=96 # # for i in range(3): # if i % 2 == 0: # dropout every 2 blocks # self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) # self.enc.append( # residual_block(self.enc[i], # num_blocks = 7, # num_conv_layers = 2, # kernel_size = 5, # mask = self.c_mask, # num_filters = d, # num_heads = nh, # seq_len = self.c_len, # scope = "Model_Encoder", # bias = False, # reuse = True if i > 0 else None, # dropout = self.dropout) # ) # DEBUG # self.debug_ops.append(inputs) # self.debug_ops.extend(self.enc) with tf.variable_scope("Output_Layer"): ''' broadcasting:dimensions with size 1 are stretched or "copied" to match the other ''' ''' x_emb: (N, x_maxlen, d) inputs: (N, c_maxlen, 4*d) mask_x: (N, x_maxlen, 1) c_proj: (N, c_maxlen, d) S_xc/S_xc_: (N, x_maxlen, c_maxlen) x2c: (N, x_maxlen, d) xp_exp: (N, x_maxlen, c_maxlen, 1) c_proj_exp: (N, 1, c_maxlen, d) cand_context: (N, x_maxlen, c_maxlen, d) cand_context_pool: (N, x_maxlen, d) cand_condense: (N, x_maxlen, d*2) self.cand_condense: (N, x_maxlen, d) self.cand_logits: (N, x_maxlen, 1) ''' inputs = tf.concat(attention_outputs, axis = -1) # masking candidate embedding mask_x = tf.expand_dims(self.x_mask, 2) c_proj = conv(inputs, d, name="context_projection") S_xc = optimized_trilinear_for_attention([x_emb, c_proj], self.x_maxlen, self.c_maxlen, input_keep_prob=1.0 - self.dropout) S_xc_ = tf.nn.softmax(mask_logits(S_xc, mask = mask_x)) self.x2c = tf.matmul(S_xc_, c_proj) self.cand_condense = self.x2c if self.config.cand_condense_vector: xp_exp = tf.expand_dims(self.xp, axis=-1) c_proj_exp = tf.expand_dims(c_proj, axis=1) cand_context = tf.multiply(c_proj_exp, xp_exp) if self.config.cand_condense_conv: cand_context = tf.reshape(cand_context, [N*XL, PL, d]) cand_context = conv(cand_context, d, bias=True, activation=tf.nn.relu, kernel_size=3, name="candidate_from_context") cand_context = tf.reshape(cand_context, [N, XL, -1, d]) if self.config.cand_condense_pool: cand_context_pool = tf.reduce_max(cand_context, axis=-2) else: cand_context_pool = tf.reduce_mean(cand_context, axis=-2) cand_condense = tf.concat([self.x2c, cand_context_pool], axis = -1) self.cand_condense = conv(cand_condense, d, name="candidate_projection") if self.config.cand_fuse_vector: raise NotImplementedError # DEBUG self.debug_ops.extend([xp_exp, c_proj_exp, cand_context, cand_context_pool, cand_condense, self.cand_condense]) if not config.max_margin: cand_logits = tf.squeeze(conv(self.cand_condense, 1, bias=False, name="candidate_logits_1"), -1) self.cand_logits = mask_logits(cand_logits, mask=self.x_mask) loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.cand_logits, labels=self.yx) # DEBUG self.debug_ops.extend([loss, x_emb, c_proj, S_xc, S_xc_, self.x2c, self.x_mask, self.cand_logits, self.yx]) else: cand_logits = conv(self.cand_condense, 1, bias=False, name="candidate_logits_1") cand_logits = tf.tanh(cand_logits) cand_logits = tf.squeeze(conv(cand_logits, 1, bias=False, name="candidate_logits_2"), -1) self.cand_logits = tf.sigmoid(cand_logits) pos = tf.multiply(self.cand_logits, self.yx) pos = tf.reduce_max(pos, axis=-1) negs = tf.multiply(self.cand_logits, self.yx_inv) neg = tf.reduce_max(negs, axis=-1) loss = tf.maximum(tf.add(tf.subtract(neg, pos), config.margin), 0.0) # DEBUG self.debug_ops.extend([loss, x_emb, c_proj, S_xc, S_xc_, self.x2c, self.x_mask, self.cand_logits, self.yx, pos, negs, neg, self.yx, self.yx_inv]) self.loss = tf.reduce_mean(loss) # with tf.variable_scope("Output_Layer"): # ''' # tf.matrix_band_part: Copy a tensor setting everything outside a central band # in each innermost matrix to zero. # self.enc[i]: (N, c_maxlen, d) # start_logits: (N, c_maxlen) # end_logits: (N, c_maxlen) # logits1: (N, c_maxlen) # logits2: (N, c_maxlen) # outer: (N, c_maxlen, c_maxlen) # self.c_mask: (N, c_maxlen) # yp1, yp2, losses, losses2: (N,) # ''' # # # map vectors to scalars # start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, # bias = False, name = "start_pointer"),-1) # end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1, # bias = False, name = "end_pointer"), -1) # self.logits = [mask_logits(start_logits, mask = self.c_mask), mask_logits(end_logits, mask = self.c_mask)] # # logits1, logits2 = [l for l in self.logits] # # losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) # losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) # self.loss = tf.reduce_mean(losses + losses2) # # # find max-score span # outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), # tf.expand_dims(tf.nn.softmax(logits2), axis=1)) # # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18 # outer = tf.matrix_band_part(outer, 0, config.ans_limit) # self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) # self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) # # # DEBUG # self.debug_ops.extend([start_logits, end_logits, logits1, logits2, # outer, self.yp1, self.yp2, losses, losses2, self.loss]) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18 self.assign_vars = [] # self.shadow_vars = [] # self.global_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def darknet(inputs): with tf.name_scope("darknet"): inputs = layers.conv_layer(name="darknet/conv_0", inputs=inputs, filters=32, kernel_size=3) inputs = layers.conv_layer(name="darknet/conv_1", inputs=inputs, filters=64, kernel_size=3, downsample=True) for i in range(1): inputs = layers.residual_block( f"darknet/residual_group_0/residual_{i}", inputs, num_filters=32) inputs = layers.conv_layer(name="darknet/residual_group_0/conv_2", inputs=inputs, filters=128, kernel_size=3, downsample=True) for i in range(2): inputs = layers.residual_block( f"darknet/residual_group_1/residual_{i}", inputs, num_filters=64) inputs = layers.conv_layer(name="darknet/residual_group_1/conv_3", inputs=inputs, filters=256, kernel_size=3, downsample=True) for i in range(8): inputs = layers.residual_block( f"darknet/residual_group_2/residual_{i}", inputs, num_filters=128) darknet_route_1 = inputs inputs = layers.conv_layer(name="darknet/residual_group_3/conv_4", inputs=inputs, filters=512, kernel_size=3, downsample=True) for i in range(8): inputs = layers.residual_block( f"darknet/residual_group_3/residual_{i}", inputs, num_filters=256) darknet_route_2 = inputs inputs = layers.conv_layer(name="darknet/residual_group_4/conv_5", inputs=inputs, filters=1024, kernel_size=3, downsample=True) for i in range(4): inputs = layers.residual_block( f"darknet/residual_group_4/residual_{i}", inputs, num_filters=512) return darknet_route_1, darknet_route_2, inputs
def forward(self, trainable): config = self.config N, PL, QL, CL, d, dc, nh= config.batch_size,self.c_maxlen, self.q_maxlen,\ config.char_limit, config.hidden, config.char_dim, \ config.num_heads, with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) #[一个句子共有多少单词,每个单词的字符个数,每一个字符的维度] qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder以下是得到卷积之后的特征输出 ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) #[batch,feature_len,d] qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max( ch_emb, axis=1) #求出横向唯独的最大特征,这里可以用k_max尝试,而没有用max_pooling qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) #最终转变为句子长度对应的维度, qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat( [c_emb, ch_emb], axis=2) #把字符与对应的特征进行连接[batch,sequence_len,对应的输出维度] q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway( c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) #相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75] q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope('question_rnn'): self.gru = tf.contrib.rnn.GRUCell(d) initstate = self.gru.zero_state(batch_size=N, dtype=tf.float32) output, state = tf.nn.dynamic_rnn(self.gru, q, initial_state=initstate) # self.qandc=tf.concat([self.q2c,self.c2q],axis=2) # self.qandc=dense(self.qandc,d) # output,state=tf.nn.dynamic_rnn(self.gru,self.qandc,initial_state=initstate)#(32,?,75) state = tf.expand_dims(state, axis=2) weight1 = tf.matmul(self.enc[1], state) weight2 = tf.matmul(self.enc[2], state) weight3 = tf.matmul(self.enc[3], state) weight_enc1 = tf.multiply(self.enc[1], weight1) weight_enc1 = tf.reduce_sum(weight_enc1, axis=1) weight_enc2 = tf.multiply(self.enc[2], weight2) weight_enc2 = tf.reduce_sum(weight_enc2, axis=1) weight_enc3 = tf.multiply(self.enc[3], weight3) weight_enc3 = tf.reduce_sum(weight_enc3, axis=1) with tf.variable_scope("Output_Layer"): print(weight_enc1, "ggggggggggggggggg") inputs_shape = weight_enc1.get_shape().as_list() W = tf.get_variable( "W", shape=[inputs_shape[-1], 3], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[3]), name="b") self.l2_loss += tf.nn.l2_loss(W) self.l2_loss += tf.nn.l2_loss(b) self.scores1 = tf.nn.xw_plus_b(weight_enc1, W, b, name="scores") self.scores2 = tf.nn.xw_plus_b(weight_enc2, W, b, name="scores") self.scores3 = tf.nn.xw_plus_b(weight_enc3, W, b, name="scores") self.scores = (self.scores1 + self.scores2 + self.scores3) / 3.0 print(self.scores) self.predictions = tf.argmax(self.scores, 1, name="predictions") if trainable: with tf.name_scope("loss"): print(self.scores, self.input_y, "llllllllllllllll") losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean( losses) + self.l2_reg_lambda * self.l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast( correct_predictions, "float"), name="accuracy") # losses2 = tf.nn.softmax_cross_entropy_with_logits( # logits=logits2, labels=self.y2) if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage( config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v)) self.lr = tf.minimum( config.init_lr, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm( gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
def pred(self): with tf.variable_scope("embedding_layer"): (self.questions, question_lengths), ( self.contexts, context_lengths), self.answers = self.iterator.get_next() #max_context_length = tf.reduce_max(context_lengths) #max_question_length = tf.reduce_max(question_lengths) max_context_length = self.train_max_context_length max_question_length = self.train_max_question_length context_mask = tf.sequence_mask(context_lengths, maxlen=max_context_length) question_mask = tf.sequence_mask(question_lengths, maxlen=max_question_length) question_embeddings = tf.nn.embedding_lookup( self.embedding, self.questions) context_embeddings = tf.nn.embedding_lookup( self.embedding, self.contexts) print('question_embeddings', question_embeddings.get_shape().as_list()) print('context_embeddings', context_embeddings.get_shape().as_list()) with tf.variable_scope("embedding_layer"): context_output = residual_block(context_embeddings, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=context_mask, num_filters=self.lstm_hidden_size, num_heads=1, seq_len=max_context_length, scope="Encoder_Residual_Block", bias=False, dropout=1.0 - self.keep_prob) print('context_output', context_output.get_shape().as_list()) question_output = residual_block( question_embeddings, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=question_mask, num_filters=self.lstm_hidden_size, num_heads=1, seq_len=max_question_length, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=1.0 - self.keep_prob) print('question_output', question_output.get_shape().as_list()) # context_output dimension is BS * max_context_length * d # where d = 2*lstm_hidden_size with tf.variable_scope("attention_layer"): # d is equal to 2*self.lstm_hidden_size similarity_matrix = tf.matmul( context_output, tf.transpose(question_output, [0, 2, 1])) print('similarity_matrix', similarity_matrix.get_shape().as_list()) mask_aug = tf.expand_dims(context_mask, 2) & tf.expand_dims( question_mask, 1) similarity_matrix = preprocess_softmax(similarity_matrix, mask_aug) print('similarity_matrix', similarity_matrix.get_shape().as_list()) context_to_query_attention_weights = tf.nn.softmax( similarity_matrix, axis=2) print('context_to_query_attention_weights', context_to_query_attention_weights.get_shape().as_list()) context_to_query = tf.matmul(context_to_query_attention_weights, question_output) print('context_to_query', context_to_query.get_shape().as_list()) max_col_similarity = tf.reduce_max(similarity_matrix, axis=2) print('max_col_similarity', max_col_similarity.get_shape().as_list()) b = tf.nn.softmax(max_col_similarity, axis=1) print('b', b.get_shape().as_list()) b = tf.expand_dims(b, 1) print('b', b.get_shape().as_list()) query_to_context = tf.matmul(b, context_output) print('query_to_context', query_to_context.get_shape().as_list()) context_output_with_context_to_query = context_output * context_to_query print('context_output_with_context_to_query', context_output_with_context_to_query.get_shape().as_list()) context_output_with_query_to_context = context_output * query_to_context print('context_output_with_query_to_context', context_output_with_query_to_context.get_shape().as_list()) attention = tf.concat([ context_output, context_to_query, context_output_with_context_to_query, context_output_with_query_to_context ], axis=2) print('attention', attention.get_shape().as_list()) with tf.variable_scope("modeling_layer"): self.enc = [ conv(attention, self.lstm_hidden_size, name="input_projection") ] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], self.keep_prob) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=context_mask, num_filters=self.lstm_hidden_size, num_heads=1, seq_len=max_context_length, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=1.0 - self.keep_prob)) print('self.enc[i]', self.enc[i].get_shape().as_list()) with tf.variable_scope("output_layer_start"): pred_start = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) print('pred_start', pred_start.get_shape().as_list()) self.pred_start = preprocess_softmax(pred_start, context_mask) print('self.pred_start', self.pred_start.get_shape().as_list()) with tf.variable_scope("output_layer_end"): pred_end = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) print('pred_end', pred_end.get_shape().as_list()) self.pred_end = preprocess_softmax(pred_end, context_mask) print('self.pred_end', self.pred_end.get_shape().as_list()) self.preds = tf.transpose([ tf.argmax(self.pred_start, axis=1), tf.argmax(self.pred_end, axis=1) ])
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh, AL1,AL2,AL3= config.batch_size,self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads,self.aletr1_maxlen, \ self.aletr2_maxlen,self.aletr3_maxlen with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) #[一个句子共有多少单词,每个单词的字符个数,每一个字符的维度] qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) self.alternati_emb1 = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.alter1h), [N * AL1, CL, dc]) # (875, 25, 20) self.alternati_emb2 = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.alter2h), [N * AL2, CL, dc]) # (768, 16, 300) self.alternati_emb3 = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.alter3h), [N * AL3, CL, dc]) # (768, 16, 300) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) alternati_emb1 = tf.nn.dropout(self.alternati_emb1, 1.0 - 0.5 * self.dropout) alternati_emb2 = tf.nn.dropout(self.alternati_emb2, 1.0 - 0.5 * self.dropout) alternati_emb3 = tf.nn.dropout(self.alternati_emb3, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder以下是得到卷积之后的特征输出 ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) #[batch,feature_len,d] qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) alternati_emb1 = conv(alternati_emb1, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) alternati_emb2 = conv(alternati_emb2, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) alternati_emb3 = conv(alternati_emb3, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max( ch_emb, axis=1) #求出横向唯独的最大特征,这里可以用k_max尝试,而没有用max_pooling qh_emb = tf.reduce_max(qh_emb, axis=1) alternati_emb1 = tf.reduce_max(alternati_emb1, axis=1) alternati_emb2 = tf.reduce_max(alternati_emb2, axis=1) alternati_emb3 = tf.reduce_max(alternati_emb3, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) #最终转变为句子长度对应的维度, qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]]) alternati_emb1 = tf.reshape(alternati_emb1, [N, AL1, qh_emb.shape[-1]]) alternati_emb2 = tf.reshape(alternati_emb2, [N, AL2, qh_emb.shape[-1]]) alternati_emb3 = tf.reshape(alternati_emb3, [N, AL3, qh_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) alter_embedding1 = tf.nn.embedding_lookup(self.word_mat, self.alter1) # 上下文 alter_embedding2 = tf.nn.embedding_lookup(self.word_mat, self.alter2) # 上下文 alter_embedding3 = tf.nn.embedding_lookup(self.word_mat, self.alter3) # 上下文 c_emb = tf.concat( [c_emb, ch_emb], axis=2) #把字符与对应的特征进行连接[batch,sequence_len,对应的输出维度] q_emb = tf.concat([q_emb, qh_emb], axis=2) alter_embedding1 = tf.concat([alter_embedding1, alternati_emb1], axis=2) alter_embedding2 = tf.concat([alter_embedding2, alternati_emb2], axis=2) alter_embedding3 = tf.concat([alter_embedding3, alternati_emb3], axis=2) c_emb = highway( c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) #相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75] self.alter_embedding1 = c_emb q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) alter_embedding1 = highway(alter_embedding1, size=d, scope="highway", dropout=self.dropout, reuse=True) alter_embedding2 = highway(alter_embedding2, size=d, scope="highway", dropout=self.dropout, reuse=True) alter_embedding3 = highway(alter_embedding3, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) alter1 = residual_block( alter_embedding1, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.alter1_mask, num_filters=d, num_heads=nh, seq_len=self.alterh1_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) alter2 = residual_block( alter_embedding2, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.alter2_mask, num_filters=d, num_heads=nh, seq_len=self.alter2_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) alter3 = residual_block( alter_embedding3, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.alter3_mask, num_filters=d, num_heads=nh, seq_len=self.alter3_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope('question_rnn'): self.gru = tf.contrib.rnn.GRUCell(d) initstate = self.gru.zero_state(batch_size=N, dtype=tf.float32) output, state = tf.nn.dynamic_rnn(self.gru, q, initial_state=initstate) # self.qandc=tf.concat([self.q2c,self.c2q],axis=2) # self.qandc=dense(self.qandc,d) # output,state=tf.nn.dynamic_rnn(self.gru,self.qandc,initial_state=initstate)#(32,?,75) output1, state1 = tf.nn.dynamic_rnn(self.gru, alter1, initial_state=state) output2, state2 = tf.nn.dynamic_rnn(self.gru, alter2, initial_state=state) output3, state3 = tf.nn.dynamic_rnn(self.gru, alter3, initial_state=state) state = tf.expand_dims(state, axis=2) weight1 = tf.matmul(self.enc[1], state) weight2 = tf.matmul(self.enc[2], state) weight3 = tf.matmul(self.enc[3], state) weight_enc1 = tf.multiply(self.enc[1], weight1) weight_enc1 = tf.reduce_sum(weight_enc1, axis=1) weight_enc2 = tf.multiply(self.enc[2], weight2) weight_enc2 = tf.reduce_sum(weight_enc2, axis=1) weight_enc3 = tf.multiply(self.enc[3], weight3) weight_enc3 = tf.reduce_sum(weight_enc3, axis=1) with tf.variable_scope("Output_Layer"): # start_logits = tf.squeeze( # conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) # end_logits = tf.squeeze( # conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) # self.logits = [mask_logits(start_logits, mask=self.c_mask), # mask_logits(end_logits, mask=self.c_mask)] # # logits1, logits2 = [l for l in self.logits] # # outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), # tf.expand_dims(tf.nn.softmax(logits2), axis=1)) # outer = tf.matrix_band_part(outer, 0, config.ans_limit) # self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) # self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) similary1 = tf.expand_dims(self.cos_sine(weight_enc1, state1), axis=1) similary2 = tf.expand_dims(self.cos_sine(weight_enc2, state2), axis=1) similary3 = tf.expand_dims(self.cos_sine(weight_enc3, state3), axis=1) self.logits1 = tf.nn.softmax( tf.concat([similary1, similary2, similary3], axis=1)) print(self.logits1, "lllllllllllllllllllllllllllllllllllll")
def __init__(self, config, batch, word_mat=None,char_mat=None, filter_sizes=None, embedding_size=None,num_filters=None,trainable=True, l2_reg_lambda=0.0, keep_prob=0.9, graph=None): # Placeholders for input, output and dropout self.config = config self.graph = graph if graph is not None else tf.Graph() self.trainable = trainable gru = cudnn_gru if config.use_cudnn else native_gru self.is_train = tf.get_variable("is_train", shape=[], dtype=tf.bool, trainable=True) if trainable == True: self.input_x, self.input_x1, self.ch, self.qh, self.input_y, self.qa_id,self.alternatives_tokens = batch.get_next() # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len] else: self.input_x, self.input_x1, self.ch, self.qh,self.alternatives_tokens= batch.get_next() # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len] self.dropout_keep_prob =keep_prob self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.dropout = tf.placeholder_with_default(0.5, (), name="dropout") # Keeping track of l2 regularization loss (optional) l2_loss = tf.constant(0.0) self.c_mask = tf.cast(self.input_x, tf.bool) # 这里是判断出每一个数据集的context对应实际句子长度的位置(64,400) self.q_mask = tf.cast(self.input_x1, tf.bool) # 同上(64,50) self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) # 每一个训练数据集实际长度 self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) # 每一个问题的实际长度 self.ch_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1]) self.qh_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1]) # Embedding layer N, PL, QL, CL, d, dc,dg,nh= config.batch_size,config.para_limit,config.ques_limit,config.char_limit,\ config.hidden, config.char_dim,config.char_hidden,config.num_heads with tf.variable_scope("Input_Embedding_Layer"): self.char_mat = tf.get_variable("char_mat", initializer=tf.constant(char_mat, dtype=tf.float32),trainable=True) ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) cell_fw = tf.contrib.rnn.GRUCell(dg) # 按照字符有多少个gru神经单元 cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) # self.ch_len表示训练数据集所有字符平摊之后,实际字符的长度,sequence_length=[bacth_size] is N * PL, because # char_hidden is 100 so state_fw and state_bw is [N * PL,100] ch_emb = tf.concat([state_fw, state_bw], axis=1) # [N * PL,200] _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, qh_emb, self.qh_len,dtype=tf.float32) # state_* [N*QL] qh_emb = tf.concat([state_fw, state_bw], axis=1) # question_emd is [,200] qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) # [batch_size,que_len,200] ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) # 以上过程对应了论文里边的 the character-level embedding are generate by ...in the token #这样就把每一个单词的字符转化为单词的字符级别embedding信息,tf.reshape(ch_emb, [N, PL, 2 * dg]) # 从这里可以看出作者最后那字符的state状态作为字符信息与原始单词embedding进行连接,那么是否可以用拼音 # 作为汉语的字符级别信息呢,可以尝试 print(qh_emb,"llllllllllllll") with tf.name_scope("embedding"): self.W = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32), trainable=True) self.c_mask = tf.cast(self.input_x, tf.bool) # self.c为填充之后的长度是一致的,用0进行填充 self.q_mask = tf.cast(self.input_x1, tf.bool) if trainable: self.c_maxlen, self.q_maxlen, = config.para_limit, config.ques_limit, else: self.c_maxlen, self.q_maxlen = config.test_para_limit, config.test_ques_limit self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) self.embedded_chars1 = tf.nn.embedding_lookup(self.W, self.input_x1) c_emb = tf.concat([self.embedded_chars, ch_emb], axis=2) q_emb= tf.concat([self.embedded_chars1, qh_emb], axis=2) # self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) # self.embedded_chars_expanded1 = tf.expand_dims(self.embedded_chars1, -1) with tf.variable_scope("cnn_predict"): pooled_outputs = [] c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) # 相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75] q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block(q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) # 这个函数实现的是公式(4)中的所有 rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) # this is 公式(3) #[batch,c_maxlen,150] print(att,"111111111111111111111111") c_emb_expanded_shape=att.get_shape().as_list() c_emb_expanded=tf.expand_dims(att, -1) for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size,c_emb_expanded_shape[-1], 1, num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") l2_loss += tf.nn.l2_loss(W) b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") l2_loss += tf.nn.l2_loss(b) conv_ouput = tf.nn.conv2d( c_emb_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv_ouput, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, c_emb_expanded_shape[1]- filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") print(pooled,"222222222222222222222") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) self.h_pool = tf.concat(pooled_outputs, 3) self.h_pool_flat_cnn = tf.reshape(self.h_pool, [-1, num_filters_total]) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) #input_size对应embedding的长度,此过程是初始化一个gru,双向lstm,包括他们的初始状态 c = rnn(c_emb, seq_len=self.c_len) #上下文编码输出为batch ,c_maxlen,以及lstm输出长度 [batch_size,sequncen_length,150*3] num_layers is 3 so concat each layers #each layer is 150 because each layers has back_forword and feed_forword(75+75) q = rnn(q_emb, seq_len=self.q_len) #问题编码 with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) # 这个函数实现的是公式(4)中的所有公式 rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) # this is 公式(3) #[batch,c_maxlen,150] # Create a convolution + maxpool layer for each filter size input_shape=att.get_shape().as_list() print(att,"rrrr") att=tf.expand_dims(att,-1) print(att,"hhhhhhhhhhhh") pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, input_shape[-1], 1, num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") l2_loss += tf.nn.l2_loss(W) b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") l2_loss += tf.nn.l2_loss(b) conv_ouput = tf.nn.conv2d( att, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv_ouput, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, config.para_limit - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") print(pooled,"3333333333333333333333333") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) self.h_pool = tf.concat(pooled_outputs, 3) self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) # Add dropout with tf.name_scope("dropout"): self.h_drop_lstm = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) self.h_drop_cnn=tf.nn.dropout(self.h_pool_flat_cnn, self.dropout_keep_prob) self.h_drop=tf.concat([self.h_drop_lstm,self.h_drop_cnn],axis=-1) # Final (unnormalized) scores and predictions with tf.name_scope("output"): W = tf.get_variable( "W", shape=[num_filters_total*2, 3], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[3]), name="b") l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") self.predictions = tf.argmax(self.scores, 1, name="predictions") # Calculate mean cross-entropy loss if trainable: with tf.name_scope("loss"): print(self.scores,self.input_y, "llllllllllllllll") losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") # if config.decay is not None: # self.var_ema = tf.train.ExponentialMovingAverage(config.decay) # ema_op = self.var_ema.apply(tf.trainable_variables()) # with tf.control_dependencies([ema_op]): # self.loss = tf.identity(self.loss) # # self.assign_vars = [] # for var in tf.global_variables(): # v = self.var_ema.average(var) # if v: # self.assign_vars.append(tf.assign(var, v)) self.lr = tf.minimum(config.init_lr, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm( gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) # shape = (?, 16, 64) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder # d(hidden_size) = 96 ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) # shape = (?, 12, 96) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) # shape = (?, 96) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) # shape = (32, ?, 96) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): # self.enc[1] = (32, ?, 96) conv1 = conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer") # tf.shape(conv1) = (32, ?, 1) start_logits = tf.squeeze(conv1, -1) # tf.shape(start_logits) = (32, ?) conv2 = conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer") end_logits = tf.squeeze(conv2, -1) # mask ?? self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] # shape = (32, ?) -> cause the context length is variable # matmul([32, ?, 1] x [32, 1, ?]) outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) # outer = (32, ?, ?) outer = tf.matrix_band_part(outer, 0, config.ans_limit) reduced1 = tf.reduce_max(outer, axis=2) reduced2 = tf.reduce_max(outer, axis=1) # tf.shape(reduced) = (32, ?) # ############################################### paddings = [[0, 0], [0, self.MAX_PL - tf.shape(reduced1)[0]]] reduced1 = tf.pad(reduced1, paddings, "CONSTANT") reduced2 = tf.pad(reduced2, paddings, "CONSTANT") reduced1 = tf.slice(reduced1, [0, 0], [N, self.MAX_PL]) reduced2 = tf.slice(reduced2, [0, 0], [N, self.MAX_PL]) # tf.shape(reduced) = (32, ?) # no answer flag: (no_answer, answer_exist) # TODO add additinal layer # TODO dimenstion between reduced and weight na_flag1 = tf.cast( tf.argmax(tf.matmul(reduced1, self.weights1), axis=1), tf.float32) na_flag2 = tf.cast( tf.argmax(tf.matmul(reduced2, self.weights2), axis=1), tf.float32) # Tensor("Output_Layer/ArgMax:0", shape=(32, ?), dtype=int64) self.yp1 = tf.argmax(reduced1, axis=1) self.yp2 = tf.argmax(reduced2, axis=1) print(tf.reduce_sum(reduced1, axis=1)) print(tf.multiply(na_flag1, tf.reduce_sum(reduced1, axis=1))) print( tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y1)) # no_answer losses = tf.where( self.no_answer > 0, tf.multiply(na_flag1, tf.reduce_sum(reduced1, axis=1)), tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y1)) losses2 = tf.where( self.no_answer > 0, tf.multiply(na_flag2, tf.reduce_sum(reduced2, axis=1)), tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2, labels=self.y2)) ################################################# self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def build_model(self): PL, QL, CL, d, dc, nh = self.c_maxlen, self.q_maxlen, self.char_limit, self.filters, self.char_dim, self.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.contc_input), [-1, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.quesc_input), [-1, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [-1, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [-1, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.contw_input), 1.0 - self.dropout) q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.quesw_input), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) if self.use_elmo: c_emb = tf.concat([c_emb, self.cont_elmo], axis=-1) q_emb = tf.concat([q_emb, self.ques_elmo], axis=-1) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.cont_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block(q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.ques_len, scope="Encoder_Residual_Block", reuse=True, bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) c2q = tf.matmul(S_, q) q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, c2q, c * c2q, c * q2c] with tf.variable_scope("Model_Encoder_Layer"): attention_inputs = tf.concat(attention_outputs, axis=-1) enc = [conv(attention_inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks enc[i] = tf.nn.dropout(enc[i], 1.0 - self.dropout) enc.append(residual_block(enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.cont_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): start_logits = tf.concat([enc[1], enc[2]], axis=-1) end_logits = tf.concat([enc[1], enc[3]], axis=-1) if self.use_elmo: start_logits = tf.concat((start_logits, self.cont_elmo), axis=-1) end_logits = tf.concat((end_logits, self.cont_elmo), axis=-1) start_logits = tf.squeeze(conv(start_logits, 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze(conv(end_logits, 1, bias=False, name="end_pointer"), -1) # 2.0 Dataset # unanswer_bias = tf.get_variable("unanswer_bias", [1], # regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7), # initializer=tf.zeros_initializer()) # unanswer_bias = tf.reshape(tf.tile(unanswer_bias, [self.batch_size]), [-1, 1]) # self.logits1 = tf.concat((unanswer_bias, mask_logits(start_logits, mask=self.c_mask)), axis=-1) # self.logits2 = tf.concat((unanswer_bias, mask_logits(end_logits, mask=self.c_mask)), axis=-1) self.logits1 = mask_logits(start_logits, mask=self.c_mask) self.logits2 = mask_logits(end_logits, mask=self.c_mask) start_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits1, labels=self.y_start) end_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits2, labels=self.y_end) self.loss = tf.reduce_mean(start_loss + end_loss) # output outer = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2), tf.expand_dims(tf.nn.softmax(self.logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, self.ans_limit) self.output1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.output2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) if self.use_topk: with tf.variable_scope("Topk_Layer"): top_size = 3 outer = tf.reshape(outer, [self.batch_size, -1]) outer_inds = tf.nn.top_k(outer, top_size).indices # [N,top_size] self.yp1 = outer_inds // tf.shape(self.logits1)[-1] self.yp2 = outer_inds % tf.shape(self.logits2)[-1] def sen_mask(tensor): def sen_mask_(a, b, filters): try: mata = tf.zeros([a, filters], tf.int32) except: mata = [] matb = tf.ones([b - a, filters], tf.int32) matc = tf.zeros([tf.shape(self.logits1)[-1] - b, filters], tf.int32) mat = tf.concat((mata, matb, matc), axis=0) return mat return tf.map_fn(lambda x: sen_mask_(x[0], x[1], self.filters), tensor) self.yp3 = self.yp2 + 1 self.yp1 = tf.expand_dims(self.yp1, -1) self.yp2 = tf.expand_dims(self.yp2, -1) self.yp3 = tf.expand_dims(self.yp3, -1) self.y_mask = tf.concat([self.yp1, self.yp3], axis=-1) self.y_mask = tf.map_fn(lambda x: sen_mask(x), self.y_mask) # answer c = tf.tile(tf.expand_dims(c2q, 1), [1, top_size, 1, 1]) c_topk = tf.multiply(tf.cast(self.y_mask, tf.float32), c) W1 = tf.get_variable("W1", initializer=tf.ones([1, 1, 1, self.filters])) W1 = tf.tile(W1, [self.batch_size, top_size, 1, 1]) alpha1 = tf.nn.softmax(tf.matmul(W1, c_topk, transpose_b=True), axis=2) answer = tf.matmul(alpha1, c_topk) # [32,top_size,1,128] # question W2 = tf.get_variable("W2", initializer=tf.ones([1, 1, self.filters])) W2 = tf.tile(W2, [self.batch_size, 1, 1]) alpha2 = tf.nn.softmax(tf.matmul(W2, q, transpose_b=True), axis=1) ques = tf.matmul(alpha2, q) ques = tf.tile(tf.expand_dims(ques, 1), [1, top_size, 1, 1]) # [32,top_size,1,128] # question & answer W3 = tf.get_variable("W3", initializer=tf.ones([1, 1, self.filters, self.filters])) W3 = tf.tile(W3, [self.batch_size, top_size, 1, 1]) y_topk_logits = tf.nn.sigmoid(tf.matmul(ques, tf.matmul(W3, answer, transpose_b=True))) # [32,top_size,1,1] y_topk_logits = tf.squeeze(y_topk_logits) # [32,top_size] self.yp1 = tf.squeeze(self.yp1) self.yp2 = tf.squeeze(self.yp2) coeff1_topk = tf.one_hot(self.yp1, self.c_maxlen, axis=-1) # [32,top_size,400] one-hot coeff2_topk = tf.one_hot(self.yp2, self.c_maxlen, axis=-1) # [0,1,0,0,0][0,0,0,1,0]->[0,1,1,1,1]-[0,0,0,1,1]->[0,1,1,0,0]+[0,0,0,1,0]->[0,1,1,1,0] coeff1_topk_cumsum = tf.cumsum(coeff1_topk, axis=-1) coeff2_topk_cumsum = tf.cumsum(coeff2_topk, axis=-1) self.y_d = coeff1_topk_cumsum - coeff2_topk_cumsum + coeff2_topk # [32, top_size, 400] def clip_for_sigmoid(output): _epsilon = tf.convert_to_tensor(1e-7, dtype=output.dtype.base_dtype) output = tf.clip_by_value(output, _epsilon, 1 - _epsilon) output = tf.log(output / (1 - output)) return output if self.topk_loss=='f1': # f1 loss y_start_ind = tf.cumsum(self.y_start, axis=-1) y_end_ind = tf.cumsum(self.y_end, axis=-1) y_gtd = y_start_ind - y_end_ind + self.y_end # [32, 400] def cal_num_same(y_pred, y_truth): # [top_size, 400] [400,] def cal_num_same_(y_pred_, y_truth): # [400,] [400,] return tf.reduce_sum(tf.cast(tf.logical_and(tf.cast(y_pred_, tf.bool), tf.cast(y_truth, tf.bool)), tf.float32),axis=-1) return [tf.map_fn(lambda x:cal_num_same_(x,y_truth),y_pred),tf.map_fn(lambda x:cal_num_same_(x,y_truth),y_pred)] num_same = tf.map_fn(lambda x:cal_num_same(x[0], x[1]), [self.y_d, y_gtd])[0] # [32, top_size] y_precision = num_same / (tf.cast(tf.reduce_sum(self.y_d, axis=-1),tf.float32) + 1e-8) # [32, top_size] y_recall = num_same / tf.expand_dims(tf.cast(tf.reduce_sum(y_gtd, axis=-1),tf.float32) + 1e-8, axis=-1) # [32, top_size] y_f1 = (2.0 * y_precision * y_recall) / (tf.cast(y_precision + y_recall,tf.float32) + 1e-8) # [32, top_size] topk_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=clip_for_sigmoid(y_topk_logits), labels=y_f1)) elif self.topk_loss=='em': # em loss start_em = tf.equal(tf.cast(tf.expand_dims(tf.argmax(self.y_start, axis=-1), axis=1), tf.int32), tf.cast(self.yp1, tf.int32)) # [32, top_size] end_em = tf.equal(tf.cast(tf.expand_dims(tf.argmax(self.y_end, axis=-1), axis=1), tf.int32), tf.cast(self.yp2, tf.int32)) # [32, top_size] y_em = tf.cast(tf.logical_and(start_em, end_em), tf.float32) # [32, top_size] topk_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=clip_for_sigmoid(y_topk_logits), labels=y_em)) # final loss self.Lambda1 = tf.get_variable("Lambda1", initializer=tf.constant([0.9]), trainable=False) self.loss = tf.reduce_mean(self.Lambda1 * (start_loss + end_loss) + (1 - self.Lambda1) * topk_loss) # output outer_topk = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2), tf.expand_dims(tf.nn.softmax(self.logits2), axis=1)) outer_topk = tf.matrix_band_part(outer_topk, 0, self.ans_limit) self.output1 = tf.argmax(tf.reduce_max(outer_topk, axis=2), axis=1) self.output2 = tf.argmax(tf.reduce_max(outer_topk, axis=1), axis=1) # diversity loss if self.diversity_loss: self.Lambda2 = tf.get_variable("Lambda2", initializer=tf.constant([0.1]),trainable=False) diversity_loss = tf.reduce_mean(tf.reduce_prod(self.y_d, axis=1),axis=-1) # [32,top_size,400]->[32,400]->[32,] self.loss = self.loss + tf.reduce_mean(self.Lambda2 * diversity_loss) if self.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables) self.loss += l2_loss if self.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(self.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v is not None: self.assign_vars.append(tf.assign(var, v))
def forward(self): config = self.config N = config.batch_size if not self.demo else 1 PL = self.c_maxlen QL = self.q_maxlen CL = config.char_limit # 16 d = config.hidden # 96 dc = config.char_dim # 64 nh = config.num_heads # 1 with tf.variable_scope("Input_Embedding_Layer"): ''' self.ch : (N, c_maxlen, 16) self.qh : (N, q_maxlen, 16) ''' ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) # (N*c_maxlen, 16, 64) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) # (N*q_maxlen, 16, 64) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # BiDAF style conv-highway encoder: conv over chars in each word in a batch of passages ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) # (N*c_maxlen, 16-5+1, 96) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) # (N*q_maxlen, 16-5+1, 96) ch_emb = tf.reduce_max(ch_emb, axis=1) # (N*c_maxlen, 96) qh_emb = tf.reduce_max(qh_emb, axis=1) # (N*q_maxlen, 96) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) # (N, c_maxlen, 96) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) # (N, q_maxlen, 96) ''' self.c : (N, c_maxlen) self.q : (N, q_maxlen) ''' c_emb = tf.nn.dropout(tf.nn.embedding_lookup( self.word_mat, self.c), 1.0 - self.dropout) # (N, c_maxlen, 300) q_emb = tf.nn.dropout(tf.nn.embedding_lookup( self.word_mat, self.q), 1.0 - self.dropout) # (N, q_maxlen, 300) c_emb = tf.concat([c_emb, ch_emb], axis=2) # (N, c_maxlen, 396) q_emb = tf.concat([q_emb, qh_emb], axis=2) # (N, q_maxlen, 396) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) # (N, c_maxlen, 96) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) # (N, q_maxlen, 96) with tf.variable_scope("Embedding_Encoder_Layer"): ''' -> positional encoding -> layer_normalization -> depth-wise separable convolution -> self attention -> feed forward network In the paper: The total number of encoder blocks is 1 ''' # (N, c_maxlen, 96) c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) # (N, q_maxlen, 96) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): ''' tf.tile(input, multiples, name=None): creates a new tensor by replicating input multiples times. The output tensor's i'th dimension has input.dims(i) * multiples[i] elements, and the values of input are replicated multiples[i] times along the 'i'th dimension. Paper: The layer parameters are the same as the Embedding Encoder Layer except that convolution layer number is 2 within a block and the total number of blocks is 7 ''' ''' c: (N, c_maxlen, d) q: (N, q_maxlen, d) ch_emb: (N, c_maxlen, d) qh_emb: (N, q_maxlen, d) C: (N, c_maxlen, q_maxlen, d) Q: (N, c_maxlen, q_maxlen, d) S: (N, c_maxlen, q_maxlen) mask_q: (N, 1, q_maxlen) mask_c: (N, c_maxlen, 1) S_: (N, c_maxlen, q_maxlen) S_T: (N, q_maxlen, c_maxlen) self.c2q: (N, c_maxlen, d) = tf.matmul(S_, q) self.q2c: (N, c_maxlen, d) = tf.matmul(tf.matmul(S_, S_T), c) ''' C = tf.tile(tf.expand_dims(c, 2), [1, 1, self.q_maxlen, 1]) Q = tf.tile(tf.expand_dims(q, 1), [1, self.c_maxlen, 1, 1]) S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q] if config.q2c: attention_outputs.append(c * self.q2c) with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] # d=hidden=96 for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): ''' tf.matrix_band_part: Copy a tensor setting everything outside a central band in each innermost matrix to zero. self.enc[i]: (N, c_maxlen, d) start_logits: (N, c_maxlen) end_logits: (N, c_maxlen) logits1: (N, c_maxlen) logits2: (N, c_maxlen) outer: (N, c_maxlen, c_maxlen) yp1, yp2, losses, losses2: (N,) ''' start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) # find max-score span outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) #DEBUG self.debug_ops.extend([ self.enc[1], start_logits, end_logits, logits1, logits2, outer, self.yp1, self.yp2, losses, losses2, self.loss ]) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.shadow_vars = [] self.global_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.shadow_vars.append(v) self.global_vars.append(var) self.assign_vars = [] for g, v in zip(self.global_vars, self.shadow_vars): self.assign_vars.append(tf.assign(g, v))
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, \ self.q_maxlen, config.char_limit, config.hidden, config.tw_char_dim, config.num_heads with tf.variable_scope("Input_Embedding_Layer"): if config.type == "all": ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) elif config.type == 'char': c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.char_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.char_mat, self.q), 1.0 - self.dropout) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) # guess : mask the padding part pad in the end of the passage self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
def forward(self): config = self.config ''' N: batch_size PL: passage最大长度 QL: question最大长度 CL: 单词最大字母长度 d: 输出通道数 dc: 字母的嵌入维度 nh: 自注意力的头数 ''' N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads # Embedding层:获取词向量和字符向量的拼接 with tf.variable_scope("Input_Embedding_Layer"): # # character嵌入: # 1、先对单词的每个字母进行char2vec ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # 2、将单词对应的word2vec矩阵通过conv编码成向量 # 卷积 ch_emb_shape = [N * PL, CL-5+1, d], qh_emb_shape = [N * QL, CL-5+1, d] ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) # max_time_pooling # ch_emb_shape = [N * PL, d], qh_emb_shape = [N * QL, d] ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) # ch_emb_shape = [N, PL, d], qh_emb_shape = [N, QL, d] ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) # # 词嵌入:从glove获取 c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) # 拼接词向量和字符向量 # c_emb_size = [batch, n_c, c_emb+ch_emb] c_emb = tf.concat([c_emb, ch_emb], axis=2) # q_emb_size = [batch, n_q, c_emb + ch_emb] q_emb = tf.concat([q_emb, qh_emb], axis=2) # 分别通过highway网络 # c_emb_size = [batch, n_c, d] c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) # c_emb_size = [batch, n_q, d] q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) # Stacking Embedding Encoder Block的实现:共1个encoder block,每个7个卷积层,卷积核数d=96 with tf.variable_scope("Embedding_Encoder_Layer"): # c_size = [batch, n_c, d] c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) # q_size = [batch, n_q, d] q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse= True, # 共享passage和question的Stacking Embedding Encoder Block的权重 bias=False, dropout=self.dropout) # Context-Query-Attention实现: with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) # S_size = [batch, n_c, n_q], q_size = [batch, n_q, d], c_size = [batch, n_c, d] S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) # n_q方向进行softmax S_ = tf.nn.softmax(mask_logits(S, mask=mask_q), dim=-1) mask_c = tf.expand_dims(self.c_mask, 2) # n_c方向进行softmax S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) # c2q_size = [batch, n_c, d] self.c2q = tf.matmul(S_, q) # q2c_size = [batch, n_c, d] self.q2c = tf.matmul(tf.matmul(S_, S_T), c) # attention_size = [4, batch, n_c, d] attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] # Stacked Model Encoder Blocks实现:共7个encoder block,每个2个卷积层,卷积核数d=96 with tf.variable_scope("Model_Encoder_Layer"): # c, self.c2q, c * self.c2q, c * self.q2c 按照通道维度进行合并 # input_shape = [batch, n_c, 4d] inputs = tf.concat(attention_outputs, axis=-1) # self.enc[i]_shape = [batch, n_c, d] self.enc = [conv(inputs, d, name="input_projection")] # 3个Stacked Model Encoder Blocks for i in range(3): if i % 2 == 0: # 每两层进行一次dropout self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block( self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, # 共享同一个Stacked Model Encoder Blocks的权重 dropout=self.dropout)) # 输出层实现: with tf.variable_scope("Output_Layer"): # 合并Stacked Model Encoder Blocks的第一个和第二个输出,并和并通道 # start_logits_shape = [batch, n_c, 1] start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) # 合并Stacked Model Encoder Blocks的第一个和第三个输出,并和并通道 # end_logits_shape = [batch, n_c, 1] end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] # outer_shape = [bacth, n_c, n_c] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) # 保留行坐标<纵坐标,且行坐标+纵坐标<=ans_limit的数据,其余置0 outer = tf.matrix_band_part(outer, 0, config.ans_limit) # 最大值的行坐标,代表起始位置 self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) # 最大值的列坐标,代表结束位置 self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) # L2正则化 if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) # control_dependencies传入的操作是先于with后的操作 with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))