def build_model2(self): # embedding # [batch,len,embed] # pretrained_word_embeddings = np.load(f'{curr_dir}/pretrain_emb_300.npy') pretrained_word_embeddings = None s1_embed, _ = embedding(tf.expand_dims(self.s1, -1), conf.vocab_size, conf.embed_size, name='embedding', pretrain_embedding=pretrained_word_embeddings) s1_mask = mask_nonpad_from_embedding( s1_embed) # [batch,len] 1 for nonpad; 0 for pad s1_seqlen = tf.cast(tf.reduce_sum(s1_mask, axis=-1), tf.int32) # [batch] # encoder encoder_input = s1_embed encoder_input = tf.layers.dropout(encoder_input, self.dropout_rate) # dropout idcnn_net = IDCNN() encoder_output = idcnn_net(encoder_input) encoder_output = tf.layers.dropout(encoder_output, self.dropout_rate) # dropout # logits cls_logits = tf.layers.dense(encoder_output, conf.label_size, activation=None, use_bias=True, name='cls_logits') # [batch,length,label] # crf log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood( cls_logits, self.ner_label, tf.to_int32(s1_seqlen)) crf_loss = tf.reduce_mean(-log_likelihood) # loss self.loss = crf_loss # crf decode ner_pred, ner_prob = tf.contrib.crf.crf_decode(cls_logits, transition_params, tf.to_int32(s1_seqlen)) self.ner_prob = tf.identity(ner_prob, name='ner_prob') # [batch] self.ner_pred = tf.identity(ner_pred, name='ner_pred') # [batch,len] with tf.name_scope('accuracy'): ner_acc = tf.cast(tf.equal(self.ner_pred, self.ner_label), tf.float32) * s1_mask # [batch,len] ner_acc = tf.reduce_sum(ner_acc, axis=-1) # [batch] ner_acc = ner_acc / tf.cast(s1_seqlen, tf.float32) # [batch] self.accuracy = tf.reduce_mean(ner_acc, axis=0) # scalar self.global_step = tf.train.get_or_create_global_step() self.optimizer = tf.train.AdamOptimizer(learning_rate=conf.lr) self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
def build_model2(self): # pretrained_word_embeddings = np.load(f'{curr_dir}/pretrain_emb_300.npy') pretrained_word_embeddings = None s1_embed, _ = embedding(tf.expand_dims(self.s1, -1), conf.vocab_size, conf.embed_size, pretrain_embedding=pretrained_word_embeddings) """ encoder """ # encoder_input # [batch,length,embed] encoder_input = s1_embed encoder_valid_mask = mask_nonpad_from_embedding(encoder_input) # [batch,length] 1 for nonpad; 0 for pad encoder_input = add_timing_signal_1d(encoder_input) # add position embedding encoder_input = tf.layers.dropout(encoder_input, self.dropout_rate) # dropout encoder_output = transformer_encoder(encoder_input, encoder_valid_mask, hidden_size=conf.hidden_size, filter_size=conf.hidden_size * 4, num_heads=conf.num_heads, num_encoder_layers=conf.num_encoder_layers, dropout=self.dropout_rate, attention_dropout=self.dropout_rate, relu_dropout=self.dropout_rate, ) # attention # e.g. HAN attn_output = multi_head_attention_base_pool(encoder_output, encoder_valid_mask, conf.embed_size, conf.embed_size, 6, name='sent_pool', reuse=tf.AUTO_REUSE, dropout=self.dropout_rate, attn_v=None) self.logits = tf.layers.dense(attn_output, conf.label_size, use_bias=True, name='logits') # [batch,cls] self.one_hot_target = tf.one_hot(self.target, conf.label_size) smooth_one_hot_target = label_smoothing(self.one_hot_target) self.loss = softmax_cross_entropy(self.logits, smooth_one_hot_target) self.y_prob = tf.nn.softmax(self.logits) self.y_prob = tf.identity(self.y_prob, name='y_prob') with tf.name_scope("accuracy"): self.correct = tf.equal( tf.argmax(self.logits, -1, output_type=tf.int32), self.target) self.accuracy = tf.reduce_mean(tf.cast(self.correct, tf.float32)) self.global_step = tf.train.get_or_create_global_step() self.optimizer = tf.train.AdamOptimizer(learning_rate=conf.lr) self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
def build_model1(self): self.uttn_enc_hidden_size = 256 self.ctx_enc_hidden_size = 256 batch_size, num_turns, length = shape_list(self.multi_s1) # embedding # [batch,len,turn,embed] multi_s1_embed, _ = embedding(tf.expand_dims(self.multi_s1, -1), conf.vocab_size, conf.embed_size, name='share_embedding', pretrain_embedding=conf.pretrain_emb) # [batch,len,embed] s2_embed, _ = embedding(tf.expand_dims(self.s2, -1), conf.vocab_size, conf.embed_size, name='share_embedding', pretrain_embedding=conf.pretrain_emb) # uttn encoder uttn_input = tf.reshape(multi_s1_embed, [-1, length, conf.embed_size]) # [batch*turn,len,embed] uttn_mask = mask_nonpad_from_embedding(uttn_input) # [batch*turn,len] 1 for nonpad; 0 for pad uttn_seqlen = tf.cast(tf.reduce_sum(uttn_mask, axis=-1), tf.int32) # [batch*turn] # uttn-gru self.encoder_uttn_rnn = RNN(cell_name='GRUCell', name='uttn_enc', hidden_size=self.uttn_enc_hidden_size, dropout_rate=self.dropout_rate) _, uttn_embed = self.encoder_uttn_rnn(uttn_input, uttn_seqlen) # [batch*turn,hid] uttn_embed = tf.reshape(uttn_embed, [batch_size, num_turns, self.uttn_enc_hidden_size]) # [batch,turn,hid] # 之后turn相当于len # ctx encoder ctx_mask = mask_nonpad_from_embedding(uttn_embed) # [batch,turn] 1 for nonpad; 0 for pad ctx_seqlen = tf.cast(tf.reduce_sum(ctx_mask, axis=-1), tf.int32) # [batch] # ctx-gru self.encoder_ctx_rnn = RNN(cell_name='GRUCell', name='ctx_enc', hidden_size=self.ctx_enc_hidden_size, dropout_rate=self.dropout_rate) _, ctx_embed = self.encoder_ctx_rnn(uttn_embed, ctx_seqlen) # [batch,hid] # rnn decoder train (no attention) s2_mask = mask_nonpad_from_embedding(s2_embed) # [batch,len] 1 for nonpad; 0 for pad s2_seqlen = tf.cast(tf.reduce_sum(s2_mask, axis=-1), tf.int32) # [batch] decoder_input = shift_right(s2_embed) # 用pad当做eos decoder_input = tf.layers.dropout(decoder_input, rate=self.dropout_rate) # dropout # 输入拼上ctx decoder_ctx = tf.tile(tf.expand_dims(ctx_embed, axis=1), [1, shape_list(decoder_input)[1], 1]) # [batch,len,hid] decoder_input = tf.concat([decoder_input, decoder_ctx], axis=2) self.decoder_rnn = RNN(cell_name='GRUCell', name='dec', hidden_size=conf.embed_size, dropout_rate=self.dropout_rate) decoder_output, decoder_state = self.decoder_rnn(decoder_input, s2_seqlen) logits = proj_logits(decoder_output, conf.embed_size, conf.vocab_size, name='share_embedding') onehot_s2 = tf.one_hot(self.s2, depth=conf.vocab_size) # [batch,len,vocab] xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=onehot_s2) # [batch,len] weights = tf.to_float(tf.not_equal(self.s2, 0)) # [batch,len] 1 for nonpad; 0 for pad loss_num = xentropy * weights # [batch,len] loss_den = weights # [batch,len] loss = tf.reduce_sum(loss_num) / tf.reduce_sum(loss_den) # scalar self.loss = loss # rnn decoder infer (no attention) # 放在cache里面的在后面symbols_to_logits_fn函数中都会变成batch * beam cache = {'state': self.decoder_rnn.cell.zero_state(batch_size, tf.float32), # [batch,hid] 'ctx': ctx_embed, # [batch,hid] } def symbols_to_logits_fn(ids, i, cache): # ids [batch,length] pred_target = ids[:, -1:] # [batch,1] 截取最后一个 target_embed, _ = embedding(tf.expand_dims(pred_target, axis=-1), conf.vocab_size, conf.embed_size, 'share_embedding') # [batch,1,embed] decoder_input = tf.squeeze(target_embed, axis=1) # [batch,embed] # 输入加上ctx decoder_input = tf.concat([decoder_input, cache['ctx']], axis=-1) # run rnn decoder_output, cache['state'] = self.decoder_rnn.one_step(decoder_input, cache['state']) logits = proj_logits(decoder_output, conf.embed_size, conf.vocab_size, name='share_embedding') return logits, cache initial_ids = tf.zeros([batch_size], dtype=tf.int32) # <pad>为<sos> def greedy_search_wrapper(): """ Greedy Search """ decoded_ids, scores = greedy_search( symbols_to_logits_fn, initial_ids, conf.max_decode_len, cache=cache, eos_id=conf.eos_id, ) return decoded_ids, scores def beam_search_wrapper(): """ Beam Search """ decoded_ids, scores = beam_search( # [batch,beam,len] [batch,beam] symbols_to_logits_fn, initial_ids, conf.beam_size, conf.max_decode_len, conf.vocab_size, alpha=0, states=cache, eos_id=conf.eos_id, ) return decoded_ids, scores decoded_ids, scores = tf.cond(tf.equal(conf.beam_size, 1), greedy_search_wrapper, beam_search_wrapper) self.decoded_ids = tf.identity(decoded_ids, name='decoded_ids') # [batch,beam/1,len] self.scores = tf.identity(scores, name='scores') # [batch,beam/1] self.global_step = tf.train.get_or_create_global_step() self.optimizer = tf.train.AdamOptimizer(learning_rate=conf.lr) self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
def build_model3(self): assert not conf.hidden_size % 2 # 被2整除 self.uttn_enc_hidden_size = conf.hidden_size // 2 batch_size, num_turns, length = shape_list(self.multi_s1) # embedding # [batch,len,turn,embed] multi_s1_embed, _ = embedding(tf.expand_dims(self.multi_s1, -1), conf.vocab_size, conf.embed_size, name='share_embedding', pretrain_embedding=conf.pretrain_emb) # [batch,len,embed] s2_embed, _ = embedding(tf.expand_dims(self.s2, -1), conf.vocab_size, conf.embed_size, name='share_embedding', pretrain_embedding=conf.pretrain_emb) # uttn encoder uttn_input = tf.reshape(multi_s1_embed, [-1, length, conf.embed_size]) # [batch*turn,len,embed] uttn_mask = mask_nonpad_from_embedding(uttn_input) # [batch*turn,len] 1 for nonpad; 0 for pad uttn_seqlen = tf.cast(tf.reduce_sum(uttn_mask, axis=-1), tf.int32) # [batch*turn] # uttn-gru self.encoder_uttn_rnn = Bi_RNN(cell_name='GRUCell', name='uttn_enc', hidden_size=self.uttn_enc_hidden_size, dropout_rate=self.dropout_rate) _, uttn_embed = self.encoder_uttn_rnn(uttn_input, uttn_seqlen) # [batch*turn,len,2hid] [batch*turn,2hid] uttn_embed = tf.reshape(uttn_embed, [batch_size, num_turns, self.uttn_enc_hidden_size * 2]) # [batch,turn,2hid] # 之后turn相当于len # transformer ctx encoder encoder_valid_mask = mask_nonpad_from_embedding(uttn_embed) # [batch,turn] 1 for nonpad; 0 for pad encoder_input = add_timing_signal_1d(uttn_embed) # add position embedding encoder_input = tf.layers.dropout(encoder_input, rate=self.dropout_rate) # dropout encoder_output = transformer_encoder(encoder_input, encoder_valid_mask, hidden_size=conf.hidden_size, filter_size=conf.hidden_size * 4, num_heads=conf.num_heads, num_encoder_layers=conf.num_encoder_layers, dropout=self.dropout_rate, attention_dropout=self.dropout_rate, relu_dropout=self.dropout_rate, ) # transformer decoder decoder_input = s2_embed decoder_valid_mask = mask_nonpad_from_embedding(decoder_input) # [batch,len] 1 for nonpad; 0 for pad decoder_input = shift_right(decoder_input) # 用pad当做eos decoder_input = add_timing_signal_1d(decoder_input) decoder_input = tf.layers.dropout(decoder_input, rate=self.dropout_rate) # dropout decoder_output = transformer_decoder(decoder_input, encoder_output, decoder_valid_mask, encoder_valid_mask, cache=None, hidden_size=conf.hidden_size, filter_size=conf.hidden_size * 4, num_heads=conf.num_heads, num_decoder_layers=conf.num_decoder_layers, dropout=self.dropout_rate, attention_dropout=self.dropout_rate, relu_dropout=self.dropout_rate, ) logits = proj_logits(decoder_output, conf.embed_size, conf.vocab_size, name='share_embedding') onehot_s2 = tf.one_hot(self.s2, depth=conf.vocab_size) # [batch,len,vocab] xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=onehot_s2) # [batch,len] weights = tf.to_float(tf.not_equal(self.s2, 0)) # [batch,len] 1 for nonpad; 0 for pad loss_num = xentropy * weights # [batch,len] loss_den = weights # [batch,len] loss = tf.reduce_sum(loss_num) / tf.reduce_sum(loss_den) # scalar self.loss = loss # transformer decoder infer # 放在cache里面的在后面symbols_to_logits_fn函数中都会变成batch * beam # 初始化缓存 cache = { 'layer_%d' % layer: { # 用以缓存decoder过程前面已计算的k,v 'k': split_heads(tf.zeros([batch_size, 0, conf.embed_size]), conf.num_heads), 'v': split_heads(tf.zeros([batch_size, 0, conf.embed_size]), conf.num_heads) } for layer in range(conf.num_decoder_layers) } for layer in range(conf.num_decoder_layers): # 对于decoder每层均需与encoder顶层隐状态计算attention,相应均有特定的k,v可缓存 layer_name = 'layer_%d' % layer with tf.variable_scope('decoder/%s/encdec_attention/multihead_attention' % layer_name): k_encdec = tf.layers.dense(encoder_output, conf.embed_size, use_bias=False, name='k', reuse=tf.AUTO_REUSE) k_encdec = split_heads(k_encdec, conf.num_heads) v_encdec = tf.layers.dense(encoder_output, conf.embed_size, use_bias=False, name='v', reuse=tf.AUTO_REUSE) v_encdec = split_heads(v_encdec, conf.num_heads) cache[layer_name]['k_encdec'] = k_encdec cache[layer_name]['v_encdec'] = v_encdec cache['encoder_output'] = encoder_output cache['encoder_mask'] = encoder_valid_mask # position embedding position_embedding = get_timing_signal_1d(conf.max_decode_len, conf.embed_size) # +eos [1,length+1,embed] def symbols_to_logits_fn(ids, i, cache): ids = ids[:, -1:] # [batch,1] 截取最后一个 target_embed, _ = embedding(tf.expand_dims(ids, axis=-1), conf.vocab_size, conf.embed_size, 'share_embedding', reuse=True) # [batch,1,hidden] decoder_input = target_embed + position_embedding[:, i:i + 1, :] # [batch,1,hidden] encoder_output = cache['encoder_output'] encoder_mask = cache['encoder_mask'] with tf.variable_scope('', reuse=tf.AUTO_REUSE): decoder_output = transformer_decoder(decoder_input, encoder_output, None, encoder_mask, cache=cache, # 注意infer要cache hidden_size=conf.embed_size, filter_size=conf.embed_size * 4, num_heads=6, num_decoder_layers=6, ) logits = proj_logits(decoder_output, conf.embed_size, conf.vocab_size, name='share_embedding') # [batch,1,vocab] ret = tf.squeeze(logits, axis=1) # [batch,vocab] return ret, cache initial_ids = tf.zeros([batch_size], dtype=tf.int32) # <pad>为<sos> def greedy_search_wrapper(): """ Greedy Search """ decoded_ids, scores = greedy_search( symbols_to_logits_fn, initial_ids, conf.max_decode_len, cache=cache, eos_id=conf.eos_id, ) return decoded_ids, scores def beam_search_wrapper(): """ Beam Search """ decoded_ids, scores = beam_search( # [batch,beam,len] [batch,beam] symbols_to_logits_fn, initial_ids, conf.beam_size, conf.max_decode_len, conf.vocab_size, alpha=0, states=cache, eos_id=conf.eos_id, ) return decoded_ids, scores decoded_ids, scores = tf.cond(tf.equal(conf.beam_size, 1), greedy_search_wrapper, beam_search_wrapper) self.decoded_ids = tf.identity(decoded_ids, name='decoded_ids') # [batch,beam/1,len] self.scores = tf.identity(scores, name='scores') # [batch,beam/1] self.global_step = tf.train.get_or_create_global_step() self.optimizer = tf.train.AdamOptimizer(learning_rate=conf.lr) self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
def build_model2(self): self.uttn_enc_hidden_size = 256 self.ctx_enc_hidden_size = 256 batch_size, num_turns, length = shape_list(self.multi_s1) # embedding # [batch,len,turn,embed] multi_s1_embed, _ = embedding(tf.expand_dims(self.multi_s1, -1), conf.vocab_size, conf.embed_size, name='share_embedding', pretrain_embedding=conf.pretrain_emb) # [batch,len,embed] s2_embed, _ = embedding(tf.expand_dims(self.s2, -1), conf.vocab_size, conf.embed_size, name='share_embedding', pretrain_embedding=conf.pretrain_emb) # uttn encoder uttn_input = tf.reshape(multi_s1_embed, [-1, length, conf.embed_size]) # [batch*turn,len,embed] uttn_mask = mask_nonpad_from_embedding(uttn_input) # [batch*turn,len] 1 for nonpad; 0 for pad uttn_seqlen = tf.cast(tf.reduce_sum(uttn_mask, axis=-1), tf.int32) # [batch*turn] # uttn-gru self.encoder_uttn_rnn = Bi_RNN(cell_name='GRUCell', name='uttn_enc', hidden_size=self.uttn_enc_hidden_size, dropout_rate=self.dropout_rate) uttn_repre, uttn_embed = self.encoder_uttn_rnn(uttn_input, uttn_seqlen) # [batch*turn,len,2hid] [batch*turn,2hid] uttn_embed = tf.reshape(uttn_embed, [batch_size, num_turns, self.uttn_enc_hidden_size * 2]) # [batch,turn,2hid] uttn_repre = tf.reshape(uttn_repre, [batch_size, num_turns, length, self.uttn_enc_hidden_size * 2]) # [batch,turn,len,2hid] # ctx encoder ctx_mask = mask_nonpad_from_embedding(uttn_embed) # [batch,turn] 1 for nonpad; 0 for pad ctx_seqlen = tf.cast(tf.reduce_sum(ctx_mask, axis=-1), tf.int32) # [batch] # reverse turn uttn_repre = tf.reverse_sequence(uttn_repre, seq_lengths=ctx_seqlen, seq_axis=1, batch_axis=0) # ctx-gru self.encoder_ctx_rnn = RNN(cell_name='GRUCell', name='ctx_enc', hidden_size=self.ctx_enc_hidden_size, dropout_rate=self.dropout_rate) init_ctx_encoder_state = self.encoder_ctx_rnn.cell.zero_state(batch_size, tf.float32) # rnn decoder train s2_mask = mask_nonpad_from_embedding(s2_embed) # [batch,len] 1 for nonpad; 0 for pad s2_seqlen = tf.cast(tf.reduce_sum(s2_mask, axis=-1), tf.int32) # [batch] decoder_input = shift_right(s2_embed) # 用pad当做eos decoder_input = tf.layers.dropout(decoder_input, rate=self.dropout_rate) # dropout self.decoder_rnn = RNN(cell_name='GRUCell', name='dec', hidden_size=conf.embed_size, dropout_rate=self.dropout_rate) init_decoder_state = self.decoder_rnn.cell.zero_state(batch_size, tf.float32) # 两重循环 -_- # 外层循环是decoder解码的每一步 time_step def loop_condition(time_step, *_): return tf.less(time_step, tf.reduce_max(s2_seqlen)) def loop_body(time_step, dec_rnn_state, dec_rnn_output): # cal decoder ctx # word level attention # 内层循环是对于解码每一步 ctx_rnn上turn的每一步,最终生成ctx序列 turn_step def inner_loop_condition(turn_step, *_): return tf.less(turn_step, tf.reduce_max(ctx_seqlen)) def inner_loop_body(turn_step, ctx_rnn_state, ctx_rnn_output): # 根据si, ctx_init_state 递归计算多个turn的句子ctx q_antecedent = tf.concat([ctx_rnn_state, dec_rnn_state], axis=-1) # [batch, h] q_antecedent = tf.tile(tf.expand_dims(q_antecedent, 1), [1, length, 1]) # [batch,len,h] # 抽取每个batch的第i个turn # sent_repre [batch,turn,len,h] q_antecedent = tf.concat([uttn_repre[:, turn_step, :, :], q_antecedent], -1) # [batch,len,h] uttn_mask_in_turn = tf.reshape(uttn_mask, [batch_size, num_turns, length])[:, turn_step, :] # [batch,len] # word-level-attn h = tf.layers.dense(q_antecedent, 128, activation=tf.nn.tanh, use_bias=True, name='word_level_attn/layer1') energy = tf.layers.dense(h, 1, use_bias=True, name='word_level_attn/layer2') # [batch,len,1] energy = tf.squeeze(energy, -1) + (1. - uttn_mask_in_turn) * -1e9 alpha = tf.nn.softmax(energy) # [batch,len] r_in_turn = tf.reduce_sum(tf.expand_dims(alpha, -1) * uttn_repre[:, turn_step, :, :], 1) # [batch,h] ctx_rnn_output_, ctx_rnn_state = self.encoder_ctx_rnn.one_step(r_in_turn, ctx_rnn_state) # attch ctx_rnn_output = tf.concat([ctx_rnn_output, tf.expand_dims(ctx_rnn_output_, 1)], 1) # [batch,turn,h] return turn_step + 1, ctx_rnn_state, ctx_rnn_output # start inner loop final_turn_step, final_state, ctx_rnn_output = tf.while_loop( inner_loop_condition, inner_loop_body, loop_vars=[tf.constant(0, dtype=tf.int32), init_ctx_encoder_state, tf.zeros([batch_size, 0, self.ctx_enc_hidden_size])], shape_invariants=[ tf.TensorShape([]), nest.map_structure(get_state_shape_invariants, init_ctx_encoder_state), tf.TensorShape([None, None, self.ctx_enc_hidden_size]), ]) # ctx_rnn_output # [batch,turn,h] # dec_rnn_state # [batch,h] # ctx-level-attn # q_antecedent = tf.tile(tf.expand_dims(dec_rnn_state, axis=1), [1, num_turns, 1]) # [batch,turn,h] # 这样只拿当前batch中的尽可能小的turns数量而不是固定turn q_antecedent = tf.tile(tf.expand_dims(dec_rnn_state, axis=1), [1, shape_list(ctx_rnn_output)[1], 1]) # [batch,turn,h] q_antecedent = tf.concat([q_antecedent, ctx_rnn_output], 2) # [batch,turn,h] h = tf.layers.dense(q_antecedent, 128, activation=tf.nn.tanh, use_bias=True, name='ctx_level_attn/layer1') energy = tf.layers.dense(h, 1, use_bias=True, name='ctx_level_attn/layer2') # [batch,turn,1] energy = tf.squeeze(energy, -1) + (1. - ctx_mask) * -1e9 # [batch,turn] alpha = tf.nn.softmax(energy) # [batch,turn] ctx_input_in_dec = tf.reduce_sum(tf.expand_dims(alpha, -1) * ctx_rnn_output, 1) # [batch,h] dec_rnn_input = tf.concat([ctx_input_in_dec, decoder_input[:, time_step, :]], -1) # [batch,h] dec_rnn_output_, dec_rnn_state = self.decoder_rnn.one_step(dec_rnn_input, dec_rnn_state) dec_rnn_output = tf.concat([dec_rnn_output, tf.expand_dims(dec_rnn_output_, 1)], 1) return time_step + 1, dec_rnn_state, dec_rnn_output # start outer loop final_time_step, final_state, dec_rnn_output = tf.while_loop( loop_condition, loop_body, loop_vars=[tf.constant(0, dtype=tf.int32), init_decoder_state, tf.zeros([batch_size, 0, conf.embed_size])], shape_invariants=[ tf.TensorShape([]), nest.map_structure(get_state_shape_invariants, init_decoder_state), tf.TensorShape([None, None, conf.embed_size]), ]) decoder_output = dec_rnn_output logits = proj_logits(decoder_output, conf.embed_size, conf.vocab_size, name='share_embedding') onehot_s2 = tf.one_hot(self.s2, depth=conf.vocab_size) # [batch,len,vocab] xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=onehot_s2) # [batch,len] weights = tf.to_float(tf.not_equal(self.s2, 0)) # [batch,len] 1 for nonpad; 0 for pad loss_num = xentropy * weights # [batch,len] loss_den = weights # [batch,len] loss = tf.reduce_sum(loss_num) / tf.reduce_sum(loss_den) # scalar self.loss = loss # rnn decoder infer # 放在cache里面的在后面symbols_to_logits_fn函数中都会变成batch * beam cache = {'dec_rnn_state': self.decoder_rnn.cell.zero_state(batch_size, tf.float32), # [batch,hid] 'ctx_rnn_state': self.encoder_ctx_rnn.cell.zero_state(batch_size, tf.float32), # [batch,hid] 'uttn_repre': uttn_repre # [batch,turn,len,2hid] } def symbols_to_logits_fn(ids, i, cache): # ids [batch,length] pred_target = ids[:, -1:] # [batch,1] 截取最后一个 target_embed, _ = embedding(tf.expand_dims(pred_target, axis=-1), conf.vocab_size, conf.embed_size, 'share_embedding') # [batch,1,embed] decoder_input = tf.squeeze(target_embed, axis=1) # [batch,embed] dec_rnn_state = cache['dec_rnn_state'] with tf.variable_scope('', reuse=tf.AUTO_REUSE): # 内层循环是对于解码每一步 ctx_rnn上turn的每一步,最终生成ctx序列 turn_step def inner_loop_condition(turn_step, *_): return tf.less(turn_step, tf.reduce_max(ctx_seqlen)) def inner_loop_body(turn_step, ctx_rnn_state, ctx_rnn_output): # 根据si, ctx_init_state 递归计算多个turn的句子ctx q_antecedent = tf.concat([ctx_rnn_state, dec_rnn_state], axis=-1) # [batch, h] q_antecedent = tf.tile(tf.expand_dims(q_antecedent, 1), [1, length, 1]) # [batch,len,h] # 抽取每个batch的第i个turn # sent_repre [batch,turn,len,h] q_antecedent = tf.concat([cache['uttn_repre'][:, turn_step, :, :], q_antecedent], -1) # [batch,len,h] uttn_mask_in_turn = tf.reshape(uttn_mask, [batch_size, num_turns, length])[:, turn_step, :] # [batch,len] # word-level-attn h = tf.layers.dense(q_antecedent, 128, activation=tf.nn.tanh, use_bias=True, name='word_level_attn/layer1') energy = tf.layers.dense(h, 1, use_bias=True, name='word_level_attn/layer2') # [batch,len,1] energy = tf.squeeze(energy, -1) + (1. - uttn_mask_in_turn) * -1e9 alpha = tf.nn.softmax(energy) # [batch,len] r_in_turn = tf.reduce_sum(tf.expand_dims(alpha, -1) * cache['uttn_repre'][:, turn_step, :, :], 1) # [batch,h] ctx_rnn_output_, ctx_rnn_state = self.encoder_ctx_rnn.one_step(r_in_turn, ctx_rnn_state) # attch ctx_rnn_output = tf.concat([ctx_rnn_output, tf.expand_dims(ctx_rnn_output_, 1)], 1) # [batch,turn,h] return turn_step + 1, ctx_rnn_state, ctx_rnn_output # start inner loop final_turn_step, final_state, ctx_rnn_output = tf.while_loop( inner_loop_condition, inner_loop_body, loop_vars=[tf.constant(0, dtype=tf.int32), cache['ctx_rnn_state'], tf.zeros([shape_list(cache['ctx_rnn_state'])[0], 0, self.ctx_enc_hidden_size])], shape_invariants=[ tf.TensorShape([]), nest.map_structure(get_state_shape_invariants, init_ctx_encoder_state), tf.TensorShape([None, None, self.ctx_enc_hidden_size]), ]) # ctx_rnn_output # [batch,turn,h] # dec_rnn_state # [batch,h] # ctx-level-attn # q_antecedent = tf.tile(tf.expand_dims(dec_rnn_state, axis=1), [1, num_turns, 1]) # [batch,turn,h] # 这样只拿当前batch中的尽可能小的turns数量而不是固定turn q_antecedent = tf.tile(tf.expand_dims(dec_rnn_state, axis=1), [1, shape_list(ctx_rnn_output)[1], 1]) # [batch,turn,h] q_antecedent = tf.concat([q_antecedent, ctx_rnn_output], 2) # [batch,turn,h] h = tf.layers.dense(q_antecedent, 128, activation=tf.nn.tanh, use_bias=True, name='ctx_level_attn/layer1') energy = tf.layers.dense(h, 1, use_bias=True, name='ctx_level_attn/layer2') # [batch,turn,1] energy = tf.squeeze(energy, -1) + (1. - ctx_mask) * -1e9 # [batch,turn] alpha = tf.nn.softmax(energy) # [batch,turn] ctx_input_in_dec = tf.reduce_sum(tf.expand_dims(alpha, -1) * ctx_rnn_output, 1) # [batch,h] dec_rnn_input = tf.concat([ctx_input_in_dec, decoder_input], -1) # [batch,h] dec_rnn_output_, dec_rnn_state = self.decoder_rnn.one_step(dec_rnn_input, dec_rnn_state) cache['dec_rnn_state'] = dec_rnn_state logits = proj_logits(dec_rnn_output_, conf.embed_size, conf.vocab_size, name='share_embedding') return logits, cache initial_ids = tf.zeros([batch_size], dtype=tf.int32) # <pad>为<sos> def greedy_search_wrapper(): """ Greedy Search """ decoded_ids, scores = greedy_search( symbols_to_logits_fn, initial_ids, conf.max_decode_len, cache=cache, eos_id=conf.eos_id, ) return decoded_ids, scores def beam_search_wrapper(): """ Beam Search """ decoded_ids, scores = beam_search( # [batch,beam,len] [batch,beam] symbols_to_logits_fn, initial_ids, conf.beam_size, conf.max_decode_len, conf.vocab_size, alpha=0, states=cache, eos_id=conf.eos_id, ) return decoded_ids, scores decoded_ids, scores = tf.cond(tf.equal(conf.beam_size, 1), greedy_search_wrapper, beam_search_wrapper) self.decoded_ids = tf.identity(decoded_ids, name='decoded_ids') # [batch,beam/1,len] self.scores = tf.identity(scores, name='scores') # [batch,beam/1] self.global_step = tf.train.get_or_create_global_step() self.optimizer = tf.train.AdamOptimizer(learning_rate=conf.lr) self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
def build_model1(self): # embedding # [batch,len,embed] s1_embed, _ = embedding(tf.expand_dims(self.s1, -1), conf.vocab_size, conf.embed_size, name='share_embedding', pretrain_embedding=conf.pretrain_emb) s2_embed, _ = embedding(tf.expand_dims(self.s2, -1), conf.vocab_size, conf.embed_size, name='share_embedding', pretrain_embedding=conf.pretrain_emb) s1_input_mask = mask_nonpad_from_embedding(s1_embed) # [batch,len1] 1 for nonpad; 0 for pad s2_input_mask = mask_nonpad_from_embedding(s2_embed) # [batch,len2] 1 for nonpad; 0 for pad s1_seq_len = tf.cast(tf.reduce_sum(s1_input_mask, axis=-1), tf.int32) # [batch] s2_seq_len = tf.cast(tf.reduce_sum(s2_input_mask, axis=-1), tf.int32) # [batch] # bilstm sent encoder self.bilstm_encoder1 = Bi_RNN(cell_name='LSTMCell', hidden_size=conf.birnn_hidden_size, dropout_rate=self.dropout_rate) s1_bar, _ = self.bilstm_encoder1(s1_embed, s1_seq_len) # [batch,len1,2hid] s2_bar, _ = self.bilstm_encoder1(s2_embed, s2_seq_len) # [batch,len2,2hid] # local inference 局部推理 with tf.variable_scope('local_inference'): # 点积注意力 attention_logits = tf.matmul(s1_bar, tf.transpose(s2_bar, [0, 2, 1])) # [batch,len1,len2] # 注意需attention mask pad_mask * -inf + logits attention_s1 = tf.nn.softmax(attention_logits + tf.expand_dims((1. - s2_input_mask) * -1e9, 1)) # [batch,len1,len2] attention_s2 = tf.nn.softmax(tf.transpose(attention_logits, [0, 2, 1]) + tf.expand_dims((1. - s1_input_mask) * -1e9, 1)) # [batch,len2,len1] s1_hat = tf.matmul(attention_s1, s2_bar) # [batch,len1,2hid] s2_hat = tf.matmul(attention_s2, s1_bar) # [batch,len2,2hid] s1_diff = s1_bar - s1_hat s1_mul = s1_bar * s1_hat s2_diff = s2_bar - s2_hat s2_mul = s2_bar * s2_hat m_s1 = tf.concat([s1_bar, s1_hat, s1_diff, s1_mul], axis=2) # [batch,len1,8hid] m_s2 = tf.concat([s2_bar, s2_hat, s2_diff, s2_mul], axis=2) # [batch,len2,8hid] # composition 推理组成 with tf.variable_scope('composition'): self.bilstm_encoder2 = Bi_RNN(cell_name='LSTMCell', hidden_size=conf.birnn_hidden_size, dropout_rate=self.dropout_rate) v_s1, _ = self.bilstm_encoder2(m_s1, s1_seq_len) # [batch,len1,2hid] v_s2, _ = self.bilstm_encoder2(m_s2, s2_seq_len) # [batch,len2,2hid] # average pooling # 需将pad的vector变为0 v_s1 = v_s1 * tf.expand_dims(s1_input_mask, -1) # [batch,len1,2hid] v_s2 = v_s2 * tf.expand_dims(s2_input_mask, -1) # [batch,len1,2hid] v_s1_avg = tf.reduce_sum(v_s1, axis=1) / tf.cast(tf.expand_dims(s1_seq_len, -1), tf.float32) # [batch,2hid] v_s2_avg = tf.reduce_sum(v_s2, axis=1) / tf.cast(tf.expand_dims(s2_seq_len, -1), tf.float32) # [batch,2hid] # max pooling # 需将pad的vector变为极小值 v_s1_max = tf.reduce_max(v_s1 + tf.expand_dims((1. - s1_input_mask) * -1e9, -1), axis=1) # [batch,2hid] v_s2_max = tf.reduce_max(v_s2 + tf.expand_dims((1. - s2_input_mask) * -1e9, -1), axis=1) # [batch,2hid] v = tf.concat([v_s1_avg, v_s1_max, v_s2_avg, v_s2_max], axis=-1) # [batch,8hid] with tf.variable_scope('ffn'): h_ = tf.layers.dropout(v, rate=self.dropout_rate) h = tf.layers.dense(h_, 256, activation=tf.nn.relu, kernel_initializer=tf.random_normal_initializer(0.0, 0.1)) # [batch,256] o_ = tf.layers.dropout(h, rate=self.dropout_rate) o = tf.layers.dense(o_, 1, kernel_initializer=tf.random_normal_initializer(0.0, 0.1)) # [batch,1] self.logits = tf.squeeze(o, -1) # [batch] # loss with tf.name_scope('loss'): loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(self.target, tf.float32), logits=self.logits) # [batch] loss = tf.reduce_mean(loss, -1) # scalar l2_reg = conf.l2_reg weights = [v for v in tf.trainable_variables() if ('w' in v.name) or ('kernel' in v.name)] l2_loss = tf.add_n([tf.nn.l2_loss(w) for w in weights]) * l2_reg loss += l2_loss self.loss = loss self.y_prob = tf.nn.sigmoid(self.logits) self.y_prob = tf.identity(self.y_prob, name='y_prob') with tf.name_scope("accuracy"): self.correct = tf.equal( tf.cast(tf.greater_equal(self.y_prob, 0.5), tf.int32), self.target) self.accuracy = tf.reduce_mean(tf.cast(self.correct, tf.float32)) self.global_step = tf.train.get_or_create_global_step() self.optimizer = tf.train.AdamOptimizer(learning_rate=conf.lr) self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
def build_model3(self): # biGRU encoder + bah_attn + GRU decoder # embedding # [batch,len,embed] # pretrained_word_embeddings = np.load(f'{curr_dir}/pretrain_emb_300.npy') pretrained_word_embeddings = None s1_embed, _ = embedding(tf.expand_dims(self.s1, -1), conf.vocab_size, conf.embed_size, name='share_embedding', pretrain_embedding=pretrained_word_embeddings) s1_mask = mask_nonpad_from_embedding(s1_embed) # [batch,len] 1 for nonpad; 0 for pad s1_seqlen = tf.cast(tf.reduce_sum(s1_mask, axis=-1), tf.int32) # [batch] # encoder encoder_input = s1_embed encoder_input = tf.layers.dropout(encoder_input, rate=self.dropout_rate) # dropout with tf.variable_scope('birnn_encoder'): self.bilstm_encoder1 = Bi_RNN(cell_name='GRUCell', hidden_size=conf.hidden_size, dropout_rate=self.dropout_rate) encoder_output, _ = self.bilstm_encoder1(encoder_input, s1_seqlen) # [batch,len,2hid] batch_size = shape_list(encoder_input)[0] # decoder decoder_rnn = getattr(tf.nn.rnn_cell, 'GRUCell')(conf.hidden_size) # GRUCell/LSTMCell encdec_atten = EncDecAttention(encoder_output, s1_seqlen, conf.hidden_size) s2_embed, _ = embedding(tf.expand_dims(self.s2, -1), conf.vocab_size, conf.embed_size, name='share_embedding', pretrain_embedding=pretrained_word_embeddings) s2_mask = mask_nonpad_from_embedding(s2_embed) # [batch,len] 1 for nonpad; 0 for pad s2_seqlen = tf.cast(tf.reduce_sum(s2_mask, -1), tf.int32) # [batch] decoder_input = s2_embed decoder_input = shift_right(decoder_input) # 用pad当做eos decoder_input = tf.layers.dropout(decoder_input, rate=self.dropout_rate) # dropout init_decoder_state = decoder_rnn.zero_state(batch_size, tf.float32) # init_decoder_state = tf.nn.rnn_cell.LSTMStateTuple(c, h) time_step = tf.constant(0, dtype=tf.int32) rnn_output = tf.zeros([batch_size, 0, conf.hidden_size]) context_output = tf.zeros([batch_size, 0, conf.hidden_size * 2]) # 注意力 def loop_condition(time_step, *_): return tf.less(time_step, tf.reduce_max(s2_seqlen)) def loop_body(time_step, prev_rnn_state, rnn_output, context_output): # attention s = prev_rnn_state if isinstance(decoder_rnn, tf.nn.rnn_cell.GRUCell) else prev_rnn_state.h context = encdec_atten(s) # [batch,hidden] context_output = tf.concat([context_output, tf.expand_dims(context, axis=1)], axis=1) # construct rnn input rnn_input = tf.concat([decoder_input[:, time_step, :], context], axis=-1) # [batch,hidden+] use attention # rnn_input = decoder_input[:, time_step, :] # [batch,hidden] not use attention # run rnn current_output, rnn_state = decoder_rnn(rnn_input, prev_rnn_state) # append to output bucket via length dim rnn_output = tf.concat([rnn_output, tf.expand_dims(current_output, axis=1)], axis=1) return time_step + 1, rnn_state, rnn_output, context_output # start loop final_time_step, final_state, rnn_output, context_output = tf.while_loop( loop_condition, loop_body, loop_vars=[time_step, init_decoder_state, rnn_output, context_output], shape_invariants=[ tf.TensorShape([]), nest.map_structure(get_state_shape_invariants, init_decoder_state), tf.TensorShape([None, None, conf.hidden_size]), tf.TensorShape([None, None, conf.hidden_size * 2]), ]) # body_output = tf.concat([rnn_output, context_output], axis=-1) # body_output = tf.layers.dense(body_output, self.hidden_size, activation=tf.nn.tanh, use_bias=True, name='body_output_layer') decoder_output = rnn_output logits = proj_logits(decoder_output, conf.embed_size, conf.vocab_size, name='share_embedding') # logits = proj_logits(encoder_output[:,:,:300], conf.embed_size, conf.vocab_size, name='share_embedding') onehot_s2 = tf.one_hot(self.s2, depth=conf.vocab_size) # [batch,len,vocab] xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=onehot_s2) # [batch,len] weights = tf.to_float(tf.not_equal(self.s2, 0)) # [batch,len] 1 for nonpad; 0 for pad loss_num = xentropy * weights # [batch,len] loss_den = weights # [batch,len] loss = tf.reduce_sum(loss_num) / tf.reduce_sum(loss_den) # scalar self.loss = loss # self.sent_loss = tf.reduce_sum(loss_num, -1) / tf.reduce_sum(loss_den, -1) # [batch] # decoder infer cache = {'state': decoder_rnn.zero_state(batch_size, tf.float32)} def symbols_to_logits_fn(ids, i, cache): # ids [batch,length] pred_target = ids[:, -1:] # 截取最后一个 [batch,1] embed_target, _ = embedding(tf.expand_dims(pred_target, axis=-1), conf.vocab_size, conf.embed_size, 'share_embedding') # [batch,length,embed] decoder_input = tf.squeeze(embed_target, axis=1) # [batch,embed] # if use attention s = cache['state'] if isinstance(decoder_rnn, tf.nn.rnn_cell.GRUCell) else cache['state'].h context = encdec_atten(s, beam_size=conf.beam_size) # [batch,hidden] decoder_input = tf.concat([decoder_input, context], axis=-1) # [batch,hidden+] # run rnn # with tf.variable_scope('rnn', reuse=tf.AUTO_REUSE): decoder_output, cache['state'] = decoder_rnn(decoder_input, cache['state']) logits = proj_logits(decoder_output, conf.hidden_size, conf.vocab_size, name='share_embedding') return logits, cache initial_ids = tf.zeros([batch_size], dtype=tf.int32) # <pad>为<sos> def greedy_search_wrapper(): """ Greedy Search """ decoded_ids, scores = greedy_search( symbols_to_logits_fn, initial_ids, max_decode_len=conf.max_decode_len, cache=cache, eos_id=conf.eos_id, ) return decoded_ids, scores def beam_search_wrapper(): """ Beam Search """ decoded_ids, scores = beam_search( # [batch,beam,len] [batch,beam] symbols_to_logits_fn, initial_ids, beam_size=conf.beam_size, max_decode_len=conf.max_decode_len, vocab_size=conf.vocab_size, states=cache, eos_id=conf.eos_id, gamma=conf.gamma, num_group=conf.num_group, top_k=conf.top_k, ) return decoded_ids, scores decoded_ids, scores = tf.cond(tf.equal(conf.beam_size, 1), greedy_search_wrapper, beam_search_wrapper) self.decoded_ids = tf.identity(decoded_ids, name='decoded_ids') # [batch,beam/1,len] self.scores = tf.identity(scores, name='scores') # [batch,beam/1] self.global_step = tf.train.get_or_create_global_step() self.optimizer = tf.train.AdamOptimizer(learning_rate=conf.lr) self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
def build_model2(self): # biGRU encoder + bah_attn + GRU decoder # embedding # [batch,len,embed] # pretrained_word_embeddings = np.load(f'{curr_dir}/pretrain_emb_300.npy') pretrained_word_embeddings = None s1_embed, _ = embedding(tf.expand_dims(self.s1, -1), conf.vocab_size, conf.embed_size, name='share_embedding', pretrain_embedding=pretrained_word_embeddings) s1_mask = mask_nonpad_from_embedding(s1_embed) # [batch,len] 1 for nonpad; 0 for pad s1_seqlen = tf.cast(tf.reduce_sum(s1_mask, axis=-1), tf.int32) # [batch] # encoder encoder_input = s1_embed encoder_input = tf.layers.dropout(encoder_input, rate=self.dropout_rate) # dropout with tf.variable_scope('birnn_encoder'): self.bilstm_encoder1 = Bi_RNN(cell_name='GRUCell', hidden_size=conf.hidden_size, dropout_rate=self.dropout_rate) encoder_output, _ = self.bilstm_encoder1(encoder_input, s1_seqlen) # [batch,len,2hid] # decoder s2_embed, _ = embedding(tf.expand_dims(self.s2, -1), conf.vocab_size, conf.embed_size, name='share_embedding', pretrain_embedding=pretrained_word_embeddings) s2_mask = mask_nonpad_from_embedding(s2_embed) # [batch,len] 1 for nonpad; 0 for pad s2_seqlen = tf.cast(tf.reduce_sum(s2_mask, -1), tf.int32) # [batch] decoder_input = s2_embed decoder_input = shift_right(decoder_input) # 用pad当做eos decoder_input = tf.layers.dropout(decoder_input, rate=self.dropout_rate) # dropout decoder_rnn = tf.nn.rnn_cell.DropoutWrapper(getattr(tf.nn.rnn_cell, 'GRUCell')(conf.hidden_size), # GRUCell/LSTMCell input_keep_prob=1.0 - self.dropout_rate) attention_mechanism = getattr(tf.contrib.seq2seq, 'BahdanauAttention')( conf.hidden_size, encoder_output, memory_sequence_length=s1_seqlen, name='BahdanauAttention', ) cell = tf.contrib.seq2seq.AttentionWrapper(decoder_rnn, attention_mechanism, output_attention=False, name='attention_wrapper', ) with tf.variable_scope('decoder'): decoder_output, _ = tf.nn.dynamic_rnn( cell, decoder_input, s2_seqlen, initial_state=None, # 默认用0向量初始化 dtype=tf.float32, time_major=False ) # 默认scope是rnn e.g.decoder/rnn/kernal logits = proj_logits(decoder_output, conf.embed_size, conf.vocab_size, name='share_embedding') # logits = proj_logits(encoder_output[:,:,:300], conf.embed_size, conf.vocab_size, name='share_embedding') onehot_s2 = tf.one_hot(self.s2, depth=conf.vocab_size) # [batch,len,vocab] xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=onehot_s2) # [batch,len] weights = tf.to_float(tf.not_equal(self.s2, 0)) # [batch,len] 1 for nonpad; 0 for pad loss_num = xentropy * weights # [batch,len] loss_den = weights # [batch,len] loss = tf.reduce_sum(loss_num) / tf.reduce_sum(loss_den) # scalar self.loss = loss # self.sent_loss = tf.reduce_sum(loss_num, -1) / tf.reduce_sum(loss_den, -1) # [batch] """ decoder infer """ # batch_size = shape_list(encoder_output)[0] batch_size = shape_list(encoder_output)[0] last_dim = shape_list(encoder_output)[-1] tile_encoder_output = tf.tile(tf.expand_dims(encoder_output, 1), [1, conf.beam_size, 1, 1]) tile_encoder_output = tf.reshape(tile_encoder_output, [batch_size * conf.beam_size, -1, last_dim]) tile_s1_seqlen = tf.tile(tf.expand_dims(s1_seqlen, 1), [1, conf.beam_size]) tile_s1_seqlent = tf.reshape(tile_s1_seqlen, [-1]) # 因为tf.BahdanauAttention在初始时就要指定beam_size来tile memory, 所以这里初始化另外一个专用于推断的beam_size tiled的并共享参数 # 不过验证还是有问题,不能用reuse=True 报 Variable memory_layer_1/kernel does not exist, or was not created with tf.get_variable() # 故不建议使用 with tf.variable_scope('', reuse=tf.AUTO_REUSE): attention_mechanism_decoder = getattr(tf.contrib.seq2seq, 'BahdanauAttention')( conf.hidden_size, tile_encoder_output, memory_sequence_length=tile_s1_seqlent, name='BahdanauAttention', ) cell_decoder = tf.contrib.seq2seq.AttentionWrapper(decoder_rnn, attention_mechanism_decoder, output_attention=False, name='attention_wrapper', ) initial_state = cell_decoder.zero_state(batch_size * conf.beam_size, tf.float32) # 内部会检查batch_size与encoder_output是否一致,需乘beam_size # 初始化缓存 # 区分能否设在cache: cache的值在beam_search过程中会expand和merge,需要tensor rank大于1 cache = { 'cell_state': initial_state.cell_state, 'attention': initial_state.attention, 'alignments': initial_state.alignments, 'attention_state': initial_state.attention_state, } unable_cache = { 'alignment_history': initial_state.alignment_history, # 'time': initial_state.time } # 将cache先变回batch,beam_search过程会expand/merge/gather,使得state是符合batch*beam的 cache = nest.map_structure(lambda s: s[:batch_size], cache) def symbols_to_logits_fn(ids, i, cache): nonlocal unable_cache ids = ids[:, -1:] target = tf.expand_dims(ids, axis=-1) # [batch,1,1] embedding_target, _ = embedding(target, conf.vocab_size, conf.hidden_size, 'share_embedding', reuse=True) input = tf.squeeze(embedding_target, axis=1) # [batch,hid] # 合并 cache和unable_cache为state state = cell_decoder.zero_state(batch_size * conf.beam_size, tf.float32).clone( cell_state=cache['cell_state'], attention=cache['attention'], alignments=cache['alignments'], attention_state=cache['attention_state'], alignment_history=unable_cache['alignment_history'], # time=unable_cache['time'], time=tf.convert_to_tensor(i, dtype=tf.int32), ) with tf.variable_scope('decoder/rnn', reuse=tf.AUTO_REUSE): output, state = cell_decoder(input, state) # 分开cache和unable_cache cache['cell_state'] = state.cell_state cache['attention'] = state.attention cache['alignments'] = state.alignments cache['attention_state'] = state.attention_state unable_cache['alignment_history'] = state.alignment_history # unable_cache['time'] = state.time body_output = output # [batch,hidden] logits = proj_logits(body_output, conf.embed_size, conf.vocab_size, name='share_embedding') return logits, cache initial_ids = tf.zeros([batch_size], dtype=tf.int32) # <pad>为<sos> def greedy_search_wrapper(): """ Greedy Search """ decoded_ids, scores = greedy_search( symbols_to_logits_fn, initial_ids, max_decode_len=conf.max_decode_len, cache=cache, eos_id=conf.eos_id, ) return decoded_ids, scores def beam_search_wrapper(): """ Beam Search """ decoded_ids, scores = beam_search( # [batch,beam,len] [batch,beam] symbols_to_logits_fn, initial_ids, conf.beam_size, conf.max_decode_len, conf.vocab_size, states=cache, eos_id=conf.eos_id, gamma=conf.gamma, num_group=conf.num_group, top_k=conf.top_k, ) return decoded_ids, scores decoded_ids, scores = tf.cond(tf.equal(conf.beam_size, 1), greedy_search_wrapper, beam_search_wrapper) self.decoded_ids = tf.identity(decoded_ids, name='decoded_ids') # [batch,beam/1,len] self.scores = tf.identity(scores, name='scores') # [batch,beam/1] self.global_step = tf.train.get_or_create_global_step() self.optimizer = tf.train.AdamOptimizer(learning_rate=conf.lr) self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
def build_model1(self): # embedding # [batch,len,embed] s1_embed, _ = embedding(tf.expand_dims(self.s1, -1), conf.vocab_size, conf.embed_size, name='embedding', pretrain_embedding=conf.pretrain_emb) s1_mask = mask_nonpad_from_embedding( s1_embed) # [batch,len] 1 for nonpad; 0 for pad s1_seqlen = tf.cast(tf.reduce_sum(s1_mask, axis=-1), tf.int32) # [batch] # encoder encoder_input = s1_embed encoder_input = tf.layers.dropout(encoder_input, rate=self.dropout_rate) # dropout with tf.variable_scope('rnn_1'): self.bilstm_encoder1 = Bi_RNN(cell_name='LSTMCell', hidden_size=conf.birnn_hidden_size, dropout_rate=self.dropout_rate) encoder_output, _ = self.bilstm_encoder1( encoder_input, s1_seqlen) # [batch,len,2hid] with tf.variable_scope('rnn_2'): self.bilstm_encoder2 = Bi_RNN(cell_name='LSTMCell', hidden_size=conf.birnn_hidden_size, dropout_rate=self.dropout_rate) encoder_output, _ = self.bilstm_encoder2( encoder_output, s1_seqlen) # [batch,len,2hid] # logits cls_logits = tf.layers.dense(encoder_output, conf.label_size, activation=None, use_bias=True, name='cls_logits') # [batch,length,label] # crf log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood( cls_logits, self.ner_label, tf.to_int32(s1_seqlen)) crf_loss = tf.reduce_mean(-log_likelihood) # loss self.loss = crf_loss # crf decode ner_pred, ner_prob = tf.contrib.crf.crf_decode(cls_logits, transition_params, tf.to_int32(s1_seqlen)) self.ner_prob = tf.identity(ner_prob, name='ner_prob') # [batch] self.ner_pred = tf.identity(ner_pred, name='ner_pred') # [batch,len] with tf.name_scope('accuracy'): ner_acc = tf.cast(tf.equal(self.ner_pred, self.ner_label), tf.float32) * s1_mask # [batch,length] ner_acc = tf.reduce_sum(ner_acc, axis=-1) # [batch] ner_acc = ner_acc / tf.cast(s1_seqlen, tf.float32) # [batch] self.accuracy = tf.reduce_mean(ner_acc, axis=0) # scalar self.global_step = tf.train.get_or_create_global_step() self.optimizer = tf.train.AdamOptimizer(learning_rate=conf.lr) self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)