def selector_fn(outputs): if scope_name is not None: with variable_scope.variable_scope("%s/selector" % scope_name, reuse=reuse) as scope: selector_logits = layers.linear(outputs, 1, scope=scope) else: with variable_scope.variable_scope("selector", reuse=reuse) as scope: selector_logits = layers.linear(outputs, 1, scope=scope) return selector_logits
def prepare_attention(attention_states, attention_option, num_units, imem=None, output_alignments=False, reuse=False): # Prepare attention keys / values from attention_states with variable_scope.variable_scope("attention_keys", reuse=reuse) as scope: attention_keys = layers.linear( attention_states, num_units, biases_initializer=None, scope=scope) attention_values = attention_states if imem is not None: if type(imem) is tuple: with variable_scope.variable_scope("imem_graph", reuse=reuse) as scope: attention_keys2, attention_states2 = array_ops.split(layers.linear( imem[0], num_units*2, biases_initializer=None, scope=scope), [num_units, num_units], axis=2) with variable_scope.variable_scope("imem_triple", reuse=reuse) as scope: attention_keys3, attention_states3 = array_ops.split(layers.linear( imem[1], num_units*2, biases_initializer=None, scope=scope), [num_units, num_units], axis=3) attention_keys = (attention_keys, attention_keys2, attention_keys3) attention_values = (attention_states, attention_states2, attention_states3) else: with variable_scope.variable_scope("imem", reuse=reuse) as scope: attention_keys2, attention_states2 = array_ops.split(layers.linear( imem, num_units*2, biases_initializer=None, scope=scope), [num_units, num_units], axis=2) attention_keys = (attention_keys, attention_keys2) attention_values = (attention_states, attention_states2) # Attention score function if imem is None: attention_score_fn = _create_attention_score_fn("attention_score", num_units, attention_option, reuse) else: attention_score_fn = (_create_attention_score_fn("attention_score", num_units, attention_option, reuse), _create_attention_score_fn("imem_score", num_units, "luong", reuse, output_alignments=output_alignments)) # Attention construction function attention_construct_fn = _create_attention_construct_fn("attention_construct", num_units, attention_score_fn, reuse) return (attention_keys, attention_values, attention_score_fn, attention_construct_fn)
def output_fn(outputs): if scope_name is not None: with variable_scope.variable_scope("%s/output_projection" % scope_name, reuse=reuse) as scope: output_logits = layers.linear(outputs, num_symbols, scope=scope) else: with variable_scope.variable_scope("output_projection", reuse=reuse) as scope: output_logits = layers.linear(outputs, num_symbols, scope=scope) return output_logits
def project_fn(input): output = layers.linear(input, vocabulary_count, scope="projection_layer") softmaxed_probability = tf.nn.softmax( output) # batch_size*decoder_len*vocabulary_count return softmaxed_probability
def condition_tensor(tensor, conditioning): """Condition the value of a tensor. Conditioning scheme based on https://arxiv.org/abs/1609.03499. Args: tensor: A minibatch tensor to be conditioned. conditioning: A minibatch Tensor of to condition on. Must be 2D, with first dimension the same as `tensor`. Returns: `tensor` conditioned on `conditioning`. Raises: ValueError: If the non-batch dimensions of `tensor` aren't fully defined. ValueError: If `conditioning` isn't at least 2D. ValueError: If the batch dimension for the input Tensors don't match. """ tensor.shape[1:].assert_is_fully_defined() num_features = tensor.shape[1:].num_elements() if conditioning.shape.ndims < 2: raise ValueError('conditioning must be at least 2D, but saw shape: %s' % conditioning.shape) mapped_conditioning = layers.linear( layers.flatten(conditioning), num_features) if not mapped_conditioning.shape.is_compatible_with(tensor.shape): mapped_conditioning = array_ops.reshape( mapped_conditioning, _get_shape(tensor)) return tensor + mapped_conditioning
def sequence_loss( outputs, # 解码器输出 targets, # 标签 masks): # 对标签的 mask with tf.variable_scope('decoder_rnn'): # 预测值 logits = layers.linear( outputs, num_symbols, scope=name) # [batch_size, decoder_len, num_symbols] logits = tf.reshape( logits, [-1, num_symbols]) # [batch_size*decoder_len, num_symbols] # 标签 local_labels = tf.reshape(targets, [-1]) # [batch_size*decoder_len] local_masks = tf.reshape(masks, [-1]) # [batch_size*decoder_len] # 计算损失 local_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=local_labels, logits=logits) # [batch_size*decoder_len] # 序列长度外的部分不计算损失 local_loss = local_loss * local_masks loss = tf.reduce_sum(local_loss) # 序列的总损失 [batch_size*decoder_len] total_size = tf.reduce_sum(local_masks) # 序列的总长度 total_size += 1e-12 # 避免总长度为0 return loss, loss / total_size # 一个 batch 的数据的每个单词的平均损失
def construct_fn(attention_query, attention_keys, attention_values): context = attention_score_fn(attention_query, attention_keys, attention_values) concat_input = array_ops.concat(1, [attention_query, context]) attention = layers.linear( concat_input, num_units, biases_initializer=None, scope=scope) return attention
def loss_fn(decoder_output, label_id, mask): ''' :param decoder_output: [batch_size decoder_len num_units] :param label_id: batch_size,decoder_len :param mask: [batch_size,decoder_len] :return: ''' with tf.variable_scope("decoder_rnn"): softmaxed_probability = layers.linear( decoder_output, vocabulary_count, scope="projection_layer" ) # batch_size decoder_len vocabulary_count logits = tf.reshape( softmaxed_probability, [-1, vocabulary_count ]) # 二维[batch_size*decoder_len, vovabulary_count] labels = tf.reshape(label_id, [-1]) # [batch_size*decoder_len] label_mask = tf.reshape(mask, [-1]) # [batch_size*decoder_len] ''' logits是神经网络输出层的输出,shape为[batch_size,num_classes] label是一个一维向量,长度为batch_size,每个元素取值区间是[0,num_classes),其实每一个值就是代表了batch中对应样本的类别 tf.nn.sparse_softmax_cross_entropy_with_logits该函数先计算logits的softmax值,再计算softmax与label的交叉熵损失 因此传入的logits无须提前softmax ''' local_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) # [batch_size*decoder_len] total_size = tf.reduce_sum( label_mask) # batch_size个response的总长度(不算padding部分) total_size += 1e-12 # 避免总长度为0 loss = tf.reduce_sum(local_loss) # batch_size个response的总损失 avg_loss = loss / total_size # 每个单词的平均损失 return loss, avg_loss
def prepare_attention( encoder_output, # 编码器输出 [batch_size, encoder_len, num_units] num_units, attention_option="bahdanau", output_alignments=False, reuse=False): # 根据编码器的输出,构造注意力的 keys 和 values with tf.variable_scope("attention_keys", reuse=reuse) as scope: attention_keys = layers.linear(encoder_output, num_units, biases_initializer=None, scope=scope) attention_values = encoder_output attention_score_fn = create_attention_score_fn( num_units, attention_option=attention_option, output_alignments=output_alignments, reuse=reuse) attention_construct_fn = create_attention_construct_fn(num_units, attention_score_fn, reuse=reuse) return (attention_keys, attention_values, attention_score_fn, attention_construct_fn)
def construct_fn(attention_query, attention_keys, attention_values): alignments = None if type(attention_score_fn) is tuple: context0 = attention_score_fn[0](attention_query, attention_keys[0], attention_values[0]) if len(attention_keys) == 2: context1 = attention_score_fn[1](attention_query, attention_keys[1], attention_values[1]) elif len(attention_keys) == 3: context1 = attention_score_fn[1](attention_query, attention_keys[1:], attention_values[1:]) if type(context1) is tuple: if len(context1) == 2: context1, alignments = context1 concat_input = array_ops.concat([attention_query, context0, context1], 1) elif len(context1) == 3: context1, context2, alignments = context1 concat_input = array_ops.concat([attention_query, context0, context1, context2], 1) else: concat_input = array_ops.concat([attention_query, context0, context1], 1) else: context = attention_score_fn(attention_query, attention_keys, attention_values) concat_input = array_ops.concat([attention_query, context], 1) attention = layers.linear( concat_input, num_units, biases_initializer=None, scope=scope) if alignments is None: return attention else: return attention, alignments
def condition_tensor(tensor, conditioning): """Condition the value of a tensor. Conditioning scheme based on https://arxiv.org/abs/1609.03499. Args: tensor: A minibatch tensor to be conditioned. conditioning: A minibatch Tensor of to condition on. Must be 2D, with first dimension the same as `tensor`. Returns: `tensor` conditioned on `conditioning`. Raises: ValueError: If the non-batch dimensions of `tensor` aren't fully defined. ValueError: If `conditioning` isn't at least 2D. ValueError: If the batch dimension for the input Tensors don't match. """ tensor.shape[1:].assert_is_fully_defined() num_features = tensor.shape[1:].num_elements() if conditioning.shape.ndims < 2: raise ValueError( 'conditioning must be at least 2D, but saw shape: %s' % conditioning.shape) mapped_conditioning = layers.linear(layers.flatten(conditioning), num_features) if not mapped_conditioning.shape.is_compatible_with(tensor.shape): mapped_conditioning = array_ops.reshape(mapped_conditioning, _get_shape(tensor)) return tensor + mapped_conditioning
def construct_fn(attention_query, attention_keys, attention_values): context = attention_score_fn(attention_query, attention_keys, attention_values) concat_input = array_ops.concat([attention_query, context], 1) attention = layers.linear( concat_input, num_units, biases_initializer=None, scope=scope) return attention
def total_loss(outputs, targets, masks, alignments, triples_embedding, use_entities, entity_targets): ''' outputs: batch_size * decoder_len * num_units targets: batch_size * decoder_len masks: batch_size * decoder_len alignments: [batch_size, decoder_len, triple_num, triple_len] 注意力系数 triples_embedding: [batch_size, triple_num, triple_len, 3*num_trans_units] use_entities: batch_size * decoder_len 用 1 标注了回复的哪个时间步用了三元组 entity_targets: [batch_size, decoder_len, triple_num, triple_len] 用 1 标注了每个 batch 每个时间步用了哪个图的哪个三元组 ''' batch_size = tf.shape(outputs)[0] local_masks = tf.reshape(masks, [-1]) logits = layers.linear(outputs, num_symbols, scope='decoder_rnn/%s' % name) # batch_size * decoder_len * num_symbols one_hot_targets = tf.one_hot(targets, num_symbols) # batch_size * decoder_len * num_symbols # 每一步的单词预测为 target 的概率 word_prob = tf.reduce_sum(tf.nn.softmax(logits) * one_hot_targets, axis=2) # batch_size * decoder_len # 对三元组选择概率的系数,论文中的 gamma_t selector = tf.squeeze(tf.sigmoid(layers.linear(outputs, 1, scope='decoder_rnn/selector'))) # batch_size * decoder_len # 每一步对的三元组注意力为实际采用的三元组的概率 triple_prob = tf.reduce_sum(alignments * entity_targets, axis=[2, 3]) # batch_size * decoder_len # ppx_prob = word_prob * (1 - use_entities) + triple_prob * use_entities # batch_size * decoder_len # 论文中的 P(y_t) final_prob = word_prob * (1 - selector) * (1 - use_entities) + triple_prob * selector * use_entities # batch_size * decoder_len final_loss = tf.reduce_sum(tf.reshape( - tf.log(1e-12 + final_prob), [-1]) * local_masks) # value # ppx_loss = tf.reduce_sum(tf.reshape( - tf.log(1e-12 + ppx_prob), [-1]) * local_masks) # value # 每个 batch 的 ppx sentence_ppx = tf.reduce_sum(tf.reshape(tf.reshape( - tf.log(1e-12 + ppx_prob), [-1]) * local_masks, [batch_size, -1]), axis=1) # [batch_size] selector_loss = tf.reduce_sum(tf.reshape( - tf.log(1e-12 + selector * use_entities + (1 - selector) * (1 - use_entities)), [-1]) * local_masks) loss = final_loss + selector_loss total_size = tf.reduce_sum(local_masks) total_size += 1e-12 # to avoid division by 0 for all-0 weights # 每个词的平均损失,每个词的平均 ppx,每个句子的平均 ppx return loss / total_size, ppx_loss / total_size, sentence_ppx / tf.reduce_sum(masks, axis=1)
def total_loss(outputs, # [batch_size, decoder_len, num_units] targets, # [batch_size, decoder_len] masks, # [batch_size, decoder_len] alignments, # [batch_size, decoder_len, triple_num, triple_len] triples_embedding, use_entities, # [batch_size, decoder_len] 用1标注了回复的哪个时间步用了三元组 entity_targets): # [batch_size, decoder_len, triple_num, triple_len] 用1标注了每个batch每个时间步用了哪个图的哪个三元组 batch_size = tf.shape(outputs)[0] local_masks = tf.reshape(masks, [-1]) # [batch_size*decoder_len] logits = layers.linear(outputs, num_symbols, scope='decoder_rnn/%s' % name) # [batch_size, decoder_len, num_symbols] one_hot_targets = tf.one_hot(targets, num_symbols) # [batch_size, decoder_len, num_symbols] # 每一步的单词预测正确的概率 word_prob = tf.reduce_sum(tf.nn.softmax(logits) * one_hot_targets, axis=2) # [batch_size, decoder_len] # 每一步使用实体词的概率预测 selector = tf.squeeze(tf.sigmoid(layers.linear(outputs, 1, scope='decoder_rnn/selector'))) # [batch_size, decoder_len] # 每一步对使用的三元组的注意力 triple_prob = tf.reduce_sum(alignments * entity_targets, axis=[2, 3]) # [batch_size, decoder_len] # 每一步正确的概率 ppx_prob = word_prob * (1 - use_entities) + triple_prob * use_entities # [batch_size, decoder_len] # 加上选择器选择正确的概率 final_prob = word_prob * (1 - selector) * (1 - use_entities) + triple_prob * selector * use_entities # [batch_size, decoder_len] # 加上选择器选择的损失 final_loss = tf.reduce_sum(tf.reshape(- tf.log(1e-12 + final_prob), [-1]) * local_masks) # 不加选择器的ppx ppx_loss = tf.reduce_sum(tf.reshape( - tf.log(1e-12 + ppx_prob), [-1]) * local_masks) # 每个batch的ppx sentence_ppx = tf.reduce_sum(tf.reshape(tf.reshape(-tf.log(1e-12 + ppx_prob), [-1]) * local_masks, [batch_size, -1]), axis=1) # [batch_size] # 选择器的损失 selector_loss = tf.reduce_sum(tf.reshape(-tf.log(1e-12 + selector * use_entities + (1 - selector) * (1 - use_entities)), [-1]) * local_masks) # [batch_size] loss = final_loss + selector_loss total_size = tf.reduce_sum(local_masks) total_size += 1e-12 # 每个词的平均损失、每个词的平均ppx、每个样本的每个词的ppx[batch_size] return loss / total_size, ppx_loss / total_size, sentence_ppx / tf.reduce_sum(masks, axis=1)
def prepare_multistep_attention(encoder_states, decoder_reprs, kd_states1, kd_states2, attention_option, num_units, reuse=False): # Prepare attention keys / values from attention_states with variable_scope.variable_scope("attn_keys", reuse=reuse) as scope: attention_keys1 = layers.linear(encoder_states, num_units, biases_initializer=None, scope=scope) attention_values1 = encoder_states # Attention scoring function attention_score_fn1 = _create_attention_score_fn( "attn_score", num_units, attention_option, reuse) with variable_scope.variable_scope("attn_reprs", reuse=reuse) as scope: if decoder_reprs is not None: attention_keys2 = layers.linear(decoder_reprs, num_units, biases_initializer=None, scope=scope) else: attention_keys2 = None attention_values2 = decoder_reprs # Attention scoring function attention_score_fn2 = _create_attention_score_fn( "attn_score", num_units, attention_option, reuse) attention_keys = (attention_keys1, attention_keys2) if kd_states1 is not None and kd_states2 is not None: attention_values = (attention_values1, attention_values2, kd_states1, kd_states2) else: attention_values = (attention_values1, attention_values2, None, None) attention_score_fn = (attention_score_fn1, attention_score_fn2) # Attention construction function attention_construct_fn = _create_attention_construct_fn( "attn_construct_multi", num_units, attention_score_fn, reuse) return attention_keys, attention_values, attention_construct_fn
def inference_fn(inference_output): with tf.variable_scope("decoder_rnn"): inference_softmaxed_probability = tf.nn.softmax( layers.linear(inference_output, vocabulary_count, scope="projection_layer") ) # 词汇表softmaxed后的概率 [batch_size decoder_len vovabulary_count] inference_maximum_likelihood_id = tf.argmax( inference_softmaxed_probability, axis=2) return inference_maximum_likelihood_id
def construct_fn(attention_query, attention_keys, attention_values): context, alignments = attention_score_fn(attention_query, attention_keys, attention_values) concat_input = array_ops.concat([attention_query, context], 1) concat_input = array_ops.reshape(concat_input, [-1, 1024]) attention = layers.linear(concat_input, num_units, biases_initializer=None, scope=scope) return attention, alignments
def project_fn(input): ''' 如果input是batch_size*num_units,创建权重矩阵 W:num_units*vocabulary_count output = input * W batch_size * vocabulary_count 如果input是batch_size, decoder-len, num_units,则output是batch_size, decoder-len, vocabulary_count ''' output = layers.linear( input, vocabulary_count, scope="projection_layer") # batch_size * vocabulary_count softmaxed_probability = tf.nn.softmax( output) # batch_size * vocabulary_count return softmaxed_probability
def construct_fn(attention_query, attention_keys, attention_values): '''拼接几个注意力 返回: attention: [batch_size, num_units] 拼接完几个注意力并做一次线性变化得到的输出 alignments: 注意力系数 ''' alignments = None # 如果有静态图或者三元组 if type(attention_score_fn) is tuple: # 计算编码器每一步输出的注意力 context0 = attention_score_fn[0](attention_query, attention_keys[0], attention_values[0]) # 如果只有静态图 if len(attention_keys) == 2: context1 = attention_score_fn[1](attention_query, attention_keys[1], attention_values[1]) # 如果既有静态图还有三元组 elif len(attention_keys) == 3: context1 = attention_score_fn[1](attention_query, attention_keys[1:], attention_values[1:]) if type(context1) is tuple: # 只有静态图且要求输出对齐 if len(context1) == 2: context1, alignments = context1 concat_input = array_ops.concat( [attention_query, context0, context1], 1) # 既有静态图还有三元组 elif len(context1) == 3: context1, context2, alignments = context1 concat_input = array_ops.concat( [attention_query, context0, context1, context2], 1) else: # 存在静态图没有三元组且不要求输出静态图的对齐的情况 concat_input = array_ops.concat( [attention_query, context0, context1], 1) # 如果没有静态图或者三元组 else: context = attention_score_fn(attention_query, attention_keys, attention_values) concat_input = array_ops.concat([attention_query, context], 1) # 给拼接完的注意力做一个线性变化 [batch_size, num_units] attention = layers.linear(concat_input, num_units, biases_initializer=None, scope=scope) if alignments is None: return attention else: return attention, alignments
def total_loss(outputs, targets, masks, alignments, triples_embedding, use_entities, entity_targets): local_masks = tf.reshape(masks, [-1]) logits = layers.linear(outputs, num_symbols, scope='decoder_rnn/%s' % name) one_hot_targets = tf.one_hot(targets, num_symbols) word_prob = tf.reduce_sum(tf.nn.softmax(logits) * one_hot_targets, axis=2) selector = tf.squeeze(tf.sigmoid(layers.linear(outputs, 1, scope='decoder_rnn/selector'))) triple_prob = tf.reduce_sum(alignments * entity_targets, axis=[2]) cast_selector = tf.cast(tf.reduce_sum(alignments, axis=2) > tf.reduce_sum(tf.nn.softmax(logits), axis=2), tf.float32) final_prob = word_prob * (1 - selector) + triple_prob * selector ppx_prob = word_prob * (1 - use_entities) + triple_prob * use_entities final_loss = tf.reshape( - tf.log(1e-12 + final_prob), [-1]) * local_masks ppx_loss = tf.reshape( - tf.log(1e-12 + ppx_prob), [-1]) * local_masks sentence_ppx = tf.reduce_sum( - tf.log(1e-12 + ppx_prob) * masks, axis=1) loss = tf.reduce_sum(final_loss) #loss = tf.Print(loss, ['use_entity', tf.reduce_min(use_entities), tf.reduce_max(use_entities), 'triple_prob',tf.reduce_min(triple_prob), 'word_prob', tf.reduce_min(word_prob), 'final_prob', tf.reduce_min(final_prob), 'final_loss', tf.reduce_min(final_loss)], summarize=1e6) total_size = tf.reduce_sum(local_masks) total_size += 1e-12 # to avoid division by 0 for all-0 weights return loss / total_size, tf.reduce_sum(ppx_loss) / total_size, sentence_ppx / tf.reduce_sum(masks, axis=1)
def total_loss(outputs, targets, masks, alignments, triples_embedding, use_entities, entity_targets): batch_size = tf.shape(outputs)[0] local_masks = tf.reshape(masks, [-1]) logits = layers.linear(outputs, num_symbols, scope='decoder_rnn/%s' % name) one_hot_targets = tf.one_hot(targets, num_symbols) word_prob = tf.reduce_sum(tf.nn.softmax(logits) * one_hot_targets, axis=2) selector = tf.squeeze(tf.sigmoid(layers.linear(outputs, 1, scope='decoder_rnn/selector'))) triple_prob = tf.reduce_sum(alignments * entity_targets, axis=[2, 3]) ppx_prob = word_prob * (1 - use_entities) + triple_prob * use_entities final_prob = word_prob * (1 - selector) * (1 - use_entities) + triple_prob * selector * use_entities final_loss = tf.reduce_sum(tf.reshape( - tf.log(1e-12 + final_prob), [-1]) * local_masks) ppx_loss = tf.reduce_sum(tf.reshape( - tf.log(1e-12 + ppx_prob), [-1]) * local_masks) sentence_ppx = tf.reduce_sum(tf.reshape(tf.reshape( - tf.log(1e-12 + ppx_prob), [-1]) * local_masks, [batch_size, -1]), axis=1) selector_loss = tf.reduce_sum(tf.reshape( - tf.log(1e-12 + selector * use_entities + (1 - selector) * (1 - use_entities)), [-1]) * local_masks) loss = final_loss + selector_loss total_size = tf.reduce_sum(local_masks) total_size += 1e-12 # to avoid division by 0 for all-0 weights return loss / total_size, ppx_loss / total_size, sentence_ppx / tf.reduce_sum(masks, axis=1)
def construct_fn(attention_query, attention_keys, attention_values): '''拼接计算完的上下文 返回: attention: [batch_size, num_units] 注意力的上下文拼接,拼接完做了一次线性变化的 alignments: 注意力系数 ''' alignments = None # 如果有图向量和三元组嵌入 if type(attention_score_fn) is tuple: # 用 bahdanau 计算带注意力的上下文 context0 = attention_score_fn[0](attention_query, attention_keys[0], attention_values[0]) # 训练没走这个分支 if len(attention_keys) == 2: context1 = attention_score_fn[1](attention_query, attention_keys[1], attention_values[1]) # 如果有图向量和三元组嵌入 elif len(attention_keys) == 3: context1 = attention_score_fn[1](attention_query, attention_keys[1:], attention_values[1:]) if type(context1) is tuple: if len(context1) == 2: context1, alignments = context1 concat_input = array_ops.concat( [attention_query, context0, context1], 1) elif len(context1) == 3: # 训练走的这个分支 context1, context2, alignments = context1 concat_input = array_ops.concat( [attention_query, context0, context1, context2], 1 ) # [batch_size, num_units*4] 解码器输出,上下文,图上下文,三元组上下文拼接 else: concat_input = array_ops.concat( [attention_query, context0, context1], 1) else: context = attention_score_fn(attention_query, attention_keys, attention_values) concat_input = array_ops.concat([attention_query, context], 1) # 给拼接完的分布做一个线性变化,将最后一个维度转为 num_units attention = layers.linear(concat_input, num_units, biases_initializer=None, scope=scope) # [batch_size, num_units] if alignments is None: return attention else: return attention, alignments
def construct_fn(attention_query, attention_keys, attention_values): if isinstance(attention_score_fn, tuple): # multi-step decoding attention_score_fn1, attention_score_fn2 = attention_score_fn attention_keys1, attention_keys2 = attention_keys attention_values1, decoder_reprs, kd_states1, kd_states2 = attention_values context1 = attention_score_fn1(attention_query, attention_keys1, attention_values1) if kd_states1 is None or kd_states2 is None: context2 = attention_score_fn2(attention_query, attention_keys2, decoder_reprs) concat_input = array_ops.concat( [attention_query, context1, context2], 1) else: if decoder_reprs is None: print("concat=3") concat_input = array_ops.concat([ attention_query, context1, kd_states1, kd_states2 ], 1) else: print("concat=4") context2 = attention_score_fn2(attention_query, attention_keys2, decoder_reprs) concat_input = array_ops.concat([ attention_query, context1, context2, kd_states1, kd_states2 ], 1) else: # only one step decoding if isinstance(attention_values, tuple): attention_values1, kd_state = attention_values context1 = attention_score_fn(attention_query, attention_keys, attention_values1) concat_input = array_ops.concat( [attention_query, context1, kd_state], 1) else: context = attention_score_fn(attention_query, attention_keys, attention_values) concat_input = array_ops.concat([attention_query, context], 1) attention = layers.linear(concat_input, num_units, biases_initializer=None, scope=scope) return attention
def sequence_loss(outputs, targets, masks): with variable_scope.variable_scope('decoder_rnn'): logits = layers.linear(outputs, num_symbols, scope=name) # [batch_size, decoder_len, num_symbols] logits = tf.reshape(logits, [-1, num_symbols]) # [batch_size*decoder_len, num_symbols] local_labels = tf.reshape(targets, [-1]) # [batch_size*decoder_len] local_masks = tf.reshape(masks, [-1]) # [batch_size*decoder_len] local_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=local_labels, logits=logits) local_loss = local_loss * local_masks # 序列长度外的部分不计算损失 loss = tf.reduce_sum(local_loss) # 序列的总损失 total_size = tf.reduce_sum(local_masks) # 序列的总长度 total_size += 1e-12 # 避免总长度为0 return loss / total_size # 返回平均损失
def construct_fn(attention_query, attention_keys, attention_values): context, scores, alignments = attention_score_fn( attention_query, attention_keys, attention_values) concat_input = array_ops.concat([attention_query, context], 1) #NOTICE! here pass scope which is outside construct_fn func! so always not affected by using env scope, be like #seq2seq/main/decode/attention_construct/weights in rnn_decoder.py not seq2seq/main/decode/rnn/loop_function/weights in beam_decoder.py attention = layers.linear(concat_input, num_units, biases_initializer=None, scope=scope) #add this to make it safe using in loop, if only used in dynamic decode do not need this since only call above linear once #this kind of witting then add reuse seems safe and elegant #http://stackoverflow.com/questions/38545362/tensorflow-variable-scope-reuse-if-variable-exists scope.reuse_variables() return attention, scores, alignments
def sequence_loss(outputs, targets, masks): with variable_scope.variable_scope('decoder_rnn'): logits = layers.linear(outputs, num_symbols, scope=name) logits = tf.reshape(logits, [-1, num_symbols]) local_labels = tf.reshape(targets, [-1]) local_masks = tf.reshape(masks, [-1]) local_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=local_labels, logits=logits) local_loss = local_loss * local_masks loss = tf.reduce_sum(local_loss) total_size = tf.reduce_sum(local_masks) total_size += 1e-12 # to avoid division by 0 for all-0 weights return loss / total_size
def attention_construct_fn( query, # 解码器输出 [batch_size, num_units] keys, # [batch_size, encoder_len, num_units] values): # [batch_size, encoder_len, num_units] alignments = None context = attention_score_fn(query, keys, values) if type(context) is tuple: context, alignments = context concat_input = tf.concat([query, context], axis=1) attention = layers.linear(concat_input, num_units, biases_initializer=None, scope=scope) # [batch_size, num_units] if alignments is None: return attention # [batch_size, num_units] else: return attention, alignments
def loss_fn(decoder_output, label_id, mask): with tf.variable_scope("decoder_rnn"): softmaxed_probability = tf.nn.softmax( layers.linear(decoder_output, vocabulary_count, scope="projection_layer")) logits = tf.reshape( softmaxed_probability, [-1, vocabulary_count ]) # [batch_size*decoder_len vovabulary_count] labels = tf.reshape(label_id, [-1]) # [batch_size*decoder_len] label_mask = tf.reshape(mask, [-1]) # [batch_size*decoder_len] local_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) # [batch_size*decoder_len] total_size = tf.reduce_sum(label_mask) total_size += 1e-12 loss = tf.reduce_sum(local_loss * label_mask) avg_loss = loss / total_size return loss, avg_loss
def prepare_attention(attention_states, attention_option, num_units, reuse=False): """Prepare keys/values/functions for attention. Args: attention_states: hidden states to attend over. attention_option: how to compute attention, either "luong" or "bahdanau". num_units: hidden state dimension. reuse: whether to reuse variable scope. Returns: attention_keys: to be compared with target states. attention_values: to be used to construct context vectors. attention_score_fn: to compute similarity between key and target states. attention_construct_fn: to build attention states. """ # Prepare attention keys / values from attention_states with variable_scope.variable_scope("attention_keys", reuse=reuse) as scope: attention_keys = layers.linear(attention_states, num_units, biases_initializer=None, scope=scope) #well here do not need ad scope reuse since you get attention_keys and use it not call this linear again, #attention_construct_fn is different, since you call that function each step during loop attention_values = attention_states # Attention score function attention_score_fn = _create_attention_score_fn("attention_score", num_units, attention_option, reuse) # Attention construction function attention_construct_fn = _create_attention_construct_fn( "attention_construct", num_units, attention_score_fn, reuse) return (attention_keys, attention_values, attention_score_fn, attention_construct_fn)
def prepare_attention(attention_states, attention_len, attention_option, num_units, reuse=False): # Prepare attention keys / values from attention_states with variable_scope.variable_scope("attention_keys", reuse=reuse) as scope: attention_keys = layers.linear( attention_states, num_units, biases_initializer=None, scope=scope) attention_values = attention_states # Attention score function attention_score_fn = _create_attention_score_fn( "attention_score", attention_len, num_units, attention_option, reuse) # Attention construction function attention_construct_fn = _create_attention_construct_fn( "attention_construct", num_units, attention_score_fn, reuse) return (attention_keys, attention_values, attention_score_fn, attention_construct_fn)
def prepare_attention(attention_states, attention_option, num_units, reuse=False): """Prepare keys/values/functions for attention. Args: attention_states: hidden states to attend over. attention_option: how to compute attention, either "luong" or "bahdanau". num_units: hidden state dimension. reuse: whether to reuse variable scope. Returns: attention_keys: to be compared with target states. attention_values: to be used to construct context vectors. attention_score_fn: to compute similarity between key and target states. attention_construct_fn: to build attention states. """ # Prepare attention keys / values from attention_states with variable_scope.variable_scope("attention_keys", reuse=reuse) as scope: attention_keys = layers.linear(attention_states, num_units, biases_initializer=None, scope=scope) #attention_keys = tf.Print(attention_keys, ["attention_keys", tf.shape(attention_states), tf.shape(attention_keys), attention_keys]) attention_values = attention_states # Attention score function attention_score_fn = _create_attention_score_fn("attention_score", num_units, attention_option, reuse) # Attention construction function attention_construct_fn = _create_attention_construct_fn( "attention_construct", num_units, attention_score_fn, reuse) return (attention_keys, attention_values, attention_score_fn, attention_construct_fn)
def prepare_attention(attention_states, attention_option, num_units, reuse=False): """Prepare keys/values/functions for attention. Args: attention_states: hidden states to attend over. attention_option: how to compute attention, either "luong" or "bahdanau". num_units: hidden state dimension. reuse: whether to reuse variable scope. Returns: attention_keys: to be compared with target states. attention_values: to be used to construct context vectors. attention_score_fn: to compute similarity between key and target states. attention_construct_fn: to build attention states. """ # Prepare attention keys / values from attention_states with variable_scope.variable_scope("attention_keys", reuse=reuse) as scope: attention_keys = layers.linear( attention_states, num_units, biases_initializer=None, scope=scope) attention_values = attention_states # Attention score function attention_score_fn = _create_attention_score_fn("attention_score", num_units, attention_option, reuse) # Attention construction function attention_construct_fn = _create_attention_construct_fn("attention_construct", num_units, attention_score_fn, reuse) return (attention_keys, attention_values, attention_score_fn, attention_construct_fn)
def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): v = vs.get_variable("AttnV", [num_units]) #[batch_size, num_units] -> [batch_size, num_units] processed_query = layers.linear(query, num_units, scope="query_layer") #processed_query = layers_core.dense(query, num_units, use_bias=False, name="query_layer") #->[batch_size, 1, num_units] processed_query = tf.expand_dims(processed_query, 1) #[batch_size, attn_length, num_units] + [batch_size, 1, num_units] -> [batch_size, attn_length, num_units] #reduce_sum -> [batch_size, attn_length] scores = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2]) if feed_prev: alignments = nn_ops.softmax(scores) #-> [batch_size, attn_length, 1] alignments = array_ops.expand_dims(alignments, 2) #context_vector = math_ops.reduce_sum(alignments * values, [1]) #return scores, alignments, context_vector return scores, alignments else: return scores