Пример #1
0
 def selector_fn(outputs):
     if scope_name is not None:
         with variable_scope.variable_scope("%s/selector" % scope_name,
                                            reuse=reuse) as scope:
             selector_logits = layers.linear(outputs, 1, scope=scope)
     else:
         with variable_scope.variable_scope("selector",
                                            reuse=reuse) as scope:
             selector_logits = layers.linear(outputs, 1, scope=scope)
     return selector_logits
Пример #2
0
def prepare_attention(attention_states,
                          attention_option,
                          num_units,
                          imem=None,
                          output_alignments=False,
                          reuse=False):
    # Prepare attention keys / values from attention_states
    with variable_scope.variable_scope("attention_keys", reuse=reuse) as scope:
        attention_keys = layers.linear(
            attention_states, num_units, biases_initializer=None, scope=scope)
        attention_values = attention_states

    if imem is not None:
        if type(imem) is tuple:
            with variable_scope.variable_scope("imem_graph", reuse=reuse) as scope:
                attention_keys2, attention_states2 = array_ops.split(layers.linear(
                    imem[0], num_units*2, biases_initializer=None, scope=scope), [num_units, num_units], axis=2)
            with variable_scope.variable_scope("imem_triple", reuse=reuse) as scope:
                attention_keys3, attention_states3 = array_ops.split(layers.linear(
                    imem[1], num_units*2, biases_initializer=None, scope=scope), [num_units, num_units], axis=3)
            attention_keys = (attention_keys, attention_keys2, attention_keys3)
            attention_values = (attention_states, attention_states2, attention_states3)
        else:
            with variable_scope.variable_scope("imem", reuse=reuse) as scope:
                attention_keys2, attention_states2 = array_ops.split(layers.linear(
                    imem, num_units*2, biases_initializer=None, scope=scope), [num_units, num_units], axis=2)
                attention_keys = (attention_keys, attention_keys2)
                attention_values = (attention_states, attention_states2)

        

    # Attention score function
    if imem is None:
        attention_score_fn = _create_attention_score_fn("attention_score", num_units,
                                                            attention_option, reuse)
    else:
        attention_score_fn = (_create_attention_score_fn("attention_score", num_units,
                                                            attention_option, reuse),
                            _create_attention_score_fn("imem_score", num_units,
                                                            "luong", reuse, output_alignments=output_alignments))

    # Attention construction function
    attention_construct_fn = _create_attention_construct_fn("attention_construct",
                                  num_units,
                                  attention_score_fn,
                                  reuse)

    return (attention_keys, attention_values, attention_score_fn,
                    attention_construct_fn)
Пример #3
0
 def output_fn(outputs):
     if scope_name is not None:
         with variable_scope.variable_scope("%s/output_projection" %
                                            scope_name,
                                            reuse=reuse) as scope:
             output_logits = layers.linear(outputs,
                                           num_symbols,
                                           scope=scope)
     else:
         with variable_scope.variable_scope("output_projection",
                                            reuse=reuse) as scope:
             output_logits = layers.linear(outputs,
                                           num_symbols,
                                           scope=scope)
     return output_logits
Пример #4
0
 def project_fn(input):
     output = layers.linear(input,
                            vocabulary_count,
                            scope="projection_layer")
     softmaxed_probability = tf.nn.softmax(
         output)  # batch_size*decoder_len*vocabulary_count
     return softmaxed_probability
def condition_tensor(tensor, conditioning):
  """Condition the value of a tensor.

  Conditioning scheme based on https://arxiv.org/abs/1609.03499.

  Args:
    tensor: A minibatch tensor to be conditioned.
    conditioning: A minibatch Tensor of to condition on. Must be 2D, with first
      dimension the same as `tensor`.

  Returns:
    `tensor` conditioned on `conditioning`.

  Raises:
    ValueError: If the non-batch dimensions of `tensor` aren't fully defined.
    ValueError: If `conditioning` isn't at least 2D.
    ValueError: If the batch dimension for the input Tensors don't match.
  """
  tensor.shape[1:].assert_is_fully_defined()
  num_features = tensor.shape[1:].num_elements()
  if conditioning.shape.ndims < 2:
    raise ValueError('conditioning must be at least 2D, but saw shape: %s'
                     % conditioning.shape)

  mapped_conditioning = layers.linear(
      layers.flatten(conditioning), num_features)
  if not mapped_conditioning.shape.is_compatible_with(tensor.shape):
    mapped_conditioning = array_ops.reshape(
        mapped_conditioning, _get_shape(tensor))
  return tensor + mapped_conditioning
Пример #6
0
 def sequence_loss(
         outputs,  # 解码器输出
         targets,  # 标签
         masks):  # 对标签的 mask
     with tf.variable_scope('decoder_rnn'):
         # 预测值
         logits = layers.linear(
             outputs, num_symbols,
             scope=name)  # [batch_size, decoder_len, num_symbols]
         logits = tf.reshape(
             logits,
             [-1, num_symbols])  # [batch_size*decoder_len, num_symbols]
         # 标签
         local_labels = tf.reshape(targets,
                                   [-1])  # [batch_size*decoder_len]
         local_masks = tf.reshape(masks, [-1])  # [batch_size*decoder_len]
         # 计算损失
         local_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
             labels=local_labels, logits=logits)  # [batch_size*decoder_len]
         # 序列长度外的部分不计算损失
         local_loss = local_loss * local_masks
         loss = tf.reduce_sum(local_loss)  # 序列的总损失 [batch_size*decoder_len]
         total_size = tf.reduce_sum(local_masks)  # 序列的总长度
         total_size += 1e-12  # 避免总长度为0
         return loss, loss / total_size  # 一个 batch 的数据的每个单词的平均损失
Пример #7
0
 def construct_fn(attention_query, attention_keys, attention_values):
   context = attention_score_fn(attention_query, attention_keys,
                                attention_values)
   concat_input = array_ops.concat(1, [attention_query, context])
   attention = layers.linear(
       concat_input, num_units, biases_initializer=None, scope=scope)
   return attention
Пример #8
0
 def loss_fn(decoder_output, label_id, mask):
     '''
     :param decoder_output: [batch_size decoder_len num_units]
     :param label_id: batch_size,decoder_len
     :param mask: [batch_size,decoder_len]
     :return:
     '''
     with tf.variable_scope("decoder_rnn"):
         softmaxed_probability = layers.linear(
             decoder_output, vocabulary_count, scope="projection_layer"
         )  # batch_size decoder_len vocabulary_count
         logits = tf.reshape(
             softmaxed_probability,
             [-1, vocabulary_count
              ])  # 二维[batch_size*decoder_len, vovabulary_count]
         labels = tf.reshape(label_id, [-1])  # [batch_size*decoder_len]
         label_mask = tf.reshape(mask, [-1])  # [batch_size*decoder_len]
         '''
         logits是神经网络输出层的输出,shape为[batch_size,num_classes]
         label是一个一维向量,长度为batch_size,每个元素取值区间是[0,num_classes),其实每一个值就是代表了batch中对应样本的类别
         tf.nn.sparse_softmax_cross_entropy_with_logits该函数先计算logits的softmax值,再计算softmax与label的交叉熵损失
         因此传入的logits无须提前softmax
         '''
         local_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
             labels=labels, logits=logits)  # [batch_size*decoder_len]
         total_size = tf.reduce_sum(
             label_mask)  # batch_size个response的总长度(不算padding部分)
         total_size += 1e-12  # 避免总长度为0
         loss = tf.reduce_sum(local_loss)  # batch_size个response的总损失
         avg_loss = loss / total_size  # 每个单词的平均损失
         return loss, avg_loss
Пример #9
0
def prepare_attention(
        encoder_output,  # 编码器输出 [batch_size, encoder_len, num_units]
        num_units,
        attention_option="bahdanau",
        output_alignments=False,
        reuse=False):
    # 根据编码器的输出,构造注意力的 keys 和 values
    with tf.variable_scope("attention_keys", reuse=reuse) as scope:
        attention_keys = layers.linear(encoder_output,
                                       num_units,
                                       biases_initializer=None,
                                       scope=scope)
        attention_values = encoder_output

    attention_score_fn = create_attention_score_fn(
        num_units,
        attention_option=attention_option,
        output_alignments=output_alignments,
        reuse=reuse)
    attention_construct_fn = create_attention_construct_fn(num_units,
                                                           attention_score_fn,
                                                           reuse=reuse)

    return (attention_keys, attention_values, attention_score_fn,
            attention_construct_fn)
Пример #10
0
 def construct_fn(attention_query, attention_keys, attention_values):
     alignments = None
     if type(attention_score_fn) is tuple:
         context0 = attention_score_fn[0](attention_query, attention_keys[0],
                                                                  attention_values[0])
         if len(attention_keys) == 2:
             context1 = attention_score_fn[1](attention_query, attention_keys[1],
                                                                      attention_values[1])
         elif len(attention_keys) == 3:
             context1 = attention_score_fn[1](attention_query, attention_keys[1:],
                     attention_values[1:])
         if type(context1) is tuple:
             if len(context1) == 2:
                 context1, alignments = context1
                 concat_input = array_ops.concat([attention_query, context0, context1], 1)
             elif len(context1) == 3:
                 context1, context2, alignments = context1
                 concat_input = array_ops.concat([attention_query, context0, context1, context2], 1)
         else:
             concat_input = array_ops.concat([attention_query, context0, context1], 1)
     else:
         context = attention_score_fn(attention_query, attention_keys,
                                                                  attention_values)
         concat_input = array_ops.concat([attention_query, context], 1)
     attention = layers.linear(
             concat_input, num_units, biases_initializer=None, scope=scope)
     if alignments is None:
         return attention
     else:
         return attention, alignments
Пример #11
0
def condition_tensor(tensor, conditioning):
    """Condition the value of a tensor.

  Conditioning scheme based on https://arxiv.org/abs/1609.03499.

  Args:
    tensor: A minibatch tensor to be conditioned.
    conditioning: A minibatch Tensor of to condition on. Must be 2D, with first
      dimension the same as `tensor`.

  Returns:
    `tensor` conditioned on `conditioning`.

  Raises:
    ValueError: If the non-batch dimensions of `tensor` aren't fully defined.
    ValueError: If `conditioning` isn't at least 2D.
    ValueError: If the batch dimension for the input Tensors don't match.
  """
    tensor.shape[1:].assert_is_fully_defined()
    num_features = tensor.shape[1:].num_elements()
    if conditioning.shape.ndims < 2:
        raise ValueError(
            'conditioning must be at least 2D, but saw shape: %s' %
            conditioning.shape)

    mapped_conditioning = layers.linear(layers.flatten(conditioning),
                                        num_features)
    if not mapped_conditioning.shape.is_compatible_with(tensor.shape):
        mapped_conditioning = array_ops.reshape(mapped_conditioning,
                                                _get_shape(tensor))
    return tensor + mapped_conditioning
Пример #12
0
 def construct_fn(attention_query, attention_keys, attention_values):
   context = attention_score_fn(attention_query, attention_keys,
                                attention_values)
   concat_input = array_ops.concat([attention_query, context], 1)
   attention = layers.linear(
       concat_input, num_units, biases_initializer=None, scope=scope)
   return attention
Пример #13
0
    def total_loss(outputs, targets, masks, alignments, triples_embedding, use_entities, entity_targets):
        '''
        outputs: batch_size * decoder_len * num_units
        targets: batch_size * decoder_len
        masks: batch_size * decoder_len
        alignments: [batch_size, decoder_len, triple_num, triple_len] 注意力系数
        triples_embedding: [batch_size, triple_num, triple_len, 3*num_trans_units]
        use_entities: batch_size * decoder_len 用 1 标注了回复的哪个时间步用了三元组
        entity_targets: [batch_size, decoder_len, triple_num, triple_len] 用 1 标注了每个 batch 每个时间步用了哪个图的哪个三元组
        '''
        batch_size = tf.shape(outputs)[0]
        local_masks = tf.reshape(masks, [-1])
        
        logits = layers.linear(outputs, num_symbols, scope='decoder_rnn/%s' % name)  # batch_size * decoder_len * num_symbols
        one_hot_targets = tf.one_hot(targets, num_symbols)  # batch_size * decoder_len * num_symbols

        # 每一步的单词预测为 target 的概率
        word_prob = tf.reduce_sum(tf.nn.softmax(logits) * one_hot_targets, axis=2)  # batch_size * decoder_len

        # 对三元组选择概率的系数,论文中的 gamma_t
        selector = tf.squeeze(tf.sigmoid(layers.linear(outputs, 1, scope='decoder_rnn/selector')))  # batch_size * decoder_len

        # 每一步对的三元组注意力为实际采用的三元组的概率
        triple_prob = tf.reduce_sum(alignments * entity_targets, axis=[2, 3])  # batch_size * decoder_len

        #
        ppx_prob = word_prob * (1 - use_entities) + triple_prob * use_entities  # batch_size * decoder_len

        # 论文中的 P(y_t)
        final_prob = word_prob * (1 - selector) * (1 - use_entities) + triple_prob * selector * use_entities  # batch_size * decoder_len
        final_loss = tf.reduce_sum(tf.reshape( - tf.log(1e-12 + final_prob), [-1]) * local_masks)  # value

        #
        ppx_loss = tf.reduce_sum(tf.reshape( - tf.log(1e-12 + ppx_prob), [-1]) * local_masks)  # value
        # 每个 batch 的 ppx
        sentence_ppx = tf.reduce_sum(tf.reshape(tf.reshape( - tf.log(1e-12 + ppx_prob), [-1]) * local_masks, [batch_size, -1]), axis=1)  # [batch_size]

        selector_loss = tf.reduce_sum(tf.reshape( - tf.log(1e-12 + selector * use_entities + (1 - selector) * (1 - use_entities)), [-1]) * local_masks)
            
        loss = final_loss + selector_loss
        total_size = tf.reduce_sum(local_masks)
        total_size += 1e-12  # to avoid division by 0 for all-0 weights

        # 每个词的平均损失,每个词的平均 ppx,每个句子的平均 ppx
        return loss / total_size, ppx_loss / total_size, sentence_ppx / tf.reduce_sum(masks, axis=1)
Пример #14
0
    def total_loss(outputs,  # [batch_size, decoder_len, num_units]
                   targets,  # [batch_size, decoder_len]
                   masks,  # [batch_size, decoder_len]
                   alignments,  # [batch_size, decoder_len, triple_num, triple_len]
                   triples_embedding,
                   use_entities,  # [batch_size, decoder_len] 用1标注了回复的哪个时间步用了三元组
                   entity_targets):  # [batch_size, decoder_len, triple_num, triple_len] 用1标注了每个batch每个时间步用了哪个图的哪个三元组

        batch_size = tf.shape(outputs)[0]
        local_masks = tf.reshape(masks, [-1])  # [batch_size*decoder_len]
        
        logits = layers.linear(outputs, num_symbols, scope='decoder_rnn/%s' % name)  # [batch_size, decoder_len, num_symbols]
        one_hot_targets = tf.one_hot(targets, num_symbols)  # [batch_size, decoder_len, num_symbols]

        # 每一步的单词预测正确的概率
        word_prob = tf.reduce_sum(tf.nn.softmax(logits) * one_hot_targets, axis=2)  # [batch_size, decoder_len]

        # 每一步使用实体词的概率预测
        selector = tf.squeeze(tf.sigmoid(layers.linear(outputs, 1, scope='decoder_rnn/selector')))  # [batch_size, decoder_len]

        # 每一步对使用的三元组的注意力
        triple_prob = tf.reduce_sum(alignments * entity_targets, axis=[2, 3])  # [batch_size, decoder_len]

        # 每一步正确的概率
        ppx_prob = word_prob * (1 - use_entities) + triple_prob * use_entities  # [batch_size, decoder_len]
        # 加上选择器选择正确的概率
        final_prob = word_prob * (1 - selector) * (1 - use_entities) + triple_prob * selector * use_entities  # [batch_size, decoder_len]

        # 加上选择器选择的损失
        final_loss = tf.reduce_sum(tf.reshape(- tf.log(1e-12 + final_prob), [-1]) * local_masks)
        # 不加选择器的ppx
        ppx_loss = tf.reduce_sum(tf.reshape( - tf.log(1e-12 + ppx_prob), [-1]) * local_masks)

        # 每个batch的ppx
        sentence_ppx = tf.reduce_sum(tf.reshape(tf.reshape(-tf.log(1e-12 + ppx_prob), [-1]) * local_masks, [batch_size, -1]), axis=1)  # [batch_size]
        # 选择器的损失
        selector_loss = tf.reduce_sum(tf.reshape(-tf.log(1e-12 + selector * use_entities + (1 - selector) * (1 - use_entities)), [-1]) * local_masks)  # [batch_size]
            
        loss = final_loss + selector_loss
        total_size = tf.reduce_sum(local_masks)
        total_size += 1e-12

        # 每个词的平均损失、每个词的平均ppx、每个样本的每个词的ppx[batch_size]
        return loss / total_size, ppx_loss / total_size, sentence_ppx / tf.reduce_sum(masks, axis=1)
Пример #15
0
def prepare_multistep_attention(encoder_states,
                                decoder_reprs,
                                kd_states1,
                                kd_states2,
                                attention_option,
                                num_units,
                                reuse=False):
    # Prepare attention keys / values from attention_states
    with variable_scope.variable_scope("attn_keys", reuse=reuse) as scope:
        attention_keys1 = layers.linear(encoder_states,
                                        num_units,
                                        biases_initializer=None,
                                        scope=scope)
        attention_values1 = encoder_states
        # Attention scoring function
        attention_score_fn1 = _create_attention_score_fn(
            "attn_score", num_units, attention_option, reuse)

    with variable_scope.variable_scope("attn_reprs", reuse=reuse) as scope:
        if decoder_reprs is not None:
            attention_keys2 = layers.linear(decoder_reprs,
                                            num_units,
                                            biases_initializer=None,
                                            scope=scope)
        else:
            attention_keys2 = None
        attention_values2 = decoder_reprs
        # Attention scoring function
        attention_score_fn2 = _create_attention_score_fn(
            "attn_score", num_units, attention_option, reuse)

    attention_keys = (attention_keys1, attention_keys2)
    if kd_states1 is not None and kd_states2 is not None:
        attention_values = (attention_values1, attention_values2, kd_states1,
                            kd_states2)
    else:
        attention_values = (attention_values1, attention_values2, None, None)
    attention_score_fn = (attention_score_fn1, attention_score_fn2)

    # Attention construction function
    attention_construct_fn = _create_attention_construct_fn(
        "attn_construct_multi", num_units, attention_score_fn, reuse)

    return attention_keys, attention_values, attention_construct_fn
Пример #16
0
 def inference_fn(inference_output):
     with tf.variable_scope("decoder_rnn"):
         inference_softmaxed_probability = tf.nn.softmax(
             layers.linear(inference_output,
                           vocabulary_count,
                           scope="projection_layer")
         )  # 词汇表softmaxed后的概率 [batch_size decoder_len vovabulary_count]
         inference_maximum_likelihood_id = tf.argmax(
             inference_softmaxed_probability, axis=2)
         return inference_maximum_likelihood_id
        def construct_fn(attention_query, attention_keys, attention_values):
            context, alignments = attention_score_fn(attention_query,
                                                     attention_keys,
                                                     attention_values)

            concat_input = array_ops.concat([attention_query, context], 1)
            concat_input = array_ops.reshape(concat_input, [-1, 1024])
            attention = layers.linear(concat_input,
                                      num_units,
                                      biases_initializer=None,
                                      scope=scope)
            return attention, alignments
Пример #18
0
 def project_fn(input):
     '''
     如果input是batch_size*num_units,创建权重矩阵 W:num_units*vocabulary_count
     output = input * W  batch_size * vocabulary_count
     如果input是batch_size, decoder-len, num_units,则output是batch_size, decoder-len, vocabulary_count
     '''
     output = layers.linear(
         input, vocabulary_count,
         scope="projection_layer")  # batch_size * vocabulary_count
     softmaxed_probability = tf.nn.softmax(
         output)  # batch_size * vocabulary_count
     return softmaxed_probability
Пример #19
0
        def construct_fn(attention_query, attention_keys, attention_values):
            '''拼接几个注意力
            返回:
                attention: [batch_size, num_units] 拼接完几个注意力并做一次线性变化得到的输出
                alignments: 注意力系数
            '''
            alignments = None
            # 如果有静态图或者三元组
            if type(attention_score_fn) is tuple:
                # 计算编码器每一步输出的注意力
                context0 = attention_score_fn[0](attention_query,
                                                 attention_keys[0],
                                                 attention_values[0])
                # 如果只有静态图
                if len(attention_keys) == 2:
                    context1 = attention_score_fn[1](attention_query,
                                                     attention_keys[1],
                                                     attention_values[1])
                # 如果既有静态图还有三元组
                elif len(attention_keys) == 3:
                    context1 = attention_score_fn[1](attention_query,
                                                     attention_keys[1:],
                                                     attention_values[1:])

                if type(context1) is tuple:
                    # 只有静态图且要求输出对齐
                    if len(context1) == 2:
                        context1, alignments = context1
                        concat_input = array_ops.concat(
                            [attention_query, context0, context1], 1)
                    # 既有静态图还有三元组
                    elif len(context1) == 3:
                        context1, context2, alignments = context1
                        concat_input = array_ops.concat(
                            [attention_query, context0, context1, context2], 1)
                else:  # 存在静态图没有三元组且不要求输出静态图的对齐的情况
                    concat_input = array_ops.concat(
                        [attention_query, context0, context1], 1)
            # 如果没有静态图或者三元组
            else:
                context = attention_score_fn(attention_query, attention_keys,
                                             attention_values)
                concat_input = array_ops.concat([attention_query, context], 1)
            # 给拼接完的注意力做一个线性变化 [batch_size, num_units]
            attention = layers.linear(concat_input,
                                      num_units,
                                      biases_initializer=None,
                                      scope=scope)
            if alignments is None:
                return attention
            else:
                return attention, alignments
Пример #20
0
    def total_loss(outputs, targets, masks, alignments, triples_embedding, use_entities, entity_targets):
        local_masks = tf.reshape(masks, [-1])
        
        logits = layers.linear(outputs, num_symbols, scope='decoder_rnn/%s' % name)
        one_hot_targets = tf.one_hot(targets, num_symbols)
        word_prob = tf.reduce_sum(tf.nn.softmax(logits) * one_hot_targets, axis=2)
        selector = tf.squeeze(tf.sigmoid(layers.linear(outputs, 1, scope='decoder_rnn/selector')))

        triple_prob = tf.reduce_sum(alignments * entity_targets, axis=[2])
        cast_selector = tf.cast(tf.reduce_sum(alignments, axis=2) > tf.reduce_sum(tf.nn.softmax(logits), axis=2), tf.float32)
        final_prob = word_prob * (1 - selector) + triple_prob * selector 
        ppx_prob = word_prob * (1 - use_entities) + triple_prob * use_entities
        final_loss = tf.reshape( - tf.log(1e-12 + final_prob), [-1]) * local_masks
        ppx_loss = tf.reshape( - tf.log(1e-12 + ppx_prob), [-1]) * local_masks
        sentence_ppx = tf.reduce_sum( - tf.log(1e-12 + ppx_prob) * masks, axis=1)
            
        loss = tf.reduce_sum(final_loss)
        #loss = tf.Print(loss, ['use_entity', tf.reduce_min(use_entities), tf.reduce_max(use_entities), 'triple_prob',tf.reduce_min(triple_prob), 'word_prob', tf.reduce_min(word_prob), 'final_prob', tf.reduce_min(final_prob), 'final_loss', tf.reduce_min(final_loss)], summarize=1e6)
        total_size = tf.reduce_sum(local_masks)
        total_size += 1e-12 # to avoid division by 0 for all-0 weights
        
        return loss / total_size, tf.reduce_sum(ppx_loss) / total_size, sentence_ppx / tf.reduce_sum(masks, axis=1)
Пример #21
0
    def total_loss(outputs, targets, masks, alignments, triples_embedding, use_entities, entity_targets):
        batch_size = tf.shape(outputs)[0]
        local_masks = tf.reshape(masks, [-1])
        
        logits = layers.linear(outputs, num_symbols, scope='decoder_rnn/%s' % name)
        one_hot_targets = tf.one_hot(targets, num_symbols)
        word_prob = tf.reduce_sum(tf.nn.softmax(logits) * one_hot_targets, axis=2)
        selector = tf.squeeze(tf.sigmoid(layers.linear(outputs, 1, scope='decoder_rnn/selector')))

        triple_prob = tf.reduce_sum(alignments * entity_targets, axis=[2, 3])
        ppx_prob = word_prob * (1 - use_entities) + triple_prob * use_entities
        final_prob = word_prob * (1 - selector) * (1 - use_entities) + triple_prob * selector * use_entities
        final_loss = tf.reduce_sum(tf.reshape( - tf.log(1e-12 + final_prob), [-1]) * local_masks)
        ppx_loss = tf.reduce_sum(tf.reshape( - tf.log(1e-12 + ppx_prob), [-1]) * local_masks)
        sentence_ppx = tf.reduce_sum(tf.reshape(tf.reshape( - tf.log(1e-12 + ppx_prob), [-1]) * local_masks, [batch_size, -1]), axis=1)
        selector_loss = tf.reduce_sum(tf.reshape( - tf.log(1e-12 + selector * use_entities + (1 - selector) * (1 - use_entities)), [-1]) * local_masks)
            
        loss = final_loss + selector_loss
        total_size = tf.reduce_sum(local_masks)
        total_size += 1e-12 # to avoid division by 0 for all-0 weights
        
        return loss / total_size, ppx_loss / total_size, sentence_ppx / tf.reduce_sum(masks, axis=1)
Пример #22
0
        def construct_fn(attention_query, attention_keys, attention_values):
            '''拼接计算完的上下文

            返回:
                attention: [batch_size, num_units] 注意力的上下文拼接,拼接完做了一次线性变化的
                alignments: 注意力系数
            '''
            alignments = None
            # 如果有图向量和三元组嵌入
            if type(attention_score_fn) is tuple:
                # 用 bahdanau 计算带注意力的上下文
                context0 = attention_score_fn[0](attention_query,
                                                 attention_keys[0],
                                                 attention_values[0])
                # 训练没走这个分支
                if len(attention_keys) == 2:
                    context1 = attention_score_fn[1](attention_query,
                                                     attention_keys[1],
                                                     attention_values[1])
                # 如果有图向量和三元组嵌入
                elif len(attention_keys) == 3:
                    context1 = attention_score_fn[1](attention_query,
                                                     attention_keys[1:],
                                                     attention_values[1:])

                if type(context1) is tuple:
                    if len(context1) == 2:
                        context1, alignments = context1
                        concat_input = array_ops.concat(
                            [attention_query, context0, context1], 1)
                    elif len(context1) == 3:  # 训练走的这个分支
                        context1, context2, alignments = context1
                        concat_input = array_ops.concat(
                            [attention_query, context0, context1, context2], 1
                        )  # [batch_size, num_units*4] 解码器输出,上下文,图上下文,三元组上下文拼接
                else:
                    concat_input = array_ops.concat(
                        [attention_query, context0, context1], 1)
            else:
                context = attention_score_fn(attention_query, attention_keys,
                                             attention_values)
                concat_input = array_ops.concat([attention_query, context], 1)
            # 给拼接完的分布做一个线性变化,将最后一个维度转为 num_units
            attention = layers.linear(concat_input,
                                      num_units,
                                      biases_initializer=None,
                                      scope=scope)  # [batch_size, num_units]
            if alignments is None:
                return attention
            else:
                return attention, alignments
Пример #23
0
        def construct_fn(attention_query, attention_keys, attention_values):
            if isinstance(attention_score_fn, tuple):  # multi-step decoding
                attention_score_fn1, attention_score_fn2 = attention_score_fn
                attention_keys1, attention_keys2 = attention_keys
                attention_values1, decoder_reprs, kd_states1, kd_states2 = attention_values
                context1 = attention_score_fn1(attention_query,
                                               attention_keys1,
                                               attention_values1)
                if kd_states1 is None or kd_states2 is None:
                    context2 = attention_score_fn2(attention_query,
                                                   attention_keys2,
                                                   decoder_reprs)
                    concat_input = array_ops.concat(
                        [attention_query, context1, context2], 1)
                else:
                    if decoder_reprs is None:
                        print("concat=3")
                        concat_input = array_ops.concat([
                            attention_query, context1, kd_states1, kd_states2
                        ], 1)
                    else:
                        print("concat=4")
                        context2 = attention_score_fn2(attention_query,
                                                       attention_keys2,
                                                       decoder_reprs)
                        concat_input = array_ops.concat([
                            attention_query, context1, context2, kd_states1,
                            kd_states2
                        ], 1)
            else:  # only one step decoding
                if isinstance(attention_values, tuple):
                    attention_values1, kd_state = attention_values
                    context1 = attention_score_fn(attention_query,
                                                  attention_keys,
                                                  attention_values1)
                    concat_input = array_ops.concat(
                        [attention_query, context1, kd_state], 1)
                else:
                    context = attention_score_fn(attention_query,
                                                 attention_keys,
                                                 attention_values)
                    concat_input = array_ops.concat([attention_query, context],
                                                    1)

            attention = layers.linear(concat_input,
                                      num_units,
                                      biases_initializer=None,
                                      scope=scope)
            return attention
Пример #24
0
 def sequence_loss(outputs, targets, masks):
     with variable_scope.variable_scope('decoder_rnn'):
         logits = layers.linear(outputs, num_symbols, scope=name)  # [batch_size, decoder_len, num_symbols]
         logits = tf.reshape(logits, [-1, num_symbols])  # [batch_size*decoder_len, num_symbols]
         local_labels = tf.reshape(targets, [-1])  # [batch_size*decoder_len]
         local_masks = tf.reshape(masks, [-1])  # [batch_size*decoder_len]
         
         local_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=local_labels, logits=logits)
         local_loss = local_loss * local_masks  # 序列长度外的部分不计算损失
         
         loss = tf.reduce_sum(local_loss)  # 序列的总损失
         total_size = tf.reduce_sum(local_masks)  # 序列的总长度
         total_size += 1e-12  # 避免总长度为0
         
         return loss / total_size  # 返回平均损失
Пример #25
0
 def construct_fn(attention_query, attention_keys, attention_values):
     context, scores, alignments = attention_score_fn(
         attention_query, attention_keys, attention_values)
     concat_input = array_ops.concat([attention_query, context], 1)
     #NOTICE! here pass scope which is outside construct_fn func! so always not affected by using env scope, be like
     #seq2seq/main/decode/attention_construct/weights in rnn_decoder.py not seq2seq/main/decode/rnn/loop_function/weights in beam_decoder.py
     attention = layers.linear(concat_input,
                               num_units,
                               biases_initializer=None,
                               scope=scope)
     #add this to make it safe using in loop, if only used in dynamic decode do not need this since only call above linear once
     #this kind of witting then add reuse seems safe and elegant
     #http://stackoverflow.com/questions/38545362/tensorflow-variable-scope-reuse-if-variable-exists
     scope.reuse_variables()
     return attention, scores, alignments
Пример #26
0
 def sequence_loss(outputs, targets, masks):
     with variable_scope.variable_scope('decoder_rnn'):
         logits = layers.linear(outputs, num_symbols, scope=name)
         logits = tf.reshape(logits, [-1, num_symbols])
         local_labels = tf.reshape(targets, [-1])
         local_masks = tf.reshape(masks, [-1])
         
         local_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=local_labels, logits=logits)
         local_loss = local_loss * local_masks
         
         loss = tf.reduce_sum(local_loss)
         total_size = tf.reduce_sum(local_masks)
         total_size += 1e-12 # to avoid division by 0 for all-0 weights
         
         return loss / total_size
Пример #27
0
 def attention_construct_fn(
     query,  # 解码器输出 [batch_size, num_units]
     keys,  # [batch_size, encoder_len, num_units]
     values):  # [batch_size, encoder_len, num_units]
     alignments = None
     context = attention_score_fn(query, keys, values)
     if type(context) is tuple:
         context, alignments = context
     concat_input = tf.concat([query, context], axis=1)
     attention = layers.linear(concat_input,
                               num_units,
                               biases_initializer=None,
                               scope=scope)  # [batch_size, num_units]
     if alignments is None:
         return attention  # [batch_size, num_units]
     else:
         return attention, alignments
Пример #28
0
 def loss_fn(decoder_output, label_id, mask):
     with tf.variable_scope("decoder_rnn"):
         softmaxed_probability = tf.nn.softmax(
             layers.linear(decoder_output,
                           vocabulary_count,
                           scope="projection_layer"))
         logits = tf.reshape(
             softmaxed_probability,
             [-1, vocabulary_count
              ])  # [batch_size*decoder_len vovabulary_count]
         labels = tf.reshape(label_id, [-1])  # [batch_size*decoder_len]
         label_mask = tf.reshape(mask, [-1])  # [batch_size*decoder_len]
         local_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
             labels=labels, logits=logits)  # [batch_size*decoder_len]
         total_size = tf.reduce_sum(label_mask)
         total_size += 1e-12
         loss = tf.reduce_sum(local_loss * label_mask)
         avg_loss = loss / total_size
         return loss, avg_loss
Пример #29
0
def prepare_attention(attention_states,
                      attention_option,
                      num_units,
                      reuse=False):
    """Prepare keys/values/functions for attention.

  Args:
    attention_states: hidden states to attend over.
    attention_option: how to compute attention, either "luong" or "bahdanau".
    num_units: hidden state dimension.
    reuse: whether to reuse variable scope.

  Returns:
    attention_keys: to be compared with target states.
    attention_values: to be used to construct context vectors.
    attention_score_fn: to compute similarity between key and target states.
    attention_construct_fn: to build attention states.
  """

    # Prepare attention keys / values from attention_states
    with variable_scope.variable_scope("attention_keys", reuse=reuse) as scope:
        attention_keys = layers.linear(attention_states,
                                       num_units,
                                       biases_initializer=None,
                                       scope=scope)
        #well here do not need ad scope reuse since you get attention_keys and use it  not call this linear again,
        #attention_construct_fn is different, since you call that function each step during loop

    attention_values = attention_states

    # Attention score function
    attention_score_fn = _create_attention_score_fn("attention_score",
                                                    num_units,
                                                    attention_option, reuse)

    # Attention construction function
    attention_construct_fn = _create_attention_construct_fn(
        "attention_construct", num_units, attention_score_fn, reuse)

    return (attention_keys, attention_values, attention_score_fn,
            attention_construct_fn)
def prepare_attention(attention_states,
                      attention_len,
                      attention_option,
                      num_units,
                      reuse=False):
    # Prepare attention keys / values from attention_states
    with variable_scope.variable_scope("attention_keys", reuse=reuse) as scope:
        attention_keys = layers.linear(
            attention_states, num_units, biases_initializer=None, scope=scope)
    attention_values = attention_states

    # Attention score function
    attention_score_fn = _create_attention_score_fn(
        "attention_score", attention_len, num_units, attention_option, reuse)

    # Attention construction function
    attention_construct_fn = _create_attention_construct_fn(
        "attention_construct", num_units, attention_score_fn, reuse)

    return (attention_keys, attention_values, attention_score_fn,
            attention_construct_fn)
def prepare_attention(attention_states,
                      attention_option,
                      num_units,
                      reuse=False):
    """Prepare keys/values/functions for attention.

  Args:
    attention_states: hidden states to attend over.
    attention_option: how to compute attention, either "luong" or "bahdanau".
    num_units: hidden state dimension.
    reuse: whether to reuse variable scope.

  Returns:
    attention_keys: to be compared with target states.
    attention_values: to be used to construct context vectors.
    attention_score_fn: to compute similarity between key and target states.
    attention_construct_fn: to build attention states.
  """

    # Prepare attention keys / values from attention_states
    with variable_scope.variable_scope("attention_keys", reuse=reuse) as scope:
        attention_keys = layers.linear(attention_states,
                                       num_units,
                                       biases_initializer=None,
                                       scope=scope)
        #attention_keys = tf.Print(attention_keys, ["attention_keys", tf.shape(attention_states), tf.shape(attention_keys), attention_keys])
    attention_values = attention_states

    # Attention score function
    attention_score_fn = _create_attention_score_fn("attention_score",
                                                    num_units,
                                                    attention_option, reuse)

    # Attention construction function
    attention_construct_fn = _create_attention_construct_fn(
        "attention_construct", num_units, attention_score_fn, reuse)

    return (attention_keys, attention_values, attention_score_fn,
            attention_construct_fn)
Пример #32
0
def prepare_attention(attention_states,
                      attention_option,
                      num_units,
                      reuse=False):
  """Prepare keys/values/functions for attention.

  Args:
    attention_states: hidden states to attend over.
    attention_option: how to compute attention, either "luong" or "bahdanau".
    num_units: hidden state dimension.
    reuse: whether to reuse variable scope.

  Returns:
    attention_keys: to be compared with target states.
    attention_values: to be used to construct context vectors.
    attention_score_fn: to compute similarity between key and target states.
    attention_construct_fn: to build attention states.
  """

  # Prepare attention keys / values from attention_states
  with variable_scope.variable_scope("attention_keys", reuse=reuse) as scope:
    attention_keys = layers.linear(
        attention_states, num_units, biases_initializer=None, scope=scope)
  attention_values = attention_states

  # Attention score function
  attention_score_fn = _create_attention_score_fn("attention_score", num_units,
                                                  attention_option, reuse)

  # Attention construction function
  attention_construct_fn = _create_attention_construct_fn("attention_construct",
                                                          num_units,
                                                          attention_score_fn,
                                                          reuse)

  return (attention_keys, attention_values, attention_score_fn,
          attention_construct_fn)
Пример #33
0
    def attention(query):
      """Point on hidden using hidden_features and query."""
      with vs.variable_scope("Attention"):
        v = vs.get_variable("AttnV", [num_units])
        #[batch_size, num_units] -> [batch_size, num_units]
        processed_query =  layers.linear(query, num_units, scope="query_layer")
        #processed_query =  layers_core.dense(query, num_units, use_bias=False, name="query_layer")
        #->[batch_size, 1, num_units]
        processed_query = tf.expand_dims(processed_query, 1)
        #[batch_size, attn_length, num_units] + [batch_size, 1, num_units] -> [batch_size, attn_length, num_units] 
        #reduce_sum -> [batch_size, attn_length]
        scores = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2])
        
        if feed_prev:
          alignments = nn_ops.softmax(scores)
          #-> [batch_size, attn_length, 1]
          alignments = array_ops.expand_dims(alignments, 2)
        
          #context_vector = math_ops.reduce_sum(alignments * values, [1])

          #return scores, alignments, context_vector
          return scores, alignments
        else:
          return scores