Exemplo n.º 1
0
def tf_reshape_box(true_xy_A: tf.Tensor, true_wh_A: tf.Tensor,
                   p_xy_A: tf.Tensor, p_wh_A: tf.Tensor, layer: int,
                   helper: Helper) -> tuple:
    """ reshape the xywh to [?,h,w,anchor_nums,true_box_nums,2]
        NOTE  must use obj mask in atrue xywh !
    Parameters
    ----------
    true_xy_A : tf.Tensor
        shape will be [true_box_nums,2]

    true_wh_A : tf.Tensor
        shape will be [true_box_nums,2]

    p_xy_A : tf.Tensor
        shape will be [?,h,w,anhor_nums,2]

    p_wh_A : tf.Tensor
        shape will be [?,h,w,anhor_nums,2]

    layer : int

    helper : Helper


    Returns
    -------
    tuple
        true_cent, true_box_wh, pred_cent, pred_box_wh
    """
    with tf.name_scope('reshape_box_%d' % layer):
        true_cent = true_xy_A[tf.newaxis, tf.newaxis, tf.newaxis, tf.newaxis,
                              ...]
        true_box_wh = true_wh_A[tf.newaxis, tf.newaxis, tf.newaxis, tf.newaxis,
                                ...]

        true_cent = tf.tile(true_cent, [
            helper.batch_size, helper.out_hw[layer][0],
            helper.out_hw[layer][1], helper.anchor_number, 1, 1
        ])
        true_box_wh = tf.tile(true_box_wh, [
            helper.batch_size, helper.out_hw[layer][0],
            helper.out_hw[layer][1], helper.anchor_number, 1, 1
        ])

        pred_cent = p_xy_A[..., tf.newaxis, :]
        pred_box_wh = p_wh_A[..., tf.newaxis, :]
        pred_cent = tf.tile(pred_cent, [1, 1, 1, 1, tf.shape(true_xy_A)[0], 1])
        pred_box_wh = tf.tile(
            pred_box_wh, [1, 1, 1, 1, tf.shape(true_wh_A)[0], 1])

    return true_cent, true_box_wh, pred_cent, pred_box_wh
Exemplo n.º 2
0
def split_targets(y_true: Tensor, y_pred: Tensor,
                  method: Method) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
    """
    Split concatenated hard targets / logits and hard predictions / soft predictions.

    :param y_true: tensor with the true labels.
    :param y_pred: tensor with the predicted labels.
    :param method: the method used to transfer the knowledge.
    :return: the concatenated logits, soft predictions, hard targets and hard predictions
    (teacher_logits, student_output, y_true, y_pred).
    """
    # Here we get the split point, which is half of the predicting dimension.
    # The reason is because the network's output contains the predicted values
    # concatenated with the predicted logits, which will always have the same dimension.
    split_point = cast(divide(shape(y_true)[1], 2), int32)
    # Get hard labels and logits.
    y_true, teacher_logits = y_true[:, :split_point], y_true[:, split_point:]

    if method == Method.DISTILLATION or method == Method.PKT_PLUS_DISTILLATION:
        y_pred, student_output = y_pred[:, :split_point], y_pred[:,
                                                                 split_point:]
    else:
        student_output = identity(y_pred)

    return teacher_logits, student_output, y_true, y_pred
Exemplo n.º 3
0
def create_label(click_position, num_labels=10):
    num_rows = shape(click_position)[0]
    row_idx = expand_dims(range(num_rows), axis=1)
    idx = concatenate([row_idx, cast(click_position, int32)], axis=1)
    labels = SparseTensor(indices=cast(idx, int64),
                          values=ones([num_rows]),
                          dense_shape=[num_rows, num_labels])
    return ones([num_rows, num_labels]) - to_dense(labels)
Exemplo n.º 4
0
def noise_label(labels):
    id = range(shape(labels['click_position'])[0])
    idx = concatenate(
        [expand_dims(cast(id, int64), axis=1), labels['click_position']],
        axis=1)
    clicked_item = gather_nd(labels['reco'], idx)
    return cast(equal(expand_dims(clicked_item, axis=1), labels['reco']),
                float32)
Exemplo n.º 5
0
 def test_ficken(self):
     labels = {'click_position': [1, 2], 'reco': [[0, 1, 2], [2, 1, 0]]}
     id = range(shape(labels['click_position'])[0])
     idx = concatenate([
         expand_dims(cast(id, int64), axis=1),
         expand_dims(cast(labels['click_position'], int64), axis=1)
     ],
                       axis=1)
     clicked_item = gather_nd(labels['reco'], idx)
     with self.test_session():
         print(clicked_item.eval())
def attention_decoder(decoder_inputs,
                      initial_state,
                      attention_states,
                      cell,
                      output_size=None,
                      num_heads=1,
                      loop_function=None,
                      dtype=None,
                      scope=None,
                      initial_state_attention=False):
  """RNN decoder with attention for the sequence-to-sequence model.

  In this context "attention" means that, during decoding, the RNN can look up
  information in the additional tensor attention_states, and it does this by
  focusing on a few entries from the tensor. This model has proven to yield
  especially good results in a number of sequence-to-sequence tasks. This
  implementation is based on http://arxiv.org/abs/1412.7449 (see below for
  details). It is recommended for complex sequence-to-sequence tasks.

  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: Size of the output vectors; if None, we use cell.output_size.
    num_heads: Number of attention heads that read from attention_states.
    loop_function: If not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of
        shape [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either the i-th element
        of decoder_inputs or loop_function(output {i-1}, i)) as follows.
        First, we run the cell on a combination of the input and previous
        attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      state: The state of each decoder cell the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, shapes
      of attention_states are not set, or input size cannot be inferred
      from the input.
  """
  if not decoder_inputs:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if attention_states.get_shape()[2].value is None:
    raise ValueError("Shape[2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with variable_scope.variable_scope(
      scope or "attention_decoder", dtype=dtype) as scope:
    dtype = scope.dtype

    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length = attention_states.get_shape()[1].value
    if attn_length is None:
      attn_length = shape(attention_states)[1]
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = array_ops.reshape(
        attention_states, [-1, attn_length, 1, attn_size])
    hidden_features = []
    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    for a in xrange(num_heads):
      k = variable_scope.get_variable("AttnW_%d" % a,
                                      [1, 1, attn_size, attention_vec_size])
      hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
      v.append(
          variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]))

    state = initial_state

    def attention(query):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      if nest.is_sequence(query):  # If the query is a tuple, flatten it.
        query_list = nest.flatten(query)
        for q in query_list:  # Check that ndims == 2 if specified.
          ndims = q.get_shape().ndims
          if ndims:
            assert ndims == 2
        query = array_ops.concat(1, query_list)
      for a in xrange(num_heads):
        with variable_scope.variable_scope("Attention_%d" % a):
          y = linear(query, attention_vec_size, True)
          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
          # Attention mask is a softmax of v^T * tanh(...).
          s = math_ops.reduce_sum(
              v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
          a = nn_ops.softmax(s)
          # Now calculate the attention-weighted vector d.
          d = math_ops.reduce_sum(
              array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
              [1, 2])
          ds.append(array_ops.reshape(d, [-1, attn_size]))
      return ds

    outputs = []
    prev = None
    batch_attn_size = array_ops.pack([batch_size, attn_size])
    attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
             for _ in xrange(num_heads)]
    for a in attns:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size])
    if initial_state_attention:
      attns = attention(initial_state)
    for i, inp in enumerate(decoder_inputs):
      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()
      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None and prev is not None:
        with variable_scope.variable_scope("loop_function", reuse=True):
          inp = loop_function(prev, i)
      # Merge input and previous attentions into one vector of the right size.
      input_size = inp.get_shape().with_rank(2)[1]
      if input_size.value is None:
        raise ValueError("Could not infer input size from input: %s" % inp.name)
      x = linear([inp] + attns, input_size, True)
      # Run the RNN.
      cell_output, state = cell(x, state)
      # Run the attention mechanism.
      if i == 0 and initial_state_attention:
        with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                           reuse=True):
          attns = attention(state)
      else:
        attns = attention(state)

      with variable_scope.variable_scope("AttnOutputProjection"):
        output = linear([cell_output] + attns, output_size, True)
      if loop_function is not None:
        prev = output
      outputs.append(output)

  return outputs, state
Exemplo n.º 7
0
def kv_attention_decoder(cell,
                         decoder_inputs,
                         kb_inputs,
                         kb_mask_inputs,
                         initial_state,
                         attention_states,
                         num_decoder_symbols,
                         embedding_size,
                         output_size,
                         output_projection=None,
                         feed_previous=False,
                         attn_type="linear",
                         enc_attn=False,
                         enc_query=False,
                         scope=None,
                         dtype=None):
    """
    Run decoding which includes an attention over both the encoder states and the KB
    :param cell:
    :param encoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs)
    :param decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs)
    :param kb_inputs: Tensor containing KB to be used for decoding
    :param kb_col_inputs: Tensor containing col indices for batch of dialogues (batch_size, num_cols)
    :param kb_mask_inputs: Tensor containing KB masks to be used for zeroing out PAD embeddings in KB
    :param initial_state: Initial encoder state fed into the decoder
    :param attention_states: Embedded encoder attention states (batch_size, attn_length, attn_size)
    :param num_decoder_symbols: Vocab size for decoding
    :param embedding_size: Size of embedding vector
    :param output_size: Size of output vectors
    :param output_projection:
    :param feed_previous:
    :param scope:
    :param dtype:
    :return:
    """
    if output_projection is not None:
        proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
        proj_biases.get_shape().assert_is_compatible_with(
            [num_decoder_symbols])

    with variable_scope.variable_scope(scope or "kb_attention_decoder",
                                       dtype=dtype) as scope:
        embedding = variable_scope.get_variable(
            "embedding", [num_decoder_symbols, embedding_size])
        loop_function = _extract_argmax_and_embed(
            embedding, output_projection) if feed_previous else None
        emb_inp = [
            embedding_ops.embedding_lookup(embedding, i)
            for i in decoder_inputs
        ]
        # Needed for reshaping.
        batch_size = array_ops.shape(decoder_inputs[0])[0]
        attn_length = attention_states.get_shape()[1].value
        if attn_length is None:
            attn_length = shape(attention_states)[1]
        attn_size = attention_states.get_shape()[2].value
        # To calculate W1 * h_t we use a 1-by-1 convolution, need to
        # reshape before.
        hidden = array_ops.reshape(attention_states,
                                   [-1, attn_length, 1, attn_size])
        hidden_features = []
        v = []
        # Size of query vectors for attention.
        attention_vec_size = attn_size

        if attn_type == "linear" or attn_type == "two-mlp":
            k = variable_scope.get_variable(
                "AttnW", [1, 1, attn_size, attention_vec_size])
            hidden_features.append(
                nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
            v.append(variable_scope.get_variable("AttnV",
                                                 [attention_vec_size]))

        # Initialize mask embedding table
        np_mask = np.array([[0.] * embedding_size, [1.] * embedding_size])
        embedding_mask = variable_scope.get_variable(
            "embedding_mask", [2, embedding_size],
            initializer=tf.constant_initializer(np_mask),
            trainable=False)
        embedded_kb_mask_batch = tf.nn.embedding_lookup(
            embedding_mask, kb_mask_inputs)
        # Mask for zeroing out attns over PAD tokens
        kb_attn_mask = tf.cast(kb_mask_inputs[:, :, 0, 0], tf.float32)
        # Embed kb
        embedded_kb_batch = tf.nn.embedding_lookup(embedding, kb_inputs)
        embedded_kb_batch = embedded_kb_batch * embedded_kb_mask_batch

        embedded_kb_batch = math_ops.reduce_sum(embedded_kb_batch, [3])
        # Split into value, type tensors
        num_triples = embedded_kb_batch.get_shape()[1].value

        embedded_kb_key = embedded_kb_batch[:, :, :2, :]
        # Summing head + relation
        embedded_kb_key = math_ops.reduce_sum(embedded_kb_key, [2])

        # Dim: (?, num_triples,)
        value_idx = kb_inputs[:, :, 3, 0]

        # Query will usually be of (batch_size, rnn_size)
        def attention(query):
            """Put attention masks on hidden using hidden_features and query."""
            # Results of attention reads will be stored here.
            ds = []
            # Will store masks over encoder context
            attn_masks = []
            # Store attention logits
            attn_logits = []
            # If the query is a tuple (LSTMStateTuple), flatten it.
            if nest.is_sequence(query):
                query_list = nest.flatten(query)
                # Check that ndims == 2 if specified.
                for q in query_list:
                    ndims = q.get_shape().ndims
                    if ndims:
                        assert ndims == 2
                query = array_ops.concat(query_list, axis=1)
            with variable_scope.variable_scope("Attention"):
                if attn_type == "linear":
                    y = linear(query, attention_vec_size, True)
                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                    # Attention mask is a softmax of v^T * tanh(...).
                    s = math_ops.reduce_sum(
                        v[0] * math_ops.tanh(hidden_features[0] + y), [2, 3])
                elif attn_type == "bilinear":
                    query = tf.tile(tf.expand_dims(query, 1),
                                    [1, attn_length, 1])
                    query = batch_linear(query, attn_size, bias=True)
                    hid = tf.squeeze(hidden, [2])
                    s = tf.reduce_sum(tf.matmul(query, hid), [2])
                else:
                    # Two layer MLP
                    y = linear(query, attention_vec_size, True)
                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                    # Attention mask is a softmax of v^T * tanh(...).
                    layer1 = math_ops.tanh(hidden_features[0] + y)
                    k2 = variable_scope.get_variable(
                        "AttnW_1", [1, 1, attn_size, attention_vec_size])
                    layer2 = nn_ops.conv2d(layer1, k2, [1, 1, 1, 1], "SAME")
                    s = math_ops.reduce_sum(v[0] * math_ops.tanh(layer2),
                                            [2, 3])

                a = nn_ops.softmax(s)
                attn_masks.append(a)
                attn_logits.append(s)
                # Now calculate the attention-weighted vector d.
                # Hidden is encoder hidden states
                d = math_ops.reduce_sum(
                    array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                    [1, 2])
                ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds, attn_masks, attn_logits

        def attention_kb_triple(query):
            """
            Compute attention over kb triples given decoder hidden state as a query
            :param query:
            :return:
            """
            # Expand dims so can concatenate with embedded_key
            with variable_scope.variable_scope("Attention_KB_Triple"):
                if attn_type == "two-mlp":
                    query = tf.expand_dims(query, [1])
                    with variable_scope.variable_scope("KB_key_W1"):
                        key_layer_1 = batch_linear(embedded_kb_key,
                                                   attention_vec_size,
                                                   bias=True)

                    with variable_scope.variable_scope("Query_W1"):
                        query_layer_1 = batch_linear(query,
                                                     attention_vec_size,
                                                     bias=True)

                    layer_1 = math_ops.tanh(key_layer_1 + query_layer_1)
                    with variable_scope.variable_scope("KB_Query_W2"):
                        layer_2 = batch_linear(layer_1,
                                               attention_vec_size,
                                               bias=True)

                    layer_2 = math_ops.tanh(layer_2)
                    with variable_scope.variable_scope("KB_Query_W3"):
                        layer_3 = batch_linear(layer_2, 1, bias=True)

                    layer_3_logits = tf.squeeze(layer_3, [2])
                    layer_3 = nn_ops.softmax(layer_3_logits)

                    return layer_3, layer_3_logits
                elif attn_type == "linear":
                    query = tf.expand_dims(query, [1])
                    with variable_scope.variable_scope("KB_key_W1"):
                        key_layer_1 = batch_linear(embedded_kb_key,
                                                   attention_vec_size,
                                                   bias=True)

                    with variable_scope.variable_scope("Query_W1"):
                        query_layer_1 = batch_linear(query,
                                                     attention_vec_size,
                                                     bias=True)

                    layer_1 = math_ops.tanh(key_layer_1 + query_layer_1)
                    with variable_scope.variable_scope("KB_Query_W2"):
                        layer_2 = batch_linear(layer_1, 1, bias=True)

                    layer_2_logits = tf.squeeze(layer_2, [2])
                    layer_2 = nn_ops.softmax(layer_2_logits)
                    return layer_2, layer_2_logits

        state = initial_state
        outputs = []
        switch_outputs = []
        attn_kb_outputs = []
        prev = None
        batch_attn_size = array_ops.stack([batch_size, attn_size])
        attns = [array_ops.zeros(batch_attn_size, dtype=dtype)]
        first_indices = tf.tile(tf.expand_dims(tf.range(batch_size), dim=1),
                                [1, num_triples])
        # Use encoding of query
        if enc_query:
            encoder_q = array_ops.concat([state.c, state.h], axis=1)
            attn_kb, attn_kb_logits = attention_kb_triple(encoder_q)
        # Ensure the second shape of attention vectors is set.
        for a in attns:
            a.set_shape([None, attn_size])

        for i, inp in enumerate(emb_inp):
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()
            # If loop_function is set, we use it instead of decoder_inputs.
            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function",
                                                   reuse=True):
                    inp = loop_function(prev, i)
            # Merge input and previous attentions into one vector of
            # the right size.
            input_size = inp.get_shape().with_rank(2)[1]
            if input_size.value is None:
                raise ValueError("Could not infer input size from input: %s" %
                                 inp.name)

            if enc_attn:
                # Use encoder attention as well
                x = linear([inp] + attns, input_size, True)
            else:
                x = linear([inp], input_size, True)

            # Run the RNN.
            cell_output, state = cell(x, state)
            # If the query is a tuple (LSTMStateTuple), flatten it.
            if nest.is_sequence(state):
                query_list = nest.flatten(state)
                # Check that ndims == 2 if specified.
                for q in query_list:
                    ndims = q.get_shape().ndims
                    if ndims:
                        assert ndims == 2
                concat_state = array_ops.concat(query_list, axis=1)

            if enc_attn:
                attns, attn_masks, attn_logits = attention(state)

            if not enc_query:
                attn_kb, attn_kb_logits = attention_kb_triple(concat_state)

            attn_kb_logits = attn_kb_logits * kb_attn_mask
            # Gather values from KB
            gather_indices = tf.stack([first_indices, value_idx], axis=2)
            updated_p = tf.scatter_nd(gather_indices, attn_kb_logits,
                                      [batch_size, num_decoder_symbols])
            attn_kb_outputs.append(attn_kb_logits)

            with variable_scope.variable_scope("AttnOutputProjection"):
                if enc_attn:
                    output = linear([cell_output] + attns, output_size, True)
                else:
                    output = linear([cell_output], output_size, True)
            # Simply add output logits and attn kb logits
            output = updated_p + output
            if loop_function is not None:
                prev = output
            outputs.append(output)

    return outputs, attn_kb_outputs, switch_outputs
def attention_decoder(decoder_inputs,  # T * [batch_size, input_size]
                      initial_state,  # [batch_size, cell.states]
                      attention_states,  # [batch_size, attn_length , attn_size]
                      cell,
                      output_size=None,
                      num_heads=1,
                      loop_function=None,
                      dtype=None,
                      scope=None,
                      initial_state_attention=False):
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if num_heads < 1:
        raise ValueError("With less than 1 heads, use a non-attention decoder.")
    if attention_states.get_shape()[2].value is None:
        raise ValueError("Shape[2] of attention_states must be known: %s"
                         % attention_states.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with variable_scope.variable_scope(
            scope or "attention_decoder", dtype=dtype) as scope:
        dtype = scope.dtype

        batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        if attn_length is None:
            attn_length = shape(attention_states)[1]
        attn_size = attention_states.get_shape()[2].value
        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        # W_{1}*h_{i}用的是卷积的方式实现,返回的tensor的形状是[batch_size, attn_length, 1, attention_vec_size]
        hidden = array_ops.reshape(
            attention_states, [-1, attn_length, 1, attn_size])
        hidden_features = []
        v = []
        attention_vec_size = attn_size  # Size of query vectors for attention.
        for a in range(num_heads):
            k = variable_scope.get_variable("AttnW_%d" % a,
                                            [1, 1, attn_size, attention_vec_size])
            hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
            v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]))

        state = initial_state
        def attention(query):
            """Put attention masks on hidden using hidden_features and query."""
            ds = []  # Results of attention reads will be stored here.
            if nest.is_sequence(query):  # If the query is a tuple, flatten it.
                query_list = nest.flatten(query)
                for q in query_list:  # Check that ndims == 2 if specified.
                    ndims = q.get_shape().ndims
                    if ndims:
                        assert ndims == 2
                query = array_ops.concat(query_list, 1)
            # W_{2}*d_{t},此项是通过下面的线性映射函数linear实现
            for a in range(num_heads):
                with variable_scope.variable_scope("Attention_%d" % a):
                    # query对应当前隐层状态d_t
                    y = linear(query, attention_vec_size, True)
                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                    # Attention mask is a softmax of v^T * tanh(...).
                    # 计算u_t
                    s = math_ops.reduce_sum(
                        v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
                    a = nn_ops.softmax(s)
                    # Now calculate the attention-weighted vector d.
                    d = math_ops.reduce_sum(
                        array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                        [1, 2])
                    ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds

        outputs = []
        prev = None
        batch_attn_size = array_ops.pack([batch_size, attn_size])
        attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
                 for _ in range(num_heads)]
        for a in attns:  # Ensure the second shape of attention vectors is set.
            a.set_shape([None, attn_size])
        if initial_state_attention:
            attns = attention(initial_state)
        for i, inp in enumerate(decoder_inputs):
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()
            # If loop_function is set, we use it instead of decoder_inputs.
            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function", reuse=True):
                    inp, inp_symbol = loop_function(prev, i)
            # Merge input and previous attentions into one vector of the right size.
            input_size = inp.get_shape().with_rank(2)[1]
            if input_size.value is None:
                raise ValueError("Could not infer input size from input: %s" % inp.name)
            x = linear([inp] + attns, input_size, True)
            # Run the RNN.
            cell_output, state = cell(x, state)
            # Run the attention mechanism.
            if i == 0 and initial_state_attention:
                with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                                   reuse=True):
                    attns = attention(state)
            else:
                attns = attention(state)

            with variable_scope.variable_scope("AttnOutputProjection"):
                output = linear([cell_output] + attns, output_size, True)
            if loop_function is not None:
                prev = output
            outputs.append(output)

        return outputs, state
Exemplo n.º 9
0
def attention_decoder(initial_state,
                      attention_states,
                      cell,
                      vocab_size,
                      time_steps,
                      batch_size,
                      output_size=None,
                      loop_function=None,
                      dtype=None,
                      scope=None):
  if attention_states.get_shape()[2].value is None:
    raise ValueError("Shape[2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with variable_scope.variable_scope(
      scope or "attention_decoder", dtype=dtype) as scope:
    dtype = scope.dtype

    attn_length = attention_states.get_shape()[1].value
    if attn_length is None:
      attn_length = shape(attention_states)[1]
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = array_ops.reshape(
        attention_states, [-1, attn_length, 1, attn_size])
    attention_vec_size = attn_size  # Size of query vectors for attention.
    k = variable_scope.get_variable("AttnW",
                                    [1, 1, attn_size, attention_vec_size])
    hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
    v = variable_scope.get_variable("AttnV", [attention_vec_size])

    state = initial_state

    def attention(query):
      """Put attention masks on hidden using hidden_features and query."""
      if nest.is_sequence(query):  # If the query is a tuple, flatten it.
        query_list = nest.flatten(query)
        for q in query_list:  # Check that ndims == 2 if specified.
          ndims = q.get_shape().ndims
          if ndims:
            assert ndims == 2
        query = array_ops.concat(1, query_list)
      with variable_scope.variable_scope("Attention_0"):
        y = linear(query, attention_vec_size, True)
        y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
        # Attention mask is a softmax of v^T * tanh(...).
        s = math_ops.reduce_sum(
            v * math_ops.tanh(hidden_features + y), [2, 3])
        a = nn_ops.softmax(s)
        # Now calculate the attention-weighted vector d.
        d = math_ops.reduce_sum(
            array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
            [1, 2])
        ds = array_ops.reshape(d, [-1, attn_size])
      return ds

    prev = array_ops.zeros([batch_size,output_size])
    batch_attn_size = array_ops.pack([batch_size, attn_size])
    attn = array_ops.zeros(batch_attn_size, dtype=dtype)
    attn.set_shape([None, attn_size])

    def cond(time_step, prev_o_t, prev_softmax_input, state_c, state_h, outputs):
      return time_step < time_steps

    def body(time_step, prev_o_t, prev_softmax_input, state_c, state_h, outputs):
      state = tf.nn.rnn_cell.LSTMStateTuple(state_c,state_h)
      with variable_scope.variable_scope("loop_function", reuse=True):
        inp = loop_function(prev_softmax_input, time_step)

      input_size = inp.get_shape().with_rank(2)[1]
      if input_size.value is None:
        raise ValueError("Could not infer input size from input: %s" % inp.name)
      x = tf.concat(1,[inp,prev_o_t])
      # Run the RNN.
      cell_output, state = cell(x, state)
      # Run the attention mechanism.
      attn = attention(state)

      with variable_scope.variable_scope("AttnOutputProjection"):
        output = math_ops.tanh(linear([cell_output, attn], output_size, False))
        with variable_scope.variable_scope("FinalSoftmax"):
          softmax_input = linear(output,vocab_size,False)

      new_outputs = tf.concat(1, [outputs,tf.expand_dims(softmax_input,1)])
      return (time_step + tf.constant(1, dtype=tf.int32),\
              output, softmax_input, state.c, state.h, new_outputs)

    time_step = tf.constant(0, dtype=tf.int32)
    shape_invariants = [time_step.get_shape(),\
                        prev.get_shape(),\
                        tf.TensorShape([batch_size, vocab_size]),\
                        tf.TensorShape([batch_size,512]),\
                        tf.TensorShape([batch_size,512]),\
                        tf.TensorShape([batch_size, None, vocab_size])]

# START keyword is 0
    init_word = np.zeros([batch_size, vocab_size])

    loop_vars = [time_step,\
                 prev,\
                 tf.constant(init_word, dtype=tf.float32),\
                 initial_state.c,initial_state.h,\
                 tf.zeros([batch_size,1,vocab_size])] # we just need to feed an empty matrix
                                                      # to start off the while loop since you can
                                                      # only concat matrices that agree on all but
                                                      # one dimension. Below, we remove that initial
                                                      # filler index

    outputs = tf.while_loop(cond, body, loop_vars, shape_invariants)

  return outputs[-1][:,1:], tf.nn.rnn_cell.LSTMStateTuple(outputs[-3],outputs[-2])
Exemplo n.º 10
0
def attention_decoder(decoder_inputs,
                      encoder_inputs,
                      initial_state,
                      attention_states,
                      cell,
                      sent_decoder_inputs,
                      sent_encoder_inputs,
                      sent_initial_state,
                      sent_attention_states,
                      sent_cell,
                      dec_timesteps,
                      mode_train=True,
                      switch=None,
                      word_weights=None,
                      output_size=None,
                      num_heads=1,
                      loop_function=None,
                      sent_loop_function=None,
                      dtype=None,
                      scope=None,
                      initial_state_attention=False):
  
 
  if not decoder_inputs:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if attention_states.get_shape()[2].value is None:
    raise ValueError("Shape[2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with variable_scope.variable_scope(
      scope or "attention_decoder", dtype=dtype) as scope:
    dtype = scope.dtype
    with variable_scope.variable_scope("word_attn") as attn_scope:

      batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
      attn_length = attention_states.get_shape()[1].value
      if attn_length is None:
        attn_length = shape(attention_states)[1]
      attn_size = attention_states.get_shape()[2].value

      # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
      hidden = array_ops.reshape(
          attention_states, [-1, attn_length, 1, attn_size])
      hidden_features = []
      v = []
      attention_vec_size = attn_size  # Size of query vectors for attention.
      for a in xrange(num_heads):
        k = variable_scope.get_variable("AttnW_%d" % a,
                                        [1, 1, attn_size, attention_vec_size])
        hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
        v.append(
            variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]))

   
    
      def attention(query,coverage=None):
        """Put attention masks on hidden using hidden_features and query."""
        ds = []  # Results of attention reads will be stored here.
        if nest.is_sequence(query):  # If the query is a tuple, flatten it.
          query_list = nest.flatten(query)
          for q in query_list:  # Check that ndims == 2 if specified.
            ndims = q.get_shape().ndims
            if ndims:
              assert ndims == 2
          query = array_ops.concat(1, query_list)
        for a in xrange(num_heads):
          with variable_scope.variable_scope("Attention_%d" % a):
            y = linear(query, attention_vec_size, True)
            y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
	
            s = math_ops.reduce_sum(
                  v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])

            atn = nn_ops.softmax(s)
  
            d = math_ops.reduce_sum(
                array_ops.reshape(atn, [-1, attn_length, 1, 1]) * hidden,
                [1, 2])
            ds.append(array_ops.reshape(d, [-1, attn_size]))
        return ds,s,atn 

      outputs = []
      #prev = None
      batch_attn_size = array_ops.pack([batch_size, attn_size])
      attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
               for _ in xrange(num_heads)]

      for a in attns:  # Ensure the second shape of attention vectors is set.
        a.set_shape([None, attn_size])
      if initial_state_attention:
        attns,ss,soft_ss = attention(initial_state)

    with variable_scope.variable_scope("sent_attn") as sent_attn_scope:

      #batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
      sent_attn_length = sent_attention_states.get_shape()[1].value
      if sent_attn_length is None:
        sent_attn_length = shape(sent_attention_states)[1]
      sent_attn_size = sent_attention_states.get_shape()[2].value


      sent_hidden = array_ops.reshape(
          sent_attention_states, [-1, sent_attn_length, 1, sent_attn_size])
      sent_hidden_features = []
      sent_v = []
      sent_attention_vec_size = sent_attn_size  # Size of query vectors for attention.
      for a in xrange(num_heads):
        sent_k = variable_scope.get_variable("sent_AttnW_%d" % a,
                                        [1, 1, sent_attn_size, sent_attention_vec_size])
        sent_hidden_features.append(nn_ops.conv2d(sent_hidden, sent_k, [1, 1, 1, 1], "SAME"))
        sent_v.append(
            variable_scope.get_variable("sent_AttnV_%d" % a, [sent_attention_vec_size]))

    

      def sent_attention(query, sent_coverage=None):
        
        ds = []  # Results of attention reads will be stored here.
        if nest.is_sequence(query):  # If the query is a tuple, flatten it.
          query_list = nest.flatten(query)
          for q in query_list:  # Check that ndims == 2 if specified.
            ndims = q.get_shape().ndims
            if ndims:
              assert ndims == 2
          query = array_ops.concat(1, query_list)
        for a in xrange(num_heads):
          with variable_scope.variable_scope("sent_Attention_%d" % a):
            y = linear(query, sent_attention_vec_size, True)
            y = array_ops.reshape(y, [-1, 1, 1, sent_attention_vec_size])
  
            s = math_ops.reduce_sum(
                  v[a] * math_ops.tanh(sent_hidden_features[a] + y), [2, 3])

            atn = nn_ops.softmax(s)
      #      sent_coverage = array_ops.expand_dims(array_ops.expand_dims(atn,2),2)

            # Now calculate the attention-weighted vector d.
            d = math_ops.reduce_sum(
                array_ops.reshape(atn, [-1, sent_attn_length, 1, 1]) * sent_hidden,
                [1, 2])
            ds.append(array_ops.reshape(d, [-1, sent_attn_size]))
        return ds,s,atn  #,sent_coverage

      outputs = []
      sent_outputs = []
      soft_outputs = []
      soft_sent_outputs = []
      
      sent_batch_attn_size = array_ops.pack([batch_size, sent_attn_size])
      sent_attns = [array_ops.zeros(sent_batch_attn_size, dtype=dtype)
               for _ in xrange(num_heads)]

      for a in sent_attns:  # Ensure the second shape of attention vectors is set.
        a.set_shape([None, sent_attn_size])
      if initial_state_attention:
        sent_attns,sent_ss,soft_sent_ss = sent_attention(sent_initial_state)


 
    hidden_words=[]
    hidden_sents=[]

    s_w=[]
    d_k = variable_scope.get_variable("switch_w",
                                    [1, 1, attn_size, attention_vec_size])
    T_k = variable_scope.get_variable("switch_s" ,
                                    [1, 1, sent_attn_size, sent_attention_vec_size])
    hidden_words=nn_ops.conv2d(hidden, d_k, [1, 1, 1, 1], "SAME")
    hidden_sents=nn_ops.conv2d(sent_hidden, T_k, [1, 1, 1, 1], "SAME")
 

    def switch_pos(st_w,st_s,h_w,h_s):
      with variable_scope.variable_scope("switch_w"):

        y_w=linear(st_w,2, True)

      with variable_scope.variable_scope("switch_s"):
        y_s=linear(st_s,2, True)
      with variable_scope.variable_scope("switch_hw"):
        y_hw=linear(h_w,2, True)
      with variable_scope.variable_scope("switch_hs"):
        y_hs=linear(h_s,2, True)

      s_b=y_s+y_hs+math_ops.tanh(y_w+y_hw)
      s_b=array_ops.reshape(s_b,[-1,2])
      switch_pb=nn_ops.softmax (s_b)
      return s_b ,switch_pb

    sent_state = sent_initial_state
    state = initial_state
    
    
    sent_prev = None
    prev=None


    
    switch_outputs=[]
    switch_softmax=[]
   
    for i in xrange(dec_timesteps):
      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()
 
      
      sb,switch_prob=switch_pos(state,sent_state,attns,sent_attns)
      switch_outputs.append(sb)
      switch_softmax.append(switch_prob)
   
      inp=decoder_inputs[i]
      
      sent_inp=sent_decoder_inputs[i]
      if mode_train is not True and prev is not None:
        with variable_scope.variable_scope("loop_function", reuse=True):
          #print("Caliing Loop function")
	  if not loop_function ==None:
          	inp = loop_function(prev,encoder_inputs)
          	sent_inp=sent_loop_function(sent_prev,sent_encoder_inputs)
 
      input_size = inp.get_shape().with_rank(2)[1]
      sent_input_size = sent_inp.get_shape().with_rank(2)[1]
      if input_size.value is None:
        raise ValueError("Could not infer input size from input: %s" % inp.name)


      sent_switch=(switch_prob[:,1])
      word_switch=switch_prob[:,0] #ss(j-1)
 
 
      with variable_scope.variable_scope("word_stpes"):
        x=linear(array_ops.concat(2,[[inp],attns,math_ops.tanh(sent_attns)])[0],input_size,True)
        cell_output,state=cell(x,state)
	


      ##########Sentence decoder##################################


      with variable_scope.variable_scope("sent_steps"):

        sent_x=linear(array_ops.concat(2,[[sent_inp],sent_attns,math_ops.tanh(attns)])[0],sent_input_size,True)
        
        sent_cell_output,sent_state=sent_cell(sent_x,sent_state)
 
      with variable_scope.variable_scope(sent_attn_scope,reuse=True):
        sent_attns,sent_ss ,soft_sent_ss= sent_attention(sent_state)
      with variable_scope.variable_scope(attn_scope,reuse=True):
        attns,ss,soft_ss = attention(state)





      soft_ssout=soft_ss *array_ops.reshape([word_switch],[-1,1])
      soft_sent_ssout=soft_sent_ss *array_ops.reshape([sent_switch],[-1,1])

      prev=ss
      sent_prev=sent_ss
        
   
      outputs.append(soft_ss)
      soft_outputs.append(soft_ssout)

     
      sent_outputs.append(soft_sent_ss)
      soft_sent_outputs.append(soft_sent_ssout)

  
  return outputs, state, sent_outputs, sent_state,switch_outputs,switch_softmax,soft_outputs,soft_sent_outputs #,coverage
Exemplo n.º 11
0
def lookup_positives(scores, click_position):
    num_rows = shape(scores)[0]
    row_idx = expand_dims(range(num_rows), axis=1)
    idx = concatenate([row_idx, cast(click_position, int32)], axis=1)
    return gather_nd(scores, idx)
Exemplo n.º 12
0
def to_one_hot(scores):
    return one_hot(argmax(scores, axis=1), shape(scores)[1])