Пример #1
0
 def testEmptyInput(self):
   with self.test_session():
     x = array_ops.placeholder(dtypes.float32, shape=[0, 3])
     self.assertEqual(0, array_ops.size(x).eval())
     # reshape would raise if logits is empty
     with self.assertRaises(errors_impl.InvalidArgumentError):
       nn_ops.softmax(x, axis=0).eval()
Пример #2
0
 def testDimTooLarge(self):
   with self.test_session():
     # Use placeholder to make sure we get runtime error instead of shape
     # inference error.
     dim = array_ops.placeholder_with_default(100, shape=[])
     with self.assertRaises(errors_impl.InvalidArgumentError):
       nn_ops.softmax([1., 2., 3., 4.], axis=dim).eval()
Пример #3
0
 def testEmptyInput(self):
   with self.test_session():
     x = constant_op.constant([[]], shape=[0, 3])
     self.assertEqual(0, array_ops.size(x).eval())
     # reshape would raise if logits is empty
     with self.assertRaises(errors_impl.InvalidArgumentError):
       nn_ops.softmax(x, dim=0).eval()
Пример #4
0
 def testSoftmaxAxes(self):
   arr = np.linspace(0., 1, 12).reshape(3, 4)
   x_neg_axis = nn_ops.softmax(arr, axis=-2)
   y_pos_axis = nn_ops.softmax(arr, axis=0)
   z_gt_axis = nn_ops.softmax(arr, axis=4)
   x_neg_axis_tf = self.evaluate(x_neg_axis)
   y_pos_axis_tf = self.evaluate(y_pos_axis)
   z_gt_axis_tf = self.evaluate(z_gt_axis)
   eps = 1e-3
   self.assertAllClose(x_neg_axis_tf, y_pos_axis_tf, eps)
   self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
Пример #5
0
 def testSoftmax(self):
   x_shape = [5, 10]
   x_np = np.random.randn(*x_shape).astype(np.float32)
   y_np = self._softmax(x_np)
   x_tf = constant_op.constant(x_np)
   y_tf = nn_ops.softmax(x_tf)
   y_tf_last_dim = nn_ops.softmax(x_tf, 1)
   y_tf_np = self.evaluate(y_tf)
   y_tf_last_dim_np = self.evaluate(y_tf_last_dim)
   eps = 1e-3
   self.assertAllClose(y_tf_np, y_np, eps)
   self.assertAllClose(y_tf_last_dim_np, y_np, eps)
Пример #6
0
def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
  """Gradient function for SoftmaxCrossEntropyWithLogits."""
  # grad_loss is the backprop for cost, and we multiply it with the gradients
  # (which is output[1])
  # grad_grad is the backprop for softmax gradient.
  # There is no gradient for the labels
  #
  # Second derivative is just softmax derivative w.r.t. logits.
  softmax_grad = op.outputs[1]
  grad = _BroadcastMul(grad_loss, softmax_grad)

  def IsZero(g):
    # Some introspection to check if the gradient is feeding zeros
    if g.op.type in ("ZerosLike", "Zeros"):
      return True
    const_fill_value = tensor_util.constant_value(g)
    return const_fill_value is not None and (const_fill_value == 0).all()

  if not IsZero(grad_grad):
    logits = op.inputs[0]
    softmax = nn_ops.softmax(logits)

    grad += ((grad_grad - array_ops.squeeze(
        math_ops.matmul(grad_grad[:, None, :],
                        softmax[:, :, None]), axis=1)) * softmax)

  return grad, None
Пример #7
0
def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
  """Gradient function for SoftmaxCrossEntropyWithLogits."""
  # grad_loss is the backprop for cost, and we multiply it with the gradients
  # (which is output[1])
  # grad_grad is the backprop for softmax gradient.
  #
  # Second derivative is just softmax derivative w.r.t. logits.
  softmax_grad = op.outputs[1]
  grad = _BroadcastMul(grad_loss, softmax_grad)

  def IsZero(g):
    # Some introspection to check if the gradient is feeding zeros
    if context.executing_eagerly():
      # TODO(apassos) add an efficient way to detect eager zeros here.
      return False
    if g.op.type in ("ZerosLike", "Zeros"):
      return True
    const_fill_value = tensor_util.constant_value(g)
    return const_fill_value is not None and (const_fill_value == 0).all()

  logits = op.inputs[0]
  if grad_grad is not None and not IsZero(grad_grad):
    softmax = nn_ops.softmax(logits)

    grad += ((grad_grad - array_ops.squeeze(
        math_ops.matmul(
            array_ops.expand_dims(grad_grad, 1),
            array_ops.expand_dims(softmax, 2)),
        axis=1)) * softmax)

  return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))
Пример #8
0
  def testMeanMultivariate(self):
    with self.test_session() as sess:
      for batch_shape in ((), (2,), (2, 3)):
        dist = make_multivariate_mixture(
            batch_shape=batch_shape, num_components=2, event_shape=(4,),
            use_static_graph=self.use_static_graph)
        mean = dist.mean()
        self.assertEqual(batch_shape + (4,), mean.get_shape())

        cat_probs = nn_ops.softmax(dist.cat.logits)
        dist_means = [d.mean() for d in dist.components]

        mean_value, cat_probs_value, dist_means_value = sess.run(
            [mean, cat_probs, dist_means])
        self.assertEqual(batch_shape + (4,), mean_value.shape)

        cat_probs_value = _swap_first_last_axes(cat_probs_value)

        # Add a new innermost dimension for broadcasting to mvn vector shape
        cat_probs_value = [np.expand_dims(c_p, -1) for c_p in cat_probs_value]

        true_mean = sum(
            [c_p * m for (c_p, m) in zip(cat_probs_value, dist_means_value)])

        self.assertAllClose(true_mean, mean_value)
Пример #9
0
 def attention(query):
   """Put attention masks on hidden using hidden_features and query."""
   ds = []  # Results of attention reads will be stored here.
   if nest.is_sequence(query):  # If the query is a tuple, flatten it.
     query_list = nest.flatten(query)
     for q in query_list:  # Check that ndims == 2 if specified.
       ndims = q.get_shape().ndims
       if ndims:
         assert ndims == 2
     query = array_ops.concat(1, query_list)
   for i in xrange(num_heads):
     with variable_scope.variable_scope("Attention_%d" % i):                  
       y = linear(query, attention_vec_size, True)
       y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
       # Attention mask is a softmax of v^T * tanh(...).
       s = math_ops.reduce_sum(
           v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3])
       # multiply with source mask, then do softmax
       if src_mask is not None:
         s = s * src_mask
       a = nn_ops.softmax(s)
       # Now calculate the attention-weighted vector d.
       d = math_ops.reduce_sum(
           array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
           [1, 2])                  
       ds.append(array_ops.reshape(d, [-1, attn_size]))
   return ds            
Пример #10
0
  def _forward(self, x):
    # Pad the last dim with a zeros vector. We need this because it lets us
    # infer the scale in the inverse function.
    y = array_ops.expand_dims(x, dim=-1) if self._static_event_ndims == 0 else x
    ndims = (y.get_shape().ndims if y.get_shape().ndims is not None
             else array_ops.rank(y))
    y = array_ops.pad(y,
                      paddings=array_ops.concat(
                          (array_ops.zeros(
                              (ndims - 1, 2), dtype=dtypes.int32), [[0, 1]]),
                          0))

    # Set shape hints.
    if x.get_shape().ndims is not None:
      shape = x.get_shape().as_list()
      if self._static_event_ndims == 0:
        shape += [2]
      elif shape[-1] is not None:
        shape[-1] += 1
      shape = tensor_shape.TensorShape(shape)
      y.get_shape().assert_is_compatible_with(shape)
      y.set_shape(shape)

    # Since we only support event_ndims in [0, 1] and we do padding, we always
    # reduce over the last dimension, i.e., dim=-1 (which is the default).
    return nn_ops.softmax(y)
Пример #11
0
  def testProbScalarMultivariate(self):
    with self.test_session() as sess:
      dist = make_multivariate_mixture(
          batch_shape=[], num_components=2, event_shape=[3],
          use_static_graph=self.use_static_graph)
      for x in [
          np.array(
              [[-1.0, 0.0, 1.0], [0.5, 1.0, -0.3]], dtype=np.float32), np.array(
                  [-1.0, 0.0, 1.0], dtype=np.float32),
          np.random.randn(2, 2, 3).astype(np.float32)
      ]:
        p_x = dist.prob(x)

        self.assertEqual(x.shape[:-1], p_x.get_shape())

        cat_probs = nn_ops.softmax([dist.cat.logits])[0]
        dist_probs = [d.prob(x) for d in dist.components]

        p_x_value, cat_probs_value, dist_probs_value = sess.run(
            [p_x, cat_probs, dist_probs])

        self.assertEqual(x.shape[:-1], p_x_value.shape)

        total_prob = sum(c_p_value * d_p_value
                         for (c_p_value, d_p_value
                             ) in zip(cat_probs_value, dist_probs_value))

        self.assertAllClose(total_prob, p_x_value)
Пример #12
0
  def testProbBatchMultivariate(self):
    with self.test_session() as sess:
      dist = make_multivariate_mixture(
          batch_shape=[2, 3], num_components=2, event_shape=[4],
          use_static_graph=self.use_static_graph)

      for x in [
          np.random.randn(2, 3, 4).astype(np.float32),
          np.random.randn(4, 2, 3, 4).astype(np.float32)
      ]:
        p_x = dist.prob(x)
        self.assertEqual(x.shape[:-1], p_x.get_shape())

        cat_probs = nn_ops.softmax(dist.cat.logits)
        dist_probs = [d.prob(x) for d in dist.components]

        p_x_value, cat_probs_value, dist_probs_value = sess.run(
            [p_x, cat_probs, dist_probs])
        self.assertEqual(x.shape[:-1], p_x_value.shape)

        cat_probs_value = _swap_first_last_axes(cat_probs_value)
        total_prob = sum(c_p_value * d_p_value
                         for (c_p_value, d_p_value
                             ) in zip(cat_probs_value, dist_probs_value))

        self.assertAllClose(total_prob, p_x_value)
Пример #13
0
  def testSmallNetwork(self):
    image = array_ops.placeholder(dtypes.float32, shape=[1, 28, 28, 1])
    label = array_ops.placeholder(dtypes.float32, shape=[1, 10])
    w = variables.Variable(
        random_ops.truncated_normal([5, 5, 1, 32], stddev=0.1))
    b = variables.Variable(random_ops.truncated_normal([32], stddev=0.1))
    conv = nn_ops.conv2d(image, w, strides=[1, 1, 1, 1], padding="SAME")
    h_conv = nn_ops.relu(conv + b)
    h_conv_flat = array_ops.reshape(h_conv, [1, -1])

    w_fc = variables.Variable(
        random_ops.truncated_normal([25088, 10], stddev=0.1))
    b_fc = variables.Variable(random_ops.truncated_normal([10], stddev=0.1))
    y_conv = nn_ops.softmax(math_ops.matmul(h_conv_flat, w_fc) + b_fc)

    cross_entropy = math_ops.reduce_mean(-math_ops.reduce_sum(
        label * math_ops.log(y_conv), reduction_indices=[1]))
    _ = adam.AdamOptimizer(1e-4).minimize(cross_entropy)

    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
    report = cost_analyzer.GenerateCostReport(mg)

    self.assertTrue(b"MatMul" in report)
    self.assertTrue(b"ApplyAdam" in report)
    self.assertTrue(b"Conv2D" in report)
    self.assertTrue(b"Conv2DBackpropInput" in report)
    self.assertTrue(b"Conv2DBackpropFilter" in report)
    self.assertTrue(b"Softmax" in report)

    # Also print the report to make it easier to debug
    print("{}".format(report))
Пример #14
0
 def attention(query, use_attention=False):
   """Put attention masks on hidden using hidden_features and query."""
   attn_weights = []
   ds = []  # Results of attention reads will be stored here.
   for i in xrange(num_heads):
     with variable_scope.variable_scope("Attention_%d" % i):
       y = rnn_cell._linear(query, attention_vec_size, True)
       y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
       # Attention mask is a softmax of v^T * tanh(...).
       s = math_ops.reduce_sum(
           v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3])
       if use_attention is False: # apply mean pooling
           weights = tf.tile(sequence_length, tf.stack([attn_length]))
           weights = array_ops.reshape(weights, tf.shape(s))
           a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(weights)
           # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1])
       else:
         a = nn_ops.softmax(s)
       attn_weights.append(a)
       # Now calculate the attention-weighted vector d.
       d = math_ops.reduce_sum(
           array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
           [1, 2])
       ds.append(array_ops.reshape(d, [-1, attn_size]))
   return attn_weights, ds
Пример #15
0
  def _forward(self, x):
    y = x
    # Pad the event_ndims with a zeros vector. We need this because it lets
    # us infer the scale in the inverse function.
    if self._static_event_ndims == 0:
      y = array_ops.expand_dims(y, dim=-1)
      zeros = array_ops.zeros_like(y)
    else:
      shape = array_ops.concat(0, (array_ops.shape(x)[:-1], [1]))
      zeros = array_ops.zeros(shape, dtype=y.dtype)
    y = array_ops.concat(array_ops.rank(y)-1, (y, zeros))

    # Set shape hints.
    if x.get_shape().ndims is not None:
      shape = x.get_shape().as_list()
      if self._static_event_ndims == 0:
        shape += [2]
      elif shape[-1] is not None:
        shape[-1] += 1
      shape = tensor_shape.TensorShape(shape)
      y.get_shape().assert_is_compatible_with(shape)
      y.set_shape(shape)

    # Since we only support event_ndims in [0, 1] and we do padding, we always
    # reduce over the last dimension, i.e., dim=-1 (which is the default).
    return nn_ops.softmax(y)
Пример #16
0
def sequence_softmax(inputs, noutput, scope=None, name=None, linear_name=None):
  """Run a softmax layer over all the time steps of an input sequence.

  Args:
    inputs: (length, batch_size, depth) tensor
    noutput: output depth
    scope: optional scope name
    name: optional name for output tensor
    linear_name: name for linear (pre-softmax) output

  Returns:
    A tensor of size (length, batch_size, noutput).

  """
  length, _, ninputs = _shape(inputs)
  inputs_u = array_ops.unstack(inputs)
  output_u = []
  with variable_scope.variable_scope(scope, "SequenceSoftmax", [inputs]):
    initial_w = random_ops.truncated_normal([0 + ninputs, noutput], stddev=0.1)
    initial_b = constant_op.constant(0.1, shape=[noutput])
    w = variables.model_variable("weights", initializer=initial_w)
    b = variables.model_variable("biases", initializer=initial_b)
    for i in xrange(length):
      with variable_scope.variable_scope(scope, "SequenceSoftmaxStep",
                                         [inputs_u[i]]):
        # TODO(tmb) consider using slim.fully_connected(...,
        # activation_fn=tf.nn.softmax)
        linear = nn_ops.xw_plus_b(inputs_u[i], w, b, name=linear_name)
        output = nn_ops.softmax(linear)
        output_u += [output]
    outputs = array_ops.stack(output_u, name=name)
  return outputs
Пример #17
0
  def testEntropyGradient(self):
    with self.cached_session() as sess:
      logits = constant_op.constant([[1., 2., 3.], [2., 5., 1.]])

      probabilities = nn_ops.softmax(logits)
      log_probabilities = nn_ops.log_softmax(logits)
      true_entropy = - math_ops.reduce_sum(
          probabilities * log_probabilities, axis=-1)

      categorical_distribution = categorical.Categorical(probs=probabilities)
      categorical_entropy = categorical_distribution.entropy()

      # works
      true_entropy_g = gradients_impl.gradients(true_entropy, [logits])
      categorical_entropy_g = gradients_impl.gradients(
          categorical_entropy, [logits])

      res = sess.run({"true_entropy": true_entropy,
                      "categorical_entropy": categorical_entropy,
                      "true_entropy_g": true_entropy_g,
                      "categorical_entropy_g": categorical_entropy_g})
      self.assertAllClose(res["true_entropy"],
                          res["categorical_entropy"])
      self.assertAllClose(res["true_entropy_g"],
                          res["categorical_entropy_g"])
Пример #18
0
 def _entropy(self):
   logits_2d = array_ops.reshape(
       self.logits, array_ops.pack([-1, self.num_classes]))
   histogram_2d = nn_ops.softmax(logits_2d)
   ret = array_ops.reshape(
       nn_ops.softmax_cross_entropy_with_logits(logits_2d, histogram_2d),
       self.batch_shape())
   ret.set_shape(self.get_batch_shape())
   return ret
Пример #19
0
 def testShapeInference(self):
     op = nn_ops.softmax(
         [
             [[1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 3.0, 4.0]],
             [[2.0, 3.0, 4.0, 5.0], [6.0, 7.0, 8.0, 9.0]],
             [[5.0, 4.0, 3.0, 2.0], [1.0, 2.0, 3.0, 4.0]],
         ]
     )
     self.assertEqual([3, 2, 4], op.get_shape())
    def body(i, prev_c, prev_h, actions, log_probs):
      # pylint: disable=g-long-lambda
      signal = control_flow_ops.cond(
          math_ops.equal(i, 0),
          lambda: array_ops.tile(device_go_embedding,
                                 [self.hparams.num_children, 1]),
          lambda: embedding_ops.embedding_lookup(device_embeddings,
                                                 actions.read(i - 1))
      )
      if self.hparams.keep_prob is not None:
        signal = nn_ops.dropout(signal, self.hparams.keep_prob)
      next_c, next_h = lstm(signal, prev_c, prev_h, w_lstm, forget_bias)
      query = math_ops.matmul(next_h, attn_w_2)
      query = array_ops.reshape(
          query, [self.hparams.num_children, 1, self.hparams.hidden_size])
      query = math_ops.tanh(query + attn_mem)
      query = array_ops.reshape(query, [
          self.hparams.num_children * self.num_groups, self.hparams.hidden_size
      ])
      query = math_ops.matmul(query, attn_v)
      query = array_ops.reshape(query,
                                [self.hparams.num_children, self.num_groups])
      query = nn_ops.softmax(query)
      query = array_ops.reshape(query,
                                [self.hparams.num_children, self.num_groups, 1])
      query = math_ops.reduce_sum(attn_mem * query, axis=1)
      query = array_ops.concat([next_h, query], axis=1)
      logits = math_ops.matmul(query, device_softmax)
      logits /= self.hparams.temperature
      if self.hparams.tanh_constant > 0:
        logits = math_ops.tanh(logits) * self.hparams.tanh_constant
      if self.hparams.logits_std_noise > 0:
        num_in_logits = math_ops.cast(
            array_ops.size(logits), dtype=dtypes.float32)
        avg_norm = math_ops.divide(
            linalg_ops.norm(logits), math_ops.sqrt(num_in_logits))
        logits_noise = random_ops.random_normal(
            array_ops.shape(logits),
            stddev=self.hparams.logits_std_noise * avg_norm)
        logits = control_flow_ops.cond(
            self.global_step > self.hparams.stop_noise_step, lambda: logits,
            lambda: logits + logits_noise)

      if mode == "sample":
        next_y = random_ops.multinomial(logits, 1, seed=self.hparams.seed)
      elif mode == "greedy":
        next_y = math_ops.argmax(logits, 1)
      elif mode == "target":
        next_y = array_ops.slice(y, [0, i], [-1, 1])
      else:
        raise NotImplementedError
      next_y = math_ops.to_int32(next_y)
      next_y = array_ops.reshape(next_y, [self.hparams.num_children])
      actions = actions.write(i, next_y)
      log_probs += nn_ops.sparse_softmax_cross_entropy_with_logits(
          logits=logits, labels=next_y)
      return i + 1, next_c, next_h, actions, log_probs
Пример #21
0
 def testGradient(self, x_shape):
   x_np = np.random.randn(*x_shape).astype(np.float64)
   with self.cached_session():
     x_tf = constant_op.constant(x_np)
     y_tf = nn_ops.softmax(x_tf)
     err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
                                                   x_shape)
   eps = 2e-8
   self.assertLess(err, eps)
Пример #22
0
 def _entropy(self):
     if self.logits.get_shape().ndims == 2:
         logits_2d = self.logits
     else:
         logits_2d = array_ops.reshape(self.logits, [-1, self.num_classes])
     histogram_2d = nn_ops.softmax(logits_2d)
     ret = array_ops.reshape(nn_ops.softmax_cross_entropy_with_logits(logits_2d, histogram_2d), self.batch_shape())
     ret.set_shape(self.get_batch_shape())
     return ret
Пример #23
0
 def testSoftmax(self):
   x_shape = [5, 10]
   x_np = np.random.randn(*x_shape).astype(np.float32)
   y_np = self._softmax(x_np)
   with self.test_session():
     x_tf = constant_op.constant(x_np)
     y_tf = nn_ops.softmax(x_tf)
     y_tf_np = y_tf.eval()
   eps = 1e-3
   self.assertAllClose(y_tf_np, y_np, eps)
Пример #24
0
 def entropy(self, name="sample"):
   with ops.name_scope(self.name):
     with ops.op_scope([], name):
       logits_2d = array_ops.reshape(
           self.logits, array_ops.pack([-1, self.num_classes]))
       histogram_2d = nn_ops.softmax(logits_2d)
       ret = array_ops.reshape(
           nn_ops.softmax_cross_entropy_with_logits(logits_2d, histogram_2d),
           self.batch_shape())
       ret.set_shape(self.get_batch_shape())
       return ret
Пример #25
0
def _output_with_attention(cell_output, output_size, decoder_hidden, attn_size,
                           projection_attention_f, initializer=None, output_form=OUTPUT_CONCAT):
    """

    Parameters
    ----------
    decoder_hidden
    attn_size
    projection_attention_f
    initializer
    step_num

    Returns
    -------

    """
    assert initializer is not None

    with vs.variable_scope("AttnOutputProjection", initializer=initializer):

        with vs.variable_scope("output_attention", initializer=initializer):

            s = projection_attention_f(decoder_hidden, attn_size)

            # beta will be (?, timesteps)
            beta = nn_ops.softmax(s)

            shape = decoder_hidden.get_shape()
            timesteps = shape[1].value
            b = array_ops.reshape(beta, [-1, timesteps, 1, 1])

            # b  and decoder_hidden will be (?, timesteps, 1, 1)
            d = math_ops.reduce_sum(b * decoder_hidden, [1, 2])

            # d is (?, decoder_size)
            # ds is (?, decoder_size)
            ds = tf.reshape(d, [-1, attn_size])

            _ = tf.histogram_summary('attention_context', ds)

        # output = cells.linear([cell_output] + [ds], output_size, True)

        if output_form == OUTPUT_SPLIT:
            output = _output_form_split(cell_output, ds, output_size, initializer=initializer)

        elif output_form == OUTPUT_SINGLE:
            output = _output_form_single(ds, output_size, initializer=initializer)

        else:
            output = _output_form_concat(cell_output, ds, output_size, initializer=initializer)

        output = tf.tanh(output)

    return output
Пример #26
0
  def _forward(self, x):
    # Pad the last dim with a zeros vector. We need this because it lets us
    # infer the scale in the inverse function.
    y = distribution_util.pad(x, axis=-1, back=True)

    # Set shape hints.
    if x.shape.ndims is not None:
      shape = x.shape[:-1].concatenate(x.shape[-1] + 1)
      y.shape.assert_is_compatible_with(shape)
      y.set_shape(shape)

    return nn_ops.softmax(y)
Пример #27
0
def global_attention(decoder_hidden_state, hidden_attn, initializer, window_size=10,
                     content_function=vinyals_kaiser, dtype=tf.float32):

    """Put global attention on hidden using decoder hidden states and the hidden states of encoder (hidden_attn).

    Parameters
    ----------
    decoder_hidden_state : 2-D Tensor
        Tensor representing the current hidden state of the decoder (output of the recurrent layers).
        Shape is (?, decoder_size).
    hidden_attn : 4-D Tensor
        Tensor representing the hidden states of the encoder (output of the recurrent layers). It has
        shape (?, timesteps, 1, decoder_sdize) so it is possible to apply a 1-D convolution to calculate
        the attention score more efficiently.
    initializer : function
        Function to use when initializing variables within the variables context.
    window_size : int
        Size of each side of the window to use when applying local attention. Not relevant to global
        attention. Default to 10.
    content_function : function
        Content function to score the decoder hidden states and encoder hidden states to extract their
        weights. Default to 'vinyals_kaiser'.
    dtype : tensorflow dtype
        Type of tensors. Default to tf.float32

    Returns
    -------
    ds : 2-D Tensor
        Tensor representing the context vector generated after scoring the encoder and decoder hidden
        states. Has shape (?, decoder_size), i.e., one context vector per batch sample.

    """
    assert content_function is not None

    attention_vec_size = hidden_attn.get_shape()[3].value
    attn_length = hidden_attn.get_shape()[1].value

    with vs.variable_scope("AttentionGlobal", initializer=initializer):

        # apply content function to score the hidden states from the encoder
        s = content_function(hidden_attn, decoder_hidden_state)

        alpha = nn_ops.softmax(s)

        _ = tf.histogram_summary('global_alpha_weights', alpha)

        # Now calculate the attention-weighted vector d.
        d = math_ops.reduce_sum(array_ops.reshape(alpha, [-1, attn_length, 1, 1]) * hidden_attn, [1, 2])
        ds = array_ops.reshape(d, [-1, attention_vec_size])#

    _ = tf.histogram_summary('global_attention_context', ds)

    return ds
Пример #28
0
 def _entropy(self):
   if self.logits.get_shape().ndims == 2:
     logits_2d = self.logits
   else:
     logits_2d = array_ops.reshape(self.logits, [-1, self.event_size])
   histogram_2d = nn_ops.softmax(logits_2d)
   ret = array_ops.reshape(
       nn_ops.softmax_cross_entropy_with_logits(labels=histogram_2d,
                                                logits=logits_2d),
       self.batch_shape_tensor())
   ret.set_shape(self.batch_shape)
   return ret
Пример #29
0
def classifier_score(images, classifier_fn, num_batches=1):
  """Classifier score for evaluating a conditional generative model.

  This is based on the Inception Score, but for an arbitrary classifier.

  This technique is described in detail in https://arxiv.org/abs/1606.03498. In
  summary, this function calculates

  exp( E[ KL(p(y|x) || p(y)) ] )

  which captures how different the network's classification prediction is from
  the prior distribution over classes.

  Args:
    images: Images to calculate the classifier score for.
    classifier_fn: A function that takes images and produces logits based on a
      classifier.
    num_batches: Number of batches to split `generated_images` in to in order to
      efficiently run them through the classifier network.

  Returns:
    The classifier score. A floating-point scalar.
  """
  generated_images_list = array_ops.split(
      images, num_or_size_splits=num_batches)

  # Compute the classifier splits using the memory-efficient `map_fn`.
  logits = functional_ops.map_fn(
      fn=classifier_fn,
      elems=array_ops.stack(generated_images_list),
      parallel_iterations=1,
      back_prop=False,
      swap_memory=True,
      name='RunClassifier')
  logits = array_ops.concat(array_ops.unstack(logits), 0)
  logits.shape.assert_has_rank(2)

  # Use maximum precision for best results.
  logits_dtype = logits.dtype
  if logits_dtype != dtypes.float64:
    logits = math_ops.cast(logits, dtypes.float64)

  p = nn_ops.softmax(logits)
  q = math_ops.reduce_mean(p, axis=0)
  kl = _kl_divergence(p, logits, q)
  kl.shape.assert_has_rank(1)
  log_score = math_ops.reduce_mean(kl)
  final_score = math_ops.exp(log_score)

  if logits_dtype != dtypes.float64:
    final_score = math_ops.cast(final_score, dtypes.float64)
  return final_score
Пример #30
0
  def testLargeDims(self):
    # Make sure that we properly handle large inputs. See
    # https://github.com/tensorflow/tensorflow/issues/4425 for details
    for dims in [129, 256]:
      ones = np.random.rand(dims, dims).astype(np.float32)
      np_softmax = self._npSoftmax(ones)

      for use_gpu in [True, False]:
        with self.test_session(use_gpu=use_gpu) as sess:
          x = array_ops.placeholder(dtypes.float32)
          y = nn_ops.softmax(x)
          tf_softmax = sess.run(y, feed_dict={x: ones})
        self.assertAllClose(tf_softmax, np_softmax)
Пример #31
0
    def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
        """在 dynamic_rnn_decoder 中用于推导的解码器函数

        这个解码器函数和 attention_decoder_fn_train 中的 decoder_fn 最大的区别是,next_cell_input
        是如何计算的。在解码器函数中,我们通过在解码器输出的特征维度上使用一个 argmax 来计算下一个输入。
        这是一种 greedy-search 的方式。(Bahdanau et al., 2014) & (Sutskever et al., 2014) 使用 beam-search。

        Args:
            time: 反映当前时间步的正整型常量                     positive integer constant reflecting the current timestep.
            cell_state: RNNCell 的状态                          state of RNNCell.
            cell_input: dynamic_rnn_decoder 提供的输入          input provided by `dynamic_rnn_decoder`.
            cell_output: RNNCell的输出                          output of RNNCell.
            context_state: dynamic_rnn_decoder 提供的上下文状态  context state provided by `dynamic_rnn_decoder`.
        Returns:
            一个元组 (done, next state, next input, emit output, next context state)
            其中:
            done: 一个指示哪个句子已经达到 end_of_sequence_id 的布尔向量。
            被 dynamic_rnn_decoder 用来提早停止。当 time>maximum_length 时,
            一个所有元素都为 true 的布尔向量被返回。
            next state: `cell_state`, 这个解码器函数不修改给定的状态。
            next input: cell_output 的 argmax 的嵌入被用作 next_input
            emit output: 如果 output_fn is None,所提供的 cell_output 被返回。
                否则被用来在计算 next_input 和返回 cell_output 之前更新 cell_output。
            next context state: `context_state`, 这个解码器函数不修改给定的上下文状态。
                当使用,例如,beam search 时,上下文状态能够被修改。
        Raises:
            ValueError: if cell_input is not None.
        """
        with ops.name_scope(
                name, "attention_decoder_fn_inference",
            [time, cell_state, cell_input, cell_output, context_state]):
            if cell_input is not None:
                raise ValueError(
                    "Expected cell_input to be None, but saw: %s" % cell_input)
            if cell_output is None:
                # invariant that this is time == 0
                next_input_id = array_ops.ones([
                    batch_size,
                ], dtype=dtype) * (start_of_sequence_id)
                done = array_ops.zeros([
                    batch_size,
                ], dtype=dtypes.bool)
                cell_state = encoder_state
                cell_output = array_ops.zeros([num_decoder_symbols],
                                              dtype=dtypes.float32)
                cell_input = array_ops.gather(embeddings, next_input_id)

                # init attention
                attention = _init_attention(encoder_state)
                # init context state
                log_beam_probs = tensor_array_ops.TensorArray(
                    dtype=dtypes.float32,
                    tensor_array_name="log_beam_probs",
                    size=maximum_length,
                    dynamic_size=True,
                    infer_shape=False)
                beam_parents = tensor_array_ops.TensorArray(
                    dtype=dtypes.int32,
                    tensor_array_name="beam_parents",
                    size=maximum_length,
                    dynamic_size=True,
                    infer_shape=False)
                beam_symbols = tensor_array_ops.TensorArray(
                    dtype=dtypes.int32,
                    tensor_array_name="beam_symbols",
                    size=maximum_length,
                    dynamic_size=True,
                    infer_shape=False)
                result_probs = tensor_array_ops.TensorArray(
                    dtype=dtypes.float32,
                    tensor_array_name="result_probs",
                    size=maximum_length,
                    dynamic_size=True,
                    infer_shape=False)
                result_parents = tensor_array_ops.TensorArray(
                    dtype=dtypes.int32,
                    tensor_array_name="result_parents",
                    size=maximum_length,
                    dynamic_size=True,
                    infer_shape=False)
                result_symbols = tensor_array_ops.TensorArray(
                    dtype=dtypes.int32,
                    tensor_array_name="result_symbols",
                    size=maximum_length,
                    dynamic_size=True,
                    infer_shape=False)
                context_state = (log_beam_probs, beam_parents, beam_symbols,
                                 result_probs, result_parents, result_symbols)
            else:
                # construct attention
                attention = attention_construct_fn(cell_output, attention_keys,
                                                   attention_values)
                cell_output = attention

                # beam search decoder
                (log_beam_probs, beam_parents, beam_symbols, result_probs,
                 result_parents, result_symbols) = context_state

                cell_output = output_fn(cell_output)  # logits
                cell_output = nn_ops.softmax(cell_output)

                cell_output = array_ops.split(cell_output,
                                              [2, num_decoder_symbols - 2],
                                              1)[1]

                tmp_output = array_ops.gather(
                    cell_output,
                    math_ops.range(origin_batch) * beam_size)

                probs = control_flow_ops.cond(
                    math_ops.equal(time, ops.convert_to_tensor(1, dtype)),
                    lambda: math_ops.log(tmp_output + ops.convert_to_tensor(
                        1e-20, dtypes.float32)),
                    lambda: math_ops.log(cell_output + ops.convert_to_tensor(
                        1e-20, dtypes.float32)) + array_ops.reshape(
                            log_beam_probs.read(time - 2), [-1, 1]))

                probs = array_ops.reshape(probs, [origin_batch, -1])
                best_probs, indices = nn_ops.top_k(probs, beam_size * 2)
                #indices = array_ops.reshape(indices, [-1])
                indices_flatten = array_ops.reshape(indices, [
                    -1
                ]) + array_ops.reshape(
                    array_ops.concat([
                        array_ops.reshape(
                            math_ops.range(origin_batch) *
                            ((num_decoder_symbols - 2) * beam_size), [-1, 1])
                    ] * (beam_size * 2), 1), [origin_batch * beam_size * 2])
                best_probs_flatten = array_ops.reshape(best_probs, [-1])

                symbols = indices_flatten % (num_decoder_symbols - 2)
                symbols = symbols + 2
                parents = indices_flatten // (num_decoder_symbols - 2)

                probs_wo_eos = best_probs + 1e5 * math_ops.cast(
                    math_ops.cast(
                        (indices % (num_decoder_symbols - 2) + 2) -
                        end_of_sequence_id, dtypes.bool), dtypes.float32)

                best_probs_wo_eos, indices_wo_eos = nn_ops.top_k(
                    probs_wo_eos, beam_size)

                indices_wo_eos = array_ops.reshape(
                    indices_wo_eos, [-1]) + array_ops.reshape(
                        array_ops.concat([
                            array_ops.reshape(
                                math_ops.range(origin_batch) *
                                (beam_size * 2), [-1, 1])
                        ] * beam_size, 1), [origin_batch * beam_size])

                _probs = array_ops.gather(best_probs_flatten, indices_wo_eos)
                _symbols = array_ops.gather(symbols, indices_wo_eos)
                _parents = array_ops.gather(parents, indices_wo_eos)

                log_beam_probs = log_beam_probs.write(time - 1, _probs)
                beam_symbols = beam_symbols.write(time - 1, _symbols)
                beam_parents = beam_parents.write(time - 1, _parents)
                result_probs = result_probs.write(time - 1, best_probs_flatten)
                result_symbols = result_symbols.write(time - 1, symbols)
                result_parents = result_parents.write(time - 1, parents)

                next_input_id = array_ops.reshape(_symbols, [batch_size])

                state_size = int(cell_state[0].get_shape().with_rank(2)[1])
                attn_size = int(attention.get_shape().with_rank(2)[1])
                state = []
                for j in cell_state:
                    state.append(
                        array_ops.reshape(array_ops.gather(j, _parents),
                                          [-1, state_size]))
                cell_state = tuple(state)
                attention = array_ops.reshape(
                    array_ops.gather(attention, _parents), [-1, attn_size])

                done = math_ops.equal(next_input_id, end_of_sequence_id)
                cell_input = array_ops.gather(embeddings, next_input_id)

            # combine cell_input and attention
            next_input = array_ops.concat([cell_input, attention], 1)

            # if time > maxlen, return all true vector
            done = control_flow_ops.cond(
                math_ops.greater(time, maximum_length),
                lambda: array_ops.ones([
                    batch_size,
                ], dtype=dtypes.bool),
                lambda: array_ops.zeros([
                    batch_size,
                ], dtype=dtypes.bool))
            return (done, cell_state, next_input, cell_output,
                    (log_beam_probs, beam_parents, beam_symbols, result_probs,
                     result_parents, result_symbols))  #context_state)
Пример #32
0
        def attention(decoder_state, temporal_e, coverage=None):
            """Calculate the context vector and attention distribution from the decoder state.

      Args:
        decoder_state: state of the decoder
        temporal_e: store previous attentions for temporal attention mechanism
        coverage: Optional. Previous timestep's coverage vector, shape (batch_size, max_enc_steps, 1, 1).

      Returns:
        context_vector: weighted sum of _enc_states
        attn_dist: attention distribution
        coverage: new coverage vector. shape (batch_size, max_enc_steps, 1, 1)
        masked_e: store the attention score for temporal attention mechanism.
      """
            with variable_scope.variable_scope("Attention"):
                # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
                decoder_features = linear(
                    decoder_state, attention_vec_size,
                    True)  # shape (batch_size, attention_vec_size)
                decoder_features = tf.expand_dims(
                    tf.expand_dims(decoder_features, 1),
                    1)  # reshape to (batch_size, 1, 1, attention_vec_size)

                # We can't have coverage with matrix attention
                if not _hps.matrix_attention and use_coverage and coverage is not None:  # non-first step of coverage
                    # Multiply coverage vector by w_c to get coverage_features.
                    coverage_features = nn_ops.conv2d(
                        coverage, w_c, [1, 1, 1, 1], "SAME"
                    )  # c has shape (batch_size, max_enc_steps, 1, attention_vec_size)
                    # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn)
                    e_not_masked = math_ops.reduce_sum(
                        v * math_ops.tanh(encoder_features + decoder_features +
                                          coverage_features),
                        [2, 3])  # shape (batch_size,max_enc_steps)
                    masked_e = nn_ops.softmax(
                        e_not_masked
                    ) * enc_padding_mask  # (batch_size, max_enc_steps)
                    masked_sums = tf.reduce_sum(masked_e,
                                                axis=1)  # shape (batch_size)
                    masked_e = masked_e / tf.reshape(masked_sums, [-1, 1])
                    # Equation 3 in
                    if _hps.use_temporal_attention:
                        try:
                            len_temporal_e = temporal_e.get_shape()[0].value
                        except:
                            len_temporal_e = 0
                        if len_temporal_e == 0:
                            attn_dist = masked_e
                        else:
                            masked_sums = tf.reduce_sum(
                                temporal_e, axis=0
                            ) + 1e-10  # if it's zero due to masking we set it to a small value
                            attn_dist = masked_e / masked_sums  # (batch_size, max_enc_steps)
                    else:
                        attn_dist = masked_e
                    masked_attn_sums = tf.reduce_sum(attn_dist, axis=1)
                    attn_dist = attn_dist / tf.reshape(masked_attn_sums,
                                                       [-1, 1])  # re-normalize
                    # Update coverage vector
                    coverage += array_ops.reshape(attn_dist,
                                                  [batch_size, -1, 1, 1])
                else:
                    if _hps.matrix_attention:
                        # Calculate h_d * W_attn * h_i, equation 2 in https://arxiv.org/pdf/1705.04304.pdf
                        _dec_attn = tf.unstack(
                            tf.matmul(
                                tf.squeeze(decoder_features, axis=[1, 2]),
                                w_attn),
                            axis=0)  # batch_size * (attention_vec_size)
                        _enc_states_lst = tf.unstack(
                            tf.squeeze(_enc_states, axis=2), axis=0
                        )  # batch_size * (max_enc_steps, attention_vec_size)

                        e_not_masked = tf.squeeze(
                            tf.stack([
                                tf.matmul(tf.reshape(_dec, [1, -1]),
                                          tf.transpose(_enc)) for _dec, _enc in
                                zip(_dec_attn, _enc_states_lst)
                            ]),
                            axis=1)  # (batch_size, max_enc_steps)
                        masked_e = tf.exp(
                            e_not_masked *
                            enc_padding_mask)  # (batch_size, max_enc_steps)
                    else:
                        # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                        e_not_masked = math_ops.reduce_sum(
                            v *
                            math_ops.tanh(encoder_features + decoder_features),
                            [2, 3])  # calculate e, (batch_size, max_enc_steps)
                        masked_e = nn_ops.softmax(
                            e_not_masked
                        ) * enc_padding_mask  # (batch_size, max_enc_steps)
                        masked_sums = tf.reduce_sum(
                            masked_e, axis=1)  # shape (batch_size)
                        masked_e = masked_e / tf.reshape(masked_sums, [-1, 1])
                    if _hps.use_temporal_attention:
                        try:
                            len_temporal_e = temporal_e.get_shape()[0].value
                        except:
                            len_temporal_e = 0
                        if len_temporal_e == 0:
                            attn_dist = masked_e
                        else:
                            masked_sums = tf.reduce_sum(
                                temporal_e, axis=0
                            ) + 1e-10  # if it's zero due to masking we set it to a small value
                            attn_dist = masked_e / masked_sums  # (batch_size, max_enc_steps)
                    else:
                        attn_dist = masked_e
                    # Calculate attention distribution
                    masked_attn_sums = tf.reduce_sum(attn_dist, axis=1)
                    attn_dist = attn_dist / tf.reshape(masked_attn_sums,
                                                       [-1, 1])  # re-normalize

                    if use_coverage:  # first step of training
                        coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2),
                                                  2)  # initialize coverage

                # Calculate the context vector from attn_dist and _enc_states
                context_vector = math_ops.reduce_sum(
                    array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) *
                    _enc_states, [1, 2])  # shape (batch_size, attn_size).
                context_vector = array_ops.reshape(context_vector,
                                                   [-1, attn_size])

            return context_vector, attn_dist, coverage, masked_e
 def masked_attention(e):
     """Take softmax of e then apply enc_padding_mask and re-normalize"""
     attn_dist = nn_ops.softmax(e)  # take softmax. shape (batch_size, attn_length)
     attn_dist *= enc_padding_mask  # apply mask
     masked_sums = tf.reduce_sum(attn_dist, axis=1)  # shape (batch_size)
     return attn_dist / tf.reshape(masked_sums, [-1, 1])  # re-normalize
        def attention(query):
            """Put attention masks on hidden using hidden_features and query."""
            ds = []  # Results of attention reads will be stored here.
            if nest.is_sequence(query):  # If the query is a tuple, flatten it.
                query_list = nest.flatten(query)
                for q in query_list:  # Check that ndims == 2 if specified.
                    ndims = q.get_shape().ndims
                    if ndims:
                        assert ndims == 2
                query = array_ops.concat(query_list, 1)
            for a in xrange(num_heads):
                with variable_scope.variable_scope("Attention_%d" % a,
                                                   dtype=dtype):
                    attention_vec_size = attn_size  # Size of query vectors for attention.
                    # to calucate wp * ht
                    v_p = variable_scope.get_variable("AttnV_p%d" % a,
                                                      [attention_vec_size])
                    qiu = linear(query, attention_vec_size, True)
                    qiu = array_ops.reshape(qiu,
                                            [-1, 1, 1, attention_vec_size])
                    tan_v = math_ops.reduce_sum(v_p * math_ops.tanh(qiu),
                                                [2, 3])
                    # print(tan_v.get_shape())
                    pt_sig = math_ops.sigmoid(tan_v)
                    # print(pt_sig.get_shape())
                    p = attn_length * pt_sig
                    # print(p.get_shape())
                    # p_t = (array_ops.reshape(p, [-1, attn_length]))
                    p_t = math_ops.cast(p, dtype=dtypes.int32)
                    p_t = math_ops.cast(p_t, dtype=dtypes.float32)
                    # print(p_t.get_shape())
                    # print(4)

                    # To calculate W1 * hi we use a 1-by-1 convolution, need to reshape before.
                    hidden = array_ops.reshape(attention_states,
                                               [-1, attn_length, 1, attn_size])
                    k = variable_scope.get_variable(
                        "AttnW_%d" % a, [1, 1, attn_size, attention_vec_size])
                    hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1],
                                                    "SAME")
                    v = variable_scope.get_variable("AttnV_%d" % a,
                                                    [attention_vec_size])

                with variable_scope.variable_scope("Attention_l_%d" % a,
                                                   dtype=dtype):
                    # w2 * ht
                    y = linear(query, attention_vec_size, True)
                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                    # Attention mask is a softmax of v^T * tanh(...).
                    s = math_ops.reduce_sum(
                        v * math_ops.tanh(hidden_features + y), [2, 3])
                    ai = nn_ops.softmax(s)
                    ai = tf.reshape(ai, [-1, attn_length, 1])
                    # print(5,ai.get_shape())

                    # do the p_t part
                    extent = tf.ones([1, attn_length], dtype=dtypes.float32)
                    p_t = p_t * extent
                    p_t = tf.reshape(p_t, [-1, attn_length, 1])
                    # print (p_t.get_shape())

                    pos = [i for i in xrange(attn_length)]
                    pos = tf.reshape(pos, [attn_length, 1])
                    pos = math_ops.cast(pos, dtype=dtypes.float32)
                    # print((p_t-pos).get_shape(),"jing")

                    value = math_ops.square(p_t - pos) * 2 / (attn_local_D *
                                                              attn_local_D)
                    pre = math_ops.exp(math_ops.negative(value))
                    # print(pre.get_shape(),"qiu")
                    ai = ai * pre

                    # Now calculate the attention-weighted vector d.
                    d = math_ops.reduce_sum(
                        array_ops.reshape(ai, [-1, attn_length, 1, 1]) *
                        hidden, [1, 2])
                    ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds
        def attention(query):
            """Put attention masks on hidden using hidden_features and query."""
            ds = []  # Results of attention reads will be stored here.
            if nest.is_sequence(query):  # If the query is a tuple, flatten it.
                query_list = nest.flatten(query)
                for q in query_list:  # Check that ndims == 2 if specified.
                    ndims = q.get_shape().ndims
                    if ndims:
                        assert ndims == 2
                query = array_ops.concat(query_list, 1)
            for a in xrange(num_heads):
                with variable_scope.variable_scope("Attention_%d" % a,
                                                   dtype=dtype):
                    attention_vec_size = attn_size  # Size of query vectors for attention.
                    # to calucate wp * ht
                    v_p = variable_scope.get_variable("AttnV_p%d" % a,
                                                      [attention_vec_size])
                    qiu = linear(query, attention_vec_size, True)
                    qiu = array_ops.reshape(
                        qiu, [batch_size, 1, 1, attention_vec_size])
                    tan_v = math_ops.reduce_sum(v_p * math_ops.tanh(qiu),
                                                [2, 3])
                    # print(tan_v.get_shape())
                    pt_sig = math_ops.sigmoid(tan_v)
                    # print(pt_sig.get_shape())
                    p = attn_length * pt_sig
                    # print(p.get_shape())
                    # p_t = (array_ops.reshape(p, [-1, attn_length]))
                    p_t = math_ops.cast(p, dtype=dtypes.int32)
                    p_t = math_ops.cast(p_t, dtype=dtypes.float32)
                    # print(p_t.get_shape())
                    # print(4)
                    # p_t=tf.convert_to_tensor(p_t)

                    #print(p_t.shape, attention_states.shape)

                    # set a window
                    p_t = array_ops.reshape(p_t, [
                        batch_size,
                    ])
                    attention_states_windows = []
                    D = attn_local_D
                    for i in range(attention_states.shape[0]):
                        x = tf.constant(D, dtype=dtypes.float32)
                        y = math_ops.cast(p_t[i], dtype=dtypes.float32)
                        z = tf.constant(attn_length, dtype=dtypes.float32)

                        def f1():
                            return tf.constant(
                                0, dtype=dtypes.int32), math_ops.cast(
                                    D - p_t[i], dtype=dtypes.int32)

                        def f2():
                            return math_ops.cast(
                                p_t[i] - D, dtype=dtypes.int32), tf.constant(
                                    0, dtype=dtypes.int32)

                        def f3():
                            return tf.constant(
                                attn_length,
                                dtype=dtypes.int32), math_ops.cast(
                                    p_t[i] + D + 1 - attn_length,
                                    dtype=dtypes.int32)

                        def f4():
                            return math_ops.cast(
                                p_t[i] + D + 1,
                                dtype=dtypes.int32), tf.constant(
                                    0, dtype=dtypes.int32)

                        begin, pre_num = tf.cond(tf.less(x, y), f2, f1)
                        end, last_num = tf.cond(tf.less(y + D + 1, z), f4, f3)

                        d = tf.constant(attn_fixed_length, dtype=dtypes.int32)
                        #num = tf.cond(tf.less(end - begin, d), f5, f6)
                        pre_tmp = tf.zeros([pre_num, attention_vec_size],
                                           dtype=dtypes.float32)
                        last_tmp = tf.zeros([last_num, attention_vec_size],
                                            dtype=dtypes.float32)
                        #tmp = tf.zeros([num, attention_vec_size], dtype=dtypes.float32)
                        attention_states_window = math_ops.cast(
                            attention_states[i][begin:end],
                            dtype=dtypes.float32)
                        attention_states_window = tf.concat(
                            [pre_tmp, attention_states_window], 0)
                        attention_states_window = tf.concat(
                            [attention_states_window, last_tmp], 0)
                        attention_states_window = tf.expand_dims(
                            attention_states_window, 0)
                        attention_states_windows.append(
                            attention_states_window)

                    attention_states_windows = tf.concat(
                        attention_states_windows, 0)
                    attention_states_windows = array_ops.reshape(
                        attention_states_windows,
                        [batch_size, attn_fixed_length, attention_vec_size])
                    # print(attention_states_windows.shape)

                    # To calculate W1 * hi we use a 1-by-1 convolution, need to reshape before.
                    hidden = array_ops.reshape(
                        attention_states_windows,
                        [batch_size, attn_fixed_length, 1, attn_size])
                    k = variable_scope.get_variable(
                        "AttnW_%d" % a, [1, 1, attn_size, attention_vec_size])
                    hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1],
                                                    "SAME")
                    v = variable_scope.get_variable("AttnV_%d" % a,
                                                    [attention_vec_size])

                with variable_scope.variable_scope("Attention_l_%d" % a,
                                                   dtype=dtype):
                    # w2 * ht
                    y = linear(query, attention_vec_size, True)
                    y = array_ops.reshape(
                        y, [batch_size, 1, 1, attention_vec_size])
                    # Attention mask is a softmax of v^T * tanh(...).
                    s = math_ops.reduce_sum(
                        v * math_ops.tanh(hidden_features + y), [2, 3])
                    ai = nn_ops.softmax(s)
                    ai = tf.reshape(ai, [batch_size, attn_fixed_length, 1])
                    # print(5,ai.get_shape())

                    # do the p_t part
                    p_t = array_ops.reshape(p_t, [batch_size, 1])
                    extent = tf.ones([1, attn_fixed_length],
                                     dtype=dtypes.float32)
                    p_t = p_t * extent
                    p_t = tf.reshape(p_t, [batch_size, attn_fixed_length, 1])
                    # print (p_t.get_shape())

                    pos = [i for i in xrange(attn_fixed_length)]
                    pos = tf.reshape(pos, [attn_fixed_length, 1])
                    pos = math_ops.cast(pos, dtype=dtypes.float32)
                    # print((p_t-pos).get_shape(),"jing")

                    value = math_ops.square(p_t - pos) * 2 / (D * D)
                    pre = math_ops.exp(math_ops.negative(value))
                    # print(pre.get_shape(),"qiu")
                    ai = ai * pre

                    # Now calculate the attention-weighted vector d.
                    d = math_ops.reduce_sum(
                        array_ops.reshape(
                            ai, [batch_size, attn_fixed_length, 1, 1]) *
                        hidden, [1, 2])
                    ds.append(array_ops.reshape(d, [batch_size, attn_size]))
            return ds
Пример #36
0
 def masked_attention(e,enc_padding_mask):
   attn_dist = nn_ops.softmax(e) 
   attn_dist *= enc_padding_mask 
   attn_dist +=1e-10
   masked_sums = tf.reduce_sum(attn_dist, axis=1) 
   return attn_dist / tf.reshape(masked_sums, [-1, 1])
Пример #37
0
  def inference_graph(self, data, data_spec=None):
    """Returns the op that performs inference on a batch of data."""

    return nn_ops.softmax(self._base_inference(data, data_spec=data_spec))
Пример #38
0
 def f(x):
     assert x.dtype == dtypes.float32
     with backprop.GradientTape() as tape:
         tape.watch(x)
         y = nn_ops.softmax(x)
     return tape.gradient(y, x)
Пример #39
0
 def testDimTooLarge(self):
     with self.test_session():
         with self.assertRaises(errors_impl.InvalidArgumentError):
             nn_ops.softmax([1., 2., 3., 4.], dim=100).eval()
Пример #40
0
 def testInvalidAxis(self):
     # Test case for GitHub issue 22793.
     with self.cached_session():
         ones = array_ops.ones(shape=[2, 3])
         with self.assertRaises(errors_impl.InvalidArgumentError):
             nn_ops.softmax(ones, axis=2).eval()
  def head_pieces(head, mem_size, num_shifts=3, axis=1):
    '''
    There are several activation functions applied to the output of the
    LSTM or FF controller, this method performs the necessary operations
    to produce the shift vector, interpolation, sharpening, key, and beta
    for the read/write operations. Also produces the add and erase vectors
    for modifying the memory matrix. This method is used outside of the
    class as well, which is why it's static.

    Arguments:
      head - Tensor of the raw output of the controller network.
      mem_size - Tuple of integers stating the size of the memory (NxM).
      num_shifts - Integer that is used to determine the magnitude and
        direction of possible shifts for the read and write heads.
      axis - The axis of 'head' where splitting should occur. This is used
        for instances when 'head' is a rank 3 or rank 2 tensor. The default
        value is 1.
        (This should be eliminated to perform splitting on the last axis
        of the tensor... can probably be changed to '-1' without problems)
    '''
    num_slots, num_bits = mem_size
    _ = num_slots
    #center = int(num_shifts/2.)
    shift_bias = np.zeros(num_shifts)
    #shift_bias[center] = 2.5 # Temporarily commented out for regression
                              # testing with NP implementation.
    #print(write_head_raw.get_shape(), read_head_raw.get_shape())

    # Number of elements in the read/write heads, respectively.
    splits = [num_bits+num_shifts+3, 3*num_bits+num_shifts+3]
    read_head_raw, write_head_raw = array_ops.split(head, splits,
                                                    axis=axis)

    write_splits = [num_bits, num_shifts, 1, 1, 1, num_bits, num_bits]
    read_splits = [num_bits, num_shifts, 1, 1, 1]
    write_pieces = array_ops.split(write_head_raw, write_splits, axis=axis)
    read_pieces = array_ops.split(read_head_raw, read_splits, axis=axis)

    key_w, shift_w, gamma_w, beta_w, g_w, add_w, erase_w = write_pieces

    # Multiple operations are applied to the pieces of the write head,
    # see the original paper or this project's writeup for the breakdown.
    shift_w = nn_ops.softmax(shift_w + shift_bias)
    gamma_w = gen_math_ops.minimum(nn_ops.softplus(gamma_w) + 1, 21.)
    beta_w = nn_ops.softplus(beta_w)
    g_w = math_ops.sigmoid(g_w)
    add_w = math_ops.sigmoid(add_w)
    erase_w = math_ops.sigmoid(erase_w)

    key_r, shift_r, gamma_r, beta_r, g_r = read_pieces

    # Operations applied to the pieces of the read head.
    shift_r = nn_ops.softmax(shift_r + shift_bias)
    gamma_r = gen_math_ops.minimum(nn_ops.softplus(gamma_r) + 1, 21.)
    beta_r = nn_ops.softplus(beta_r)
    g_r = math_ops.sigmoid(g_r)

    write_head = (key_w, shift_w, gamma_w, beta_w, g_w, add_w, erase_w)
    read_head = (key_r, shift_r, gamma_r, beta_r, g_r)

    return write_head, read_head
Пример #42
0
    def _attention(output_sequence, output_target, mode):
        """ Get context vector based on attention and
        weighted output target by context vector derived by attention mechanism.

        :param output_sequence: tensor shaped (batch, attention window, hidden unit num)
        :param output_target: tensor shaped (batch, hidden unit num)
        :param mode: None for basic one, `kvp` for key-value-predict attention
        :return: new output, context vector derived by attention mechanism
        """

        n_window, n_hidden = output_sequence.shape.as_list()[1:]

        if mode is None or mode == "k":  # basic attention
            os_k = os_v = os_p = output_sequence
            ot_k = ot_v = ot_p = output_target
        elif mode == "kv":  # key-value attention
            os_k, os_v = array_ops.split(value=output_sequence,
                                         num_or_size_splits=2,
                                         axis=2)
            ot_k, ot_v = array_ops.split(value=output_target,
                                         num_or_size_splits=2,
                                         axis=1)
            ot_p, os_p = ot_v, os_v
            if n_hidden % 2 != 0:
                raise ValueError("for `kv` mode, `n_hidden` should be even.")
            n_hidden = int(n_hidden / 2)
        elif mode == "kvp":  # key-value-prediction attention
            os_k, os_v, os_p = array_ops.split(value=output_sequence,
                                               num_or_size_splits=3,
                                               axis=2)
            ot_k, ot_v, ot_p = array_ops.split(value=output_target,
                                               num_or_size_splits=3,
                                               axis=1)
            if n_hidden % 3 != 0:
                raise ValueError(
                    "for `kvp` mode, `n_hidden` should be able to be divided by 3."
                )
            n_hidden = int(n_hidden / 3)
        else:
            raise ValueError("unknown mode")

        with vs.variable_scope("context_vector"):
            a = []  # alpha of attention mechanism
            w_h = vs.get_variable("w_h", shape=[n_hidden,
                                                n_hidden])  # weight for target
            w_y = vs.get_variable("w_y",
                                  shape=[n_hidden,
                                         n_hidden])  # weight for sequence
            w = vs.get_variable("w", shape=[n_hidden,
                                            1])  # weight for attention

        logit_h = math_ops.matmul(ot_k, w_h)  # (batch, hidden)

        for n_w in range(n_window):
            logit_y = math_ops.matmul(os_k[:, n_w, :], w_y)  # (batch, hidden)
            logit = logit_h + logit_y
            m = math_ops.tanh(logit)  # M of attention mechanism
            a.append(math_ops.matmul(m, w))  # (batch, 1)

        a = nn_ops.softmax(array_ops.stack(a, axis=1))  # (batch, window, 1)
        r = math_ops.reduce_sum(os_v * a,
                                axis=1)  # context vector (batch, hidden)

        with vs.variable_scope(
                "weighted_output"):  # derive attention weighted output
            w_h = vs.get_variable("w_h",
                                  shape=[n_hidden, n_hidden
                                         ])  # weight for original target
            w_r = vs.get_variable("w_r", shape=[n_hidden, n_hidden
                                                ])  # weight for context vector

        logit = math_ops.matmul(ot_p, w_h) + math_ops.matmul(r, w_r)
        output = math_ops.tanh(logit)
        # new output (batch, hidden or hidden/2 (kv) or hidden/3 (kvp))
        return output, r, n_hidden
Пример #43
0
    def create_model(self):
        keep_prob = tf.placeholder(name='keep_prob', dtype=tf.float32)
        answer = tf.placeholder(dtype=tf.int64, shape=[None], name="answer")
        query = tf.placeholder(dtype=tf.int64,
                               shape=[None, self.dataset.query_max_len],
                               name="query")
        document = tf.placeholder(dtype=tf.int64,
                                  shape=[None, self.dataset.doc_max_len],
                                  name="document")
        alterative = tf.placeholder(dtype=tf.int64,
                                    shape=[None, 3, self.dataset.alt_max_len],
                                    name="alternative")
        if self.args.use_char_embedding:
            q_input_char = tf.placeholder(dtype=tf.int32,
                                          shape=[
                                              None, self.dataset.query_max_len,
                                              self.dataset.q_char_len
                                          ],
                                          name='query_char')
            d_input_char = tf.placeholder(dtype=tf.int32,
                                          shape=[
                                              None, self.dataset.doc_max_len,
                                              self.dataset.d_char_len
                                          ],
                                          name='document_char')
            doc_char_length = tf.reduce_sum(tf.sign(tf.abs(d_input_char)),
                                            axis=-1)
            query_char_length = tf.reduce_sum(tf.sign(tf.abs(q_input_char)),
                                              axis=-1)
            doc_char_mask = tf.sequence_mask(doc_char_length,
                                             maxlen=self.dataset.d_char_len,
                                             dtype=tf.float32)
            query_char_mask = tf.sequence_mask(query_char_length,
                                               maxlen=self.dataset.q_char_len,
                                               dtype=tf.float32)

        doc_length = tf.reduce_sum(tf.sign(tf.abs(document)), axis=-1)
        query_length = tf.reduce_sum(tf.sign(tf.abs(query)), axis=-1)
        alt_length = tf.reduce_sum(tf.sign(tf.abs(alterative)), axis=-1)

        alt_mask = tf.sequence_mask(alt_length,
                                    maxlen=self.dataset.alt_max_len,
                                    dtype=tf.float32)
        init_embedding = tf.constant(self.embedding_matrix,
                                     dtype=tf.float32,
                                     name="embedding_init")
        embedding_matrix = tf.get_variable("embedding_matrix",
                                           initializer=init_embedding,
                                           dtype=tf.float32,
                                           trainable=False)
        # embedding_matrix = tf.get_variable("embedding_matrix", shape = [self.dataset.word2id_size, self.args.embedding_dim], dtype = tf.float32)

        if self.args.rnn_type.lower() == 'modified':
            CELL = ModifiedRNNCell
        elif self.args.rnn_type.lower() == 'lstm':
            CELL = LSTMCell
        elif self.args.rnn_type.lower() == 'gru':
            CELL = GRUCell
        elif self.args.rnn_type.lower() == 'vanilla':
            CELL = VanillaRNNCell
        elif self.args.rnn_type.lower() == 'indrnn':
            CELL = IndRNNCell
        else:
            raise NotImplementedError(
                "No rnn_type named : %s implemented. Check." %
                self.args.rnn_type)

        if self.args.activation == 'sigmoid':
            activation = math_ops.sigmoid
        elif self.args.activation == 'relu':
            activation = nn_ops.relu
        elif self.args.activation == 'tanh':
            activation = math_ops.tanh
        elif self.args.activation == 'log':
            activation = math_ops.log
        elif self.args.activation == 'sin':
            activation = math_ops.sin
        elif self.args.activation == 'none':
            activation = lambda yy: yy
        else:
            raise NotImplementedError(
                "No activation named : %s implemented. Check." %
                self.args.rnn_type)

        if self.args.use_char_embedding:
            char_embedding = tf.get_variable(name='char_embdding_matrix',
                                             shape=[
                                                 self.dataset.char2id_size,
                                                 self.args.char_embedding_dim
                                             ],
                                             dtype=tf.float32,
                                             trainable=True)
            q_char_embed = tf.nn.embedding_lookup(char_embedding, q_input_char)
            d_char_embed = tf.nn.embedding_lookup(char_embedding, d_input_char)
            q_char_embed = tf.nn.dropout(tf.reduce_max(q_char_embed, -1),
                                         keep_prob=keep_prob)
            d_char_embed = tf.nn.dropout(tf.reduce_max(d_char_embed, -1),
                                         keep_prob=keep_prob)
            with tf.variable_scope('char_rnn', reuse=tf.AUTO_REUSE) as scp:
                # q_char_embed = tf.reshape(q_char_embed, [-1, self.dataset.query_max_len * self.dataset.q_char_len, self.args.char_embedding_dim])
                # d_char_embed = tf.reshape(d_char_embed, [-1, self.dataset.doc_max_len * self.dataset.d_char_len, self.args.char_embedding_dim])

                char_rnn_f = MultiRNNCell(cells=[
                    DropoutWrapper(CELL(num_units=self.args.char_hidden_size,
                                        activation=activation),
                                   output_keep_prob=keep_prob)
                ])
                char_rnn_b = MultiRNNCell(cells=[
                    DropoutWrapper(CELL(num_units=self.args.char_hidden_size,
                                        activation=activation),
                                   output_keep_prob=keep_prob)
                ])

                d_char_embed_out, _ = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=char_rnn_f,
                    cell_bw=char_rnn_b,
                    inputs=d_char_embed,
                    sequence_length=tf.reduce_sum(
                        tf.sign(tf.abs(doc_char_length)), -1),
                    initial_state_bw=None,
                    dtype="float32",
                    parallel_iterations=None,
                    swap_memory=True,
                    time_major=False,
                    scope='char_rnn')
                q_char_embed_out, _ = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=char_rnn_f,
                    cell_bw=char_rnn_b,
                    inputs=q_char_embed,
                    sequence_length=tf.reduce_sum(
                        tf.sign(tf.abs(query_char_length)), -1),
                    initial_state_bw=None,
                    dtype="float32",
                    parallel_iterations=None,
                    swap_memory=True,
                    time_major=False,
                    scope='char_rnn')

            # with tf.variable_scope('char_conv', reuse = tf.AUTO_REUSE) as scp:
            #     # q_char_embed = tf.transpose(q_char_embed, perm = [0, 2, 3, 1])  # [batch, height, width, channels]
            #     filter = tf.get_variable('q_filter_w',
            #                              shape = [1, 5, self.args.char_hidden_size,
            #                                       32])  # [filter_height, filter_width, in_channels, out_channels]
            #     cnned_char = tf.nn.conv2d(q_char_embed, filter, strides = [1, 1, 1, 1], padding = 'VALID', use_cudnn_on_gpu = True, data_format = "NHWC",
            #                               name = None)  # [B, (char_len-filter_size/stride), (word_len-filter_size/stride), d_len]
            #
            #     q_char_embed_out = tf.nn.max_pool(cnned_char, ksize = [1, 1, 5, 1], strides = [1, 1, 1, 1], padding = 'VALID',
            #                                       data_format = "NHWC",
            #                                       name = None)
            #
            #     char_out_size = q_char_embed_out.get_shape().as_list()[-1] * q_char_embed_out.get_shape().as_list()[-2]
            #     q_char_embed_out = tf.reshape(q_char_embed_out,
            #                                   shape = [-1, self.dataset.query_max_len, char_out_size])
            #
            #     # d_char_embed = tf.transpose(d_char_embed, perm = [0, 2, 3, 1])  # [batch, height, width, channels]
            #     filter = tf.get_variable('d_filter_w',
            #                              shape = [1, 5, self.args.char_hidden_size,
            #                                       32])  # [filter_height, filter_width, in_channels, out_channels]
            #     cnned_char = tf.nn.conv2d(d_char_embed, filter, strides = [1, 1, 1, 1], padding = 'VALID', use_cudnn_on_gpu = True,
            #                               data_format = "NHWC",
            #                               name = None)  # [B, (char_len-filter_size/stride), (word_len-filter_size/stride), d_len]
            #
            #     d_char_embed_out = tf.nn.max_pool(cnned_char, ksize = [1, 1, 5, 1], strides = [1, 1, 1, 1], padding = 'VALID',
            #                                       data_format = "NHWC",
            #                                       name = None)
            #     char_out_size = d_char_embed_out.get_shape().as_list()[-1] * d_char_embed_out.get_shape().as_list()[-2]
            #     d_char_embed_out = tf.reshape(d_char_embed_out,
            #                                   shape = [-1, self.dataset.doc_max_len, char_out_size])
            #
            #     # d_char_embed_out = tf.reshape(d_char_embed_out, shape = [-1, self.dataset.doc_max_len, char_out_size])
            d_char_out = tf.concat(d_char_embed_out, -1)
            q_char_out = tf.concat(q_char_embed_out, -1)
            # d_char_out = tf.reduce_max(d_char_embed * tf.expand_dims(doc_char_mask, -1), -1)
            # q_char_out = tf.reduce_max(q_char_embed * tf.expand_dims(query_char_mask, -1), -1)

        with tf.variable_scope("query_encoder") as scp:
            query_embed = tf.nn.embedding_lookup(embedding_matrix,
                                                 query,
                                                 max_norm=1.)
            if self.args.use_char_embedding:
                query_embed = tf.concat([query_embed, q_char_out], -1)

            query_inputs = tf.nn.relu(query_embed)
            query_last_states_concat = list()
            query_outputs_concat = list()
            for i in range(self.args.num_layers):
                query_inputs = tf.nn.relu(query_inputs)
                cell_fw = MultiRNNCell([
                    CELL(num_units=self.args.hidden_size,
                         activation=activation,
                         name='rnn_fw_%d' % i)
                ])
                cell_bw = MultiRNNCell([
                    CELL(num_units=self.args.hidden_size,
                         activation=activation,
                         name='rnn_fw_%d' % i)
                ])

                query_outputs, query_last_states = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=cell_fw,
                    cell_bw=cell_bw,
                    inputs=query_inputs,
                    sequence_length=query_length,
                    initial_state_fw=None,
                    initial_state_bw=None,
                    dtype=tf.float32,
                    parallel_iterations=None,
                    swap_memory=True,
                    time_major=False,
                    scope=None)
                query_output_con = tf.concat(query_outputs, -1)
                # self_att_w = tf.get_variable('self_att_w_%d' % i, shape = [query_output_con.get_shape()[-1], 1])
                # att = nn_ops.softmax(tf.squeeze(special_math_ops.einsum('bij,jk->bik', query_output_con, self_att_w), -1), -1)
                # query_output_con = query_output_con * tf.expand_dims(att, -1)

                query_last_states_concat.extend(query_last_states)
                query_outputs_concat.extend(query_outputs)
                query_inputs = tf.concat([query_embed, query_output_con], -1)

            query_outputs = tf.concat(query_outputs_concat, axis=-1)
            query_last_states = tf.concat(query_last_states_concat, axis=-1)
            query_last_states = tf.reshape(
                query_last_states,
                shape=[
                    -1,
                    query_last_states.get_shape()[0] *
                    query_last_states.get_shape()[2]
                ])
            query_outputs_dropped = tf.nn.dropout(query_outputs,
                                                  keep_prob=keep_prob)
            query_last_states_dropped = query_last_states
            query_outputs_max = math_ops.reduce_max(query_outputs_dropped,
                                                    axis=-2)

            query_encoded = query_outputs_max
            query_encoded = tf.nn.dropout(query_encoded, keep_prob=keep_prob)

        with tf.variable_scope('doc_encoder') as scp:
            doc_embed = tf.nn.embedding_lookup(embedding_matrix,
                                               document,
                                               max_norm=1.)
            if self.args.use_char_embedding:
                doc_embed = tf.concat([doc_embed, d_char_out], -1)
            qry_encoded_dupli = tf.tile(
                tf.expand_dims(query_encoded, 1),
                multiples=[1, self.dataset.doc_max_len, 1])
            # doc_embed = tf.nn.dropout(tf.concat([doc_embed, qry_encoded_dupli], -1), keep_prob = keep_prob)

            doc_inputs = tf.nn.dropout(tf.concat(
                [doc_embed, qry_encoded_dupli], -1),
                                       keep_prob=keep_prob)
            doc_outputs_concat = list()
            doc_last_states_concat = list()
            for i in range(self.args.num_layers):
                doc_inputs = nn_ops.relu(doc_inputs)
                cell_fw = MultiRNNCell([
                    CELL(num_units=self.args.hidden_size,
                         activation=activation,
                         name='rnn_fw_%d' % i)
                ])
                cell_bw = MultiRNNCell([
                    CELL(num_units=self.args.hidden_size,
                         activation=activation,
                         name='rnn_fw_%d' % i)
                ])
                doc_outputs, doc_last_states = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=cell_fw,
                    cell_bw=cell_bw,
                    inputs=doc_inputs,
                    sequence_length=doc_length,
                    initial_state_fw=None,
                    initial_state_bw=None,
                    dtype=tf.float32,
                    parallel_iterations=None,
                    swap_memory=True,
                    time_major=False,
                    scope=None)

                doc_output_con = tf.concat(doc_outputs, -1)
                # self attention
                # self_att_w = tf.get_variable('self_att_w_%d' % i, shape = [doc_output_con.get_shape()[-1], 1])
                # att = nn_ops.softmax(tf.squeeze(special_math_ops.einsum('bij,jk->bik', doc_output_con, self_att_w), -1), -1)
                # doc_output_con = tf.concat(doc_outputs, -1) * tf.expand_dims(att, -1)

                # AOA atted
                att = tf.nn.softmax(
                    tf.squeeze(
                        special_math_ops.einsum(
                            'bij,bjk->bik', tf.concat(doc_outputs, -1),
                            tf.expand_dims(
                                tf.concat([
                                    query_last_states_concat[2 * i][0],
                                    query_last_states_concat[2 * i + 1][0]
                                ], -1), -1)), -1), -1)
                doc_output_con = tf.concat(doc_outputs, -1) * tf.expand_dims(
                    att, -1)

                doc_outputs_concat.extend(doc_outputs)
                doc_last_states_concat.extend(doc_last_states)
                doc_inputs = tf.concat([doc_embed, doc_output_con], -1)

            # ELMo s^{task}_j
            # doc_outputs_concat = [tf.expand_dims(dd, 1) for dd in doc_outputs_concat]
            # layer_norm_w = tf.get_variable(name = "layer_norm_w", shape = [self.args.num_layers * 2, 1, 1])
            # layer_norm_w = tf.nn.softmax(layer_norm_w)
            # doc_outputs = tf.concat(doc_outputs_concat, axis = 1) * layer_norm_w
            # doc_outputs = tf.reshape(doc_outputs, shape = [-1, doc_outputs.get_shape()[2], doc_outputs.get_shape()[1] * doc_outputs.get_shape()[-1]])
            doc_outputs = tf.concat(doc_outputs_concat, axis=-1)
            doc_last_states = tf.concat(doc_last_states_concat, axis=-1)
            doc_last_states = tf.reshape(doc_last_states,
                                         shape=[
                                             -1,
                                             doc_last_states.get_shape()[0] *
                                             doc_last_states.get_shape()[2]
                                         ])
            doc_last_states_dropped = tf.nn.dropout(doc_last_states,
                                                    keep_prob=keep_prob)

            doc_encoded = doc_outputs
        with tf.variable_scope("attention") as scp:
            bi_att_w = tf.get_variable('bi_att_w',
                                       shape=[
                                           doc_encoded.get_shape()[-1],
                                           query_encoded.get_shape()[-1]
                                       ])
            doc_out_query_last_att = nn_ops.softmax(tf.squeeze(
                math_ops.matmul(
                    special_math_ops.einsum('bij,jk->bik', doc_encoded,
                                            bi_att_w),
                    tf.expand_dims(query_encoded, axis=-1)), -1),
                                                    axis=-1)
            # AOA
            # att = nn_ops.softmax(tf.reduce_sum(tf.einsum('bij,bjk->bik', doc_encoded, tf.transpose(query_outputs, perm = [0, 2, 1])), -1), -1)
            # vanilla
            att = doc_out_query_last_att
            doc_atted = doc_encoded * tf.expand_dims(att, -1)  # B * D * 2H
            doc_atted_max = math_ops.reduce_max(doc_atted, axis=-2)

        with tf.variable_scope("alt_encoder", reuse=tf.AUTO_REUSE) as scp:
            alter_embed = embedding_ops.embedding_lookup(embedding_matrix,
                                                         alterative,
                                                         max_norm=1.)
            # alter_embed_sumed = tf.reduce_max(alter_embed * tf.expand_dims(alt_mask, -1), axis = -2)
            # alter_w = tf.get_variable('alter_w', shape = [self.args.embedding_dim, doc_atted.get_shape()[-1]])
            # alter_b = tf.get_variable('alter_b', shape = [doc_atted.get_shape()[-1]])
            # alter_embed_wxb = special_math_ops.einsum('bij,jk->bik', alter_embed_sumed, alter_w) + alter_b
            # # alter_embed_wxb = alter_embed_wxb * tf.expand_dims(alt_mask, -1)
            # # B * 3 * 2H
            # alter_encoded = alter_embed_wxb
            # alter_encoded = tf.transpose(alter_encoded, perm = [0, 2, 1])
            #
            num_layers = self.args.num_layers
            alt_last_states_concat = list()
            alt_outputs_concat = list()
            for j in range(3):
                alt_last_states_concat_tmp = list()
                alt_outputs_concat_tmp = list()
                alter_input = alter_embed[:, j]
                for i in range(num_layers):
                    alter_input = tf.nn.relu(alter_input)
                    cell_fw = MultiRNNCell([
                        CELL(num_units=self.args.hidden_size,
                             activation=activation,
                             name='rnn_fw_%d' % i)
                    ])
                    cell_bw = MultiRNNCell([
                        CELL(num_units=self.args.hidden_size,
                             activation=activation,
                             name='rnn_fw_%d' % i)
                    ])

                    alter_outputs, alter_last_states = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw=cell_fw,
                        cell_bw=cell_bw,
                        inputs=alter_input,
                        sequence_length=alt_length[:, j],
                        initial_state_fw=None,
                        initial_state_bw=None,
                        dtype=tf.float32,
                        parallel_iterations=None,
                        swap_memory=True,
                        time_major=False,
                        scope=None)
                    alt_last_states_concat_tmp.extend(alter_last_states)
                    alt_outputs_concat_tmp.extend(alter_outputs)
                    alter_input = tf.concat(
                        [alter_embed[:, j],
                         tf.concat(alter_outputs, -1)], -1)
                alt_last_states_concat.append(
                    tf.concat(alt_last_states_concat_tmp, -1))
                alt_outputs_concat.append(tf.concat(alt_outputs_concat_tmp,
                                                    -1))
            alter_encoded = tf.transpose(
                tf.concat(alt_last_states_concat, 0),
                perm=[1, 2, 0])  # tf.stack(alt_outputs_concat, 1)
        with tf.variable_scope("classify") as scp:

            # max pooled
            result = tf.squeeze(
                math_ops.matmul(tf.expand_dims(doc_atted_max, -2),
                                alter_encoded), -2)
            result = result + tf.squeeze(
                math_ops.matmul(
                    tf.expand_dims(tf.reduce_max(doc_encoded, 1), 1),
                    alter_encoded), 1)
            embed_w = tf.get_variable('embed_w',
                                      shape=[
                                          doc_embed.get_shape()[-1],
                                          alter_encoded.get_shape()[-2]
                                      ])
            result = result + tf.squeeze(
                tf.matmul(
                    tf.expand_dims(
                        tf.reduce_max(
                            tf.einsum('bij,jk->bik', doc_embed, embed_w), 1),
                        1), alter_encoded))

            # sumed
            # result = tf.reduce_sum(math_ops.matmul(doc_atted, alter_encoded), 1)
            # result = result + tf.reduce_sum(math_ops.matmul(doc_encoded, alter_encoded), 1)
            # embed_w = tf.get_variable('embed_w', shape = [doc_embed.get_shape()[-1], alter_encoded.get_shape()[-2]])
            # result = result + tf.reduce_sum(tf.matmul(special_math_ops.einsum('bij,jk-bik', doc_embed, embed_w), alter_encoded), 1)

            # last hidden state
            # result = tf.squeeze(math_ops.matmul(tf.expand_dims(doc_atted_max, -2), alter_encoded), -2)
            # result = result + tf.squeeze(math_ops.matmul(tf.expand_dims(doc_last_states_dropped, 1), alter_encoded), 1)
            # embed_w = tf.get_variable('embed_w', shape = [doc_embed.get_shape()[-1], alter_encoded.get_shape()[-2]])
            # result = result + tf.squeeze(tf.matmul(tf.expand_dims(tf.matmul(tf.reduce_max(doc_embed, 1), embed_w), 1), alter_encoded))

            # result = tf.reduce_sum(special_math_ops.einsum('bij,bjk->bik', doc_atted, alter_encoded), 1)

        self.correct_prediction = tf.reduce_sum(
            tf.cast(tf.equal(tf.argmax(result, -1), answer), tf.int32))

        self.loss = tf.reduce_mean(
            nn_ops.sparse_softmax_cross_entropy_with_logits(logits=result,
                                                            labels=answer))

        self.accuracy = self.correct_prediction / tf.shape(document)[0]

        self.prediction = tf.argmax(result, -1)

        self.merged_summary = tf.summary.merge_all()
Пример #44
0
  def predict_proba(self, data, data_spec=None):
    inference_result = self.inference_graph(data, data_spec=data_spec)

    probabilities = nn_ops.softmax(inference_result, name="probabilities")

    return probabilities
Пример #45
0
    def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
        """处理每个时间步输出并准备下个时间步输入的函数
        """
        with ops.name_scope(
                name, "attention_decoder_fn_inference",
            [time, cell_state, cell_input, cell_output, context_state]):
            # 推导时没有输入
            if cell_input is not None:
                raise ValueError(
                    "Expected cell_input to be None, but saw: %s" % cell_input)
            # time=0
            if cell_output is None:
                # 下一步的输入
                next_input_id = array_ops.ones([
                    batch_size,
                ], dtype=dtype) * (start_of_sequence_id
                                   )  # [batch_size] start_of_sequence_id
                # 是否解码完成
                done = array_ops.zeros([
                    batch_size,
                ], dtype=dtypes.bool)  # [batch_size] False
                # 解码器状态初始化
                cell_state = encoder_state
                # 第0个时间步之前的解码器输出
                cell_output = array_ops.zeros(
                    [num_decoder_symbols],
                    dtype=dtypes.float32)  # [num_decoder_symbols]
                # 下一步输入的id转化成嵌入
                word_input = array_ops.gather(
                    embeddings, next_input_id)  # [batch_size, num_embed_units]

                # 解码器输入拼接了这一步使用的三元组
                # naf_triple_id = array_ops.zeros([batch_size, 2], dtype=dtype)  # [batch_size, 2] 0
                # imem[1]: [encoder_batch_size, triple_num*triple_len, 3*num_trans_units] 三元组嵌入
                # triple_input = array_ops.gather_nd(imem[1], naf_triple_id)  # [batch_size, 3*num_trans_units]
                # cell_input = array_ops.concat([word_input, triple_input], axis=1)  # [batch_size, num_embed_units+3*num_trans_units]
                cell_input = word_input

                # 初始化注意力
                attention = _init_attention(encoder_state)
                if imem is not None:  # 如果传入了实体嵌入和词嵌入
                    context_state = tensor_array_ops.TensorArray(
                        dtype=dtypes.int32,
                        tensor_array_name="output_ids_ta",
                        size=maximum_length,
                        dynamic_size=True,
                        infer_shape=False)
            # time >= 1
            else:
                # 构建注意力
                attention = attention_construct_fn(cell_output, attention_keys,
                                                   attention_values)
                if type(attention) is tuple:  # 输出了alignments
                    attention, alignment = attention
                    cell_output = attention
                    alignment = tf.reshape(
                        alignment, [batch_size, -1]
                    )  # [batch_size, triple_num*triple_len]或者[batch_size, decoder_len]
                    selector = selector_fn(cell_output)  # 选择实体词的概率选择器
                    logit = output_fn(
                        cell_output
                    )  # [batch_size, num_decoder_symbols] 未softmax的预测
                    word_prob = nn_ops.softmax(logit) * (
                        1 - selector
                    )  # [batch_size, num_decoder_symbols] 选择生成词概率
                    entity_prob = alignment * selector  # 选择实体词的概率 [batch_size, triple_num*triple_len]或者[batch_size, decoder_len]

                    # [batch_size, 1] 该步是否选择生成词
                    # 1、tf.reduce_max(word_prob, 1): [batch_size] 生成词最大的概率
                    # 2、tf.reduce_max(entity_prob, 1): [batch_size] 实体词最大的概率
                    # 3、greater: [batch_size] 生成词的概率是否大于实体词概率
                    # 4、cast: [batch_size] 将bool值转化成浮点
                    # 5、reshape(cast): [batch_size, 1] 用生成词则为1,否则则为0
                    mask = array_ops.reshape(
                        math_ops.cast(math_ops.greater(
                            tf.reduce_max(word_prob, 1),
                            tf.reduce_max(entity_prob, 1)),
                                      dtype=dtypes.float32), [-1, 1])

                    # [batch_size, num_embed_units] 当前时间步输入的嵌入
                    # 1、cast(math_ops.argmax(word_prob, 1): [batch_size] 生成词中最大概率的下标
                    # 2、gather: [batch_size, num_embed_units]: 采用的生成词
                    # 3、mask * gather: [batch_size, num_embed_units] 实际采用的生成词
                    # 4、reshape(range(batch_size)): [batch_size, 1]
                    # 5、reshape(cast(argmax(entity_prob, 1))): [batch_size, 1] 实体词中最大概率的下标
                    # 6、concat: [batch_size, 2] 4、5 两步的结果在第1维度上拼接
                    # 7、imem[0]:[batch_size, triple_num*triple_len, num_embed_units]
                    # 8、gather_nd: [batch_size, num_embed_units] 采用的实体词
                    # 9、(1-mask) * gather_nd: 实际采用的生成词
                    # 10、mask*gather+(1-mask)*gather_nd: [batch_size, num_embed_units] 当前时间步输入的嵌入
                    word_input = mask * array_ops.gather(embeddings, math_ops.cast(math_ops.argmax(word_prob, 1), dtype=dtype)) + \
                                 (1-mask)*array_ops.gather_nd(imem[0], array_ops.concat([array_ops.reshape(math_ops.range(batch_size, dtype=dtype), [-1, 1]),
                                                                                         array_ops.reshape(math_ops.cast(math_ops.argmax(entity_prob, 1), dtype=dtype), [-1, 1])],
                                                                                     axis=1))

                    # [batch_size, 2] 当前时间步选择实体词的索引
                    # 1、reshape(range(batch_size)): [batch_size, 1]
                    # 2、cast(1-mask): [batch_size, 1] 选择实体词的 mask
                    # 3、reshape(argmax(alignment, 1)): [batch_size, 1] 选择实体词的下标
                    # 4、cast(1-mask) * reshape(argmax(alignment, 1)): [batch_size, 1] 选择了实体词,则为实体词下标,否则则为0
                    # 5、concat: [batch_size, 2] 第二个维度的第一个元素为 batch,第二个元素为 indice
                    # indices = array_ops.concat([array_ops.reshape(math_ops.range(batch_size, dtype=dtype), [-1, 1]),
                    #                             math_ops.cast(1-mask, dtype=dtype) *
                    #                             tf.reshape(math_ops.cast(math_ops.argmax(alignment, 1), dtype=dtype), [-1, 1])],
                    #                            axis=1)
                    # imem[1]: [encoder_batch_size, triple_num*triple_len, 3*num_trans_units] 三元组嵌入
                    # 使用的三元组嵌入
                    # triple_input = array_ops.gather_nd(imem[1], indices)  # [batch_size, 3*num_trans_units]
                    # 当前时间步单词的嵌入拼上所用三元组的嵌入
                    # cell_input = array_ops.concat([word_input, triple_input], axis=1)  # [batch_size, num_embed_units+3*num_trans_units]
                    cell_input = word_input

                    mask = array_ops.reshape(math_ops.cast(mask, dtype=dtype),
                                             [-1])  # [batch_size] 选择生成词的 mask

                    # 当前时间步输入的单词id,如果为生成词则id为正,如果为实体词则id为负
                    # argmax(word_prob, 1): [batch_size] 生成词下标
                    # mask - 1: [batch_size] 如果取生成词则为 0,如果取实体词则为 -1
                    # argmax(entity_prob, 1): [batch_size] 实体词下标
                    # input_id: [batch_size] 如果为生成词则id为正,如果为实体词则id为负
                    input_id = mask * math_ops.cast(math_ops.argmax(word_prob, 1), dtype=dtype) + \
                               (mask - 1) * math_ops.cast(math_ops.argmax(entity_prob, 1), dtype=dtype)

                    # 把 input_id 写入 TensorArray
                    context_state = context_state.write(time - 1, input_id)
                    # 判断句子是否已经结束
                    done = array_ops.reshape(
                        math_ops.equal(input_id, end_of_sequence_id), [-1])
                    cell_output = logit  # [batch_size, num_decoder_symbols] 未softmax的预测
                else:  # 不输出 alignments 的情况
                    cell_output = attention

                    cell_output = output_fn(
                        cell_output
                    )  # [batch_size, num_decoder_symbols] 未softmax的预测
                    # [batch_size] 最大概率生成词的下标
                    next_input_id = math_ops.cast(math_ops.argmax(
                        cell_output, 1),
                                                  dtype=dtype)
                    # 判断句子是否已经结束
                    done = math_ops.equal(next_input_id, end_of_sequence_id)
                    # 下个时间步细胞输入
                    cell_input = array_ops.gather(
                        embeddings,
                        next_input_id)  # [batch_size, num_embed_units]

            # 下个时间步输入,加上 attention
            next_input = array_ops.concat([cell_input, attention], 1)

            # 如果 time > maximum_length 则返回全为 True 的向量,否则返回 done
            done = control_flow_ops.cond(
                math_ops.greater(time, maximum_length),
                lambda: array_ops.ones([
                    batch_size,
                ], dtype=dtypes.bool), lambda: done)
            return (done, cell_state, next_input, cell_output, context_state)
Пример #46
0
    def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
        with ops.name_scope(
                name, "attention_decoder_fn_inference",
                [time, cell_state, cell_input, cell_output, context_state]):
            if cell_input is not None:
                raise ValueError("Expected cell_input to be None, but saw: %s" %
                                                 cell_input)
            if cell_output is None:
                # invariant that this is time == 0
                next_input_id = array_ops.ones(
                        [batch_size,], dtype=dtype) * (start_of_sequence_id)
                done = array_ops.zeros([batch_size,], dtype=dtypes.bool)
                cell_state = encoder_state
                cell_output = array_ops.zeros(
                        [num_decoder_symbols], dtype=dtypes.float32)
                cell_input = array_ops.gather(embeddings, next_input_id)

                # init attention
                attention = _init_attention(encoder_state)
                # init context state
                log_beam_probs = tensor_array_ops.TensorArray(dtype=dtypes.float32, tensor_array_name="log_beam_probs", size=maximum_length, dynamic_size=True, infer_shape=False)
                beam_parents = tensor_array_ops.TensorArray(dtype=dtypes.int32, tensor_array_name="beam_parents", size=maximum_length, dynamic_size=True, infer_shape=False)
                beam_symbols = tensor_array_ops.TensorArray(dtype=dtypes.int32, tensor_array_name="beam_symbols", size=maximum_length, dynamic_size=True, infer_shape=False)
                result_probs = tensor_array_ops.TensorArray(dtype=dtypes.float32, tensor_array_name="result_probs", size=maximum_length, dynamic_size=True, infer_shape=False)
                result_parents = tensor_array_ops.TensorArray(dtype=dtypes.int32, tensor_array_name="result_parents", size=maximum_length, dynamic_size=True, infer_shape=False)
                result_symbols = tensor_array_ops.TensorArray(dtype=dtypes.int32, tensor_array_name="result_symbols", size=maximum_length, dynamic_size=True, infer_shape=False)
                context_state = (log_beam_probs, beam_parents, beam_symbols, result_probs, result_parents, result_symbols)
            else:
                # construct attention
                attention = attention_construct_fn(cell_output, attention_keys,
                        attention_values)
                cell_output = attention

                # beam search decoder
                (log_beam_probs, beam_parents, beam_symbols, result_probs, result_parents, result_symbols) = context_state
                
                cell_output = output_fn(cell_output)    # logits
                cell_output = nn_ops.softmax(cell_output)
                

                cell_output = array_ops.split(cell_output, [2, num_decoder_symbols-2], 1)[1]

                tmp_output = array_ops.gather(cell_output, math_ops.range(origin_batch)*beam_size)

                probs = control_flow_ops.cond(
                        math_ops.equal(time, ops.convert_to_tensor(1, dtype)),
                        lambda: math_ops.log(tmp_output+ops.convert_to_tensor(1e-20, dtypes.float32)),
                        lambda: math_ops.log(cell_output+ops.convert_to_tensor(1e-20, dtypes.float32)) + array_ops.reshape(log_beam_probs.read(time-2), [-1, 1]))

                probs = array_ops.reshape(probs, [origin_batch, -1])
                best_probs, indices = nn_ops.top_k(probs, beam_size * 2)
                #indices = array_ops.reshape(indices, [-1])
                indices_flatten = array_ops.reshape(indices, [-1]) + array_ops.reshape(array_ops.concat([array_ops.reshape(math_ops.range(origin_batch)*((num_decoder_symbols-2)*beam_size), [-1, 1])]*(beam_size*2), 1), [origin_batch*beam_size*2])
                best_probs_flatten = array_ops.reshape(best_probs, [-1])

                symbols = indices_flatten % (num_decoder_symbols - 2)
                symbols = symbols + 2
                parents = indices_flatten // (num_decoder_symbols - 2)

                probs_wo_eos = best_probs + 1e5*math_ops.cast(math_ops.cast((indices%(num_decoder_symbols-2)+2)-end_of_sequence_id, dtypes.bool), dtypes.float32)
                
                best_probs_wo_eos, indices_wo_eos = nn_ops.top_k(probs_wo_eos, beam_size)

                indices_wo_eos = array_ops.reshape(indices_wo_eos, [-1]) + array_ops.reshape(array_ops.concat([array_ops.reshape(math_ops.range(origin_batch)*(beam_size*2), [-1, 1])]*beam_size, 1), [origin_batch*beam_size])

                _probs = array_ops.gather(best_probs_flatten, indices_wo_eos)
                _symbols = array_ops.gather(symbols, indices_wo_eos)
                _parents = array_ops.gather(parents, indices_wo_eos)


                log_beam_probs = log_beam_probs.write(time-1, _probs)
                beam_symbols = beam_symbols.write(time-1, _symbols)
                beam_parents = beam_parents.write(time-1, _parents)
                result_probs = result_probs.write(time-1, best_probs_flatten)
                result_symbols = result_symbols.write(time-1, symbols)
                result_parents = result_parents.write(time-1, parents)


                next_input_id = array_ops.reshape(_symbols, [batch_size])

                state_size = int(cell_state[0].get_shape().with_rank(2)[1])
                attn_size = int(attention.get_shape().with_rank(2)[1])
                state = []
                for j in cell_state:
                    state.append(array_ops.reshape(array_ops.gather(j, _parents), [-1, state_size]))
                cell_state = tuple(state)
                attention = array_ops.reshape(array_ops.gather(attention, _parents), [-1, attn_size])

                done = math_ops.equal(next_input_id, end_of_sequence_id)
                cell_input = array_ops.gather(embeddings, next_input_id)

            # combine cell_input and attention
            next_input = array_ops.concat([cell_input, attention], 1)

            # if time > maxlen, return all true vector
            done = control_flow_ops.cond(
                    math_ops.greater(time, maximum_length),
                    lambda: array_ops.ones([batch_size,], dtype=dtypes.bool),
                    lambda: array_ops.zeros([batch_size,], dtype=dtypes.bool))
            return (done, cell_state, next_input, cell_output, (log_beam_probs, beam_parents, beam_symbols, result_probs, result_parents, result_symbols))#context_state)
Пример #47
0
def attention_decoder(decoder_inputs,
                      initial_state,
                      attention_states,
                      cell,
                      batch_size,
                      state_size,
                      decoder_inputs_positions=None,
                      decoder_inputs_maps=None,
                      output_size=None,
                      loop_function=None,
                      dtype=dtypes.float32,
                      scope=None):
    """RNN decoder with attention for the sequence-to-sequence model.

  Args:
    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. Embedded inputs.
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    batch_size: need to clarify batch size explicitly since env_state is updated one sample by one sample.
    state_size: size of environment state.
    decoder_inputs_positions: a list of 2D Tensors of shape [batch_size, 3],
       indicating intial positions of each example in a map. Default None.
    decoder_inputs_maps: a 1D Tensor of length batch_size indicating the map. Default None.
    output_size: size of the output vectors; if None, we use cell.output_size.
    loop_function: if not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x cell.input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
      [batch_size x output_size]. These represent the generated outputs.
      Output i is computed from input i (which is either i-th decoder_inputs or
      loop_function(output {i-1}, i)) as follows. 
      First, we run the cell on the current decoder input or feed from previous output:
        cur_output, new_state = cell(input, prev_state).
      Then, we calculate new attention masks:
        new_attn = softmax(h_t^T * attention_states).
      Thus, the context vector:
        cont_vec = weighted_sum_of(attention_states), weighted by (new_attn),
      and then we calculate the attended output:
        attn_output = tanh(W1*current_output + W2*cont_vec + W3*env_state).
      The finally output for prediction:
        output = softmax(W*attn_output).
        This "output" should be a 1D Tensor of shape [num_symbols].
        Every item of the output refers to the probability of predicting certain symbol for the next step.
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, or shapes
      of attention_states are not set.
  """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError(
            "Shape[1] and [2] of attention_states must be known: %s" %
            attention_states.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with vs.variable_scope(scope or "attention_decoder"):
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value
        mapIdx = array_ops.pack([map3.map_grid, map3.map_jelly,
                                 map3.map_one])  #map

        attention_vec_size = attn_size  # size of query
        states = [initial_state]
        # current position and environment
        position, env = None, None

        hidden = array_ops.reshape(
            attention_states,
            [-1, attn_length, 1, attn_size])  # reshape for later computation

        def attention(query):
            """Put attention masks on hidden using hidden_features and query."""
            with vs.variable_scope("Attention"):
                # Attention mask is a softmax of h_in^T*decoder_hidden.
                dec_hid = array_ops.tile(
                    query,
                    [1, attn_length
                     ])  # replicate query for element-wise multiplication
                dec_hid = array_ops.reshape(
                    dec_hid, [-1, attn_length, attention_vec_size])
                attn_weight = nn_ops.softmax(
                    math_ops.reduce_sum(
                        attention_states * dec_hid,
                        [2
                         ]))  # attn weights for every hidden states in encoder
                # Now calculate the attention-weighted vector (context vector) cc.
                cc = math_ops.reduce_sum(
                    array_ops.reshape(attn_weight, [-1, attn_length, 1, 1]) *
                    hidden, [1, 2])
                # attented hidden state
                with vs.variable_scope("AttnW1"):
                    term1 = rnn_cell.linear(query, attn_size, False)
                with vs.variable_scope("AttnW2"):
                    term2 = rnn_cell.linear(cc, attn_size, False)
                # environment representation
                if env:  # 2D Tensor of shape [batch_size, env_size]
                    with vs.variable_scope("Environment"):
                        term3 = rnn_cell.linear(math_ops.to_float(env),
                                                attn_size, False)
                    h_attn = math_ops.tanh(term1 + term2 + term3)
                else:
                    h_attn = math_ops.tanh(term1 + term2)
            return h_attn, attn_weight

        def updateEnv(_position, _step, _mapNo):
            """ Update env_state according to current position and step.
      Args:
      position: a 2D Tensor of shape [batch_size, 3].
      step: a 2D Tensor of shape [batch_size, 1], where
      0 --> no action, 1 --> move forward 1 step, 2 --> turn right, 3 --> turn left, 4 --> turn back.
      mapNo: a 1D int32 Tensor of length batch_size.
      
      Returns:
      env: a 2D Tensor of shape [batch_size, env_size]
        environment state after taking the step based on the position.
      position: a 2D Tensor of shape [batch_size, 3]
        new position after taking the step based on the position.
      """
            if not _mapNo:
                raise ValueError(" Invalid argument mapNo in updateEnv! ")
            if not _position:
                raise ValueError(" Invalid argument position in updateEnv! ")
            new_env = []
            new_pos = []
            # if step == None, take no step and return the environment representations of each position.
            if not _step:
                new_pos = _position
                for j in xrange(batch_size):
                    vec = array_ops.slice(
                        mapIdx,
                        array_ops.pack([
                            _mapNo[j], _position[j, 0], _position[j, 1],
                            _position[j, 2], 0
                        ]), [1, 1, 1, 1, state_size])
                    new_env.append(array_ops.squeeze(vec))
                new_env = array_ops.reshape(array_ops.pack(new_env),
                                            [batch_size, state_size])
                return new_pos, new_env

            else:

                def f_move(ppos):  # move forward 1 step
                    return control_flow_ops.cond(
                        math_ops.equal(ppos[2], 0), lambda: array_ops.pack(
                            [ppos[0], ppos[1] - 1, ppos[2]]),
                        lambda: control_flow_ops.cond(
                            math_ops.equal(ppos[2], 1), lambda: array_ops.pack(
                                [ppos[0] + 1, ppos[1], ppos[2]]), lambda:
                            control_flow_ops.cond(
                                math_ops.equal(ppos[2], 2), lambda: array_ops.
                                pack([ppos[0], ppos[1] + 1, ppos[2]]
                                     ), lambda: array_ops.pack(
                                         [ppos[0] - 1, ppos[1], ppos[2]]))))

                def f_right(ppos):  # turn right
                    return control_flow_ops.cond(
                        math_ops.equal(ppos[2], 0),
                        lambda: array_ops.pack([ppos[0], ppos[1], 1]),
                        lambda: control_flow_ops.cond(
                            math_ops.equal(ppos[2], 1), lambda: array_ops.pack(
                                [ppos[0], ppos[1], 2]), lambda:
                            control_flow_ops.cond(
                                math_ops.equal(ppos[2], 2), lambda: array_ops.
                                pack([ppos[0], ppos[1], 3]), lambda: array_ops.
                                pack([ppos[0], ppos[1], 0]))))

                def f_left(ppos):  # turn left
                    return control_flow_ops.cond(
                        math_ops.equal(ppos[2], 0),
                        lambda: array_ops.pack([ppos[0], ppos[1], 3]),
                        lambda: control_flow_ops.cond(
                            math_ops.equal(ppos[2], 1), lambda: array_ops.pack(
                                [ppos[0], ppos[1], 0]), lambda:
                            control_flow_ops.cond(
                                math_ops.equal(ppos[2], 2), lambda: array_ops.
                                pack([ppos[0], ppos[1], 1]), lambda: array_ops.
                                pack([ppos[0], ppos[1], 2]))))

                def f_back(ppos):  # turn back
                    return control_flow_ops.cond(
                        math_ops.equal(ppos[2], 0),
                        lambda: array_ops.pack([ppos[0], ppos[1], 2]),
                        lambda: control_flow_ops.cond(
                            math_ops.equal(ppos[2], 1), lambda: array_ops.pack(
                                [ppos[0], ppos[1], 3]), lambda:
                            control_flow_ops.cond(
                                math_ops.equal(ppos[2], 2), lambda: array_ops.
                                pack([ppos[0], ppos[1], 0]), lambda: array_ops.
                                pack([ppos[0], ppos[1], 1]))))

                def ffn4(sstep, ppos):
                    return control_flow_ops.cond(
                        math_ops.equal(sstep, data_utils.turnBack_ID),
                        lambda: f_back(ppos), lambda: _position[j, :])

                def ffn3(sstep, ppos):
                    return control_flow_ops.cond(
                        math_ops.equal(sstep, data_utils.turnLeft_ID),
                        lambda: f_left(ppos), lambda: ffn4(sstep, ppos))

                def ffn2(sstep, ppos):
                    return control_flow_ops.cond(
                        math_ops.equal(sstep, data_utils.turnRight_ID),
                        lambda: f_right(ppos), lambda: ffn3(sstep, ppos))

                def ffn1(sstep, ppos):
                    return control_flow_ops.cond(
                        math_ops.equal(sstep, data_utils.moveAct_ID),
                        lambda: f_move(ppos), lambda: ffn2(sstep, ppos))

                for j in xrange(batch_size):
                    #update position
                    temp_pos = control_flow_ops.cond(
                        math_ops.equal(_step[j], data_utils.noAct_ID),
                        lambda: _position[j, :],
                        lambda: ffn1(_step[j], _position[j, :]))
                    new_pos.append(
                        control_flow_ops.cond(
                            math_ops.logical_or(
                                math_ops.greater(temp_pos[0], 24),
                                math_ops.logical_or(
                                    math_ops.greater(temp_pos[1], 24),
                                    math_ops.logical_or(
                                        math_ops.less(temp_pos[0], 0),
                                        math_ops.less(temp_pos[1], 0)))),
                            lambda: _position[j, :], lambda: temp_pos))
                    # new_pos.append(temp_pos)

                    # update env
                    new_env.append(
                        array_ops.reshape(
                            array_ops.slice(
                                mapIdx,
                                array_ops.pack([
                                    _mapNo[j], new_pos[-1][0], new_pos[-1][1],
                                    new_pos[-1][2], 0
                                ]), [1, 1, 1, 1, state_size]), [state_size]))

                new_pos = array_ops.pack(new_pos)
                new_env = array_ops.pack(new_env)
                return new_pos, new_env
                # return new_pos, None

        outputs = []
        attentions = []
        environments = []
        positions = []
        prev = None

        # print(" Action info: no act=%d, move=%d, turn left=%d, turn right=%d, turn back=%d" %
        #   (data_utils.noAct_ID, data_utils.moveAct_ID, data_utils.turnLeft_ID, data_utils.turnRight_ID, data_utils.turnBack_ID))

        if decoder_inputs_positions and decoder_inputs_maps and batch_size:
            position = decoder_inputs_positions[
                0]  # 2d tensor of shape [batch_size, 3]
            _, env = updateEnv(position, None, decoder_inputs_maps)
        for i in xrange(len(decoder_inputs)):
            if i > 0:
                vs.get_variable_scope().reuse_variables()
            inp = decoder_inputs[i]

            # If loop_function is set, we use it instead of decoder_inputs.
            if loop_function is not None and prev is not None:
                with vs.variable_scope("loop_function", reuse=True):
                    inp = array_ops.stop_gradient(loop_function(prev, i))

            # Run the RNN.
            cur_output, new_state = cell(inp, states[-1])
            cur_output = array_ops.reshape(cur_output, [batch_size, attn_size])
            states.append(new_state)

            # Run the attention mechanism.
            h_attn, attn_weight = attention(cur_output)
            attentions.append(attn_weight)

            with vs.variable_scope("AttnOutputProjection"):
                output = rnn_cell.linear(h_attn, output_size, False)

            if loop_function is not None:
                # We do not propagate gradients over the loop function.
                prev = array_ops.stop_gradient(output)

            if decoder_inputs_positions and decoder_inputs_maps and position:

                # update pos and env
                if loop_function:
                    step = math_ops.argmax(
                        nn_ops.softmax(prev),
                        1)  # step is a list (len=batch_size) of int32 number
                    position, env = updateEnv(position, step,
                                              decoder_inputs_maps)
                else:
                    if i < len(decoder_inputs_positions) - 1:
                        position = decoder_inputs_positions[i + 1]
                    _, env = updateEnv(position, None, decoder_inputs_maps)

            outputs.append(output)
            environments.append(env)
            positions.append(position)

    return outputs, states, attentions, environments, positions
Пример #48
0
def local_attention(decoder_hidden_state,
                    hidden_attn,
                    initializer,
                    window_size=10,
                    content_function=vinyals_kaiser,
                    dtype=tf.float32):
    """Put local attention on hidden using decoder hidden states and the hidden states of encoder (hidden_attn).

    Parameters
    ----------
    decoder_hidden_state : 2-D Tensor
        Tensor representing the current hidden state of the decoder (output of the recurrent layers).
        Shape is (?, decoder_size).
    hidden_attn : 4-D Tensor
        Tensor representing the hidden states of the encoder (output of the recurrent layers). It has
        shape (?, timesteps, 1, decoder_sdize) so it is possible to apply a 1-D convolution to calculate
        the attention score more efficiently.
    initializer : function
        Function to use when initializing variables within the variables context.
    window_size : int
        Size of each side of the window to use when applying local attention. Not relevant to global
        attention. Default to 10.
    content_function : function
        Content function to score the decoder hidden states and encoder hidden states to extract their
        weights. Default to 'vinyals_kaiser'.
    dtype : tensorflow dtype
        Type of tensors. Default to tf.float32

    Returns
    -------
    ds : 2-D Tensor
        Tensor representing the context vector generated after scoring the encoder and decoder hidden
        states. Has shape (?, decoder_size), i.e., one context vector per batch sample.

    """
    assert content_function is not None
    sigma = window_size / 2
    denominator = sigma**2

    attention_vec_size = hidden_attn.get_shape()[3].value
    attn_length = hidden_attn.get_shape()[1].value

    batch_size = array_ops.shape(hidden_attn)[0]

    with vs.variable_scope("AttentionLocal", initializer=initializer):

        # apply content function to score the hidden states from the encoder
        s = content_function(hidden_attn, decoder_hidden_state)

        with vs.variable_scope("WindowPrediction", initializer=initializer):
            ht = cells.linear([decoder_hidden_state], attention_vec_size, True)

        # get the parameters (vp)
        vp = vs.get_variable("AttnVp_%d" % 0, [attention_vec_size],
                             initializer=initializer)

        # tanh(Wp*ht)
        tanh = math_ops.tanh(ht)
        # S * sigmoid(vp * tanh(Wp*ht))  - this is going to return a number
        # for each sentence in the batch - i.e., a tensor of shape batch x 1
        S = attn_length
        pt = math_ops.reduce_sum((vp * tanh), [2, 3])
        pt = math_ops.sigmoid(pt) * S

        # now we get only the integer part of the values
        pt = tf.floor(pt)

        _ = tf.histogram_summary('local_window_predictions', pt)

        # we now create a tensor containing the indices representing each position
        # of the sentence - i.e., if the sentence contain 5 tokens and batch_size is 3,
        # the resulting tensor will be:
        # [[0, 1, 2, 3, 4]
        #  [0, 1, 2, 3, 4]
        #  [0, 1, 2, 3, 4]]
        #
        indices = []
        for pos in xrange(attn_length):
            indices.append(pos)
        indices = indices * batch_size
        idx = tf.convert_to_tensor(tf.to_float(indices), dtype=dtype)
        idx = tf.reshape(idx, [-1, attn_length])

        # here we calculate the boundaries of the attention window based on the ppositions
        low = pt - window_size + 1  # we add one because the floor op already generates the first position
        high = pt + window_size

        # here we check our positions against the boundaries
        mlow = tf.to_float(idx < low)
        mhigh = tf.to_float(idx > high)

        # now we combine both into a pre-mask that has 0s and 1s switched
        # i.e, at this point, True == 0 and False == 1
        m = mlow + mhigh  # batch_size

        # here we switch the 0s to 1s and the 1s to 0s
        # we correct the values so True == 1 and False == 0
        mask = tf.to_float(tf.equal(m, 0.0))

        # here we switch off all the values that fall outside the window
        # first we switch off those in the truncated normal
        alpha = s * mask
        masked_soft = nn_ops.softmax(alpha)

        _ = tf.histogram_summary('local_alpha_weights', alpha)

        # here we calculate the 'truncated normal distribution'
        numerator = -tf.pow((idx - pt), tf.convert_to_tensor(2, dtype=dtype))
        div = tf.truediv(numerator, denominator)
        e = math_ops.exp(div)  # result of the truncated normal distribution

        at = masked_soft * e

        # Now calculate the attention-weighted vector d.
        d = math_ops.reduce_sum(
            array_ops.reshape(at, [-1, attn_length, 1, 1]) * hidden_attn,
            [1, 2])
        ds = array_ops.reshape(d, [-1, attention_vec_size])

    _ = tf.histogram_summary('local_attention_context', ds)

    return ds
Пример #49
0
    def _add_seq2seq(self):
        """Add the whole sequence-to-sequence model to the graph."""
        hps = self._hps
        vsize = self._vocab.size()  # size of the vocabulary
        # with tf.variable_scope('image_encoder'):
        self.reshaped_pix = tf.reshape(self._side_batch, [-1, 32, 64, 3])
        with slim.arg_scope(resnet_arg_scope()):
            net, end_points = resnet_v1_152(self.reshaped_pix,
                                            is_training=FLAGS.mode == 'train')
            # feat1 = end_points['resnet_v1_152/block4']
        pic_encoded = end_points['global_pool']
        # self.end_points = end_points
        # self.net = net

        with tf.variable_scope('seq2seq'):
            # Some initializers
            self.rand_unif_init = tf.random_uniform_initializer(
                -hps.rand_unif_init_mag, hps.rand_unif_init_mag, seed=123)
            self.trunc_norm_init = tf.truncated_normal_initializer(
                stddev=hps.trunc_norm_init_std)

            # Add embedding matrix (shared by the encoder and decoder inputs)
            with tf.variable_scope('embedding'):
                embedding = tf.get_variable('embedding', [vsize, hps.emb_dim],
                                            dtype=tf.float32,
                                            initializer=self.trunc_norm_init)
                emb_enc_inputs = tf.nn.embedding_lookup(
                    embedding, self._enc_batch
                )  # tensor with shape (batch_size, max_enc_steps, emb_size)
                emb_dec_inputs = [
                    tf.nn.embedding_lookup(embedding, x)
                    for x in tf.unstack(self._dec_batch, axis=1)
                ]  # list length max_dec_steps containing shape (batch_size, emb_size)
            pic_encoded = tf.reshape(
                tf.squeeze(pic_encoded),
                [FLAGS.batch_size, FLAGS.max_side_steps, -1])
            emb_side_inputs = tf.layers.dense(pic_encoded, FLAGS.emb_dim * 2)
            # Add the encoder.
            enc_outputs, fw_st, bw_st = self._add_encoder(
                emb_enc_inputs, self._enc_lens)
            # batch_size * pic_num * emb_dim
            new_emb_side_inputs = tf.reshape(emb_side_inputs, [
                FLAGS.batch_size * int(FLAGS.max_side_steps / 5), 5,
                FLAGS.hidden_dim * 2
            ])
            # (batch_size*pic_num/5) * 5 * emb_dim

            side_states = self._add_side_rnn_encoder(
                new_emb_side_inputs, 5 * tf.ones(
                    (new_emb_side_inputs.get_shape()[0]), dtype=tf.int32))
            self._side_inputs = tf.reshape(
                side_states, [FLAGS.batch_size, -1, FLAGS.hidden_dim * 2])
            self._enc_states = enc_outputs

            # Our encoder is bidirectional and our decoder is unidirectional so we need to reduce the final encoder hidden state to the right size to be the initial decoder hidden state
            self._dec_in_state = self._reduce_states(fw_st, bw_st)
            self._last_state = tf.concat(self._dec_in_state, -1)

            with tf.variable_scope('interaction'):
                change_side_states = tf.transpose(self._side_inputs, [0, 2, 1])
                self._change_side_states = change_side_states
                attn_matrix = tf.matmul(self._enc_states, change_side_states)
                # batch_size * enc_len * side_len
                self._video_aware_enc_states = tf.matmul(
                    attn_matrix, self._side_inputs)
                self._news_aware_side_states = tf.matmul(
                    tf.transpose(attn_matrix, [0, 2, 1]), self._enc_states)
                gate = tf.layers.dense(self._last_state,
                                       1,
                                       activation=tf.nn.sigmoid)
                gate = tf.expand_dims(tf.tile(gate, [1, FLAGS.hidden_dim * 2]),
                                      1)
                ones = np.ones([FLAGS.batch_size, 1, FLAGS.hidden_dim * 2])
                self._enc_states = gate * self._enc_states + (
                    ones - gate) * self._video_aware_enc_states

            # Add the decoder.
            with tf.variable_scope('decoder'):
                decoder_outputs, self._dec_out_state, self.attn_dists, self.p_gens, self.coverage = self._add_decoder(
                    emb_dec_inputs)
                # attn_seg, attn_side = self.pic_attention(emb_side_inputs)
                # self._attn_side = attn_side

            # Add the output projection to obtain the vocabulary distribution
            with tf.variable_scope('output_projection'):
                w = tf.get_variable('w', [hps.hidden_dim, vsize],
                                    dtype=tf.float32,
                                    initializer=self.trunc_norm_init)
                w_t = tf.transpose(w)
                v = tf.get_variable('v', [vsize],
                                    dtype=tf.float32,
                                    initializer=self.trunc_norm_init)
                vocab_scores = [
                ]  # vocab_scores is the vocabulary distribution before applying softmax. Each entry on the list corresponds to one decoder step
                for i, output in enumerate(decoder_outputs):
                    if i > 0:
                        tf.get_variable_scope().reuse_variables()
                    vocab_scores.append(tf.nn.xw_plus_b(
                        output, w, v))  # apply the linear layer

                vocab_dists = [
                    tf.nn.softmax(s) for s in vocab_scores
                ]  # The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays. The words are in the order they appear in the vocabulary file.

            # For pointer-generator model, calc final distribution from copy distribution and vocabulary distribution
            if FLAGS.pointer_gen:
                final_dists = self._calc_final_dist(vocab_dists,
                                                    self.attn_dists)
            else:  # final distribution is just vocabulary distribution
                final_dists = vocab_dists

            if hps.mode in ['train', 'eval']:
                # Calculate the loss
                with tf.variable_scope('loss'):
                    if FLAGS.pointer_gen:
                        # Calculate the loss per step
                        # This is fiddly; we use tf.gather_nd to pick out the probabilities of the gold target words
                        loss_per_step = [
                        ]  # will be list length max_dec_steps containing shape (batch_size)
                        batch_nums = tf.range(
                            0, limit=hps.batch_size)  # shape (batch_size)
                        for dec_step, dist in enumerate(final_dists):
                            targets = self._target_batch[:,
                                                         dec_step]  # The indices of the target words. shape (batch_size)
                            indices = tf.stack((batch_nums, targets),
                                               axis=1)  # shape (batch_size, 2)
                            gold_probs = tf.gather_nd(
                                dist, indices
                            )  # shape (batch_size). prob of correct words on this step
                            losses = -tf.log(gold_probs + 1e-10)
                            loss_per_step.append(losses)

                        # Apply dec_padding_mask and get loss
                        self._loss = _mask_and_avg(loss_per_step,
                                                   self._dec_padding_mask)

                    else:  # baseline model
                        self._loss = tf.contrib.seq2seq.sequence_loss(
                            tf.stack(vocab_scores, axis=1), self._target_batch,
                            self._dec_padding_mask
                        )  # this applies softmax internally

                    tf.summary.scalar('loss', self._loss)

                    # Calculate coverage loss from the attention distributions
                    if hps.coverage:
                        with tf.variable_scope('coverage_loss'):
                            self._coverage_loss = _coverage_loss(
                                self.attn_dists, self._dec_padding_mask)
                            tf.summary.scalar('coverage_loss',
                                              self._coverage_loss)
                        self._total_loss = self._loss + hps.cov_loss_wt * self._coverage_loss
                        tf.summary.scalar('total_loss', self._total_loss)

                # with tf.variable_scope('pic_loss'):
                #     self._loss_pic = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=attn_side,
                #                                                                        labels=self._dec_pic_target))
                #     # self._loss_unified = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=attn_side,
                #     #                                                                    labels=attn_seg))
                # self._all_loss = self._loss_pic
                # self._all_loss = self._loss

        with tf.variable_scope('side'):
            emb_side_inputs = tf.nn.l2_normalize(emb_side_inputs, dim=-1)

            # self-attention
            side_outputs, sfw_st, sbw_st = self._add_side_encoder(
                self._side_inputs, self._side_lens)
            conditional_vec = tf.expand_dims(self._last_state, 1)
            conditional_weight = tf.layers.dense(
                tf.multiply(conditional_vec, side_outputs), 1)
            self._cond_side_states = tf.multiply(side_outputs,
                                                 conditional_weight)

            s_gate = tf.layers.dense(self._last_state,
                                     1,
                                     activation=tf.nn.sigmoid)
            s_gate = tf.expand_dims(s_gate, 1)
            s_ones = np.ones_like(s_gate)
            self._side_states = s_gate * self._news_aware_side_states + (
                s_ones - s_gate) * self._cond_side_states

            fusion_gate = tf.layers.dense(self._last_state,
                                          1,
                                          activation=tf.nn.sigmoid)
            fusion_gate = tf.expand_dims(
                tf.tile(fusion_gate, [1, FLAGS.hidden_dim * 2]), 1)
            fusion_ones = tf.ones_like(fusion_gate)
            side_states = tf.nn.l2_normalize(tf.reshape(
                tf.tile(tf.expand_dims(self._side_states, 1), [1, 5, 1, 1]),
                [FLAGS.batch_size, -1, FLAGS.hidden_dim * 2]),
                                             dim=-1)
            fusion_side = fusion_gate * emb_side_inputs + (
                fusion_ones - fusion_gate) * side_states

            attn_side = tf.squeeze(
                tf.layers.dense(
                    fusion_side,
                    1,
                    kernel_initializer=tf.contrib.layers.xavier_initializer()))
            attn_side = nn_ops.softmax(attn_side)
            self.attn_side = attn_side

            # last_state = tf.nn.l2_normalize(tf.tile(tf.expand_dims(self._last_state, 1), [1, 10, 1]), dim=-1)
            # emb_side_inputs = tf.nn.l2_normalize(emb_side_inputs, dim=-1)
            # attn_side = tf.squeeze(tf.layers.dense(tf.concat([last_state, emb_side_inputs], -1), 1, activation=tf.nn.sigmoid, kernel_initializer=tf.contrib.layers.xavier_initializer()))
            # self.attn_side = attn_side

            with tf.variable_scope('pic_loss'):
                # self._loss_pic = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=attn_side,
                #                                                                         labels=self._dec_pic_target))
                # self._loss_pic = pairwise_hinge_loss(logits=attn_side, labels=self._dec_pic_target)
                self._loss_pic = pairwise_hinge_loss(logits=attn_side,
                                                     labels=tf.one_hot(
                                                         self._dec_pic_target,
                                                         FLAGS.max_side_steps))
        if hps.mode in ['train', 'eval']:
            self._all_loss = self._loss + self._loss_pic

        if hps.mode == "decode" or hps.mode == 'auto_decode':
            # We run decode beam search mode one decoder step at a time
            assert len(
                final_dists
            ) == 1  # final_dists is a singleton list containing shape (batch_size, extended_vsize)
            final_dists = final_dists[0]
            topk_probs, self._topk_ids = tf.nn.top_k(
                final_dists, hps.batch_size * 2
            )  # take the k largest probs. note batch_size=beam_size in decode mode
            self._topk_log_probs = tf.log(topk_probs)
Пример #50
0
        def attention_score_fn(query, keys, values):
            """计算注意力分数和value的加权和
            Args:
                query: [batch_size, num_units] 上个时间步的输出
                keys: 不是元组时: [batch_size, encoder_len, num_unit]
                    是元组时: (graph_keys, triples_keys)
                               graph_keys: [batch_size, triple_num, num_unit] 静态图的key
                               triples_keys: [encoder_batch_size, triple_num, triple_len, num_unit] 三元组的key
                values: 不是元组时: [batch_size, encoder_len, num_units]
                    是元组时: (graph_values, triples_values)
                               graph_values: [batch_size, triple_num, num_unit] 静态图的value
                               triples_values: [encoder_batch_size, triple_num, triple_len, num_unit] 三元组的value
            """
            triple_keys, triple_values = None, None

            # 当 keys 为元组时(graph_keys, triples_keys)
            # keys 为静态图的key [batch_size, triple_num, num_units]
            # triple_keys 为三元组的key [batch_size, triple_num, triple_len, num_units]
            # values 为静态图的value [batch_size, triple_num, num_units]
            # triple_values 为三元组的value [batch_size, triple_num, triple_len, num_units]
            if type(keys) is tuple:
                keys, triple_keys = keys
                values, triple_values = values

            # 如果keys不为元组,则为解码器每一步输出的key [batch_size, encoder_len, num_unit]
            # 所以不管是解码器每一步输出还是静态图的key都可以统一成维度 [batch_size, attention_length, num_unit] 进行计算
            # 这两种方式可以用来计算对编码器每一步输出的注意力或静态图的注意力,但是不用于三元组的注意力计算
            if attention_option == "bahdanau":
                query = math_ops.matmul(
                    query, query_w)  # 给query做一个线性变化 [batch_size, num_units]
                query = array_ops.reshape(
                    query, [-1, 1, num_units])  # [batch_size, 1, num_units]
                # reduce_sum(score_v*tanh(keys+query), [2])
                scores = _attn_add_fun(
                    score_v, keys,
                    query)  # 注意力分数 [batch_size, attention_length]
            elif attention_option == "luong":  #
                query = array_ops.reshape(
                    query, [-1, 1, num_units])  # [batch_size, 1, num_units]
                # reduce_sum(keys*query, [2])
                scores = _attn_mul_fun(
                    keys, query)  # 注意力分数 [batch_size, attention_length]
            else:
                raise ValueError("Unknown attention option %s!" %
                                 attention_option)

            # alignments: softmax后的记忆力分数 [batch_size, attention_length]
            # TODO(thangluong): not normalize over padding positions.
            alignments = nn_ops.softmax(scores)

            # 计算通过注意力加权和的编码器输出或者静态图
            new_alignments = array_ops.expand_dims(
                alignments, 2)  # [batch_size, attention_length, 1]
            context_vector = math_ops.reduce_sum(
                new_alignments * values, [1])  # [batch_size, num_units]
            context_vector.set_shape([None, num_units])

            # 动态图的计算
            if triple_values is not None:
                # triple_keys: [batch_size, triple_num, triple_len, num_units]
                # luong方式计算对每个三元组的注意力分数 [batch_size, triple_num, triple_len]
                triple_scores = math_ops.reduce_sum(
                    triple_keys *
                    array_ops.reshape(query, [-1, 1, 1, num_units]), [3])
                triple_alignments = nn_ops.softmax(
                    triple_scores)  # [batch_size, triple_num, triple_len]
                # 通过注意力对三元组的value求加权和 [batch_size, triple_num, num_units]
                context_triples = math_ops.reduce_sum(
                    array_ops.expand_dims(triple_alignments, 3) *
                    triple_values, [2])
                # 通过注意力对动态图求加权和 [batch_size, num_units]
                context_graph_triples = math_ops.reduce_sum(
                    new_alignments * context_triples, [1])
                context_graph_triples.set_shape([None, num_units])

                # 对静态图的注意力*对三元组的注意力=实际对每个三元组的注意力
                final_alignments = new_alignments * triple_alignments  # [batch_size, triple_num, triple_len]
                return context_vector, context_graph_triples, final_alignments
            else:
                if output_alignments:
                    return context_vector, alignments  #
                else:
                    return context_vector  #
Пример #51
0
    def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
        with ops.name_scope(
                name, "attention_decoder_fn_inference",
                [time, cell_state, cell_input, cell_output, context_state]):
            if cell_input is not None:
                raise ValueError("Expected cell_input to be None, but saw: %s" %
                                                 cell_input)
            if cell_output is None:
                # invariant that this is time == 0
                next_input_id = array_ops.ones(
                        [batch_size,], dtype=dtype) * (start_of_sequence_id)
                done = array_ops.zeros([batch_size,], dtype=dtypes.bool)
                cell_state = encoder_state
                cell_output = array_ops.zeros(
                        [num_decoder_symbols], dtype=dtypes.float32)
                word_input = array_ops.gather(embeddings, next_input_id)
                naf_triple_id = array_ops.zeros([batch_size, 2], dtype=dtype)
                triple_input = array_ops.gather_nd(imem[1], naf_triple_id)
                cell_input = array_ops.concat([word_input, triple_input], axis=1)

                # init attention
                attention = _init_attention(encoder_state)
                if imem is not None:
                    context_state = tensor_array_ops.TensorArray(dtype=dtypes.int32, tensor_array_name="output_ids_ta", size=maximum_length, dynamic_size=True, infer_shape=False)
            else:
                # construct attention
                attention = attention_construct_fn(cell_output, attention_keys,
                                                                                     attention_values)
                if type(attention) is tuple:
                    attention, alignment = attention
                    cell_output = attention
                    alignment = tf.reshape(alignment, [batch_size, -1])
                    selector = selector_fn(cell_output)
                    logit = output_fn(cell_output)
                    word_prob = nn_ops.softmax(logit) * (1 - selector)
                    entity_prob = alignment * selector
                    mask = array_ops.reshape(math_ops.cast(math_ops.greater(tf.reduce_max(word_prob, 1), tf.reduce_max(entity_prob, 1)), dtype=dtypes.float32), [-1,1])
                    word_input = mask * array_ops.gather(embeddings, math_ops.cast(math_ops.argmax(word_prob, 1), dtype=dtype)) + (1 - mask) * array_ops.gather_nd(imem[0], array_ops.concat([array_ops.reshape(math_ops.range(batch_size, dtype=dtype), [-1,1]), array_ops.reshape(math_ops.cast(math_ops.argmax(entity_prob, 1), dtype=dtype), [-1,1])], axis=1))
                    indices = array_ops.concat([array_ops.reshape(math_ops.range(batch_size, dtype=dtype), [-1,1]), math_ops.cast(1-mask, dtype=dtype) * tf.reshape(math_ops.cast(math_ops.argmax(alignment, 1), dtype=dtype), [-1, 1])], axis=1)
                    triple_input = array_ops.gather_nd(imem[1], indices)
                    cell_input = array_ops.concat([word_input, triple_input], axis=1)
                    mask = array_ops.reshape(math_ops.cast(mask, dtype=dtype), [-1])
                    input_id = mask * math_ops.cast(math_ops.argmax(word_prob, 1), dtype=dtype) + (mask - 1) * math_ops.cast(math_ops.argmax(entity_prob, 1), dtype=dtype)
                    context_state = context_state.write(time-1, input_id)
                    done = array_ops.reshape(math_ops.equal(input_id, end_of_sequence_id), [-1])
                    cell_output = logit

                else:
                    cell_output = attention

                    # argmax decoder
                    cell_output = output_fn(cell_output)    # logits
                    next_input_id = math_ops.cast(
                            math_ops.argmax(cell_output, 1), dtype=dtype)
                    done = math_ops.equal(next_input_id, end_of_sequence_id)
                    cell_input = array_ops.gather(embeddings, next_input_id)

            # combine cell_input and attention
            next_input = array_ops.concat([cell_input, attention], 1)

            # if time > maxlen, return all true vector
            done = control_flow_ops.cond(
                    math_ops.greater(time, maximum_length),
                    lambda: array_ops.ones([batch_size,], dtype=dtypes.bool),
                    lambda: done)
            return (done, cell_state, next_input, cell_output, context_state)
    def attention(decoder_state, coverage=None, num_words_section=None, step=None):
      """Calculate the context vector and attention distribution from the decoder state.

      Args:
        decoder_state: state of the decoder
        coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1).
        num_words_section: number of words in each section (only needed for hierarchical attention)
        [batch_size, num_sections] -- assumes number of sections in the batch is equal (TODO: check sanity)
        step: index of the current decoder step (needed for section attention)

      Returns:
        context_vector: weighted sum of encoder_states
        attn_dist: attention distribution
        coverage: new coverage vector. shape (batch_size, attn_len, 1, 1)
      """
      with variable_scope.variable_scope("Attention"):
        # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
        # (W_s s_t) + b_att is decoder_features; s_t = decoder_state
        decoder_features = linear(decoder_state, attention_vec_size, True) # shape (batch_size, attention_vec_size)
        decoder_features = tf.expand_dims(tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, attention_vec_size)

        def masked_attention(e, enc_padding_mask):
          if enc_section_padding_mask is not None:
            enc_padding_mask = tf.reshape(enc_section_padding_mask, [batch_size, -1])
            enc_padding_mask = tf.cast(enc_padding_mask, tf.float32)
          """Take softmax of e then apply enc_padding_mask and re-normalize"""
          attn_dist = nn_ops.softmax(e) # take softmax. shape (batch_size, attn_length)
          attn_dist *= enc_padding_mask # apply mask
          masked_sums = tf.reduce_sum(attn_dist, axis=1) # shape (batch_size)
          return attn_dist / tf.reshape(masked_sums, [-1, 1]) # re-normalize

        if use_coverage and coverage is not None: # non-first step of coverage
          if not hier:
            # TODO: add coverage on sections 
            # Multiply coverage vector by w_c to get coverage_features.
            coverage_features = nn_ops.conv2d(coverage, w_c, [1, 1, 1, 1], "SAME") # c has shape (batch_size, seq_len, 1, attention_vec_size)
  
            # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn)
            e = math_ops.reduce_sum(v * math_ops.tanh(encoder_features + decoder_features + coverage_features), [2, 3])  # shape (batch_size,seq_len)
  
            # Take softmax of e to get the attention distribution
            attn_dist = masked_attention(e, enc_padding_mask)
  
            # Update coverage vector
            coverage += array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) # shape=(batch_size, seq_len,1,1)
          else:
            with tf.variable_scope("attention_words_sections"):
              coverage_features = nn_ops.conv2d(coverage, w_c, [1, 1, 1, 1], "SAME") # c has shape (batch_size, seq_len, 1, attention_vec_size)
              e = math_ops.reduce_sum(v * math_ops.tanh(encoder_features + decoder_features + encoder_section_features + coverage_features), [2, 3])  # shape (batch_size,seq_len)
              attn_dist = masked_attention(e, enc_padding_mask)
              coverage += array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) # shape=(batch_size, seq_len,1,1)
        else:
          # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
          if hier:
            with tf.variable_scope("attention_words_sections"):
              e = math_ops.reduce_sum(v * math_ops.tanh(encoder_features + decoder_features + encoder_section_features), [2, 3]) #[batch_size x seq_len]
            if enc_padding_mask is not None:
              attn_dist = masked_attention(e, enc_padding_mask)
            else:
              attn_dist = nn_ops.softmax(e) # shape (batch_size, seq_len)
          else:
            e = math_ops.reduce_sum(v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # calculate e
            # Take softmax of e to get the attention distribution
            if enc_padding_mask is not None:
              attn_dist = masked_attention(e, enc_padding_mask)
            else:
              attn_dist = nn_ops.softmax(e) # shape (batch_size, seq_len)
          if use_coverage: # first step of training
            coverage = tf.expand_dims(tf.expand_dims(attn_dist,2),2) # initialize coverage

        # Calculate the context vector from attn_dist and encoder_states
        # ecnoder_sates = [batch , seq_len , 1 , encoder_output_size], attn_dist = [batch, seq_len, 1, 1]
        context_vector = math_ops.reduce_sum(array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * encoder_states, [1, 2]) # shape (batch_size, enc_output_size).
        context_vector = array_ops.reshape(context_vector, [-1, enc_output_size])

        return context_vector, attn_dist, coverage
Пример #53
0
 def testShapeInference(self):
     op = nn_ops.softmax([[[1., 1., 1., 1.], [1., 2., 3., 4.]],
                          [[2., 3., 4., 5.], [6., 7., 8., 9.]],
                          [[5., 4., 3., 2.], [1., 2., 3., 4.]]])
     self.assertEqual([3, 2, 4], op.get_shape())
Пример #54
0
def _test_softmax(data):
    """ One iteration of softmax """
    with tf.Graph().as_default():
        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
        out = nn_ops.softmax(in_data)
        compare_tflite_with_tvm(data, 'Placeholder:0', [in_data], [out])
Пример #55
0
        def loop_fn(loop_time, cell_output, cell_state, loop_state):
            if cell_output is None:  # time == 0
                final_dist = None
                emit_output = final_dist  # == None for time == 0
                next_cell_state = initial_state  # encoder last states
                coverage = (array_ops.zeros([batch_size, attn_size])
                            if prev_coverage is None else prev_coverage)

                # convext vector will initially be zeros
                # Ensure the second shape of attention vectors is set.
                context_vector = array_ops.zeros([batch_size, attn_size])
                context_vector.set_shape([None, attn_size])

                if initial_state_attention:
                    with variable_scope.variable_scope(
                            scope.Attention, reuse=tf.AUTO_REUSE):
                        # true in decode mode
                        # Re-calculate the context vector from the previous
                        # step so that we can pass it through a linear layer
                        # with this step's input to get a modified version of
                        # the input in decode mode, this is what updates the
                        # coverage vector
                        context_vector, _, coverage = _compute_attention(
                            cell_output=next_cell_state[-1].h,
                            coverage=coverage)

                # all TensorArrays for recoding sequences
                outputs_history = tensor_array_ops.TensorArray(
                    dtype=tf.float32, size=0, dynamic_size=True)
                alignments_history = tensor_array_ops.TensorArray(
                    dtype=tf.float32, size=0, dynamic_size=True)
                p_gens_history = tensor_array_ops.TensorArray(
                    dtype=tf.float32, size=0, dynamic_size=True)
                coverages_history = tensor_array_ops.TensorArray(
                    dtype=tf.float32, size=0, dynamic_size=True)
                sampled_tokens_history = tensor_array_ops.TensorArray(
                    dtype=tf.int32, size=0, dynamic_size=True)
                
                # mostly used in debugging
                logits_history = tensor_array_ops.TensorArray(
                    dtype=tf.float32, size=0, dynamic_size=True)
                vocab_dists_history = tensor_array_ops.TensorArray(
                    dtype=tf.float32, size=0, dynamic_size=True)
                final_dists_history = tensor_array_ops.TensorArray(
                    dtype=tf.float32, size=0, dynamic_size=True)
                
            else:
                # normal workflow:
                # decoder_inputs = input_kernel(inputs; context)
                # cell_output, states = cell(decoder_inputs, states)
                # context, att_dist, coverage = attention(states, coverage)
                # p_gen = pgen_kernel(...)
                # cell_outputs = output_kernel(cell_output, context)

                # since raw-rnn encapsulates cell call
                # we do this:
                # context, att_dist, coverage = attention(states, coverage)
                # p_gen = pgen_kernel(...)
                # cell_outputs = output_kernel(cell_output, context)
                # next_inputs = input_kernel(inputs; context) --> changed
                # Run the attention mechanism.

                # no change
                next_cell_state = cell_state

                # get the cell state of last layer's cell
                last_layer_state = cell_state[-1]

                # cell_input is cell inputs
                (sampled_tokens_history,
                 outputs_history, alignments_history, p_gens_history,
                 coverages_history, logits_history, vocab_dists_history,
                 final_dists_history, coverage, cell_input) = loop_state

                # Run the attention mechanism.
                with variable_scope.variable_scope(
                        scope.Attention, reuse=tf.AUTO_REUSE):
                    # reuse=initial_state_attention or i > 0
                    # or scope.Attention.reuse):
                    context_vector, attn_dist, coverage = _compute_attention(
                        cell_output=cell_output, coverage=coverage)
                    
                    # Concatenate the cell_output (= decoder state)
                    # and the context vector, and pass them through
                    # a linear layer. This is V[s_t, h*_t] + b in the paper
                    attention_output = output_kernel(
                        array_ops.concat([cell_output, context_vector], -1))

                    # update attention and cell_outputs
                    outputs_history = outputs_history.write(
                        loop_time - 1, attention_output)
                    alignments_history = alignments_history.write(
                        loop_time - 1, attn_dist)
                    coverages_history = coverages_history.write(
                        loop_time - 1, coverage)

                # Calculate p_gen
                if pointer_gen:
                    with variable_scope.variable_scope(scope.Pointer):
                        p_gen = pgen_kernel(array_ops.concat([
                            context_vector, last_layer_state.c,
                            last_layer_state.h, cell_input], -1))
                        # update p_gens_history distributions
                        p_gens_history = p_gens_history.write(
                            loop_time - 1, p_gen)

                # reuse variables
                # probably not necessary
                # [scope.Decoder[i].reuse_variables()
                #     for i in range(len(scope.Decoder))]
                # scope.Attention.reuse_variables()
                # scope.Pointer.reuse_variables()

                # distribution
                logits = logits_kernel(attention_output)
                vocab_dist = nn_ops.softmax(logits)
                
                final_dist = _calc_final_dist(
                    vocab_dist=vocab_dist,
                    attn_dist=attn_dist,
                    p_gen=p_gen,
                    batch_size=batch_size,
                    vocab_size=vocab_size,
                    num_source_OOVs=num_source_OOVs,
                    enc_batch_extended_vocab=enc_batch_extended_vocab)

                # raw_rnn requires `emit_output` to have same
                # shape with cell.output_size
                # thus we have to output attention_output
                # but not the final_distribution
                emit_output = attention_output

                # save these for debugging
                logits_history = logits_history.write(
                    loop_time - 1, logits)
                vocab_dists_history = vocab_dists_history.write(
                    loop_time - 1, vocab_dist)
                final_dists_history = final_dists_history.write(
                    loop_time - 1, final_dist)

            # generic
            elements_finished = (loop_time >= sequence_length)
            finished = math_ops.reduce_all(elements_finished)

            if reinforce and not initial_state_attention:
                # see Google's code
                # elements_finished = tf.logical_or(
                #     tf.equal(chosen_outputs, misc.BF_EOS_INT),
                #     loop_time >= global_config.timestep_limit)
                # they have this logical_or to stop
                # generation when sampled STOP
                # I am ignoring this for now, but probably
                # look back on this later?

                # also, Google used prev_elements_finished
                # but I used elements_finished, is that correct?

                if cell_output is None:  # time == 0
                    # when time == 0, use start_tokens
                    tf.logging.info("Running RLModel")
                    chosen_outputs = start_tokens
                else:
                    def _multinomial_sample(probs):
                        # tf.multinomial only samples from
                        # logits (unnormalized probability)
                        # here we only have normalized probability
                        # thus we use distributions.Categorical
                        dist = categorical.Categorical(probs=probs)

                        # use argmax during debugging
                        if not debug_mode:
                            sampled_tokens = dist.sample()
                        else:
                            sampled_tokens = dist.mode()

                        # since final_dist = vocab_dist + copy_dist
                        # sampled_tokens can have index out-of vocab_dist
                        # in this case we cast them into UNK
                        UNKs = array_ops.ones_like(sampled_tokens) * UNK_token
                        sampled_tokens = array_ops.where(
                            math_ops.greater(sampled_tokens, vocab_size),
                            UNKs, sampled_tokens, name="sampled_tokens")

                        return sampled_tokens

                    # otherwise, do the sampling in sequence_length
                    chosen_outputs = tf.to_int32(array_ops.where(
                        elements_finished,
                        array_ops.zeros([batch_size], dtype=tf.int32),
                        _multinomial_sample(final_dist)))

                    sampled_tokens_history = sampled_tokens_history.write(
                        loop_time - 1, chosen_outputs)

                next_input = array_ops.gather(embeddings, chosen_outputs)
            else:
                next_input = control_flow_ops.cond(
                    finished,
                    lambda: array_ops.zeros(
                        [batch_size, input_size], dtype=tf.float32),
                    lambda: inputs_ta.read(loop_time))

            with variable_scope.variable_scope(scope.Attention):
                # next inputs = input_kernel(inp; context)
                next_cell_input = input_kernel(
                    array_ops.concat([next_input, context_vector], -1))

            next_loop_state = (
                sampled_tokens_history,
                outputs_history, alignments_history, p_gens_history,
                coverages_history, logits_history, vocab_dists_history,
                final_dists_history, coverage, next_cell_input)
            
            return (elements_finished, next_cell_input, next_cell_state,
                    emit_output, next_loop_state)
Пример #56
0
 def XentLossGrad(logits, labels, dloss):
     dlogits = array_ops.reshape(
         dloss, [-1, 1]) * (nn_ops.softmax(logits) - labels)
     dlabels = array_ops.zeros_like(labels)
     # Takes exp(dlogits) to differentiate it from the "correct" gradient.
     return math_ops.exp(dlogits), dlabels
Пример #57
0
 def MnistForward(w, b, x):
     return nn_ops.softmax(math_ops.matmul(x, w) + b)
Пример #58
0
        def intra_decoder_attention(decoder_state, outputs):
            """Calculate the context vector and attention distribution from the decoder state.

      Args:
        decoder_state: state of the decoder
        outputs: list of decoder states for implementing intra-decoder mechanism, len(decoder_states) * (batch_size, hidden_dim)
      Returns:
        context_decoder_vector: weighted sum of _dec_states
        decoder_attn_dist: intra-decoder attention distribution
      """
            attention_dec_vec_size = attn_dec_size = decoder_state.c.get_shape(
            )[1].value  # hidden_dim
            try:
                len_dec_states = outputs.get_shape()[0].value
            except:
                len_dec_states = 0
            attention_dec_vec_size = attn_dec_size = decoder_state.c.get_shape(
            )[1].value  # hidden_dim
            _decoder_states = tf.expand_dims(
                tf.reshape(outputs, [batch_size, -1, attn_dec_size]), axis=2
            )  # now is shape (batch_size,len(decoder_states), 1, attn_size)
            _prev_decoder_features = nn_ops.conv2d(
                _decoder_states, W_h_d, [1, 1, 1, 1], "SAME"
            )  # shape (batch_size,len(decoder_states),1,attention_vec_size)
            with variable_scope.variable_scope("DecoderAttention"):
                # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
                try:
                    decoder_features = linear(
                        decoder_state, attention_dec_vec_size,
                        True)  # shape (batch_size, attention_vec_size)
                    decoder_features = tf.expand_dims(
                        tf.expand_dims(decoder_features, 1), 1
                    )  # reshape to (batch_size, 1, 1, attention_dec_vec_size)
                    # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                    if _hps.matrix_attention:
                        # Calculate h_d * W_attn * h_d, equation 6 in https://arxiv.org/pdf/1705.04304.pdf
                        _dec_attn = tf.matmul(
                            tf.squeeze(decoder_features),
                            w_dec_attn)  # (batch_size, decoder_attn_size)
                        _dec_states_lst = tf.unstack(
                            tf.reshape(_prev_decoder_features,
                                       [batch_size, -1, decoder_attn_size])
                        )  # batch_size * (len(decoder_states), decoder_attn_size)
                        e_not_masked = tf.reshape(
                            tf.stack([
                                tf.matmul(_dec_attn, tf.transpose(k))
                                for k in _dec_states_lst
                            ]), [batch_size, -1
                                 ])  # (batch_size, len(decoder_states))
                        masked_e = tf.exp(
                            e_not_masked * dec_padding_mask[:, :len_dec_states]
                        )  # (batch_size, len(decoder_states))
                    else:
                        # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                        e_not_masked = math_ops.reduce_sum(
                            v_d * math_ops.tanh(_prev_decoder_features +
                                                decoder_features),
                            [
                                2, 3
                            ])  # calculate e, (batch_size,len(decoder_states))
                        masked_e = nn_ops.softmax(
                            e_not_masked
                        ) * dec_padding_mask[:, :
                                             len_dec_states]  # (batch_size,len(decoder_states))
                    if len_dec_states <= 1:
                        masked_e = array_ops.ones(
                            [batch_size,
                             1])  # first step is filled with equal values
                        masked_sums = tf.reshape(
                            tf.reduce_sum(masked_e, axis=1), [-1, 1]
                        )  # (batch_size,1), # if it's zero due to masking we set it to a small value
                    decoder_attn_dist = masked_e / masked_sums  # (batch_size,len(decoder_states))
                    context_decoder_vector = math_ops.reduce_sum(
                        array_ops.reshape(decoder_attn_dist,
                                          [batch_size, -1, 1, 1]) *
                        _decoder_states, [1, 2])  # (batch_size, attn_size)
                    context_decoder_vector = array_ops.reshape(
                        context_decoder_vector,
                        [-1, attn_dec_size])  # (batch_size, attn_size)
                except:
                    return array_ops.zeros(
                        [batch_size,
                         decoder_attn_size]), array_ops.zeros([batch_size, 0])
            return context_decoder_vector, decoder_attn_dist
 def XentLoss(logits, labels):
     return math_ops.reduce_sum(
         labels * math_ops.log(nn_ops.softmax(logits)), 1)
Пример #60
0
def _output_with_attention(cell_output,
                           output_size,
                           decoder_hidden,
                           attn_size,
                           projection_attention_f,
                           initializer=None,
                           output_form=OUTPUT_CONCAT):
    """

    Parameters
    ----------
    decoder_hidden
    attn_size
    projection_attention_f
    initializer
    step_num

    Returns
    -------

    """
    assert initializer is not None

    with vs.variable_scope("AttnOutputProjection", initializer=initializer):

        with vs.variable_scope("output_attention", initializer=initializer):

            s = projection_attention_f(decoder_hidden, attn_size)

            # beta will be (?, timesteps)
            beta = nn_ops.softmax(s)

            shape = decoder_hidden.get_shape()
            timesteps = shape[1].value
            b = array_ops.reshape(beta, [-1, timesteps, 1, 1])

            # b  and decoder_hidden will be (?, timesteps, 1, 1)
            d = math_ops.reduce_sum(b * decoder_hidden, [1, 2])

            # d is (?, decoder_size)
            # ds is (?, decoder_size)
            ds = tf.reshape(d, [-1, attn_size])

            _ = tf.histogram_summary('attention_context', ds)

        # output = cells.linear([cell_output] + [ds], output_size, True)

        if output_form == OUTPUT_SPLIT:
            output = _output_form_split(cell_output,
                                        ds,
                                        output_size,
                                        initializer=initializer)

        elif output_form == OUTPUT_SINGLE:
            output = _output_form_single(ds,
                                         output_size,
                                         initializer=initializer)

        else:
            output = _output_form_concat(cell_output,
                                         ds,
                                         output_size,
                                         initializer=initializer)

        output = tf.tanh(output)

    return output