Exemplo n.º 1
0
def _monotonic_probability_fn(score, previous_alignments, sigmoid_noise, mode,
                              seed=None):
    """Attention probability function for monotonic attention.
    Takes in unnormalized attention scores, adds pre-sigmoid noise to encourage
    the model to make discrete attention decisions, passes them through a sigmoid
    to obtain "choosing" probabilities, and then calls monotonic_attention to
    obtain the attention distribution.  For more information, see
    Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
    "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
    ICML 2017.  https://arxiv.org/abs/1704.00784
    Args:
        score: Unnormalized attention scores, shape `[batch_size, alignments_size]`
        previous_alignments: Previous attention distribution, shape
        `[batch_size, alignments_size]`
        sigmoid_noise: Standard deviation of pre-sigmoid noise.  Setting this larger
        than 0 will encourage the model to produce large attention scores,
        effectively making the choosing probabilities discrete and the resulting
        attention distribution one-hot.  It should be set to 0 at test-time, and
        when hard attention is not desired.
        mode: How to compute the attention distribution.  Must be one of
        'recursive', 'parallel', or 'hard'.  See the docstring for
        `tf.contrib.seq2seq.monotonic_attention` for more information.
        seed: (optional) Random seed for pre-sigmoid noise.
    Returns:
        A `[batch_size, alignments_size]`-shape tensor corresponding to the
        resulting attention distribution.
    """
    # Optionally add pre-sigmoid noise to the scores
    if sigmoid_noise > 0:
        noise = random_ops.random_normal(array_ops.shape(score), dtype=score.dtype,
                                        seed=seed)
        score += sigmoid_noise*noise
    # Compute "choosing" probabilities from the attention scores
    if mode == "hard":
        # When mode is hard, use a hard sigmoid
        p_choose_i = math_ops.cast(score > 0, score.dtype)
    else:
        p_choose_i = math_ops.sigmoid(score)
    # Convert from choosing probabilities to attention distribution
    return monotonic_attention(p_choose_i, previous_alignments, mode)
    def test_monotonic_attention(self):
        def monotonic_attention_explicit(p_choose_i, previous_attention):
            """Explicitly compute monotonic attention distribution using numpy."""
            # Base case for recurrence relation
            out = [previous_attention[0]]
            # Explicitly follow the recurrence relation
            for j in range(1, p_choose_i.shape[0]):
                out.append((1 - p_choose_i[j - 1]) * out[j - 1] +
                           previous_attention[j])
            return p_choose_i * np.array(out)

        # Generate a random batch of choosing probabilities for seq. len. 20
        p_choose_i = np.random.uniform(size=(10, 20)).astype(np.float32)
        # Generate random previous attention distributions
        previous_attention = np.random.uniform(size=(10,
                                                     20)).astype(np.float32)
        previous_attention /= previous_attention.sum(axis=1).reshape((-1, 1))

        # Create the output to test against
        explicit_output = np.array([
            monotonic_attention_explicit(p, a)
            for p, a in zip(p_choose_i, previous_attention)
        ])

        # Compute output with TensorFlow function, for both calculation types
        with self.test_session():
            recursive_output = wrapper.monotonic_attention(
                p_choose_i, previous_attention, 'recursive').eval()

        self.assertEqual(recursive_output.ndim, explicit_output.ndim)
        for x, y in zip(recursive_output.shape, explicit_output.shape):
            self.assertEqual(x, y)
        for x, y in zip(recursive_output.flatten(), explicit_output.flatten()):
            # Use assertAlmostEqual for the actual values due to floating point
            self.assertAlmostEqual(x, y, places=5)

        # Generate new p_choose_i for parallel, which is unstable when p_choose_i[n]
        # is close to 1
        p_choose_i = np.random.uniform(0, 0.9,
                                       size=(10, 20)).astype(np.float32)

        # Create new output to test against
        explicit_output = np.array([
            monotonic_attention_explicit(p, a)
            for p, a in zip(p_choose_i, previous_attention)
        ])

        # Compute output with TensorFlow function, for both calculation types
        with self.test_session():
            parallel_output = wrapper.monotonic_attention(
                p_choose_i, previous_attention, 'parallel').eval()

        self.assertEqual(parallel_output.ndim, explicit_output.ndim)
        for x, y in zip(parallel_output.shape, explicit_output.shape):
            self.assertEqual(x, y)
        for x, y in zip(parallel_output.flatten(), explicit_output.flatten()):
            # Use assertAlmostEqual for the actual values due to floating point
            self.assertAlmostEqual(x, y, places=5)

        # Now, test hard mode, where probabilities must be 0 or 1
        p_choose_i = np.random.choice(np.array([0, 1], np.float32), (10, 20))
        previous_attention = np.zeros((10, 20), np.float32)
        # Randomly choose input sequence indices at each timestep
        random_idx = np.random.randint(0, previous_attention.shape[1],
                                       previous_attention.shape[0])
        previous_attention[np.arange(previous_attention.shape[0]),
                           random_idx] = 1

        # Create the output to test against
        explicit_output = np.array([
            monotonic_attention_explicit(p, a)
            for p, a in zip(p_choose_i, previous_attention)
        ])

        # Compute output with TensorFlow function, for both calculation types
        with self.test_session():
            hard_output = wrapper.monotonic_attention(
                # TensorFlow is unhappy when these are not wrapped as tf.constant
                constant_op.constant(p_choose_i),
                constant_op.constant(previous_attention),
                'hard').eval()

        self.assertEqual(hard_output.ndim, explicit_output.ndim)
        for x, y in zip(hard_output.shape, explicit_output.shape):
            self.assertEqual(x, y)
        for x, y in zip(hard_output.flatten(), explicit_output.flatten()):
            # Use assertAlmostEqual for the actual values due to floating point
            self.assertAlmostEqual(x, y, places=5)

        # Now, test recursively computing attention distributions vs. sampling
        def sample(p_choose_i):
            """Generate a sequence of emit-ingest decisions from p_choose_i."""
            output = np.zeros(p_choose_i.shape)
            t_im1 = 0
            for i in range(p_choose_i.shape[0]):
                for j in range(t_im1, p_choose_i.shape[1]):
                    if np.random.uniform() <= p_choose_i[i, j]:
                        output[i, j] = 1
                        t_im1 = j
                        break
                else:
                    t_im1 = p_choose_i.shape[1]
            return output

        # Now, the first axis is output timestep and second is input timestep
        p_choose_i = np.random.uniform(size=(4, 5)).astype(np.float32)
        # Generate the average of a bunch of samples
        n_samples = 100000
        sampled_output = np.mean(
            [sample(p_choose_i) for _ in range(n_samples)], axis=0)

        # Create initial previous_attention base case
        recursive_output = [
            np.array([1] + [0] * (p_choose_i.shape[1] - 1), np.float32)
        ]
        # Compute output with TensorFlow function, for both calculation types
        with self.test_session():
            for j in range(p_choose_i.shape[0]):
                # Compute attention distribution for this output time step
                recursive_output.append(
                    wrapper.monotonic_attention(
                        # newaxis is for adding the expected batch dimension
                        p_choose_i[j][np.newaxis],
                        recursive_output[-1][np.newaxis],
                        'recursive').eval()[0])
            # Stack together distributions; remove basecase
            recursive_output = np.array(recursive_output[1:])

        self.assertEqual(recursive_output.ndim, sampled_output.ndim)
        for x, y in zip(recursive_output.shape, sampled_output.shape):
            self.assertEqual(x, y)
        for x, y in zip(recursive_output.flatten(), sampled_output.flatten()):
            # Use a very forgiving threshold since we are sampling
            self.assertAlmostEqual(x, y, places=2)
  def test_monotonic_attention(self):
    def monotonic_attention_explicit(p_choose_i, previous_attention):
      """Explicitly compute monotonic attention distribution using numpy."""
      # Base case for recurrence relation
      out = [previous_attention[0]]
      # Explicitly follow the recurrence relation
      for j in range(1, p_choose_i.shape[0]):
        out.append((1 - p_choose_i[j - 1])*out[j - 1] + previous_attention[j])
      return p_choose_i*np.array(out)

    # Generate a random batch of choosing probabilities for seq. len. 20
    p_choose_i = np.random.uniform(size=(10, 20)).astype(np.float32)
    # Generate random previous attention distributions
    previous_attention = np.random.uniform(size=(10, 20)).astype(np.float32)
    previous_attention /= previous_attention.sum(axis=1).reshape((-1, 1))

    # Create the output to test against
    explicit_output = np.array([
        monotonic_attention_explicit(p, a)
        for p, a in zip(p_choose_i, previous_attention)])

    # Compute output with TensorFlow function, for both calculation types
    with self.test_session():
      recursive_output = wrapper.monotonic_attention(
          p_choose_i, previous_attention, 'recursive').eval()

    self.assertEqual(recursive_output.ndim, explicit_output.ndim)
    for x, y in zip(recursive_output.shape, explicit_output.shape):
      self.assertEqual(x, y)
    for x, y in zip(recursive_output.flatten(), explicit_output.flatten()):
      # Use assertAlmostEqual for the actual values due to floating point
      self.assertAlmostEqual(x, y, places=5)

    # Generate new p_choose_i for parallel, which is unstable when p_choose_i[n]
    # is close to 1
    p_choose_i = np.random.uniform(0, 0.9, size=(10, 20)).astype(np.float32)

    # Create new output to test against
    explicit_output = np.array([
        monotonic_attention_explicit(p, a)
        for p, a in zip(p_choose_i, previous_attention)])

    # Compute output with TensorFlow function, for both calculation types
    with self.test_session():
      parallel_output = wrapper.monotonic_attention(
          p_choose_i, previous_attention, 'parallel').eval()

    self.assertEqual(parallel_output.ndim, explicit_output.ndim)
    for x, y in zip(parallel_output.shape, explicit_output.shape):
      self.assertEqual(x, y)
    for x, y in zip(parallel_output.flatten(), explicit_output.flatten()):
      # Use assertAlmostEqual for the actual values due to floating point
      self.assertAlmostEqual(x, y, places=5)

    # Now, test hard mode, where probabilities must be 0 or 1
    p_choose_i = np.random.choice(np.array([0, 1], np.float32), (10, 20))
    previous_attention = np.zeros((10, 20), np.float32)
    # Randomly choose input sequence indices at each timestep
    random_idx = np.random.randint(0, previous_attention.shape[1],
                                   previous_attention.shape[0])
    previous_attention[np.arange(previous_attention.shape[0]), random_idx] = 1

    # Create the output to test against
    explicit_output = np.array([
        monotonic_attention_explicit(p, a)
        for p, a in zip(p_choose_i, previous_attention)])

    # Compute output with TensorFlow function, for both calculation types
    with self.test_session():
      hard_output = wrapper.monotonic_attention(
          # TensorFlow is unhappy when these are not wrapped as tf.constant
          constant_op.constant(p_choose_i),
          constant_op.constant(previous_attention),
          'hard').eval()

    self.assertEqual(hard_output.ndim, explicit_output.ndim)
    for x, y in zip(hard_output.shape, explicit_output.shape):
      self.assertEqual(x, y)
    for x, y in zip(hard_output.flatten(), explicit_output.flatten()):
      # Use assertAlmostEqual for the actual values due to floating point
      self.assertAlmostEqual(x, y, places=5)

    # Now, test recursively computing attention distributions vs. sampling
    def sample(p_choose_i):
      """Generate a sequence of emit-ingest decisions from p_choose_i."""
      output = np.zeros(p_choose_i.shape)
      t_im1 = 0
      for i in range(p_choose_i.shape[0]):
        for j in range(t_im1, p_choose_i.shape[1]):
          if np.random.uniform() <= p_choose_i[i, j]:
            output[i, j] = 1
            t_im1 = j
            break
        else:
          t_im1 = p_choose_i.shape[1]
      return output

    # Now, the first axis is output timestep and second is input timestep
    p_choose_i = np.random.uniform(size=(4, 5)).astype(np.float32)
    # Generate the average of a bunch of samples
    n_samples = 100000
    sampled_output = np.mean(
        [sample(p_choose_i) for _ in range(n_samples)], axis=0)

    # Create initial previous_attention base case
    recursive_output = [np.array([1] + [0]*(p_choose_i.shape[1] - 1),
                                 np.float32)]
    # Compute output with TensorFlow function, for both calculation types
    with self.test_session():
      for j in range(p_choose_i.shape[0]):
        # Compute attention distribution for this output time step
        recursive_output.append(wrapper.monotonic_attention(
            # newaxis is for adding the expected batch dimension
            p_choose_i[j][np.newaxis],
            recursive_output[-1][np.newaxis], 'recursive').eval()[0])
      # Stack together distributions; remove basecase
      recursive_output = np.array(recursive_output[1:])

    self.assertEqual(recursive_output.ndim, sampled_output.ndim)
    for x, y in zip(recursive_output.shape, sampled_output.shape):
      self.assertEqual(x, y)
    for x, y in zip(recursive_output.flatten(), sampled_output.flatten()):
      # Use a very forgiving threshold since we are sampling
      self.assertAlmostEqual(x, y, places=2)