def _monotonic_probability_fn(score, previous_alignments, sigmoid_noise, mode, seed=None): """Attention probability function for monotonic attention. Takes in unnormalized attention scores, adds pre-sigmoid noise to encourage the model to make discrete attention decisions, passes them through a sigmoid to obtain "choosing" probabilities, and then calls monotonic_attention to obtain the attention distribution. For more information, see Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck, "Online and Linear-Time Attention by Enforcing Monotonic Alignments." ICML 2017. https://arxiv.org/abs/1704.00784 Args: score: Unnormalized attention scores, shape `[batch_size, alignments_size]` previous_alignments: Previous attention distribution, shape `[batch_size, alignments_size]` sigmoid_noise: Standard deviation of pre-sigmoid noise. Setting this larger than 0 will encourage the model to produce large attention scores, effectively making the choosing probabilities discrete and the resulting attention distribution one-hot. It should be set to 0 at test-time, and when hard attention is not desired. mode: How to compute the attention distribution. Must be one of 'recursive', 'parallel', or 'hard'. See the docstring for `tf.contrib.seq2seq.monotonic_attention` for more information. seed: (optional) Random seed for pre-sigmoid noise. Returns: A `[batch_size, alignments_size]`-shape tensor corresponding to the resulting attention distribution. """ # Optionally add pre-sigmoid noise to the scores if sigmoid_noise > 0: noise = random_ops.random_normal(array_ops.shape(score), dtype=score.dtype, seed=seed) score += sigmoid_noise*noise # Compute "choosing" probabilities from the attention scores if mode == "hard": # When mode is hard, use a hard sigmoid p_choose_i = math_ops.cast(score > 0, score.dtype) else: p_choose_i = math_ops.sigmoid(score) # Convert from choosing probabilities to attention distribution return monotonic_attention(p_choose_i, previous_alignments, mode)
def test_monotonic_attention(self): def monotonic_attention_explicit(p_choose_i, previous_attention): """Explicitly compute monotonic attention distribution using numpy.""" # Base case for recurrence relation out = [previous_attention[0]] # Explicitly follow the recurrence relation for j in range(1, p_choose_i.shape[0]): out.append((1 - p_choose_i[j - 1]) * out[j - 1] + previous_attention[j]) return p_choose_i * np.array(out) # Generate a random batch of choosing probabilities for seq. len. 20 p_choose_i = np.random.uniform(size=(10, 20)).astype(np.float32) # Generate random previous attention distributions previous_attention = np.random.uniform(size=(10, 20)).astype(np.float32) previous_attention /= previous_attention.sum(axis=1).reshape((-1, 1)) # Create the output to test against explicit_output = np.array([ monotonic_attention_explicit(p, a) for p, a in zip(p_choose_i, previous_attention) ]) # Compute output with TensorFlow function, for both calculation types with self.test_session(): recursive_output = wrapper.monotonic_attention( p_choose_i, previous_attention, 'recursive').eval() self.assertEqual(recursive_output.ndim, explicit_output.ndim) for x, y in zip(recursive_output.shape, explicit_output.shape): self.assertEqual(x, y) for x, y in zip(recursive_output.flatten(), explicit_output.flatten()): # Use assertAlmostEqual for the actual values due to floating point self.assertAlmostEqual(x, y, places=5) # Generate new p_choose_i for parallel, which is unstable when p_choose_i[n] # is close to 1 p_choose_i = np.random.uniform(0, 0.9, size=(10, 20)).astype(np.float32) # Create new output to test against explicit_output = np.array([ monotonic_attention_explicit(p, a) for p, a in zip(p_choose_i, previous_attention) ]) # Compute output with TensorFlow function, for both calculation types with self.test_session(): parallel_output = wrapper.monotonic_attention( p_choose_i, previous_attention, 'parallel').eval() self.assertEqual(parallel_output.ndim, explicit_output.ndim) for x, y in zip(parallel_output.shape, explicit_output.shape): self.assertEqual(x, y) for x, y in zip(parallel_output.flatten(), explicit_output.flatten()): # Use assertAlmostEqual for the actual values due to floating point self.assertAlmostEqual(x, y, places=5) # Now, test hard mode, where probabilities must be 0 or 1 p_choose_i = np.random.choice(np.array([0, 1], np.float32), (10, 20)) previous_attention = np.zeros((10, 20), np.float32) # Randomly choose input sequence indices at each timestep random_idx = np.random.randint(0, previous_attention.shape[1], previous_attention.shape[0]) previous_attention[np.arange(previous_attention.shape[0]), random_idx] = 1 # Create the output to test against explicit_output = np.array([ monotonic_attention_explicit(p, a) for p, a in zip(p_choose_i, previous_attention) ]) # Compute output with TensorFlow function, for both calculation types with self.test_session(): hard_output = wrapper.monotonic_attention( # TensorFlow is unhappy when these are not wrapped as tf.constant constant_op.constant(p_choose_i), constant_op.constant(previous_attention), 'hard').eval() self.assertEqual(hard_output.ndim, explicit_output.ndim) for x, y in zip(hard_output.shape, explicit_output.shape): self.assertEqual(x, y) for x, y in zip(hard_output.flatten(), explicit_output.flatten()): # Use assertAlmostEqual for the actual values due to floating point self.assertAlmostEqual(x, y, places=5) # Now, test recursively computing attention distributions vs. sampling def sample(p_choose_i): """Generate a sequence of emit-ingest decisions from p_choose_i.""" output = np.zeros(p_choose_i.shape) t_im1 = 0 for i in range(p_choose_i.shape[0]): for j in range(t_im1, p_choose_i.shape[1]): if np.random.uniform() <= p_choose_i[i, j]: output[i, j] = 1 t_im1 = j break else: t_im1 = p_choose_i.shape[1] return output # Now, the first axis is output timestep and second is input timestep p_choose_i = np.random.uniform(size=(4, 5)).astype(np.float32) # Generate the average of a bunch of samples n_samples = 100000 sampled_output = np.mean( [sample(p_choose_i) for _ in range(n_samples)], axis=0) # Create initial previous_attention base case recursive_output = [ np.array([1] + [0] * (p_choose_i.shape[1] - 1), np.float32) ] # Compute output with TensorFlow function, for both calculation types with self.test_session(): for j in range(p_choose_i.shape[0]): # Compute attention distribution for this output time step recursive_output.append( wrapper.monotonic_attention( # newaxis is for adding the expected batch dimension p_choose_i[j][np.newaxis], recursive_output[-1][np.newaxis], 'recursive').eval()[0]) # Stack together distributions; remove basecase recursive_output = np.array(recursive_output[1:]) self.assertEqual(recursive_output.ndim, sampled_output.ndim) for x, y in zip(recursive_output.shape, sampled_output.shape): self.assertEqual(x, y) for x, y in zip(recursive_output.flatten(), sampled_output.flatten()): # Use a very forgiving threshold since we are sampling self.assertAlmostEqual(x, y, places=2)
def test_monotonic_attention(self): def monotonic_attention_explicit(p_choose_i, previous_attention): """Explicitly compute monotonic attention distribution using numpy.""" # Base case for recurrence relation out = [previous_attention[0]] # Explicitly follow the recurrence relation for j in range(1, p_choose_i.shape[0]): out.append((1 - p_choose_i[j - 1])*out[j - 1] + previous_attention[j]) return p_choose_i*np.array(out) # Generate a random batch of choosing probabilities for seq. len. 20 p_choose_i = np.random.uniform(size=(10, 20)).astype(np.float32) # Generate random previous attention distributions previous_attention = np.random.uniform(size=(10, 20)).astype(np.float32) previous_attention /= previous_attention.sum(axis=1).reshape((-1, 1)) # Create the output to test against explicit_output = np.array([ monotonic_attention_explicit(p, a) for p, a in zip(p_choose_i, previous_attention)]) # Compute output with TensorFlow function, for both calculation types with self.test_session(): recursive_output = wrapper.monotonic_attention( p_choose_i, previous_attention, 'recursive').eval() self.assertEqual(recursive_output.ndim, explicit_output.ndim) for x, y in zip(recursive_output.shape, explicit_output.shape): self.assertEqual(x, y) for x, y in zip(recursive_output.flatten(), explicit_output.flatten()): # Use assertAlmostEqual for the actual values due to floating point self.assertAlmostEqual(x, y, places=5) # Generate new p_choose_i for parallel, which is unstable when p_choose_i[n] # is close to 1 p_choose_i = np.random.uniform(0, 0.9, size=(10, 20)).astype(np.float32) # Create new output to test against explicit_output = np.array([ monotonic_attention_explicit(p, a) for p, a in zip(p_choose_i, previous_attention)]) # Compute output with TensorFlow function, for both calculation types with self.test_session(): parallel_output = wrapper.monotonic_attention( p_choose_i, previous_attention, 'parallel').eval() self.assertEqual(parallel_output.ndim, explicit_output.ndim) for x, y in zip(parallel_output.shape, explicit_output.shape): self.assertEqual(x, y) for x, y in zip(parallel_output.flatten(), explicit_output.flatten()): # Use assertAlmostEqual for the actual values due to floating point self.assertAlmostEqual(x, y, places=5) # Now, test hard mode, where probabilities must be 0 or 1 p_choose_i = np.random.choice(np.array([0, 1], np.float32), (10, 20)) previous_attention = np.zeros((10, 20), np.float32) # Randomly choose input sequence indices at each timestep random_idx = np.random.randint(0, previous_attention.shape[1], previous_attention.shape[0]) previous_attention[np.arange(previous_attention.shape[0]), random_idx] = 1 # Create the output to test against explicit_output = np.array([ monotonic_attention_explicit(p, a) for p, a in zip(p_choose_i, previous_attention)]) # Compute output with TensorFlow function, for both calculation types with self.test_session(): hard_output = wrapper.monotonic_attention( # TensorFlow is unhappy when these are not wrapped as tf.constant constant_op.constant(p_choose_i), constant_op.constant(previous_attention), 'hard').eval() self.assertEqual(hard_output.ndim, explicit_output.ndim) for x, y in zip(hard_output.shape, explicit_output.shape): self.assertEqual(x, y) for x, y in zip(hard_output.flatten(), explicit_output.flatten()): # Use assertAlmostEqual for the actual values due to floating point self.assertAlmostEqual(x, y, places=5) # Now, test recursively computing attention distributions vs. sampling def sample(p_choose_i): """Generate a sequence of emit-ingest decisions from p_choose_i.""" output = np.zeros(p_choose_i.shape) t_im1 = 0 for i in range(p_choose_i.shape[0]): for j in range(t_im1, p_choose_i.shape[1]): if np.random.uniform() <= p_choose_i[i, j]: output[i, j] = 1 t_im1 = j break else: t_im1 = p_choose_i.shape[1] return output # Now, the first axis is output timestep and second is input timestep p_choose_i = np.random.uniform(size=(4, 5)).astype(np.float32) # Generate the average of a bunch of samples n_samples = 100000 sampled_output = np.mean( [sample(p_choose_i) for _ in range(n_samples)], axis=0) # Create initial previous_attention base case recursive_output = [np.array([1] + [0]*(p_choose_i.shape[1] - 1), np.float32)] # Compute output with TensorFlow function, for both calculation types with self.test_session(): for j in range(p_choose_i.shape[0]): # Compute attention distribution for this output time step recursive_output.append(wrapper.monotonic_attention( # newaxis is for adding the expected batch dimension p_choose_i[j][np.newaxis], recursive_output[-1][np.newaxis], 'recursive').eval()[0]) # Stack together distributions; remove basecase recursive_output = np.array(recursive_output[1:]) self.assertEqual(recursive_output.ndim, sampled_output.ndim) for x, y in zip(recursive_output.shape, sampled_output.shape): self.assertEqual(x, y) for x, y in zip(recursive_output.flatten(), sampled_output.flatten()): # Use a very forgiving threshold since we are sampling self.assertAlmostEqual(x, y, places=2)