def testScan_MultiOutputMismatchedInitializer(self): elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) initializer = np.array(1.0) # Multiply a * 1 each time with self.assertRaisesRegexp( ValueError, "two structures don't have the same nested structure"): functional_ops.scan(lambda a, x: (a, -a), elems, initializer)
def testScan_Simple(self): with self.test_session(): elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data") v = constant_op.constant(2.0, name="v") r = functional_ops.scan(lambda a, x: math_ops.mul(a, x), elems) self.assertAllEqual([1., 2., 6., 24., 120., 720.], r.eval()) r = functional_ops.scan( lambda a, x: math_ops.mul(a, x), elems, initializer=v) self.assertAllEqual([2., 4., 12., 48., 240., 1440.], r.eval())
def testScan_Simple(self): elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data") v = constant_op.constant(2.0, name="v") # pylint: disable=unnecessary-lambda r = functional_ops.scan(lambda a, x: math_ops.multiply(a, x), elems) self.assertAllEqual([1., 2., 6., 24., 120., 720.], self.evaluate(r)) r = functional_ops.scan( lambda a, x: math_ops.multiply(a, x), elems, initializer=v) self.assertAllEqual([2., 4., 12., 48., 240., 1440.], self.evaluate(r))
def testScan_Simple(self): with self.test_session(): elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data") v = constant_op.constant(2.0, name="v") r = functional_ops.scan(lambda a, x: math_ops.mul(a, x), elems) self.assertAllEqual([1., 2., 6., 24., 120., 720.], r.eval()) r = functional_ops.scan(lambda a, x: math_ops.mul(a, x), elems, initializer=v) self.assertAllEqual([2., 4., 12., 48., 240., 1440.], r.eval())
def testScan_Reverse(self): with self.test_session(): elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data") v = constant_op.constant(2.0, name="v") # pylint: disable=unnecessary-lambda r = functional_ops.scan(lambda a, x: math_ops.multiply(a, x), elems, reverse=True) self.assertAllEqual([720., 720., 360., 120., 30., 6.], self.evaluate(r)) r = functional_ops.scan( lambda a, x: math_ops.multiply(a, x), elems, initializer=v, reverse=True) self.assertAllEqual([1440., 1440., 720., 240., 60., 12.], self.evaluate(r))
def testScanVaryingShape(self): with self.cached_session() as sess: x = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 2]) x_t = array_ops.transpose(x) # scan over dimension 0 (with shape None) result = functional_ops.scan(lambda a, x: a + x, x) # scanned over transposed dimension 0 (with shape 2) result_t = functional_ops.scan(lambda a, x: a + x, x_t, infer_shape=False) # ensure gradients can be calculated result_grad = gradients_impl.gradients(result, [x])[0] result_t_grad = gradients_impl.gradients(result_t, [x_t])[0] # smoke test to ensure they all evaluate sess.run([result, result_t, result_grad, result_t_grad], feed_dict={x: [[1.0, 2.0]]})
def hiddens(self, input_idxes): "Expects input_idxes to be input_idxes of size TIMESTEPS * BATCH_SIZE" # embed input encoded sentences embedded_timesteps = self.embedding(input_idxes) batch_size = tf.shape(input_idxes)[1] initial_state = self.rnn_cell.zero_state(batch_size) return functional_ops.scan(self.step_fun, embedded_timesteps, initializer=initial_state)
def power_sums_tensor(array_size, power_matrix, multiplier): r"""Computes \sum_{i=0}^{N-1} A^i B (A^i)^T for N=0..(array_size + 1). Args: array_size: The number of non-trivial sums to pre-compute. power_matrix: The "A" matrix above. multiplier: The "B" matrix above Returns: A Tensor with S[N] = \sum_{i=0}^{N-1} A^i B (A^i)^T S[0] is the zero matrix S[1] is B S[2] is A B A^T + B ...and so on """ array_size = math_ops.cast(array_size, dtypes.int32) power_matrix = ops.convert_to_tensor(power_matrix) identity_like_power_matrix = linalg_ops.eye( array_ops.shape(power_matrix)[0], dtype=power_matrix.dtype) identity_like_power_matrix.set_shape( ops.convert_to_tensor(power_matrix).get_shape()) transition_powers = functional_ops.scan( lambda previous_power, _: math_ops.matmul(previous_power, power_matrix), math_ops.range(array_size - 1), initializer=identity_like_power_matrix) summed = math_ops.cumsum( array_ops.concat([ array_ops.expand_dims(multiplier, 0), math_ops.matmul( batch_times_matrix(transition_powers, multiplier), transition_powers, adjoint_b=True) ], 0)) return array_ops.concat( [array_ops.expand_dims(array_ops.zeros_like(multiplier), 0), summed], 0)
def ctc_label_dense_to_sparse(labels, label_lengths): label_shape = tf.shape(labels) num_batches_tns = tf.pack([label_shape[0]]) max_num_labels_tns = tf.pack([label_shape[1]]) def range_less_than(_, current_input): return tf.range(label_shape[1]) < current_input init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool) dense_mask = functional_ops.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1) # dense_mask = dense_mask[:, 0, :] label_array = tf.reshape( tf.tile(tf.range(0, label_shape[1]), num_batches_tns), label_shape) label_ind = tf.boolean_mask(label_array, dense_mask) batch_array = tf.transpose( tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [True]))) batch_ind = tf.boolean_mask(batch_array, dense_mask) indices = tf.transpose( tf.reshape(tf.concat(0, [batch_ind, label_ind]), [2, -1])) vals_sparse = tf.gather_nd(labels, indices) return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))
def testScan_MultiInputSingleOutput(self): elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) initializer = np.array(1.0) # Multiply a * 1 each time r = functional_ops.scan(lambda a, x: a * (x[0] + x[1]), (elems + 1, -elems), initializer) self.assertAllEqual([1.0, 1.0, 1.0, 1.0, 1.0, 1.0], self.evaluate(r))
def testScan_MultiInputSameTypeOutput(self): elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) r = functional_ops.scan(lambda a, x: (a[0] + x[0], a[1] + x[1]), (elems, -elems)) r_value = self.evaluate(r) self.assertAllEqual(np.cumsum(elems), r_value[0]) self.assertAllEqual(np.cumsum(-elems), r_value[1])
def power_sums_tensor(array_size, power_matrix, multiplier): r"""Computes \sum_{i=0}^{N-1} A^i B (A^i)^T for N=0..(array_size + 1). Args: array_size: The number of non-trivial sums to pre-compute. power_matrix: The "A" matrix above. multiplier: The "B" matrix above Returns: A Tensor with S[N] = \sum_{i=0}^{N-1} A^i B (A^i)^T S[0] is the zero matrix S[1] is B S[2] is A B A^T + B ...and so on """ array_size = math_ops.cast(array_size, dtypes.int32) power_matrix = ops.convert_to_tensor(power_matrix) identity_like_power_matrix = linalg_ops.eye( array_ops.shape(power_matrix)[0], dtype=power_matrix.dtype) identity_like_power_matrix.set_shape( ops.convert_to_tensor(power_matrix).get_shape()) transition_powers = functional_ops.scan( lambda previous_power, _: math_ops.matmul(previous_power, power_matrix ), math_ops.range(array_size - 1), initializer=identity_like_power_matrix) summed = math_ops.cumsum( array_ops.concat([ array_ops.expand_dims(multiplier, 0), math_ops.matmul(batch_times_matrix(transition_powers, multiplier), transition_powers, adjoint_b=True) ], 0)) return array_ops.concat( [array_ops.expand_dims(array_ops.zeros_like(multiplier), 0), summed], 0)
def testScanEmptyTensor(self): with self.test_session(): x = functional_ops.scan(lambda x, _: x, math_ops.range(0), initializer=array_ops.ones([2, 4])) self.assertAllEqual([0, 2, 4], x.get_shape()) self.assertAllEqual(x.get_shape(), x.eval().shape)
def ctc_label_dense_to_sparse(self,labels, label_lengths): """Converts CTC labels from dense to sparse. # Arguments labels: dense CTC labels. label_lengths: length of the labels. # Returns A sparse tensor representation of the labels. """ label_shape = tf.shape(labels) num_batches_tns = tf.stack([label_shape[0]]) max_num_labels_tns = tf.stack([label_shape[1]]) def range_less_than(_, current_input): return tf.expand_dims(tf.range(label_shape[1]), 0) < tf.fill( max_num_labels_tns, current_input) init = tf.cast(tf.fill([1, label_shape[1]], 0), tf.bool) dense_mask = functional_ops.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1) dense_mask = dense_mask[:, 0, :] label_array = tf.reshape(tf.tile(tf.range(label_shape[1]), num_batches_tns), label_shape) label_ind = tf.boolean_mask(label_array, dense_mask) batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(label_shape[0]), max_num_labels_tns), self.reverse(label_shape, 0))) batch_ind = tf.boolean_mask(batch_array, dense_mask) indices = tf.transpose(tf.reshape(self.concatenate([batch_ind, label_ind], axis=0), [2, -1])) vals_sparse = tf.gather_nd(labels, indices) return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))
def integrate(self, evol_func, y0, time_grid): time_delta_grid = time_grid[1:] - time_grid[:-1] scan_func = self._make_scan_func(evol_func) y_grid = functional_ops.scan(scan_func, (time_grid[:-1], time_delta_grid), y0) return array_ops.concat([[y0], y_grid], axis=0)
def testScan_SingleInputMultiOutput(self): elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) initializer = (np.array(1.0), np.array(-1.0)) r = functional_ops.scan(lambda a, x: (a[0] * x, -a[1] * x), elems, initializer) r_value = self.evaluate(r) self.assertAllEqual([1.0, 2.0, 6.0, 24.0, 120.0, 720.0], r_value[0]) self.assertAllEqual([1.0, -2.0, 6.0, -24.0, 120.0, -720.0], r_value[1])
def testScanUnknownShape(self): x = array_ops.placeholder(dtypes.float32) initializer = array_ops.placeholder(dtypes.float32) def fn(_, current_input): return current_input y = functional_ops.scan(fn, x, initializer=initializer) self.assertIs(None, y.get_shape().dims)
def _compute_predictions(self, init = None): """ Compute vanilla-RNN states and predictions. """ with tf.variable_scope('states'): with tf.variable_scope("HMM"): with tf.variable_scope("transition"): skip_prob = tf.get_variable("skip", shape=[1], initializer=tf.constant_initializer(1e-1)) #skip_prob = tf.Variable( np.array(1e-1, dtype=np.float32), name="skip") # .astype(np.float32) self.W_trans = (1-skip_prob) * get_transition_matrix().astype(np.float32) + skip_prob* np.eye(self.hidden_layer_size).astype(np.float32) #self.W_trans = tf.Variable( transition_with_skips, # name='W_trans', trainable=True) print("W_trans", self.W_trans.get_shape()) with tf.variable_scope("emission"): "W_emit: [self.input_size, self.hidden_layer_size]" if self.emission_init is None: self.W_emit = tf.get_variable("W_emit", shape = [self.hidden_layer_size, self.input_size], initializer = tf.random_normal_initializer(0.0, 1e-6)) else: if not (self.emission_init.shape == (self.hidden_layer_size, self.input_size)): print("self.emission_init.shape", self.emission_init.shape) print("(self.hidden_layer_size, self.input_size)", (self.hidden_layer_size, self.input_size)) raise ValueError("wrong dimensions of `self.emission_init`") self.W_emit = tf.Variable(self.emission_init.astype(np.float32), name = "W_emit", trainable = False) self.W_emit_summary = tf.image_summary("W_emit", tf.reshape(self.W_emit, [1,self.hidden_layer_size, self.input_size,1])) "idea: impose kernel similarity: maximize(W K W)" "[ self.hidden_layer_size, self.nt_in_pore ]" emission_in_pore_space = tf.matmul( self.map_hex_to_pore, self.W_emit) self.emission_similarity = tf.reduce_sum( tf.diag_part( tf.matmul( tf.transpose(emission_in_pore_space),(emission_in_pore_space)) ), name="emission_w_similarity") if init is None: initial_state = tf.ones([self.hidden_layer_size], name='initial_state') initial_state = initial_state/ self.hidden_layer_size else: initial_state = init #states = self._rnn_step_fw(initial_state[:,0], self.inputs[0,:]) states = functional_ops.scan(self._rnn_step_fw, tf.identity(self.inputs), initializer=initial_state, name='states') states_fw_summary = tf.histogram_summary("states_fw", states) #states = states_fw #print("states:", states.get_shape()) with tf.variable_scope('predictions'): # set some explicit initializer, orthogonal inialization "for now, keep identity mapping from hidden states to labels" "assume probability interpretation of values: should sum to one" W_pred = tf.Variable(np.eye(self.target_size, dtype = np.float32), name="W_pred", trainable=False) predictions = tf.matmul(states, W_pred, name='predictions') #predictions = states predictions_summary = tf.histogram_summary("predictions", predictions) #predictions = tf.nn.softmax(tf.matmul(states, W_pred), name='predictions')) # do predictions sum to one? return states, predictions
def testScanShape(self): x = constant_op.constant([[1, 2, 3], [4, 5, 6]]) def fn(_, current_input): return current_input initializer = constant_op.constant([0, 0, 0]) y = functional_ops.scan(fn, x, initializer=initializer) self.assertAllEqual(y.get_shape(), self.evaluate(y).shape)
def testScan_Reverse(self): with self.test_session(): elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data") v = constant_op.constant(2.0, name="v") # pylint: disable=unnecessary-lambda r = functional_ops.scan(lambda a, x: math_ops.multiply(a, x), elems, reverse=True) self.assertAllEqual([720., 720., 360., 120., 30., 6.], self.evaluate(r)) r = functional_ops.scan(lambda a, x: math_ops.multiply(a, x), elems, initializer=v, reverse=True) self.assertAllEqual([1440., 1440., 720., 240., 60., 12.], self.evaluate(r))
def testScan_Grad(self): with self.test_session(): elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data") v = constant_op.constant(2.0, name="v") r = functional_ops.scan( lambda a, x: math_ops.mul(a, x), elems, initializer=v) r = gradients_impl.gradients(r, v)[0] self.assertAllEqual(873.0, r.eval())
def test_jacobian_scan_shape(self): # Shape x: [3, 4] x = random_ops.random_uniform([3, 4]) elems = random_ops.random_uniform([6]) # Shape y: [6, 3, 4] y = functional_ops.scan(lambda a, e: a + e, elems, initializer=x) jacobian = gradients.jacobian(y, x) expected_shape = [6, 3, 4, 3, 4] self.assertAllEqual(expected_shape, jacobian.shape.as_list())
def testScan_Control(self): with self.cached_session() as sess: s = array_ops.placeholder(dtypes.float32, shape=[None]) b = array_ops.placeholder(dtypes.bool) with ops.control_dependencies([b]): c = functional_ops.scan(lambda a, x: x * a, s) self.assertAllClose( np.array([1.0, 3.0, 9.0]), sess.run(c, {s: [1, 3, 3], b: True}))
def testScan_Grad(self): with self.cached_session(): elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data") v = constant_op.constant(2.0, name="v") # pylint: disable=unnecessary-lambda r = functional_ops.scan( lambda a, x: math_ops.multiply(a, x), elems, initializer=v) # pylint: enable=unnecessary-lambda r = gradients_impl.gradients(r, v)[0] self.assertAllEqual(873.0, self.evaluate(r))
def testScan_Scoped(self): with self.cached_session() as sess: with variable_scope.variable_scope("root") as varscope: elems = constant_op.constant([1, 2, 3, 4, 5, 6], name="data") r = functional_ops.scan(simple_scoped_fn, elems) # Check that we have the one variable we asked for here. self.assertEqual(len(variables.trainable_variables()), 1) self.assertEqual(variables.trainable_variables()[0].name, "root/body/two:0") sess.run([variables.global_variables_initializer()]) results = np.array([1, 6, 18, 44, 98, 208]) self.assertAllEqual(results, self.evaluate(r)) # Now let's reuse our single variable. varscope.reuse_variables() r = functional_ops.scan(simple_scoped_fn, elems, initializer=2) self.assertEqual(len(variables.trainable_variables()), 1) results = np.array([6, 16, 38, 84, 178, 368]) self.assertAllEqual(results, self.evaluate(r))
def testScan_Grad(self): with self.test_session(): elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data") v = constant_op.constant(2.0, name="v") r = functional_ops.scan(lambda a, x: math_ops.mul(a, x), elems, initializer=v) r = gradients_impl.gradients(r, v)[0] self.assertAllEqual(873.0, r.eval())
def testScanGradientWithPartStopGradient(self): a = variables.Variable(0.0, name="a") b = variables.Variable(0.0, name="b") elems = array_ops.zeros(5) l0, l1 = functional_ops.scan( lambda elem_, input_: (a, b), elems, initializer=(0., 0.)) loss = l0 + array_ops.stop_gradient(l1) grad = gradients_impl.gradients(ys=[loss], xs=[a, b]) with self.test_session(use_gpu=True) as sess: variables.global_variables_initializer().run() sess.run(grad)
def testScanGradientWithPartStopGradient(self): a = variables.Variable(0.0, name="a") b = variables.Variable(0.0, name="b") elems = array_ops.zeros(5) l0, l1 = functional_ops.scan( lambda elem_, input_: (a, b), elems, initializer=(0., 0.)) loss = l0 + array_ops.stop_gradient(l1) grad = gradients_impl.gradients(ys=[loss], xs=[a, b]) with self.test_session(use_gpu=True) as sess: self.evaluate(variables.global_variables_initializer()) self.evaluate(grad)
def monotonic_attention(p_choose_i, previous_attention, mode): # Force things to be tensors p_choose_i = ops.convert_to_tensor(p_choose_i, name="p_choose_i") previous_attention = ops.convert_to_tensor(previous_attention, name="previous_attention") if mode == "recursive": # Use .shape[0] when it's not None, or fall back on symbolic shape batch_size = tensor_shape.dimension_value( p_choose_i.shape[0]) or array_ops.shape(p_choose_i)[0] # Compute [1, 1 - p_choose_i[0], 1 - p_choose_i[1], ..., 1 - p_choose_i[-2]] shifted_1mp_choose_i = array_ops.concat( [array_ops.ones((batch_size, 1)), 1 - p_choose_i[:, :-1]], 1) # Compute attention distribution recursively as # q[i] = (1 - p_choose_i[i - 1])*q[i - 1] + previous_attention[i] # attention[i] = p_choose_i[i]*q[i] attention = p_choose_i * array_ops.transpose( functional_ops.scan( # Need to use reshape to remind TF of the shape between loop iterations lambda x, yz: array_ops.reshape(yz[0] * x + yz[1], (batch_size, )), # Loop variables yz[0] and yz[1] [ array_ops.transpose(shifted_1mp_choose_i), array_ops.transpose(previous_attention) ], # Initial value of x is just zeros array_ops.zeros((batch_size, )))) elif mode == "parallel": # safe_cumprod computes cumprod in logspace with numeric checks cumprod_1mp_choose_i = safe_cumprod(1 - p_choose_i, axis=1, exclusive=True) # Compute recurrence relation solution attention = p_choose_i * cumprod_1mp_choose_i * math_ops.cumsum( previous_attention / # Clip cumprod_1mp to avoid divide-by-zero clip_ops.clip_by_value(cumprod_1mp_choose_i, 1e-10, 1.), axis=1) elif mode == "hard": # Remove any probabilities before the index chosen last time step p_choose_i *= math_ops.cumsum(previous_attention, axis=1) # Now, use exclusive cumprod to remove probabilities after the first # chosen index, like so: # p_choose_i = [0, 0, 0, 1, 1, 0, 1, 1] # cumprod(1 - p_choose_i, exclusive=True) = [1, 1, 1, 1, 0, 0, 0, 0] # Product of above: [0, 0, 0, 1, 0, 0, 0, 0] attention = p_choose_i * math_ops.cumprod( 1 - p_choose_i, axis=1, exclusive=True) else: raise ValueError("mode must be 'recursive', 'parallel', or 'hard'.") return attention
def loop_body(idx_step, y): x = array_ops.zeros([10, 20, 30], dtype=dtypes.float32) x = functional_ops.scan( math_ops.add, x, initializer=array_ops.zeros([20, 30], dtype=dtypes.float32), back_prop=False, parallel_iterations=1) with ops.device('/cpu:0'): y = array_ops.identity(x) return idx_step + 1, y
def loop_body(idx_step, y): x = array_ops.zeros([10, 20, 30], dtype=dtypes.float32) x = functional_ops.scan(math_ops.add, x, initializer=array_ops.zeros( [20, 30], dtype=dtypes.float32), back_prop=False, parallel_iterations=1) with ops.device('/cpu:0'): y = array_ops.identity(x) return idx_step + 1, y
def forward(self, x0, ts): Nt = x0.shape[0] Xs = np.zeros(Nt, dtype=np.object) for i in range(Nt): time_grid = ops.convert_to_tensor(ts[i], preferred_dtype=float_type, name='t') y0 = ops.convert_to_tensor(x0[i, :].reshape((1, -1)), name='y0') time_delta_grid = time_grid[1:] - time_grid[:-1] scan_func = self._make_scan_func(self.model.f) y_grid = functional_ops.scan(scan_func, (time_grid[:-1], time_delta_grid), y0) y_s = array_ops.concat([[y0], y_grid], axis=0) Xs[i] = tf.reshape(tf.squeeze(y_s), [len(ts[i]), self.model.D]) return Xs
def _compute_predictions(self): """ Compute vanilla-RNN states and predictions. """ with tf.variable_scope('states'): initial_state = tf.zeros([self.hidden_layer_size], name='initial_state') states = functional_ops.scan(self._rnn_step, self.inputs, initializer=initial_state, name='states') with tf.variable_scope('predictions'): W_pred = tf.get_variable( 'W_pred', shape=[self.hidden_layer_size, self.target_size]) b_pred = tf.get_variable('b_pred', shape=[self.target_size], initializer=tf.constant_initializer(0.0)) predictions = tf.add(tf.matmul(states, W_pred), b_pred, name='predictions') return states, predictions
def forward(self, x0, ts, Nw=1): Xs = np.zeros(len(ts), dtype=np.object) for i in range(len(ts)): t = np.linspace(0, np.max(ts[i]), (len(ts[i]) - 1) * self.s + 1) t = np.unique(np.sort(np.hstack((t, ts[i])))) idx = np.where(np.isin(t, ts[i]))[0] t = np.reshape(t, [-1, 1]) time_grid = ops.convert_to_tensor(t, preferred_dtype=float_type, name='t') time_delta_grid = time_grid[1:] - time_grid[:-1] y0 = np.repeat(x0[i, :].reshape((1, -1)), Nw, axis=0) y0 = ops.convert_to_tensor(y0, name='y0') scan_func = self._make_scan_func(self.model.f, self.model.diffus.g) y_grid = functional_ops.scan(scan_func, (time_grid[:-1], time_delta_grid), y0) ys = array_ops.concat([[y0], y_grid], axis=0) Xs[i] = tf.transpose(tf.gather(ys, idx, axis=0), [1, 0, 2]) return Xs
def forward(self, y0, save_intermediate=False): time_grid = ops.convert_to_tensor(self.ts, preferred_dtype=float_type, name='t') y0 = ops.convert_to_tensor(y0, name='y0') time_delta_grid = time_grid[1:] - time_grid[:-1] time_grid = time_grid[1:] time_combined = tf.concat( [time_grid[:, None], time_delta_grid[:, None]], axis=1) scan_func = self._make_scan_func(self.f) if save_intermediate: y_grid = functional_ops.scan(scan_func, time_combined, y0) y_s = array_ops.concat([[y0], y_grid], axis=0) y_t = y_s[-1, :, :, :] return y_t, y_s else: y_t = functional_ops.foldl(scan_func, time_combined, y0) return y_t, None
def my_ctc_label_dense_to_sparse(labels, label_lengths): """Converts CTC labels from dense to sparse. Arguments: labels: dense CTC labels. label_lengths: length of the labels. Returns: A sparse tensor representation of the labels. """ label_shape = array_ops.shape(labels) num_batches_tns = array_ops.stack([label_shape[0]]) max_num_labels_tns = array_ops.stack([label_shape[1]]) def range_less_than(_, current_input): return array_ops.expand_dims(math_ops.range(label_shape[1]), 0) < array_ops.fill( max_num_labels_tns, current_input) init = math_ops.cast(array_ops.fill([1, label_shape[1]], 0), dtypes_module.bool) dense_mask = functional_ops.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1) dense_mask = dense_mask[:, 0, :] label_array = array_ops.reshape( array_ops.tile(math_ops.range(0, label_shape[1]), num_batches_tns), label_shape) label_ind = array_ops.boolean_mask(label_array, dense_mask) batch_array = array_ops.transpose( array_ops.reshape( array_ops.tile(math_ops.range(0, label_shape[0]), max_num_labels_tns), reverse(label_shape, 0))) batch_ind = array_ops.boolean_mask(batch_array, dense_mask) indices = array_ops.transpose( array_ops.reshape(concatenate([batch_ind, label_ind], axis=0), [2, -1])) vals_sparse = array_ops.gather_nd(labels, indices) return sparse_tensor.SparseTensor(math_ops.to_int64(indices), vals_sparse, math_ops.to_int64(label_shape))
def testScanFoldl_Nested(self): elems = constant_op.constant([1.0, 2.0, 3.0, 4.0], name="data") inner_elems = constant_op.constant([0.5, 0.5], name="data") def r_inner(a, x): return functional_ops.foldl( lambda b, y: b * y * x, inner_elems, initializer=a) r = functional_ops.scan(r_inner, elems) # t == 0 (returns 1) # t == 1, a == 1, x == 2 (returns 1) # t_0 == 0, b == a == 1, y == 0.5, returns b * y * x = 1 # t_1 == 1, b == 1, y == 0.5, returns b * y * x = 1 # t == 2, a == 1, x == 3 (returns 1.5*1.5 == 2.25) # t_0 == 0, b == a == 1, y == 0.5, returns b * y * x = 1.5 # t_1 == 1, b == 1.5, y == 0.5, returns b * y * x = 1.5*1.5 # t == 3, a == 2.25, x == 4 (returns 9) # t_0 == 0, b == a == 2.25, y == 0.5, returns b * y * x = 4.5 # t_1 == 1, b == 4.5, y == 0.5, returns b * y * x = 9 self.assertAllClose([1., 1., 2.25, 9.], self.evaluate(r))
def ctc_label_dense_to_sparse( self, labels, label_lengths ): """Mike Henry's implementation, with some minor modifications.""" with self.G.as_default(): label_shape = tf.shape( labels ) num_batches_tns = tf.pack( [label_shape[0]] ) max_num_labels_tns = tf.pack( [label_shape[1]] ) def range_less_than(previous_state, current_input): return tf.expand_dims( tf.range( label_shape[1] ), 0 ) < current_input init = tf.cast( tf.fill( max_num_labels_tns, 0 ), tf.bool ) dense_mask = functional_ops.scan(range_less_than, label_lengths , initializer=init, parallel_iterations=1) dense_mask = dense_mask[ :, 0, : ] label_array = tf.reshape( tf.tile( tf.range( 0, label_shape[1] ), num_batches_tns ), label_shape ) label_ind = tf.boolean_mask( label_array, dense_mask ) batch_array = tf.transpose( tf.reshape( tf.tile( tf.range( 0, label_shape[0] ), max_num_labels_tns ), tf.reverse( label_shape,[True]) ) ) batch_ind = tf.boolean_mask( batch_array, dense_mask ) indices = tf.transpose( tf.reshape( tf.concat( 0, [batch_ind, label_ind] ), [2,-1] ) ) vals_sparse = tf.gather_nd( labels, indices ) return tf.SparseTensor( tf.to_int64(indices), vals_sparse, tf.to_int64( label_shape ) )
def f(y): # pylint: disable=unnecessary-lambda return functional_ops.scan( lambda a, x: math_ops.multiply(a, x), y, initializer=v)
def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x, target_log_prob_fn, proposal_log_prob_fn, event_dims=(), name=None): """Runs annealed importance sampling (AIS) to estimate normalizing constants. This routine uses Hamiltonian Monte Carlo to sample from a series of distributions that slowly interpolates between an initial "proposal" distribution `exp(proposal_log_prob_fn(x) - proposal_log_normalizer)` and the target distribution `exp(target_log_prob_fn(x) - target_log_normalizer)`, accumulating importance weights along the way. The product of these importance weights gives an unbiased estimate of the ratio of the normalizing constants of the initial distribution and the target distribution: E[exp(w)] = exp(target_log_normalizer - proposal_log_normalizer). Args: n_iterations: Integer number of Markov chain updates to run. More iterations means more expense, but smoother annealing between q and p, which in turn means exponentially lower variance for the normalizing constant estimator. step_size: Scalar step size or array of step sizes for the leapfrog integrator. Broadcasts to the shape of `initial_x`. Larger step sizes lead to faster progress, but too-large step sizes make rejection exponentially more likely. When possible, it's often helpful to match per-variable step sizes to the standard deviations of the target distribution in each variable. n_leapfrog_steps: Integer number of steps to run the leapfrog integrator for. Total progress per HMC step is roughly proportional to step_size * n_leapfrog_steps. initial_x: Tensor of initial state(s) of the Markov chain(s). Must be a sample from q, or results will be incorrect. target_log_prob_fn: Python callable which takes an argument like `initial_x` and returns its (possibly unnormalized) log-density under the target distribution. proposal_log_prob_fn: Python callable that returns the log density of the initial distribution. event_dims: List of dimensions that should not be treated as independent. This allows for multiple chains to be run independently in parallel. Default is (), i.e., all dimensions are independent. name: Python `str` name prefixed to Ops created by this function. Returns: ais_weights: Tensor with the estimated weight(s). Has shape matching `target_log_prob_fn(initial_x)`. chain_states: Tensor with the state(s) of the Markov chain(s) the final iteration. Has shape matching `initial_x`. acceptance_probs: Tensor with the acceptance probabilities for the final iteration. Has shape matching `target_log_prob_fn(initial_x)`. #### Examples: ```python # Estimating the normalizing constant of a log-gamma distribution: def proposal_log_prob(x): # Standard normal log-probability. This is properly normalized. return tf.reduce_sum(-0.5 * tf.square(x) - 0.5 * np.log(2 * np.pi), 1) def target_log_prob(x): # Unnormalized log-gamma(2, 3) distribution. # True normalizer is (lgamma(2) - 2 * log(3)) * x.shape[1] return tf.reduce_sum(2. * x - 3. * tf.exp(x), 1) # Run 100 AIS chains in parallel initial_x = tf.random_normal([100, 20]) w, _, _ = hmc.ais_chain(1000, 0.2, 2, initial_x, target_log_prob, proposal_log_prob, event_dims=[1]) log_normalizer_estimate = tf.reduce_logsumexp(w) - np.log(100) ``` ```python # Estimating the marginal likelihood of a Bayesian regression model: base_measure = -0.5 * np.log(2 * np.pi) def proposal_log_prob(x): # Standard normal log-probability. This is properly normalized. return tf.reduce_sum(-0.5 * tf.square(x) + base_measure, 1) def regression_log_joint(beta, x, y): # This function returns a vector whose ith element is log p(beta[i], y | x). # Each row of beta corresponds to the state of an independent Markov chain. log_prior = tf.reduce_sum(-0.5 * tf.square(beta) + base_measure, 1) means = tf.matmul(beta, x, transpose_b=True) log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means) + base_measure, 1) return log_prior + log_likelihood def log_joint_partial(beta): return regression_log_joint(beta, x, y) # Run 100 AIS chains in parallel initial_beta = tf.random_normal([100, x.shape[1]]) w, beta_samples, _ = hmc.ais_chain(1000, 0.1, 2, initial_beta, log_joint_partial, proposal_log_prob, event_dims=[1]) log_normalizer_estimate = tf.reduce_logsumexp(w) - np.log(100) ``` """ with ops.name_scope(name, 'hmc_ais_chain', [n_iterations, step_size, n_leapfrog_steps, initial_x]): non_event_shape = array_ops.shape(target_log_prob_fn(initial_x)) beta_series = math_ops.linspace(0., 1., n_iterations+1)[1:] def _body(a, beta): # pylint: disable=missing-docstring def log_prob_beta(x): return ((1 - beta) * proposal_log_prob_fn(x) + beta * target_log_prob_fn(x)) last_x = a[0] w = a[2] w += (1. / n_iterations) * (target_log_prob_fn(last_x) - proposal_log_prob_fn(last_x)) # TODO(b/66917083): There's an opportunity for gradient reuse here. updated_x, acceptance_probs, _, _ = kernel(step_size, n_leapfrog_steps, last_x, log_prob_beta, event_dims) return updated_x, acceptance_probs, w x, acceptance_probs, w = functional_ops.scan( _body, beta_series, (initial_x, array_ops.zeros(non_event_shape), array_ops.zeros(non_event_shape))) return w[-1], x[-1], acceptance_probs[-1]
def sample_chain( num_results, target_log_prob_fn, current_state, step_size, num_leapfrog_steps, num_burnin_steps=0, num_steps_between_results=0, seed=None, current_target_log_prob=None, current_grads_target_log_prob=None, name=None): """Runs multiple iterations of one or more Hamiltonian Monte Carlo chains. Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) algorithm that takes a series of gradient-informed steps to produce a Metropolis proposal. This function samples from an HMC Markov chain at `current_state` and whose stationary distribution has log-unnormalized-density `target_log_prob_fn()`. This function samples from multiple chains in parallel. It assumes that the the leftmost dimensions of (each) `current_state` (part) index an independent chain. The function `target_log_prob_fn()` sums log-probabilities across event dimensions (i.e., current state (part) rightmost dimensions). Each element of the output of `target_log_prob_fn()` represents the (possibly unnormalized) log-probability of the joint distribution over (all) the current state (parts). The `current_state` can be represented as a single `Tensor` or a `list` of `Tensors` which collectively represent the current state. When specifying a `list`, one must also specify a list of `step_size`s. Note: `target_log_prob_fn` is called exactly twice. Since HMC states are correlated, it is sometimes desirable to produce additional intermediate states, and then discard them, ending up with a set of states with decreased autocorrelation. See [1]. Such "thinning" is made possible by setting `num_steps_between_results > 0`. The chain then takes `num_steps_between_results` extra steps between the steps that make it into the results. The extra steps are never materialized (in calls to `sess.run`), and thus do not increase memory requirements. [1]: "Statistically efficient thinning of a Markov chain sampler." Art B. Owen. April 2017. http://statweb.stanford.edu/~owen/reports/bestthinning.pdf #### Examples: ##### Sample from a diagonal-variance Gaussian. ```python tfd = tf.contrib.distributions def make_likelihood(true_variances): return tfd.MultivariateNormalDiag( scale_diag=tf.sqrt(true_variances)) dims = 10 dtype = np.float32 true_variances = tf.linspace(dtype(1), dtype(3), dims) likelihood = make_likelihood(true_variances) states, kernel_results = hmc.sample_chain( num_results=1000, target_log_prob_fn=likelihood.log_prob, current_state=tf.zeros(dims), step_size=0.5, num_leapfrog_steps=2, num_burnin_steps=500) # Compute sample stats. sample_mean = tf.reduce_mean(states, axis=0) sample_var = tf.reduce_mean( tf.squared_difference(states, sample_mean), axis=0) ``` ##### Sampling from factor-analysis posteriors with known factors. I.e., ```none for i=1..n: w[i] ~ Normal(0, eye(d)) # prior x[i] ~ Normal(loc=matmul(w[i], F)) # likelihood ``` where `F` denotes factors. ```python tfd = tf.contrib.distributions def make_prior(dims, dtype): return tfd.MultivariateNormalDiag( loc=tf.zeros(dims, dtype)) def make_likelihood(weights, factors): return tfd.MultivariateNormalDiag( loc=tf.tensordot(weights, factors, axes=[[0], [-1]])) # Setup data. num_weights = 10 num_factors = 4 num_chains = 100 dtype = np.float32 prior = make_prior(num_weights, dtype) weights = prior.sample(num_chains) factors = np.random.randn(num_factors, num_weights).astype(dtype) x = make_likelihood(weights, factors).sample(num_chains) def target_log_prob(w): # Target joint is: `f(w) = p(w, x | factors)`. return prior.log_prob(w) + make_likelihood(w, factors).log_prob(x) # Get `num_results` samples from `num_chains` independent chains. chains_states, kernels_results = hmc.sample_chain( num_results=1000, target_log_prob_fn=target_log_prob, current_state=tf.zeros([num_chains, dims], dtype), step_size=0.1, num_leapfrog_steps=2, num_burnin_steps=500) # Compute sample stats. sample_mean = tf.reduce_mean(chains_states, axis=[0, 1]) sample_var = tf.reduce_mean( tf.squared_difference(chains_states, sample_mean), axis=[0, 1]) ``` Args: num_results: Integer number of Markov chain draws. target_log_prob_fn: Python callable which takes an argument like `current_state` (or `*current_state` if it's a list) and returns its (possibly unnormalized) log-density under the target distribution. current_state: `Tensor` or Python `list` of `Tensor`s representing the current state(s) of the Markov chain(s). The first `r` dimensions index independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`. step_size: `Tensor` or Python `list` of `Tensor`s representing the step size for the leapfrog integrator. Must broadcast with the shape of `current_state`. Larger step sizes lead to faster progress, but too-large step sizes make rejection exponentially more likely. When possible, it's often helpful to match per-variable step sizes to the standard deviations of the target distribution in each variable. num_leapfrog_steps: Integer number of steps to run the leapfrog integrator for. Total progress per HMC step is roughly proportional to `step_size * num_leapfrog_steps`. num_burnin_steps: Integer number of chain steps to take before starting to collect results. Default value: 0 (i.e., no burn-in). num_steps_between_results: Integer number of chain steps between collecting a result. Only one out of every `num_steps_between_samples + 1` steps is included in the returned results. The number of returned chain states is still equal to `num_results`. Default value: 0 (i.e., no thinning). seed: Python integer to seed the random number generator. current_target_log_prob: (Optional) `Tensor` representing the value of `target_log_prob_fn` at the `current_state`. The only reason to specify this argument is to reduce TF graph size. Default value: `None` (i.e., compute as needed). current_grads_target_log_prob: (Optional) Python list of `Tensor`s representing gradient of `target_log_prob` at the `current_state` and wrt the `current_state`. Must have same shape as `current_state`. The only reason to specify this argument is to reduce TF graph size. Default value: `None` (i.e., compute as needed). name: Python `str` name prefixed to Ops created by this function. Default value: `None` (i.e., "hmc_sample_chain"). Returns: accepted_states: Tensor or Python list of `Tensor`s representing the state(s) of the Markov chain(s) at each result step. Has same shape as input `current_state` but with a prepended `num_results`-size dimension. kernel_results: `collections.namedtuple` of internal calculations used to advance the chain. """ with ops.name_scope( name, "hmc_sample_chain", [num_results, current_state, step_size, num_leapfrog_steps, num_burnin_steps, num_steps_between_results, seed, current_target_log_prob, current_grads_target_log_prob]): with ops.name_scope("initialize"): [ current_state, step_size, current_target_log_prob, current_grads_target_log_prob, ] = _prepare_args( target_log_prob_fn, current_state, step_size, current_target_log_prob, current_grads_target_log_prob) num_results = ops.convert_to_tensor( num_results, dtype=dtypes.int32, name="num_results") num_leapfrog_steps = ops.convert_to_tensor( num_leapfrog_steps, dtype=dtypes.int32, name="num_leapfrog_steps") num_burnin_steps = ops.convert_to_tensor( num_burnin_steps, dtype=dtypes.int32, name="num_burnin_steps") num_steps_between_results = ops.convert_to_tensor( num_steps_between_results, dtype=dtypes.int32, name="num_steps_between_results") def _run_chain(num_steps, current_state, kernel_results): """Runs the chain(s) for `num_steps`.""" def _loop_body(iter_, current_state, kernel_results): return [iter_ + 1] + list(kernel( target_log_prob_fn, current_state, step_size, num_leapfrog_steps, seed, kernel_results.current_target_log_prob, kernel_results.current_grads_target_log_prob)) while_loop_kwargs = dict( cond=lambda iter_, *args: iter_ < num_steps, body=_loop_body, loop_vars=[ np.int32(0), current_state, kernel_results, ], ) if seed is not None: while_loop_kwargs["parallel_iterations"] = 1 return control_flow_ops.while_loop( **while_loop_kwargs)[1:] # Lop-off "iter_". def _scan_body(args_list, iter_): """Closure which implements `tf.scan` body.""" current_state, kernel_results = args_list return _run_chain( 1 + array_ops.where(math_ops.equal(iter_, 0), num_burnin_steps, num_steps_between_results), current_state, kernel_results) scan_kwargs = dict( fn=_scan_body, elems=math_ops.range(num_results), # iter_: used to choose burnin. initializer=[ current_state, _make_dummy_kernel_results( current_state, current_target_log_prob, current_grads_target_log_prob), ]) if seed is not None: scan_kwargs["parallel_iterations"] = 1 return functional_ops.scan(**scan_kwargs)
def testScanEmptyTensor(self): with self.cached_session(): x = functional_ops.scan( lambda x, _: x, math_ops.range(0), initializer=array_ops.ones([2, 4])) self.assertAllEqual([0, 2, 4], x.get_shape()) self.assertAllEqual(x.get_shape(), self.evaluate(x).shape)
def chain(n_iterations, step_size, n_leapfrog_steps, initial_x, target_log_prob_fn, event_dims=(), name=None): """Runs multiple iterations of one or more Hamiltonian Monte Carlo chains. Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) algorithm that takes a series of gradient-informed steps to produce a Metropolis proposal. This function samples from an HMC Markov chain whose initial state is `initial_x` and whose stationary distribution has log-density `target_log_prob_fn()`. This function can update multiple chains in parallel. It assumes that all dimensions of `initial_x` not specified in `event_dims` are independent, and should therefore be updated independently. The output of `target_log_prob_fn()` should sum log-probabilities across all event dimensions. Slices along dimensions not in `event_dims` may have different target distributions; this is up to `target_log_prob_fn()`. This function basically just wraps `hmc.kernel()` in a tf.scan() loop. Args: n_iterations: Integer number of Markov chain updates to run. step_size: Scalar step size or array of step sizes for the leapfrog integrator. Broadcasts to the shape of `initial_x`. Larger step sizes lead to faster progress, but too-large step sizes make rejection exponentially more likely. When possible, it's often helpful to match per-variable step sizes to the standard deviations of the target distribution in each variable. n_leapfrog_steps: Integer number of steps to run the leapfrog integrator for. Total progress per HMC step is roughly proportional to step_size * n_leapfrog_steps. initial_x: Tensor of initial state(s) of the Markov chain(s). target_log_prob_fn: Python callable which takes an argument like `initial_x` and returns its (possibly unnormalized) log-density under the target distribution. event_dims: List of dimensions that should not be treated as independent. This allows for multiple chains to be run independently in parallel. Default is (), i.e., all dimensions are independent. name: Python `str` name prefixed to Ops created by this function. Returns: acceptance_probs: Tensor with the acceptance probabilities for each iteration. Has shape matching `target_log_prob_fn(initial_x)`. chain_states: Tensor with the state of the Markov chain at each iteration. Has shape `[n_iterations, initial_x.shape[0],...,initial_x.shape[-1]`. #### Examples: ```python # Sampling from a standard normal (note `log_joint()` is unnormalized): def log_joint(x): return tf.reduce_sum(-0.5 * tf.square(x)) chain, acceptance_probs = hmc.chain(1000, 0.5, 2, tf.zeros(10), log_joint, event_dims=[0]) # Discard first half of chain as warmup/burn-in warmed_up = chain[500:] mean_est = tf.reduce_mean(warmed_up, 0) var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est) ``` ```python # Sampling from a diagonal-variance Gaussian: variances = tf.linspace(1., 3., 10) def log_joint(x): return tf.reduce_sum(-0.5 / variances * tf.square(x)) chain, acceptance_probs = hmc.chain(1000, 0.5, 2, tf.zeros(10), log_joint, event_dims=[0]) # Discard first half of chain as warmup/burn-in warmed_up = chain[500:] mean_est = tf.reduce_mean(warmed_up, 0) var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est) ``` ```python # Sampling from factor-analysis posteriors with known factors W: # mu[i, j] ~ Normal(0, 1) # x[i] ~ Normal(matmul(mu[i], W), I) def log_joint(mu, x, W): prior = -0.5 * tf.reduce_sum(tf.square(mu), 1) x_mean = tf.matmul(mu, W) likelihood = -0.5 * tf.reduce_sum(tf.square(x - x_mean), 1) return prior + likelihood chain, acceptance_probs = hmc.chain(1000, 0.1, 2, tf.zeros([x.shape[0], W.shape[0]]), lambda mu: log_joint(mu, x, W), event_dims=[1]) # Discard first half of chain as warmup/burn-in warmed_up = chain[500:] mean_est = tf.reduce_mean(warmed_up, 0) var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est) ``` ```python # Sampling from the posterior of a Bayesian regression model.: # Run 100 chains in parallel, each with a different initialization. initial_beta = tf.random_normal([100, x.shape[1]]) chain, acceptance_probs = hmc.chain(1000, 0.1, 10, initial_beta, log_joint_partial, event_dims=[1]) # Discard first halves of chains as warmup/burn-in warmed_up = chain[500:] # Averaging across samples within a chain and across chains mean_est = tf.reduce_mean(warmed_up, [0, 1]) var_est = tf.reduce_mean(tf.square(warmed_up), [0, 1]) - tf.square(mean_est) ``` """ with ops.name_scope(name, 'hmc_chain', [n_iterations, step_size, n_leapfrog_steps, initial_x]): initial_x = ops.convert_to_tensor(initial_x, name='initial_x') non_event_shape = array_ops.shape(target_log_prob_fn(initial_x)) def body(a, _): updated_x, acceptance_probs, log_prob, grad = kernel( step_size, n_leapfrog_steps, a[0], target_log_prob_fn, event_dims, a[2], a[3]) return updated_x, acceptance_probs, log_prob, grad potential_and_grad = _make_potential_and_grad(target_log_prob_fn) potential, grad = potential_and_grad(initial_x) return functional_ops.scan(body, array_ops.zeros(n_iterations), (initial_x, array_ops.zeros(non_event_shape), -potential, -grad))[:2]
def monotonic_attention(p_choose_i, previous_attention, mode): """Compute monotonic attention distribution from choosing probabilities. Monotonic attention implies that the input sequence is processed in an explicitly left-to-right manner when generating the output sequence. In addition, once an input sequence element is attended to at a given output timestep, elements occurring before it cannot be attended to at subsequent output timesteps. This function generates attention distributions according to these assumptions. For more information, see ``Online and Linear-Time Attention by Enforcing Monotonic Alignments''. Args: p_choose_i: Probability of choosing input sequence/memory element i. Should be of shape (batch_size, input_sequence_length), and should all be in the range [0, 1]. previous_attention: The attention distribution from the previous output timestep. Should be of shape (batch_size, input_sequence_length). For the first output timestep, preevious_attention[n] should be [1, 0, 0, ..., 0] for all n in [0, ... batch_size - 1]. mode: How to compute the attention distribution. Must be one of 'recursive', 'parallel', or 'hard'. * 'recursive' uses tf.scan to recursively compute the distribution. This is slowest but is exact, general, and does not suffer from numerical instabilities. * 'parallel' uses parallelized cumulative-sum and cumulative-product operations to compute a closed-form solution to the recurrence relation defining the attention distribution. This makes it more efficient than 'recursive', but it requires numerical checks which make the distribution non-exact. This can be a problem in particular when input_sequence_length is long and/or p_choose_i has entries very close to 0 or 1. * 'hard' requires that the probabilities in p_choose_i are all either 0 or 1, and subsequently uses a more efficient and exact solution. Returns: A tensor of shape (batch_size, input_sequence_length) representing the attention distributions for each sequence in the batch. Raises: ValueError: mode is not one of 'recursive', 'parallel', 'hard'. """ # Force things to be tensors p_choose_i = ops.convert_to_tensor(p_choose_i, name="p_choose_i") previous_attention = ops.convert_to_tensor( previous_attention, name="previous_attention") if mode == "recursive": # Use .shape[0].value when it's not None, or fall back on symbolic shape batch_size = p_choose_i.shape[0].value or array_ops.shape(p_choose_i)[0] # Compute [1, 1 - p_choose_i[0], 1 - p_choose_i[1], ..., 1 - p_choose_i[-2]] shifted_1mp_choose_i = array_ops.concat( [array_ops.ones((batch_size, 1)), 1 - p_choose_i[:, :-1]], 1) # Compute attention distribution recursively as # q[i] = (1 - p_choose_i[i])*q[i - 1] + previous_attention[i] # attention[i] = p_choose_i[i]*q[i] attention = p_choose_i*array_ops.transpose(functional_ops.scan( # Need to use reshape to remind TF of the shape between loop iterations lambda x, yz: array_ops.reshape(yz[0]*x + yz[1], (batch_size,)), # Loop variables yz[0] and yz[1] [array_ops.transpose(shifted_1mp_choose_i), array_ops.transpose(previous_attention)], # Initial value of x is just zeros array_ops.zeros((batch_size,)))) elif mode == "parallel": # safe_cumprod computes cumprod in logspace with numeric checks cumprod_1mp_choose_i = safe_cumprod(1 - p_choose_i, axis=1, exclusive=True) # Compute recurrence relation solution attention = p_choose_i*cumprod_1mp_choose_i*math_ops.cumsum( previous_attention / # Clip cumprod_1mp to avoid divide-by-zero clip_ops.clip_by_value(cumprod_1mp_choose_i, 1e-10, 1.), axis=1) elif mode == "hard": # Remove any probabilities before the index chosen last time step p_choose_i *= math_ops.cumsum(previous_attention, axis=1) # Now, use exclusive cumprod to remove probabilities after the first # chosen index, like so: # p_choose_i = [0, 0, 0, 1, 1, 0, 1, 1] # cumprod(1 - p_choose_i, exclusive=True) = [1, 1, 1, 1, 0, 0, 0, 0] # Product of above: [0, 0, 0, 1, 0, 0, 0, 0] attention = p_choose_i*math_ops.cumprod( 1 - p_choose_i, axis=1, exclusive=True) else: raise ValueError("mode must be 'recursive', 'parallel', or 'hard'.") return attention
def scan(): return functional_ops.scan( lambda a, x: a + x, elems, parallel_iterations=1)