def testTransformerAttentionLayerReference(self): depth = 4 p = layers_with_attention.TransformerAttentionLayer.Params() p.name = 'transformer_atten' p.source_dim = depth p.is_masked = False p.num_attention_heads = 2 p.atten_tpl.params_init = py_utils.WeightInit.Gaussian(0.1, 12345) transformer_atten_ref = layers_with_attention.TransformerAttentionLayer( p) (query_vec, _, aux_vecs, aux_paddings) = self._TransformerSingleSourceInputs(depth) ctx_ref, probs_ref = transformer_atten_ref.FPropDefaultTheta( query_vec, aux_paddings, aux_vecs) expected_ctx, expected_probs = self._ExpectedSingleSourceResults() with self.session(use_gpu=True) as sess: tf.global_variables_initializer().run() actual_ctx_ref, actual_probs_ref = sess.run([ctx_ref, probs_ref]) tf.logging.info(np.array_repr(actual_ctx_ref)) tf.logging.info(np.array_repr(actual_probs_ref)) self.assertAllClose(expected_ctx, actual_ctx_ref) self.assertAllClose(expected_probs, actual_probs_ref)
def testTransformerAttentionLayerDeterministicDropout(self): with self.session(use_gpu=True) as sess: # Needed to generate a seed pair. py_utils.ResetStepSeed() py_utils.GetOrCreateGlobalStep() depth = 4 p = layers_with_attention.TransformerAttentionLayer.Params() p.name = 'transformer_atten' p.source_dim = depth p.is_masked = False p.num_attention_heads = 2 p.residual_dropout_tpl = layers.DeterministicDropoutLayer.Params() p.residual_dropout_prob = 0.1 transformer_atten = layers_with_attention.TransformerAttentionLayer(p) (source_vecs, source_padding, _, _) = self._testTransformerAttentionLayerInputs(depth=depth) ctx, probs = transformer_atten.FProp(transformer_atten.theta, source_vecs, source_padding) tf.global_variables_initializer().run() actual_ctx, actual_probs = sess.run([ctx, probs]) # pylint: disable=bad-whitespace # pyformat: disable print(np.array_repr(actual_ctx)) expected_ctx = np.array([ [[-1.45762944, 1.5337404 , 0.34037334, -0.97208667], [-1.35992002, -1.06530988, 1.53705895, 2.79370689]], [[ 0.00657134, 1.12030125, -1.32564592, -1.73569465], [-0.80793667, -0.10877949, -0.80295694, 2.25494242]], [[ 1.76956046, -0.50777751, -1.19745886, -1.46751583], [-1.79178905, -0.77374339, 1.31586027, 2.98173356]], [[-0.85498607, -0.37413225, 1.25707364, -0.50043333], [ 1.62276983, 0.50820369, -1.52967572, -2.02076197]], [[-0.66754031, -0.68657839, -0.51643699, 1.96581018], [-1.4816376 , 0.89419198, -0.57226259, 1.90177512]] ], dtype=np.float32) print(np.array_repr(actual_probs)) expected_probs = np.array([ [[ 0.21387868, 0.22080734, 0. , 0. , 0.56531399], [ 0. , 0.30584112, 0.24723588, 0.44692296, 0. ]], [[ 0.25358215, 0.50932312, 0. , 0. , 0.23709476], [ 0. , 0.56834149, 0.2632803 , 0.16837817, 0. ]], [[ 0.38519409, 0.55454361, 0. , 0. , 0.06026226], [ 0. , 0.33708778, 0.21976741, 0.4431448 , 0. ]], [[ 0.27139962, 0.12790371, 0. , 0. , 0.60069668], [ 0. , 0.31849149, 0.28174096, 0.39976761, 0. ]], [[ 0.16272782, 0.15781289, 0. , 0. , 0.67945927], [ 0. , 0.55003977, 0.26049581, 0.18946445, 0. ]] ], dtype=np.float32) # pyformat: enable # pylint: enable=bad-whitespace self.assertAllClose(expected_ctx, actual_ctx, rtol=1e-05, atol=1e-05) self.assertAllClose(expected_probs, actual_probs, rtol=1e-05, atol=1e-05)
def testTransformerAttentionLayerCase3(self): with self.session(use_gpu=True) as sess: depth = 4 p = layers_with_attention.TransformerAttentionLayer.Params() p.name = 'transformer_atten' p.source_dim = depth p.is_masked = False p.num_attention_heads = 2 transformer_atten = layers_with_attention.TransformerAttentionLayer( p) (query_vec, _, aux_vecs, aux_paddings) = self._testTransformerAttentionLayerInputs( depth=depth) ctx, probs = transformer_atten.FPropDefaultTheta( query_vec, aux_paddings, aux_vecs) tf.global_variables_initializer().run() actual_ctx, actual_probs = sess.run([ctx, probs]) tf.logging.info(np.array_repr(actual_ctx)) tf.logging.info(np.array_repr(actual_probs)) # pylint: disable=bad-whitespace # pyformat: disable expected_ctx = [ [[-1.42420077, 1.19024372, 1.35146523, 0.85896158], [-0.44974625, -1.00108492, 1.63387251, 1.678146]], [[0.1134335, 1.97617495, -0.35918081, 0.26396495], [-0.19688171, -0.71197301, 0.0659425, 2.5417304]], [[1.58169425, 0.81259179, -0.58948535, 0.20254248], [-0.84438968, -0.65845209, 1.45584249, 1.87587976]], [[-1.01532316, -0.05166581, 2.07901478, 0.97540361], [2.08563352, 0.34328598, -0.23240227, -0.19035631]], [[-0.53881919, -0.60117185, 0.29170275, 2.6474514], [-0.88318163, 0.37149727, -0.16098523, 2.3810885]] ] expected_probs = [ [[0.32392544, 0., 0.27218491, 0., 0.19574419, 0., 0.20814547], [0., 0.273045, 0., 0.43572819, 0., 0.2912268, 0.]], [[0.24094662, 0., 0.23919827, 0., 0.26563686, 0., 0.25421822], [0., 0.21680018, 0., 0.33962148, 0., 0.44357836, 0.]], [[0.20083594, 0., 0.20683075, 0., 0.28931937, 0., 0.30301392], [0., 0.24710922, 0., 0.453915, 0., 0.29897571, 0.]], [[0.32845193, 0., 0.26491433, 0., 0.18304622, 0., 0.22358747], [0., 0.39426237, 0., 0.19774443, 0., 0.4079932, 0.]], [[0.23542665, 0., 0.27910906, 0., 0.30036426, 0., 0.18510005], [0., 0.20147586, 0., 0.37759233, 0., 0.42093182, 0.]] ] # pyformat: enable # pylint: enable=bad-whitespace self.assertAllClose(expected_ctx, actual_ctx, rtol=1e-05, atol=1e-05) self.assertAllClose(expected_probs, actual_probs, rtol=1e-05, atol=1e-05)
def testTransformerAttentionLayerCase1(self): with self.session(use_gpu=True) as sess: depth = 4 p = layers_with_attention.TransformerAttentionLayer.Params() p.name = 'transformer_atten' p.source_dim = depth p.is_masked = False p.num_attention_heads = 2 transformer_atten = layers_with_attention.TransformerAttentionLayer( p) (source_vecs, source_padding, _, _) = self._testTransformerAttentionLayerInputs(depth=depth) ctx, probs = transformer_atten.FPropDefaultTheta( source_vecs, source_padding) tf.global_variables_initializer().run() actual_ctx, actual_probs = sess.run([ctx, probs]) # pylint: disable=bad-whitespace # pyformat: disable expected_ctx = [ [[-1.47126436, 1.46579707, 0.39105844, -0.88563323], [-1.29514003, -1.08241224, 1.49894714, 2.5935874]], [[-0.00313053, 1.17399275, -1.28071034, -1.6311729], [-0.77028418, -0.18855178, -0.75814998, 2.19872856]], [[1.72851753, -0.40323859, -1.19053328, -1.39761829], [-1.72141743, -0.78715289, 1.28404212, 2.78338313]], [[-0.8881942, 0.33776048, 1.28791749, -0.45082122], [1.4362365, 0.46009994, -1.45436597, -1.90602148]], [[-0.51681399, -0.70075679, -0.48352116, 1.93754733], [-1.44486678, 0.81801879, -1.03079689, 1.86697066]] ] expected_probs = [[[0.21387868, 0.22080734, 0., 0., 0.56531399], [0., 0.30584112, 0.24723588, 0.44692296, 0.]], [[0.25358215, 0.50932312, 0., 0., 0.23709476], [0., 0.56834149, 0.2632803, 0.16837817, 0.]], [[0.38519409, 0.55454361, 0., 0., 0.06026226], [0., 0.33708778, 0.21976741, 0.4431448, 0.]], [[0.27139962, 0.12790371, 0., 0., 0.60069668], [0., 0.31849149, 0.28174096, 0.39976761, 0.]], [[0.16272782, 0.15781289, 0., 0., 0.67945927], [0., 0.55003977, 0.26049581, 0.18946445, 0.]]] # pyformat: enable # pylint: enable=bad-whitespace self.assertAllClose(expected_ctx, actual_ctx, rtol=1e-05, atol=1e-05) self.assertAllClose(expected_probs, actual_probs, rtol=1e-05, atol=1e-05)
def testTransformerAttentionLayerCase2(self): with self.session(use_gpu=True) as sess: depth = 4 p = layers_with_attention.TransformerAttentionLayer.Params() p.name = 'transformer_atten' p.source_dim = depth p.is_masked = True p.num_attention_heads = 2 transformer_atten = layers_with_attention.TransformerAttentionLayer( p) (source_vecs, source_padding, _, _) = self._testTransformerAttentionLayerInputs(depth=depth) ctx, probs = transformer_atten.FPropDefaultTheta( source_vecs, source_padding) tf.global_variables_initializer().run() actual_ctx, actual_probs = sess.run([ctx, probs]) tf.logging.info(np.array_repr(actual_ctx)) tf.logging.info(np.array_repr(actual_probs)) # pylint: disable=bad-whitespace # pyformat: disable expected_ctx = [ [[-0.14429152, 1.15510106, 1.11930299, -1.19245839], [-0.69580591, -0.47006619, 0.82592297, 0.69593251]], [[0.24164687, 0.53328454, -1.02119482, -1.49412084], [-0.82601064, 0.024203, -1.11880171, 1.80784416]], [[1.7644347, -0.53346401, -1.1461122, -1.42797422], [-0.95326459, 0.39580142, 0.39262164, 0.67513674]], [[-0.28252155, -0.95237327, 2.08757687, -0.21231559], [1.4362365, 0.46009994, -1.45436597, -1.90602148]], [[-0.51681399, -0.70075679, -0.48352116, 1.93754733], [-1.44486678, 0.81801879, -1.03079689, 1.86697066]] ] expected_probs = [[[1., 0., 0., 0., 0.], [0.2, 0.2, 0.2, 0.2, 0.2]], [[0.3966811, 0.60331887, 0., 0., 0.], [0., 1., 0., 0., 0.]], [[0.41050252, 0.58949745, 0., 0., 0.], [0., 0.5245893, 0.4754107, 0., 0.]], [[0.58882225, 0.41117775, 0., 0., 0.], [0., 0.31849149, 0.28174096, 0.39976761, 0.]], [[0.16272782, 0.15781289, 0., 0., 0.67945927], [0., 0.55003977, 0.26049581, 0.18946445, 0.]]] # pyformat: enable # pylint: enable=bad-whitespace self.assertAllClose(expected_ctx, actual_ctx) self.assertAllClose(expected_probs, actual_probs)
def testTransformerAttentionLayerStepByStep(self): with self.session(use_gpu=True) as sess: depth = 4 p = layers_with_attention.TransformerAttentionLayer.Params() p.name = 'transformer_atten' p.source_dim = depth p.is_masked = True p.num_attention_heads = 2 x_atten = layers_with_attention.TransformerAttentionLayer(p) (source_vecs, _, _, _) = self._testTransformerAttentionLayerInputs(depth=depth) source_padding = tf.zeros([5, 2]) ctx1, probs1 = x_atten.FPropDefaultTheta(source_vecs, source_padding) ctx2 = [] probs2 = [] cached_source_vecs = tf.zeros([0, 2, 4]) cached_source_contexts = tf.zeros([0, 2, 4]) prefix_states = py_utils.NestedMap(key=cached_source_vecs, value=cached_source_contexts) for i in range(5): ctx, probs, prefix_states = x_atten.ExtendStep( x_atten.theta, source_vecs[i, :, :], prefix_states) probs_pad = tf.zeros([2, 5 - i - 1]) padded_probs = tf.concat([probs, probs_pad], 1) ctx2.append(ctx) probs2.append(padded_probs) ctx2 = tf.stack(ctx2) probs2 = tf.stack(probs2) tf.global_variables_initializer().run() ctx1_v, probs1_v, ctx2_v, probs2_v = sess.run( [ctx1, probs1, ctx2, probs2]) tf.logging.info(np.array_repr(ctx1_v)) tf.logging.info(np.array_repr(probs1_v)) tf.logging.info(np.array_repr(ctx2_v)) tf.logging.info(np.array_repr(probs2_v)) self.assertAllClose(ctx1_v, ctx2_v) self.assertAllClose(probs1_v, probs2_v)