def testTransformerLayerWithInputPackingFProp(self): with self.session(use_gpu=True) as sess: with tf.variable_scope('transformer_packed_test', reuse=tf.AUTO_REUSE): np.random.seed(6348575) depth = 4 p = layers_with_attention.TransformerLayer.Params() p.name = 'transformer' p.source_dim = depth p.is_decoder = True p.tr_fflayer_tpl.hidden_dim = 7 p.tr_atten_tpl.num_attention_heads = 2 packed_params = p.Copy() transformer = layers_with_attention.TransformerLayer(p) packed_params.packed_input = True transformer_packed = layers_with_attention.TransformerLayer( packed_params) dtype = tf.float32 source_vecs = tf.stack([ tf.constant(np.random.rand(2, depth), dtype=dtype) for _ in range(5) ]) source_padding = tf.transpose( tf.constant([[0, 0, 0, 0, 1], [0, 0, 0, 0, 0]], dtype=dtype)) aux_vecs = tf.stack([ tf.constant(np.random.rand(2, depth), dtype=dtype) for _ in range(7) ]) aux_paddings = tf.transpose( tf.constant( [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1]], dtype=dtype)) source_vecs_packed = tf.reshape(source_vecs, [-1, 1, depth]) aux_vecs_packed = tf.reshape(aux_vecs, [-1, 1, depth]) source_padding_packed = tf.reshape(source_padding, [-1, 1]) aux_padding_packed = tf.reshape(aux_paddings, [-1, 1]) source_segment_id = tf.transpose( tf.constant([[0, 1, 0, 1, 0, 1, 0, 1, 0, 1]], dtype=tf.float32)) aux_segment_id = tf.transpose( tf.constant( [[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]], dtype=tf.float32)) h, _ = transformer.FPropDefaultTheta( source_vecs, source_padding, aux_vecs=aux_vecs, aux_paddings=aux_paddings, source_segment_id=None, aux_segment_id=None) h_packed, _ = transformer_packed.FPropDefaultTheta( source_vecs_packed, source_padding_packed, aux_vecs=aux_vecs_packed, aux_paddings=aux_padding_packed, source_segment_id=source_segment_id, aux_segment_id=aux_segment_id) h_packed = tf.reshape(h_packed, tf.shape(h)) tf.global_variables_initializer().run() actual_layer, p_layer = sess.run([h, h_packed]) self.assertAllClose(actual_layer, p_layer)
def testTransformerLayerConstruction(self): p = layers_with_attention.TransformerLayer.Params() p.name = 'transformer' p.source_dim = 4 p.tr_fflayer_tpl.hidden_dim = 7 p.tr_atten_tpl.num_attention_heads = 2 p.is_decoder = True _ = layers_with_attention.TransformerLayer(p)
def testTransformerLayerFProp(self): with self.session(use_gpu=True) as sess: np.random.seed(6348575) depth = 4 p = layers_with_attention.TransformerLayer.Params() p.name = 'transformer' p.source_dim = depth p.has_aux_atten = True p.mask_self_atten = True p.tr_fflayer_tpl.hidden_dim = 7 p.tr_atten_tpl.num_attention_heads = 2 transformer = layers_with_attention.TransformerLayer(p) (source_vecs, source_padding, aux_vecs, aux_paddings) = self._testTransformerAttentionLayerInputs( depth=depth) h, probs = transformer.FPropDefaultTheta(source_vecs, source_padding, aux_vecs=aux_vecs, aux_paddings=aux_paddings) tf.global_variables_initializer().run() actual_layer_output, actual_prob_output = sess.run([h, probs]) tf.logging.info(np.array_repr(actual_layer_output)) tf.logging.info(np.array_repr(actual_prob_output)) # pylint: disable=bad-whitespace # pyformat: disable expected_layer_output = [ [[0.68134278, 0.74287307, 0.04602078, 1.99463582], [0.20382279, -1.50973201, 1.33421206, 0.53317755]], [[2.46715426, 2.84406185, -0.60359633, 0.51742059], [1.06444919, -1.45264888, -0.06196141, 0.35242724]], [[2.3442452, -0.56243378, -1.1149826, 0.50276589], [1.04868603, -1.68515253, 0.3093726, -0.19512933]], [[-0.11517292, -1.21290886, 1.31996512, 1.14821553], [3.14395714, -1.07060659, 0.27842081, -1.81273639]], [[1.39219522, -0.81882864, -0.32732445, 1.36851478], [-0.79119539, -0.28148842, 0.29963702, 1.37034667]] ] expected_prob_output = [ [[0.21795762, 0., 0.26612395, 0., 0.31251648, 0., 0.20340192], [0., 0.2677784, 0., 0.32895881, 0., 0.40326279, 0.]], [[0.25721505, 0., 0.24116731, 0., 0.25138181, 0., 0.2502358], [0., 0.25691482, 0., 0.31076014, 0., 0.43232504, 0.]], [[0.24550268, 0., 0.25128055, 0., 0.25109866, 0., 0.25211811], [0., 0.26769161, 0., 0.32481128, 0., 0.40749705, 0.]], [[0.22675318, 0., 0.26633731, 0., 0.28919035, 0., 0.21771915], [0., 0.35955882, 0., 0.36869824, 0., 0.271743, 0.]], [[0.21504655, 0., 0.26958644, 0., 0.30847484, 0., 0.20689213], [0., 0.29516917, 0., 0.29359812, 0., 0.41123265, 0.]] ] # pyformat: enable # pylint: enable=bad-whitespace self.assertAllClose(expected_layer_output, actual_layer_output) self.assertAllClose(expected_prob_output, actual_prob_output)
def testTransformerLayerExtendStep(self): with self.session(use_gpu=True) as sess: np.random.seed(6348575) depth = 4 p = layers_with_attention.TransformerLayer.Params() p.name = 'transformer' p.source_dim = depth p.has_aux_atten = True p.mask_self_atten = True p.tr_fflayer_tpl.hidden_dim = 7 p.tr_atten_tpl.num_attention_heads = 2 transformer = layers_with_attention.TransformerLayer(p) (source_vecs, _, aux_vecs, aux_paddings) = self._testTransformerAttentionLayerInputs( depth=depth) source_padding = tf.zeros([5, 2]) h1, probs1 = transformer.FPropDefaultTheta( source_vecs, source_padding, aux_vecs=aux_vecs, aux_paddings=aux_paddings) h2 = [] probs2 = [] cached_source_vecs = tf.zeros([0, 2, 4]) cached_source_contexts = tf.zeros([0, 2, 4]) prefix_states = py_utils.NestedMap(key=cached_source_vecs, value=cached_source_contexts) for i in range(5): h, probs, prefix_states = transformer.ExtendStep( transformer.theta, source_vecs[i, :, :], prefix_states, aux_vecs, aux_paddings) h2.append(h) probs2.append(probs) h2 = tf.stack(h2) probs2 = tf.concat(probs2, 0) tf.global_variables_initializer().run() h1_v, probs1_v, h2_v, probs2_v = sess.run([h1, probs1, h2, probs2]) self.assertAllClose(h1_v, h2_v) self.assertAllClose(probs1_v, probs2_v)