def testEvolvedTransformerDecoderLayerConstruction(self): p = layers_with_attention.EvolvedTransformerDecoderLayer.Params() p.name = 'evolved_transformer_decoder' p.source_dim = 16 p.transformer_tpl.tr_atten_tpl.num_attention_heads = 2 p.has_aux_atten = True p.mask_self_atten = True _ = layers_with_attention.EvolvedTransformerDecoderLayer(p)
def testEvolvedTransformerDecoderLayerFProp(self): with self.session(use_gpu=True) as sess: np.random.seed(6348575) depth = 4 p = layers_with_attention.EvolvedTransformerDecoderLayer.Params() p.name = 'evolved_transformer_decoder' p.source_dim = depth p.has_aux_atten = True p.mask_self_atten = True p.tr_double_heads_atten_tpl.num_attention_heads = 2 p.tr_atten_tpl.num_attention_heads = 2 p.transformer_tpl.tr_atten_tpl.num_attention_heads = 2 transformer = layers_with_attention.EvolvedTransformerDecoderLayer(p) (source_vecs, source_padding, aux_vecs, aux_paddings) = self._testTransformerAttentionLayerInputs(depth=depth) h, probs = transformer.FPropDefaultTheta( source_vecs, source_padding, aux_vecs=aux_vecs, aux_paddings=aux_paddings) tf.global_variables_initializer().run() actual_layer_output, actual_prob_output = sess.run([h, probs]) tf.logging.info(np.array_repr(actual_layer_output)) tf.logging.info(np.array_repr(actual_prob_output)) # pylint: disable=bad-whitespace # pyformat: disable expected_layer_output = [ [[-2.15843987, 0.54941475, 1.01636434, 0.13751736], [-1.31648636, -0.9490751 , 0.87473369, 0.5825901 ]], [[-0.48339468, 2.73935509, -0.7249794 , 0.38313258], [-1.10127831, -1.39807224, 0.34523556, 0.42135555]], [[ 0.55578727, 0.45714682, -0.5104562 , -1.37361968], [-1.25782788, -1.21873033, 0.93250239, 0.03656423]], [[-1.52875996, -0.97135425, 1.28484297, 0.32869172], [ 0.20500244, 2.30189896, 0.24345911, -0.75997925]], [[-1.27760804, -1.51032686, 0.2560831 , 0.66362542], [-1.63565814, -0.27384362, -0.42035246, 1.58936501]]] expected_prob_output = [ [[ 0.28604817, 0., 0.24327257, 0., 0.26117378, 0., 0.20950545], [ 0., 0.26642066, 0., 0.38120884, 0., 0.3523705 , 0.]], [[ 0.24503553, 0., 0.24042624, 0., 0.2301898, 0., 0.28434837], [ 0., 0.27049744, 0., 0.36453664, 0., 0.36496598, 0.]], [[ 0.25672671, 0., 0.2508592, 0., 0.25038037, 0., 0.24203378], [ 0., 0.27020746, 0., 0.38153058, 0., 0.34826195, 0.]], [[ 0.27227223, 0., 0.25547835, 0., 0.27728963, 0., 0.19495982], [ 0., 0.34053475, 0., 0.35592028, 0., 0.30354494, 0.]], [[ 0.23994856, 0., 0.25427216, 0., 0.26202756, 0., 0.24375173], [ 0., 0.30927902, 0., 0.32368731, 0., 0.36703369, 0.]]] # pyformat: enable # pylint: enable=bad-whitespace self.assertAllClose(expected_layer_output, actual_layer_output) self.assertAllClose(expected_prob_output, actual_prob_output)
def testEvolvedTransformerDecoderLayerExtendStep(self): with self.session(use_gpu=True) as sess: np.random.seed(6348575) depth = 4 p = layers_with_attention.EvolvedTransformerDecoderLayer.Params() p.name = 'evolved_transformer_decoder' p.source_dim = depth p.has_aux_atten = True p.mask_self_atten = True p.tr_double_heads_atten_tpl.num_attention_heads = 2 p.tr_atten_tpl.num_attention_heads = 2 p.transformer_tpl.tr_atten_tpl.num_attention_heads = 2 et_decoder = layers_with_attention.EvolvedTransformerDecoderLayer(p) (source_vecs, _, aux_vecs, aux_paddings) = self._testTransformerAttentionLayerInputs(depth=depth) source_padding = tf.zeros([5, 2]) h1, probs1 = et_decoder.FPropDefaultTheta( source_vecs, source_padding, aux_vecs=aux_vecs, aux_paddings=aux_paddings) h2 = [] probs2 = [] double_head_attention_states = py_utils.NestedMap( key=tf.zeros([0, 2, 4]), value=tf.zeros([0, 2, 4])) transformer_layer_states = py_utils.NestedMap( key=tf.zeros([0, 2, 4]), value=tf.zeros([0, 2, 4])) branched_convs_input = tf.zeros([0, 2, 4]) prefix_states = py_utils.NestedMap( double_head_attention_states=double_head_attention_states, transformer_layer_states=transformer_layer_states, branched_convs_input=branched_convs_input) for i in range(5): h, probs, prefix_states = et_decoder.ExtendStep( et_decoder.theta, source_vecs[i, :, :], prefix_states, aux_vecs, aux_paddings) h2.append(h) probs2.append(probs) h2 = tf.stack(h2) probs2 = tf.concat(probs2, 0) tf.global_variables_initializer().run() h1_v, probs1_v, h2_v, probs2_v = sess.run([h1, probs1, h2, probs2]) self.assertAllClose(h1_v, h2_v) self.assertAllClose(probs1_v, probs2_v)