def test_openai_gpt2(): from transformers import GPT2Model, GPT2Tokenizer input_text = "Here is some text to encode" pt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") pt_model = GPT2Model.from_pretrained("gpt2", return_dict=True) pt_outputs = pt_model(**pt_tokenizer([input_text], return_tensors="pt")) task = build_task({ "class": "lm", "params": { "data_pipeline.class": "GPT2DataPipeline", "max_len": 50, "begin_of_sentence": "eos" } }) model_cfgs = get_hyper_parameters("gpt2_117m") model = task.build_model(model_cfgs) restore_checkpoint_if_possible_v2(model, "117M", model_name="OpenAIGPT2") input_ids = task._data_pipeline.process(input_text) tf_inputs = { "trg_input": tf.convert_to_tensor([input_ids], tf.int64), "trg_length": tf.convert_to_tensor([len(input_ids)], tf.int64) } _, gen_init = model.get_symbols_to_logits_fn(tf_inputs, is_training=False, is_inference=False) tf_outputs = model.get_decoder_output(gen_init["decoder_input"], cache=gen_init["decoder_internal_cache"], is_training=False) assert_equal_numpy(pt_outputs.last_hidden_state.detach().numpy(), tf_outputs[:, :-1].numpy(), 5e-4)
def test_multiheadself_attention(): length_q = 4 num_heads = 2 num_units = 4 dropout_rate = 0. output_depth = 3 tf_att_layer = MultiHeadSelfAttention(num_heads=num_heads, num_units=num_units, output_depth=output_depth, attention_dropout_rate=dropout_rate) pt_att_layer = PTMultiHeadSelfAttention(input_depth=num_units, num_heads=num_heads, num_units=num_units, output_depth=output_depth, attention_dropout_rate=dropout_rate) query = numpy.random.rand(1, length_q, num_units) bias = numpy.random.rand(1, length_q) tf_query = tf.convert_to_tensor(query, dtype=tf.float32) tf_bias = tf.convert_to_tensor(bias, dtype=tf.float32) pt_query = torch.FloatTensor(query) pt_bias = torch.FloatTensor(bias) # build layer _ = tf_att_layer(tf_query) _ = pt_att_layer(pt_query) pt_att_layer._qkv_transform_layer._kernel.data = torch.FloatTensor( tf_att_layer._qkv_transform_layer._kernel.numpy()) pt_att_layer._qkv_transform_layer._bias.data = torch.FloatTensor( tf_att_layer._qkv_transform_layer._bias.numpy()) pt_att_layer._output_transform_layer._kernel.data = torch.FloatTensor( tf_att_layer._output_transform_layer._kernel.numpy()) pt_att_layer._output_transform_layer._bias.data = torch.FloatTensor( tf_att_layer._output_transform_layer._bias.numpy()) assert_equal_numpy(tf_att_layer(tf_query, bias=tf_bias, is_training=False).numpy(), pt_att_layer(pt_query, bias=pt_bias, is_training=False).detach().numpy())
def test_subsampler(): inp = numpy.random.rand(1, 19, 80, 1) pt_inp = torch.FloatTensor(inp) tf_inp = tf.convert_to_tensor(inp, tf.float32) # with layer norm tf_layer = TFAudioConvSubsamplingLayer(40) pt_layer = AudioConvSubsamplingLayer(40, input_dimension=80) _ = tf_layer(tf_inp) _ = pt_layer(pt_inp) pt_layer._conv_layer1.weight.data = torch.FloatTensor( tf_layer._conv_layers[0].kernel.numpy().transpose((3, 2, 0, 1))) pt_layer._conv_layer1.bias.data = torch.FloatTensor( tf_layer._conv_layers[0].bias.numpy()) pt_layer._conv_layer2.weight.data = torch.FloatTensor( tf_layer._conv_layers[1].kernel.numpy().transpose((3, 2, 0, 1))) pt_layer._conv_layer2.bias.data = torch.FloatTensor( tf_layer._conv_layers[1].bias.numpy()) pt_layer._norm_layer1.weight.data = torch.FloatTensor( tf_layer._norm_layers[0].gamma.numpy()) pt_layer._norm_layer1.bias.data = torch.FloatTensor( tf_layer._norm_layers[0].beta.numpy()) pt_layer._norm_layer2.weight.data = torch.FloatTensor( tf_layer._norm_layers[1].gamma.numpy()) pt_layer._norm_layer2.bias.data = torch.FloatTensor( tf_layer._norm_layers[1].beta.numpy()) pt_layer._dense_layer.weight.data = torch.FloatTensor( tf_layer._dense_layer.kernel.numpy().transpose()) pt_layer._dense_layer.bias.data = torch.FloatTensor( tf_layer._dense_layer.bias.numpy()) assert_equal_numpy( pt_layer(pt_inp).detach().numpy(), tf_layer(tf_inp).numpy(), 5e-5) # without layer norm tf_layer = TFAudioConvSubsamplingLayer(40, layer_norm=False) pt_layer = AudioConvSubsamplingLayer(40, input_dimension=80, layer_norm=False) _ = tf_layer(tf_inp) _ = pt_layer(pt_inp) pt_layer._conv_layer1.weight.data = torch.FloatTensor( tf_layer._conv_layers[0].kernel.numpy().transpose((3, 2, 0, 1))) pt_layer._conv_layer1.bias.data = torch.FloatTensor( tf_layer._conv_layers[0].bias.numpy()) pt_layer._conv_layer2.weight.data = torch.FloatTensor( tf_layer._conv_layers[1].kernel.numpy().transpose((3, 2, 0, 1))) pt_layer._conv_layer2.bias.data = torch.FloatTensor( tf_layer._conv_layers[1].bias.numpy()) pt_layer._dense_layer.weight.data = torch.FloatTensor( tf_layer._dense_layer.kernel.numpy().transpose()) pt_layer._dense_layer.bias.data = torch.FloatTensor( tf_layer._dense_layer.bias.numpy()) assert_equal_numpy( pt_layer(pt_inp).detach().numpy(), tf_layer(tf_inp).numpy(), 1e-6)
def test_emb(): emb_dim = 5 vocab_size = 10 tf_emb = WordEmbeddingSharedWeights(emb_dim, vocab_size, True) pt_emb = PTWordEmbeddingSharedWeights(emb_dim, vocab_size, True) inp_2d = numpy.random.randint(0, 9, [2, 5]) inp_1d = numpy.random.randint(0, 9, [ 3, ]) logits_2d = numpy.random.rand(2, 5) logits_3d = numpy.random.rand(2, 4, 5) tf_inp_2d = tf.convert_to_tensor(inp_2d, tf.int32) tf_inp_1d = tf.convert_to_tensor(inp_1d, tf.int32) tf_logits_2d = tf.convert_to_tensor(logits_2d, tf.float32) tf_logits_3d = tf.convert_to_tensor(logits_3d, tf.float32) pt_inp_2d = torch.IntTensor(inp_2d) pt_inp_1d = torch.IntTensor(inp_1d) pt_logits_2d = torch.FloatTensor(logits_2d) pt_logits_3d = torch.FloatTensor(logits_3d) _ = tf_emb(tf_logits_2d, mode="linear") _ = pt_emb(pt_logits_2d, mode="linear") pt_emb._shared_weights.data = torch.Tensor(tf_emb._shared_weights.numpy()) pt_emb._bias.data = torch.Tensor(tf_emb._bias.numpy()) assert_equal_numpy( tf_emb(tf_logits_2d, mode="linear").numpy(), pt_emb(pt_logits_2d, mode="linear").detach().numpy()) assert_equal_numpy( tf_emb(tf_logits_3d, mode="linear").numpy(), pt_emb(pt_logits_3d, mode="linear").detach().numpy()) assert_equal_numpy( tf_emb(tf_inp_2d).numpy(), pt_emb(pt_inp_2d).detach().numpy()) assert_equal_numpy( tf_emb(tf_inp_1d).numpy(), pt_emb(pt_inp_1d).detach().numpy())
def test_ffn(): numpy_inp = numpy.random.rand(3, 5) tf_inp = tf.convert_to_tensor(numpy_inp, tf.float32) pt_inp = torch.FloatTensor(numpy_inp) tf_ffn = TransformerFFN(7, 11, 0.1) tf_out = tf_ffn(tf_inp, is_training=False) pt_ffn = PTTransformerFFN(5, 7, 11, 0.1) _ = pt_ffn(pt_inp, is_training=False) pt_ffn._dense1.weight.data = torch.FloatTensor( tf_ffn._conv1.kernel.numpy().transpose([1, 0])) pt_ffn._dense1.bias.data = torch.FloatTensor(tf_ffn._conv1.bias.numpy()) pt_ffn._dense2.weight.data = torch.FloatTensor( tf_ffn._conv2.kernel.numpy().transpose([1, 0])) pt_ffn._dense2.bias.data = torch.FloatTensor(tf_ffn._conv2.bias.numpy()) pt_out = pt_ffn(pt_inp, is_training=False) assert_equal_numpy(pt_out.detach().numpy(), tf_out.numpy())
def test_multihead_dense(): num_heads = 3 output_size = (6, 12) input_size = 6 numpy_inp = numpy.random.randn(2, 3, input_size) pt_inp = torch.FloatTensor(numpy_inp) tf_inp = tf.convert_to_tensor(numpy_inp, dtype=tf.float32) tf_non_out_layer = MultiHeadDenseLayer(output_size, num_heads, use_bias=True, is_output_transform=False, name="nonoutput_transform") pt_non_out_layer = PTMultiHeadDenseLayer(input_size, output_size, num_heads, use_bias=True, is_output_transform=False) _ = pt_non_out_layer(pt_inp) tf_out = tf_non_out_layer(tf_inp) pt_non_out_layer._kernel.data = torch.FloatTensor( tf_non_out_layer._kernel.numpy()) pt_non_out_layer._bias.data = torch.FloatTensor( tf_non_out_layer._bias.numpy()) for x, y in zip(tf_out, pt_non_out_layer(pt_inp)): assert_equal_numpy(x.numpy(), y.detach().numpy()) num_inputs_per_head = 5 output_size = 6 numpy_inp = numpy.random.randn(1, 2, num_heads, num_inputs_per_head) tf_inp = tf.convert_to_tensor(numpy_inp) pt_inp = torch.FloatTensor(numpy_inp) tf_out_layer = MultiHeadDenseLayer(output_size, num_heads, use_bias=True, is_output_transform=True, name="output_transform") pt_out_layer = PTMultiHeadDenseLayer(num_heads * num_inputs_per_head, output_size, num_heads, use_bias=True, is_output_transform=True) tf_out = tf_out_layer(tf_inp) _ = pt_out_layer(pt_inp) pt_out_layer._kernel.data = torch.FloatTensor(tf_out_layer._kernel.numpy()) pt_out_layer._bias.data = torch.FloatTensor(tf_out_layer._bias.numpy()) assert_equal_numpy(tf_out.numpy(), pt_out_layer(pt_inp).detach().numpy())
def test_incremental_encode(): max_time = 5 inputs = tf.random.normal([1, max_time, 8]) inputs_padding = tf.convert_to_tensor([[ 0., 0., 0., 0., 0., ]], dtype=tf.float32) encoder = TransformerEncoder( num_layers=2, hidden_size=8, num_attention_heads=2, filter_size=20, attention_monotonic=True, ) encoder_outputs = encoder(inputs, inputs_padding, is_training=False) incremental_encoder_outputs, _ = encoder.incremental_encode(inputs, {}, time=0) assert_equal_numpy(encoder_outputs.numpy(), incremental_encoder_outputs.numpy(), 1e-5) incremental_encoder_outputs0, cache = encoder.incremental_encode( inputs[:, :2], {}, time=0) incremental_encoder_outputs1, cache = encoder.incremental_encode(inputs[:, 2], cache, time=2) incremental_encoder_outputs2, cache = encoder.incremental_encode( inputs[:, 3:], cache, time=3) assert_equal_numpy( encoder_outputs.numpy(), tf.concat([ incremental_encoder_outputs0, incremental_encoder_outputs1, incremental_encoder_outputs2 ], axis=1), 1e-5)
def test_position_embedding(): tf_postbl = PositionEmbeddingWrapper.add_sinusoids_timing_signal( tf.zeros([1, 10, 10]), None) pt_postbl = PTPositionEmbeddingWrapper.add_sinusoids_timing_signal( torch.zeros(1, 10, 10), None) assert_equal_numpy(tf_postbl.numpy(), pt_postbl.detach().numpy()) emb_dim = 5 vocab_size = 10 tf_emb = WordEmbeddingSharedWeights(emb_dim, vocab_size, False) pt_emb = PTWordEmbeddingSharedWeights(emb_dim, vocab_size, False) inp_2d = numpy.random.randint(0, 9, [2, 5]) inp_1d = numpy.random.randint(0, 9, [ 3, ]) logits_2d = numpy.random.rand(2, 5) logits_3d = numpy.random.rand(2, 4, 5) tf_inp_2d = tf.convert_to_tensor(inp_2d, tf.int32) tf_inp_1d = tf.convert_to_tensor(inp_1d, tf.int32) tf_logits_2d = tf.convert_to_tensor(logits_2d, tf.float32) tf_logits_3d = tf.convert_to_tensor(logits_3d, tf.float32) pt_inp_2d = torch.IntTensor(inp_2d) pt_inp_1d = torch.IntTensor(inp_1d) pt_logits_2d = torch.FloatTensor(logits_2d) pt_logits_3d = torch.FloatTensor(logits_3d) _ = tf_emb(tf_logits_2d, mode="linear") _ = pt_emb(pt_logits_2d, mode="linear") pt_emb._shared_weights.data = torch.Tensor(tf_emb._shared_weights.numpy()) tf_posemb = PositionEmbeddingWrapper("sinusoids", tf_emb) pt_posemb = PTPositionEmbeddingWrapper("sinusoids", pt_emb) assert_equal_numpy( tf_posemb(tf_logits_2d, mode="linear").numpy(), pt_posemb(pt_logits_2d, mode="linear").detach().numpy()) assert_equal_numpy( tf_posemb(tf_logits_3d, mode="linear").numpy(), pt_posemb(pt_logits_3d, mode="linear").detach().numpy()) assert_equal_numpy( tf_posemb(tf_inp_2d).numpy(), pt_posemb(pt_inp_2d).detach().numpy()) assert_equal_numpy( tf_posemb(tf_inp_1d, time=5).numpy(), pt_posemb(pt_inp_1d, time=5).detach().numpy())
def test_multiheadself_attention_under_dec(): num_heads = 2 num_units = 4 dropout_rate = 0. output_depth = 3 tf_att_layer = MultiHeadSelfAttention(num_heads=num_heads, num_units=num_units, output_depth=output_depth, attention_dropout_rate=dropout_rate) pt_att_layer = PTMultiHeadSelfAttention(input_depth=num_units, num_heads=num_heads, num_units=num_units, output_depth=output_depth, attention_dropout_rate=dropout_rate) query = numpy.random.rand(1, 1, num_units) tf_query = tf.convert_to_tensor(query, dtype=tf.float32) pt_query = torch.FloatTensor(query) # build layer _ = tf_att_layer(tf_query) _ = pt_att_layer(pt_query) pt_att_layer._qkv_transform_layer._kernel.data = torch.FloatTensor( tf_att_layer._qkv_transform_layer._kernel.numpy()) pt_att_layer._qkv_transform_layer._bias.data = torch.FloatTensor( tf_att_layer._qkv_transform_layer._bias.numpy()) pt_att_layer._output_transform_layer._kernel.data = torch.FloatTensor( tf_att_layer._output_transform_layer._kernel.numpy()) pt_att_layer._output_transform_layer._bias.data = torch.FloatTensor( tf_att_layer._output_transform_layer._bias.numpy()) cache = { "keys": numpy.array([[[-0.46546218, -1.0054358, 0.42906007, -1.6854379], [1.078194, 1.1625745, -0.25033495, -1.980812]]]), "values": numpy.array([[[-1.2360295, 0.69050753, -1.8204833, 0.23788007], [2.3751693, -1.8772833, -0.2574517, 1.3010416]]]), } tf_cache = {"keys": tf.reshape(tf.convert_to_tensor(cache["keys"], dtype=tf.float32), [1, 2, num_heads, num_units // num_heads]), "values": tf.reshape(tf.convert_to_tensor(cache["values"], dtype=tf.float32), [1, 2, num_heads, num_units // num_heads])} pt_cache = {"keys": torch.reshape(torch.FloatTensor(cache["keys"]), [1, 2, num_heads, num_units // num_heads]), "values": torch.reshape(torch.FloatTensor(cache["values"]), [1, 2, num_heads, num_units // num_heads])} assert_equal_numpy(tf_att_layer(tf_query, cache=tf_cache, is_training=False).numpy(), pt_att_layer(pt_query, cache=pt_cache, is_training=False).detach().numpy())
def test_prepost(): def layer(x, *args, **kwargs): _ = args _ = kwargs return x tf_prepost_layer = PrePostProcessingWrapper(layer, dropout_rate=0.1, name="lpp") pt_prepost_layer = PTPrePostProcessingWrapper(layer, norm_shape=3, dropout_rate=0.1) numpy_inp = numpy.array([[1, 2, 3.]]) tf_inp = tf.convert_to_tensor(numpy_inp, tf.float32) pt_inp = torch.FloatTensor(numpy_inp) tf_out = tf_prepost_layer(tf_inp, is_training=False) _ = pt_prepost_layer(pt_inp, is_training=False) pt_prepost_layer._norm_layer.weight.data = torch.FloatTensor( tf_prepost_layer._norm_layer.gamma.numpy()) pt_prepost_layer._norm_layer.bias.data = torch.FloatTensor( tf_prepost_layer._norm_layer.beta.numpy()) assert_equal_numpy( tf_out.numpy(), pt_prepost_layer(pt_inp, is_training=False).detach().numpy())
def test_transformer_decoder_prenorm(): dmodel = 4 num_layers = 1 num_self_attention_heads = 2 hidden_size = dmodel filter_size = 16 self_attention_dropout_rate = 0.1 ffn_dropout_rate = 0.1 layer_postprocess_dropout_rate = 0.1 tf_decoder = TFTransformerDecoder( num_layers=num_layers, num_attention_heads=num_self_attention_heads, hidden_size=hidden_size, filter_size=filter_size, attention_dropout_rate=self_attention_dropout_rate, ffn_dropout_rate=ffn_dropout_rate, layer_postprocess_dropout_rate=layer_postprocess_dropout_rate) pt_decoder = TransformerDecoder( num_layers=num_layers, num_attention_heads=num_self_attention_heads, hidden_size=hidden_size, filter_size=filter_size, attention_dropout_rate=self_attention_dropout_rate, ffn_dropout_rate=ffn_dropout_rate, layer_postprocess_dropout_rate=layer_postprocess_dropout_rate) inputs = [[[-0.37282175, 0.62301564, -2.0221813, -0.00875833], [0.31516594, -1.117763, -1.0697726, 0.80373234], [-0.717022, 0.3300997, -0.44306225, 1.550383], [-1.5516962, 0.6025011, 1.8262954, 0.42469704]], [[-0.98617625, 2.2856202, -1.3063533, 0.4174998], [1.5724765, 1.2201295, 1.1479746, 0.7810888], [0.8343642, -1.073388, 1.2718492, -0.7290778], [-1.4126722, 1.8000795, -2.118672, -0.1366007]]] input_padding = [[0, 0, 0, 0], [0, 0, 1., 1.]] decoder_input = [ [[8.6675537e-01, 2.2135425e-01, 1.4054185e+00, -4.2268831e-01], [1.9606155e+00, -1.8318410e+00, -1.8158482e+00, -3.7030798e-01], [-1.1357157e-03, 5.5629879e-01, 6.6107117e-02, -1.7330967e+00]], [[-1.1870812e+00, -5.4499257e-01, -8.6622888e-01, -7.4098641e-01], [2.2233427e-01, 5.3582352e-01, 3.0567116e-01, 1.0201423e-01], [-1.8053315e+00, 7.2125041e-01, 1.0072237e+00, -2.0333264e+00]] ] tf_inp = tf.convert_to_tensor(inputs, dtype=tf.float32) pt_inp = torch.FloatTensor(inputs) tf_inppad = tf.convert_to_tensor(input_padding, dtype=tf.float32) pt_inppad = torch.FloatTensor(input_padding) tf_decinp = tf.convert_to_tensor(decoder_input, dtype=tf.float32) pt_decinp = torch.FloatTensor(decoder_input) tf_cache = tf_decoder.create_decoding_internal_cache(tf_inp, tf_inppad, is_inference=False) _ = tf_decoder(tf_decinp, tf_cache, is_training=False) pt_cache = pt_decoder.create_decoding_internal_cache(pt_inp, pt_inppad, is_inference=False) _ = pt_decoder(pt_decinp, pt_cache, is_training=False) pt_decoder._output_norm_layer.weight.data = torch.FloatTensor( tf_decoder._output_norm_layer.gamma.numpy()) pt_decoder._output_norm_layer.bias.data = torch.FloatTensor( tf_decoder._output_norm_layer.beta.numpy()) pt_decoder._stacking_layers[0][ 0]._layer._qkv_transform_layer._kernel.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._selfatt_layer._layer. _qkv_transform_layer._kernel.numpy()) pt_decoder._stacking_layers[0][ 0]._layer._qkv_transform_layer._bias.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._selfatt_layer._layer. _qkv_transform_layer._bias.numpy()) pt_decoder._stacking_layers[0][ 0]._layer._output_transform_layer._kernel.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._selfatt_layer._layer. _output_transform_layer._kernel.numpy()) pt_decoder._stacking_layers[0][ 0]._layer._output_transform_layer._bias.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._selfatt_layer._layer. _output_transform_layer._bias.numpy()) pt_decoder._stacking_layers[0][ 0]._norm_layer.weight.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._selfatt_layer._norm_layer.gamma. numpy()) pt_decoder._stacking_layers[0][0]._norm_layer.bias.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._selfatt_layer._norm_layer.beta.numpy()) pt_decoder._stacking_layers[0][ 1]._layer._q_transform_layer._kernel.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._crossatt_layer._layer. _q_transform_layer._kernel.numpy()) pt_decoder._stacking_layers[0][ 1]._layer._q_transform_layer._bias.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._crossatt_layer._layer. _q_transform_layer._bias.numpy()) pt_decoder._stacking_layers[0][ 1]._layer._kv_transform_layer._kernel.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._crossatt_layer._layer. _kv_transform_layer._kernel.numpy()) pt_decoder._stacking_layers[0][ 1]._layer._kv_transform_layer._bias.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._crossatt_layer._layer. _kv_transform_layer._bias.numpy()) pt_decoder._stacking_layers[0][ 1]._layer._output_transform_layer._kernel.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._crossatt_layer._layer. _output_transform_layer._kernel.numpy()) pt_decoder._stacking_layers[0][ 1]._layer._output_transform_layer._bias.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._crossatt_layer._layer. _output_transform_layer._bias.numpy()) pt_decoder._stacking_layers[0][ 1]._norm_layer.weight.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._crossatt_layer._norm_layer.gamma. numpy()) pt_decoder._stacking_layers[0][ 1]._norm_layer.bias.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._crossatt_layer._norm_layer.beta. numpy()) pt_decoder._stacking_layers[0][ 2]._layer._dense1.weight.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._ffn_layer._layer._conv1.kernel. numpy().transpose([1, 0])) pt_decoder._stacking_layers[0][ 2]._layer._dense1.bias.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._ffn_layer._layer._conv1.bias.numpy( )) pt_decoder._stacking_layers[0][ 2]._layer._dense2.weight.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._ffn_layer._layer._conv2.kernel. numpy().transpose([1, 0])) pt_decoder._stacking_layers[0][ 2]._layer._dense2.bias.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._ffn_layer._layer._conv2.bias.numpy( )) pt_decoder._stacking_layers[0][ 2]._norm_layer.weight.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._ffn_layer._norm_layer.gamma.numpy( )) pt_decoder._stacking_layers[0][ 2]._norm_layer.bias.data = torch.FloatTensor( tf_decoder._stacking_layers[0]._ffn_layer._norm_layer.beta.numpy()) assert_equal_numpy( tf_decoder(tf_decinp, tf_cache, is_training=False).numpy(), pt_decoder(pt_decinp, pt_cache, is_training=False).detach().numpy(), 5e-5) # for inference tf_cache = tf_decoder.create_decoding_internal_cache(tf_inp, tf_inppad, is_inference=True) pt_cache = pt_decoder.create_decoding_internal_cache(pt_inp, pt_inppad, is_inference=True) decoder_input = [[ 1.9606155e+00, -1.8318410e+00, -1.8158482e+00, -3.7030798e-01 ], [-1.1357157e-03, 5.5629879e-01, 6.6107117e-02, -1.7330967e+00]] tf_decinp = tf.convert_to_tensor(decoder_input, tf.float32) pt_decinp = torch.FloatTensor(decoder_input) assert_equal_numpy( tf_decoder(tf_decinp, tf_cache, is_training=False).numpy(), pt_decoder(pt_decinp, pt_cache, is_training=False).detach().numpy(), 5e-5) assert_equal_numpy( tf_cache["decoding_states"]["layer_0"]["self_attention"] ["keys"].numpy(), pt_cache["decoding_states"]["layer_0"] ["self_attention"]["keys"].detach().numpy(), 5e-5) assert_equal_numpy( tf_cache["decoding_states"]["layer_0"]["self_attention"] ["values"].numpy(), pt_cache["decoding_states"]["layer_0"] ["self_attention"]["values"].detach().numpy(), 5e-5)
def test_transformer_encoder_prenorm(): # batch_size = 2 # max_len = 4 dmodel = 4 num_layers = 1 num_self_attention_heads = 2 hidden_size = dmodel filter_size = 16 self_attention_dropout_rate = 0.1 ffn_dropout_rate = 0.1 layer_postprocess_dropout_rate = 0.1 tf_encoder = TFTransformerEncoder( num_layers=num_layers, num_attention_heads=num_self_attention_heads, hidden_size=hidden_size, filter_size=filter_size, attention_dropout_rate=self_attention_dropout_rate, ffn_dropout_rate=ffn_dropout_rate, layer_postprocess_dropout_rate=layer_postprocess_dropout_rate) pt_encoder = TransformerEncoder( num_layers=num_layers, num_attention_heads=num_self_attention_heads, hidden_size=hidden_size, filter_size=filter_size, attention_dropout_rate=self_attention_dropout_rate, ffn_dropout_rate=ffn_dropout_rate, layer_postprocess_dropout_rate=layer_postprocess_dropout_rate) inputs = [[[-0.37282175, 0.62301564, -2.0221813, -0.00875833], [0.31516594, -1.117763, -1.0697726, 0.80373234], [-0.717022, 0.3300997, -0.44306225, 1.550383], [-1.5516962, 0.6025011, 1.8262954, 0.42469704]], [[-0.98617625, 2.2856202, -1.3063533, 0.4174998], [1.5724765, 1.2201295, 1.1479746, 0.7810888], [0.8343642, -1.073388, 1.2718492, -0.7290778], [-1.4126722, 1.8000795, -2.118672, -0.1366007]]] input_padding = [[0, 0, 0, 0], [0, 0, 1., 1.]] tf_inp = tf.convert_to_tensor(inputs, dtype=tf.float32) pt_inp = torch.FloatTensor(inputs) tf_inppad = tf.convert_to_tensor(input_padding, dtype=tf.float32) pt_inppad = torch.FloatTensor(input_padding) _ = tf_encoder(tf_inp, tf_inppad, is_training=False) _ = pt_encoder(pt_inp, pt_inppad, is_training=False) pt_encoder._output_norm_layer.weight.data = torch.FloatTensor( tf_encoder._output_norm_layer.gamma.numpy()) pt_encoder._output_norm_layer.bias.data = torch.FloatTensor( tf_encoder._output_norm_layer.beta.numpy()) pt_encoder._stacking_layers[0][ 0]._layer._qkv_transform_layer._kernel.data = torch.FloatTensor( tf_encoder._stacking_layers[0] [0]._layer._qkv_transform_layer._kernel.numpy()) pt_encoder._stacking_layers[0][ 0]._layer._qkv_transform_layer._bias.data = torch.FloatTensor( tf_encoder._stacking_layers[0] [0]._layer._qkv_transform_layer._bias.numpy()) pt_encoder._stacking_layers[0][ 0]._layer._output_transform_layer._kernel.data = torch.FloatTensor( tf_encoder._stacking_layers[0] [0]._layer._output_transform_layer._kernel.numpy()) pt_encoder._stacking_layers[0][ 0]._layer._output_transform_layer._bias.data = torch.FloatTensor( tf_encoder._stacking_layers[0] [0]._layer._output_transform_layer._bias.numpy()) pt_encoder._stacking_layers[0][ 1]._layer._dense1.weight.data = torch.FloatTensor( tf_encoder._stacking_layers[0] [1]._layer._conv1.kernel.numpy().transpose([1, 0])) pt_encoder._stacking_layers[0][ 1]._layer._dense1.bias.data = torch.FloatTensor( tf_encoder._stacking_layers[0][1]._layer._conv1.bias.numpy()) pt_encoder._stacking_layers[0][ 1]._layer._dense2.weight.data = torch.FloatTensor( tf_encoder._stacking_layers[0] [1]._layer._conv2.kernel.numpy().transpose([1, 0])) pt_encoder._stacking_layers[0][ 1]._layer._dense2.bias.data = torch.FloatTensor( tf_encoder._stacking_layers[0][1]._layer._conv2.bias.numpy()) pt_encoder._stacking_layers[0][ 0]._norm_layer.weight.data = torch.FloatTensor( tf_encoder._stacking_layers[0][0]._norm_layer.gamma.numpy()) pt_encoder._stacking_layers[0][ 0]._norm_layer.bias.data = torch.FloatTensor( tf_encoder._stacking_layers[0][0]._norm_layer.beta.numpy()) pt_encoder._stacking_layers[0][ 1]._norm_layer.weight.data = torch.FloatTensor( tf_encoder._stacking_layers[0][1]._norm_layer.gamma.numpy()) pt_encoder._stacking_layers[0][ 1]._norm_layer.bias.data = torch.FloatTensor( tf_encoder._stacking_layers[0][1]._norm_layer.beta.numpy()) assert_equal_numpy( tf_encoder(tf_inp, tf_inppad, is_training=False).numpy(), pt_encoder(pt_inp, pt_inppad, is_training=False).detach().numpy(), 5e-6)
def test_transformer_encoder(): # batch_size = 2 # max_len = 4 dmodel = 4 num_layers = 1 num_self_attention_heads = 2 hidden_size = dmodel filter_size = 16 self_attention_dropout_rate = 0.1 ffn_dropout_rate = 0.1 layer_postprocess_dropout_rate = 0.1 encoder = TransformerEncoder( num_layers=num_layers, num_attention_heads=num_self_attention_heads, hidden_size=hidden_size, filter_size=filter_size, attention_dropout_rate=self_attention_dropout_rate, ffn_dropout_rate=ffn_dropout_rate, layer_postprocess_dropout_rate=layer_postprocess_dropout_rate) inputs = tf.convert_to_tensor( [[[-0.37282175, 0.62301564, -2.0221813, -0.00875833], [0.31516594, -1.117763, -1.0697726, 0.80373234], [-0.717022, 0.3300997, -0.44306225, 1.550383], [-1.5516962, 0.6025011, 1.8262954, 0.42469704]], [[-0.98617625, 2.2856202, -1.3063533, 0.4174998], [1.5724765, 1.2201295, 1.1479746, 0.7810888], [0.8343642, -1.073388, 1.2718492, -0.7290778], [-1.4126722, 1.8000795, -2.118672, -0.1366007]]], dtype=tf.float32) input_padding = tf.convert_to_tensor([[0, 0, 0, 0], [0, 0, 1., 1.]], dtype=tf.float32) _ = encoder(inputs, input_padding, is_training=False) for w in encoder.trainable_weights: if "layer_0/self_attention_prepost_wrapper/self_attention/output_transform/kernel" in w.name: tf.compat.v1.assign( w, tf.convert_to_tensor( [[-0.04742211, -0.42928827, -0.54485893, -0.7514334], [0.3391741, 0.61141425, -0.23809844, 0.27043575], [-0.7315594, 0.8002729, -0.2958873, 0.698168], [-0.59683925, -0.38270262, -0.59893274, -0.4040773]], dtype=tf.float32)) elif "layer_0/self_attention_prepost_wrapper/self_attention/qkv_transform/kernel" in w.name: tf.compat.v1.assign( w, tf.convert_to_tensor( [[ 0.5907243, -0.5555184, 0.5612393, -0.2724994, 0.23405826, 0.38096863, -0.02200276, -0.26264596, 0.36556423, 0.10351193, -0.1946517, 0.60423344 ], [ 0.16057128, -0.4464733, 0.32041794, -0.30858415, 0.26626736, 0.579398, -0.19076341, 0.1072132, -0.43820834, 0.05253071, 0.08801651, -0.4995584 ], [ -0.48593724, 0.1275987, 0.15794194, -0.4632662, 0.54038125, -0.45666856, -0.16076824, 0.43855423, 0.32468224, -0.1931965, -0.42853987, 0.2411524 ], [ -0.32923162, -0.06395793, 0.33392805, -0.46701026, -0.06507087, -0.61020637, 0.545703, -0.23786944, -0.2854141, -0.1698403, -0.1244911, 0.40745395 ]], dtype=tf.float32)) elif "layer_0/ffn_prepost_wrapper/ffn/dense1/kernel" in w.name: tf.compat.v1.assign( w, tf.convert_to_tensor( [[ -0.14616564, 0.30248666, 0.5319947, 0.5002098, 0.2705282, -0.21612385, -0.3336154, 0.03436899, 0.26958936, 0.26834202, 0.0843057, -0.50728637, 0.19995207, -0.3930181, -0.4985036, 0.33232063 ], [ -0.04522616, -0.20491397, -0.19712418, 0.18106508, 0.33636385, 0.4030161, -0.30252987, 0.11853886, 0.2238034, 0.3744824, -0.28127617, -0.03388816, 0.32239246, -0.25639355, 0.02382994, 0.34818083 ], [ 0.4456296, -0.48834273, -0.26576972, 0.28717202, 0.02354515, -0.2434513, -0.26277977, -0.05434859, 0.09830189, 0.08207488, -0.28704825, -0.19418713, 0.47731507, 0.14538354, -0.3832153, -0.5143249 ], [ 0.33276683, -0.248025, -0.13612089, -0.15473047, 0.33012676, -0.39191568, -0.32679468, 0.52579904, -0.17942387, -0.39317977, 0.13891649, -0.17397407, -0.19002154, 0.05117792, 0.34706026, 0.11179692 ]], dtype=tf.float32)) elif "layer_0/ffn_prepost_wrapper/ffn/dense2/kernel" in w.name: tf.compat.v1.assign( w, tf.convert_to_tensor( [[0.18234771, 0.23902518, 0.4304248, -0.05616844], [-0.01435661, 0.11098373, 0.5370636, -0.5271752], [-0.3239155, 0.5083337, 0.43396413, -0.47642848], [0.31562793, -0.04991594, 0.530545, -0.51263183], [0.10357869, 0.2883237, 0.16929054, 0.18414849], [-0.30361128, -0.2045235, 0.05544132, 0.22116774], [0.05548936, -0.11504656, 0.13726586, -0.13652831], [0.5011635, 0.45315623, -0.35243145, 0.17173672], [-0.52015716, 0.42873853, -0.09965438, -0.45107275], [0.00233686, 0.2797522, 0.2702785, 0.33721972], [0.10216439, -0.14768293, -0.5122431, -0.3882924], [-0.44032216, -0.09983957, -0.41019306, -0.26434696], [0.50977015, -0.18238857, 0.54663074, 0.05787665], [0.3197481, -0.45845133, -0.14075449, -0.33339915], [0.10717738, 0.28995162, 0.47179937, 0.01342988], [0.37111026, -0.31352338, 0.37098122, 0.3895113]], dtype=tf.float32)) assert_equal_numpy( encoder(inputs, input_padding, is_training=False).numpy(), numpy.array([[[-0.2709918, 0.95230484, -1.5212451, 0.83993214], [0.7688386, -0.69726187, -1.2441225, 1.1725458], [-1.1408244, 0.57164305, -0.76654106, 1.3357224], [-1.5286305, 0.23827001, 1.267273, 0.02308742]], [[-1.0156152, 1.4036102, -0.8733843, 0.48538923], [-0.60578734, 0.23574206, 1.5095922, -1.1395471], [0.53838307, -0.7913252, 1.3617758, -1.1088338], [-0.8927619, 1.3975127, -1.001557, 0.49680638]]]))
def test_st(): params = copy.deepcopy( get_hyper_parameters("speech_transformer_toy")["model.params"]) params["modality.source.dim"] = None params["modality.target.dim"] = None params["modality.source.timing"] = None params["modality.target.timing"] = None params["encoder.num_layers"] = 1 params["decoder.num_layers"] = 1 src_vocab_meta = dict(audio_feature_dim=80, audio_feature_channels=1) trg_vocab_meta = dict(vocab_size=5, eos_id=4, bos_id=3, unk_id=2) fake_audio = numpy.random.rand(1, 11, 80, 1) pt_inps = { "src": torch.FloatTensor(fake_audio), "src_length": torch.LongTensor([11]), "trg_input": torch.LongTensor([[3, 0, 1]]), } tf_inps = { "src": tf.convert_to_tensor(fake_audio, tf.float32), "src_length": tf.convert_to_tensor([11], tf.int32), "trg_input": tf.convert_to_tensor([[3, 0, 1]], tf.int32), } pt_model: SpeechTransformer = build_model( { "model.class": "speech_transformer", "params": params }, src_meta=src_vocab_meta, trg_meta=trg_vocab_meta) tf_model: TFSpeechTransformer = build_tf_model( { "model.class": "speech_transformer", "params": params }, src_meta=src_vocab_meta, trg_meta=trg_vocab_meta) pt_model._src_modality.embedding_layer._conv_layer1.weight.data = torch.FloatTensor( tf_model._src_modality.embedding_layer._conv_layers[0].kernel.numpy( ).transpose((3, 2, 0, 1))) pt_model._src_modality.embedding_layer._conv_layer1.bias.data = torch.FloatTensor( tf_model._src_modality.embedding_layer._conv_layers[0].bias.numpy()) pt_model._src_modality.embedding_layer._conv_layer2.weight.data = torch.FloatTensor( tf_model._src_modality.embedding_layer._conv_layers[1].kernel.numpy( ).transpose((3, 2, 0, 1))) pt_model._src_modality.embedding_layer._conv_layer2.bias.data = torch.FloatTensor( tf_model._src_modality.embedding_layer._conv_layers[1].bias.numpy()) pt_model._src_modality.embedding_layer._norm_layer1.weight.data = torch.FloatTensor( tf_model._src_modality.embedding_layer._norm_layers[0].gamma.numpy()) pt_model._src_modality.embedding_layer._norm_layer1.bias.data = torch.FloatTensor( tf_model._src_modality.embedding_layer._norm_layers[0].beta.numpy()) pt_model._src_modality.embedding_layer._norm_layer2.weight.data = torch.FloatTensor( tf_model._src_modality.embedding_layer._norm_layers[1].gamma.numpy()) pt_model._src_modality.embedding_layer._norm_layer2.bias.data = torch.FloatTensor( tf_model._src_modality.embedding_layer._norm_layers[1].beta.numpy()) pt_model._src_modality.embedding_layer._dense_layer.weight.data = torch.FloatTensor( tf_model._src_modality.embedding_layer._dense_layer.kernel.numpy( ).transpose()) pt_model._src_modality.embedding_layer._dense_layer.bias.data = torch.FloatTensor( tf_model._src_modality.embedding_layer._dense_layer.bias.numpy()) pt_model._trg_modality.embedding_layer._shared_weights.data = torch.FloatTensor( tf_model._trg_modality.embedding_layer._shared_weights.numpy()) pt_model._trg_modality.embedding_layer._bias.data = torch.FloatTensor( tf_model._trg_modality.embedding_layer._bias.numpy()) pt_model._encoder._output_norm_layer.weight.data = torch.FloatTensor( tf_model._encoder._output_norm_layer.gamma.numpy()) pt_model._encoder._output_norm_layer.bias.data = torch.FloatTensor( tf_model._encoder._output_norm_layer.beta.numpy()) pt_model._encoder._stacking_layers[0][ 0]._layer._qkv_transform_layer._kernel.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0]._selfatt_layer._layer. _qkv_transform_layer._kernel.numpy()) pt_model._encoder._stacking_layers[0][ 0]._layer._qkv_transform_layer._bias.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0]._selfatt_layer._layer. _qkv_transform_layer._bias.numpy()) pt_model._encoder._stacking_layers[0][ 0]._layer._output_transform_layer._kernel.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0]._selfatt_layer._layer. _output_transform_layer._kernel.numpy()) pt_model._encoder._stacking_layers[0][ 0]._layer._output_transform_layer._bias.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0]._selfatt_layer._layer. _output_transform_layer._bias.numpy()) pt_model._encoder._stacking_layers[0][ 1]._layer._dense1.weight.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0]._ffn_layer._layer._conv1. kernel.numpy().transpose([1, 0])) pt_model._encoder._stacking_layers[0][ 1]._layer._dense1.bias.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0]._ffn_layer._layer._conv1. bias.numpy()) pt_model._encoder._stacking_layers[0][ 1]._layer._dense2.weight.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0]._ffn_layer._layer._conv2. kernel.numpy().transpose([1, 0])) pt_model._encoder._stacking_layers[0][ 1]._layer._dense2.bias.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0]._ffn_layer._layer._conv2. bias.numpy()) pt_model._encoder._stacking_layers[0][ 0]._norm_layer.weight.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0]._selfatt_layer._norm_layer. gamma.numpy()) pt_model._encoder._stacking_layers[0][ 0]._norm_layer.bias.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0]._selfatt_layer._norm_layer. beta.numpy()) pt_model._encoder._stacking_layers[0][ 1]._norm_layer.weight.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0]._ffn_layer._norm_layer.gamma. numpy()) pt_model._encoder._stacking_layers[0][ 1]._norm_layer.bias.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0]._ffn_layer._norm_layer.beta. numpy()) pt_model._decoder._output_norm_layer.weight.data = torch.FloatTensor( tf_model._decoder._output_norm_layer.gamma.numpy()) pt_model._decoder._output_norm_layer.bias.data = torch.FloatTensor( tf_model._decoder._output_norm_layer.beta.numpy()) pt_model._decoder._stacking_layers[0][ 0]._layer._qkv_transform_layer._kernel.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._selfatt_layer._layer. _qkv_transform_layer._kernel.numpy()) pt_model._decoder._stacking_layers[0][ 0]._layer._qkv_transform_layer._bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._selfatt_layer._layer. _qkv_transform_layer._bias.numpy()) pt_model._decoder._stacking_layers[0][ 0]._layer._output_transform_layer._kernel.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._selfatt_layer._layer. _output_transform_layer._kernel.numpy()) pt_model._decoder._stacking_layers[0][ 0]._layer._output_transform_layer._bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._selfatt_layer._layer. _output_transform_layer._bias.numpy()) pt_model._decoder._stacking_layers[0][ 0]._norm_layer.weight.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._selfatt_layer._norm_layer. gamma.numpy()) pt_model._decoder._stacking_layers[0][ 0]._norm_layer.bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._selfatt_layer._norm_layer. beta.numpy()) pt_model._decoder._stacking_layers[0][ 1]._layer._q_transform_layer._kernel.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._crossatt_layer._layer. _q_transform_layer._kernel.numpy()) pt_model._decoder._stacking_layers[0][ 1]._layer._q_transform_layer._bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._crossatt_layer._layer. _q_transform_layer._bias.numpy()) pt_model._decoder._stacking_layers[0][ 1]._layer._kv_transform_layer._kernel.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._crossatt_layer._layer. _kv_transform_layer._kernel.numpy()) pt_model._decoder._stacking_layers[0][ 1]._layer._kv_transform_layer._bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._crossatt_layer._layer. _kv_transform_layer._bias.numpy()) pt_model._decoder._stacking_layers[0][ 1]._layer._output_transform_layer._kernel.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._crossatt_layer._layer. _output_transform_layer._kernel.numpy()) pt_model._decoder._stacking_layers[0][ 1]._layer._output_transform_layer._bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._crossatt_layer._layer. _output_transform_layer._bias.numpy()) pt_model._decoder._stacking_layers[0][ 1]._norm_layer.weight.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._crossatt_layer._norm_layer. gamma.numpy()) pt_model._decoder._stacking_layers[0][ 1]._norm_layer.bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._crossatt_layer._norm_layer. beta.numpy()) pt_model._decoder._stacking_layers[0][ 2]._layer._dense1.weight.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._ffn_layer._layer._conv1. kernel.numpy().transpose([1, 0])) pt_model._decoder._stacking_layers[0][ 2]._layer._dense1.bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._ffn_layer._layer._conv1. bias.numpy()) pt_model._decoder._stacking_layers[0][ 2]._layer._dense2.weight.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._ffn_layer._layer._conv2. kernel.numpy().transpose([1, 0])) pt_model._decoder._stacking_layers[0][ 2]._layer._dense2.bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._ffn_layer._layer._conv2. bias.numpy()) pt_model._decoder._stacking_layers[0][ 2]._norm_layer.weight.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._ffn_layer._norm_layer.gamma. numpy()) pt_model._decoder._stacking_layers[0][ 2]._norm_layer.bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0]._ffn_layer._norm_layer.beta. numpy()) assert_equal_numpy( tf_model(tf_inps, is_training=False).numpy(), pt_model(pt_inps, is_training=False).detach().numpy(), 5e-6)
def test_lower_triangle_attention_bias(): assert_equal_numpy(lower_triangle_attention_bias(5).numpy(), pt_lower_triangle_attention_bias(5).detach().numpy())
def test_seq2seq(): params = copy.deepcopy( get_hyper_parameters("transformer_toy")["model.params"]) params["modality.source.dim"] = None params["modality.target.dim"] = None params["modality.source.timing"] = None params["modality.target.timing"] = None params["encoder.num_layers"] = 1 params["decoder.num_layers"] = 1 src_vocab_meta = dict(vocab_size=8, eos_id=7, bos_id=6, unk_id=5) trg_vocab_meta = dict(vocab_size=5, eos_id=4, bos_id=3, unk_id=2) pt_inps = { "src": torch.LongTensor([[0, 1, 1, 7], [1, 7, 7, 7]]), "src_padding": torch.FloatTensor([[0, 0, 0, 0.], [0, 0, 1, 1.]]), "trg_input": torch.LongTensor([[3, 0, 1], [3, 2, 4]]), "trg": torch.LongTensor([[0, 1, 4], [2, 4, 4]]), "trg_padding": torch.FloatTensor([[0, 0, 0.], [0, 0, 1.]]), } tf_inps = { "src": tf.convert_to_tensor([[0, 1, 1, 7], [1, 7, 7, 7]], tf.int64), "src_padding": tf.convert_to_tensor([[0, 0, 0, 0.], [0, 0, 1, 1.]], tf.float32), "trg_input": tf.convert_to_tensor([[3, 0, 1], [3, 2, 4]], tf.int32), "trg": tf.convert_to_tensor([[0, 1, 4], [2, 4, 4]], tf.int32), "trg_padding": tf.convert_to_tensor([[0, 0, 0.], [0, 0, 1.]], tf.float32), } pt_model: Transformer = build_pt_model( { "model.class": "transformer", "params": params }, src_meta=src_vocab_meta, trg_meta=trg_vocab_meta) tf_model: TFTransformer = build_model( { "model.class": "transformer", "params": params }, src_meta=src_vocab_meta, trg_meta=trg_vocab_meta) pt_model._src_modality.embedding_layer._shared_weights.data = torch.FloatTensor( tf_model._src_modality.embedding_layer._shared_weights.numpy()) pt_model._trg_modality.embedding_layer._shared_weights.data = torch.FloatTensor( tf_model._trg_modality.embedding_layer._shared_weights.numpy()) pt_model._trg_modality.embedding_layer._bias.data = torch.FloatTensor( tf_model._trg_modality.embedding_layer._bias.numpy()) pt_model._encoder._output_norm_layer.weight.data = torch.FloatTensor( tf_model._encoder._output_norm_layer.gamma.numpy()) pt_model._encoder._output_norm_layer.bias.data = torch.FloatTensor( tf_model._encoder._output_norm_layer.beta.numpy()) pt_model._encoder._stacking_layers[0][ 0]._layer._qkv_transform_layer._kernel.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0] [0]._layer._qkv_transform_layer._kernel.numpy()) pt_model._encoder._stacking_layers[0][ 0]._layer._qkv_transform_layer._bias.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0] [0]._layer._qkv_transform_layer._bias.numpy()) pt_model._encoder._stacking_layers[0][ 0]._layer._output_transform_layer._kernel.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0] [0]._layer._output_transform_layer._kernel.numpy()) pt_model._encoder._stacking_layers[0][ 0]._layer._output_transform_layer._bias.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0] [0]._layer._output_transform_layer._bias.numpy()) pt_model._encoder._stacking_layers[0][ 1]._layer._dense1.weight.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0] [1]._layer._conv1.kernel.numpy().transpose([1, 0])) pt_model._encoder._stacking_layers[0][ 1]._layer._dense1.bias.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0] [1]._layer._conv1.bias.numpy()) pt_model._encoder._stacking_layers[0][ 1]._layer._dense2.weight.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0] [1]._layer._conv2.kernel.numpy().transpose([1, 0])) pt_model._encoder._stacking_layers[0][ 1]._layer._dense2.bias.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0] [1]._layer._conv2.bias.numpy()) pt_model._encoder._stacking_layers[0][ 0]._norm_layer.weight.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0][0]._norm_layer.gamma.numpy()) pt_model._encoder._stacking_layers[0][ 0]._norm_layer.bias.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0][0]._norm_layer.beta.numpy()) pt_model._encoder._stacking_layers[0][ 1]._norm_layer.weight.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0][1]._norm_layer.gamma.numpy()) pt_model._encoder._stacking_layers[0][ 1]._norm_layer.bias.data = torch.FloatTensor( tf_model._encoder._stacking_layers[0][1]._norm_layer.beta.numpy()) pt_model._decoder._output_norm_layer.weight.data = torch.FloatTensor( tf_model._decoder._output_norm_layer.gamma.numpy()) pt_model._decoder._output_norm_layer.bias.data = torch.FloatTensor( tf_model._decoder._output_norm_layer.beta.numpy()) pt_model._decoder._stacking_layers[0][ 0]._layer._qkv_transform_layer._kernel.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0] [0]._layer._qkv_transform_layer._kernel.numpy()) pt_model._decoder._stacking_layers[0][ 0]._layer._qkv_transform_layer._bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0] [0]._layer._qkv_transform_layer._bias.numpy()) pt_model._decoder._stacking_layers[0][ 0]._layer._output_transform_layer._kernel.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0] [0]._layer._output_transform_layer._kernel.numpy()) pt_model._decoder._stacking_layers[0][ 0]._layer._output_transform_layer._bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0] [0]._layer._output_transform_layer._bias.numpy()) pt_model._decoder._stacking_layers[0][ 0]._norm_layer.weight.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0][0]._norm_layer.gamma.numpy()) pt_model._decoder._stacking_layers[0][ 0]._norm_layer.bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0][0]._norm_layer.beta.numpy()) pt_model._decoder._stacking_layers[0][ 1]._layer._q_transform_layer._kernel.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0] [1]._layer._q_transform_layer._kernel.numpy()) pt_model._decoder._stacking_layers[0][ 1]._layer._q_transform_layer._bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0] [1]._layer._q_transform_layer._bias.numpy()) pt_model._decoder._stacking_layers[0][ 1]._layer._kv_transform_layer._kernel.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0] [1]._layer._kv_transform_layer._kernel.numpy()) pt_model._decoder._stacking_layers[0][ 1]._layer._kv_transform_layer._bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0] [1]._layer._kv_transform_layer._bias.numpy()) pt_model._decoder._stacking_layers[0][ 1]._layer._output_transform_layer._kernel.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0] [1]._layer._output_transform_layer._kernel.numpy()) pt_model._decoder._stacking_layers[0][ 1]._layer._output_transform_layer._bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0] [1]._layer._output_transform_layer._bias.numpy()) pt_model._decoder._stacking_layers[0][ 1]._norm_layer.weight.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0][1]._norm_layer.gamma.numpy()) pt_model._decoder._stacking_layers[0][ 1]._norm_layer.bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0][1]._norm_layer.beta.numpy()) pt_model._decoder._stacking_layers[0][ 2]._layer._dense1.weight.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0] [2]._layer._conv1.kernel.numpy().transpose([1, 0])) pt_model._decoder._stacking_layers[0][ 2]._layer._dense1.bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0] [2]._layer._conv1.bias.numpy()) pt_model._decoder._stacking_layers[0][ 2]._layer._dense2.weight.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0] [2]._layer._conv2.kernel.numpy().transpose([1, 0])) pt_model._decoder._stacking_layers[0][ 2]._layer._dense2.bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0] [2]._layer._conv2.bias.numpy()) pt_model._decoder._stacking_layers[0][ 2]._norm_layer.weight.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0][2]._norm_layer.gamma.numpy()) pt_model._decoder._stacking_layers[0][ 2]._norm_layer.bias.data = torch.FloatTensor( tf_model._decoder._stacking_layers[0][2]._norm_layer.beta.numpy()) assert_equal_numpy( tf_model(tf_inps, is_training=False).numpy(), pt_model(pt_inps, is_training=False).detach().numpy(), 5e-6)