def build_modalities(cls, model_args, src_meta, trg_meta): """ Create source and target modality. """ # if modality.source.dim is not defined, then use modality.dim as default src_dim = model_args["modality.source.dim"] or model_args["modality.dim"] # if modality.target.dim is not defined, then use modality.dim as default trg_dim = model_args["modality.target.dim"] or model_args["modality.dim"] # whether to share source and target embedding input_name = "input_audio_modality" target_name = "target_symbol_modality" # creates target embedding table target_modality = cls.build_modality( vocab_size=trg_meta["vocab_size"], emb_dim=trg_dim, name=target_name, timing=(model_args["modality.target.timing"] or model_args["modality.timing"]), share_embedding_and_softmax_weights=model_args["modality.share_embedding_and_softmax_weights"]) # creates source audio modality input_modality = AudioConv2dSubsamplingLayer( embedding_dim=src_dim, kernel_size=model_args["modality.source.kernel_size"], strides=model_args["modality.source.strides"], channels=model_args["modality.source.channels"], layer_norm=model_args["modality.source.layer_norm"], name=input_name) src_timing = model_args["modality.source.timing"] or model_args["modality.timing"] if src_timing: if isinstance(src_timing, str): src_timing = {"timing": src_timing} elif not isinstance(src_timing, dict): raise ValueError("Unknown type of timing params: {}".format(str(src_timing))) input_modality = PositionEmbeddingWrapper( embedding_layer=input_modality, name=input_name + "_posenc_wrapper", **src_timing) return input_modality, target_modality
def build_modality(cls, vocab_size, emb_dim, name, timing=None, share_embedding_and_softmax_weights=False): """ Creates modality layer. Args: vocab_size: An integer, the vocabulary size. emb_dim: An integer, the dimension of the embedding. timing: A string or a dict of parameter for positional embedding parameters. name: A string, the layer name. share_embedding_and_softmax_weights: Whether to share the embedding table and softmax weight. Returns: A modality layer. """ modality = WordEmbeddingSharedWeights( embedding_dim=emb_dim, vocab_size=vocab_size, share_softmax_weights=share_embedding_and_softmax_weights, name=name) # position embedding wrapper if timing: if isinstance(timing, str): timing = {"timing": timing} elif not isinstance(timing, dict): raise ValueError("Unknown type of timing params: {}".format( str(timing))) modality = PositionEmbeddingWrapper(embedding_layer=modality, name=name + "_posenc_wrapper", **timing) return modality
def test_position_embedding(): tf_postbl = PositionEmbeddingWrapper.add_sinusoids_timing_signal( tf.zeros([1, 10, 10]), None) pt_postbl = PTPositionEmbeddingWrapper.add_sinusoids_timing_signal( torch.zeros(1, 10, 10), None) assert_equal_numpy(tf_postbl.numpy(), pt_postbl.detach().numpy()) emb_dim = 5 vocab_size = 10 tf_emb = WordEmbeddingSharedWeights(emb_dim, vocab_size, False) pt_emb = PTWordEmbeddingSharedWeights(emb_dim, vocab_size, False) inp_2d = numpy.random.randint(0, 9, [2, 5]) inp_1d = numpy.random.randint(0, 9, [ 3, ]) logits_2d = numpy.random.rand(2, 5) logits_3d = numpy.random.rand(2, 4, 5) tf_inp_2d = tf.convert_to_tensor(inp_2d, tf.int32) tf_inp_1d = tf.convert_to_tensor(inp_1d, tf.int32) tf_logits_2d = tf.convert_to_tensor(logits_2d, tf.float32) tf_logits_3d = tf.convert_to_tensor(logits_3d, tf.float32) pt_inp_2d = torch.IntTensor(inp_2d) pt_inp_1d = torch.IntTensor(inp_1d) pt_logits_2d = torch.FloatTensor(logits_2d) pt_logits_3d = torch.FloatTensor(logits_3d) _ = tf_emb(tf_logits_2d, mode="linear") _ = pt_emb(pt_logits_2d, mode="linear") pt_emb._shared_weights.data = torch.Tensor(tf_emb._shared_weights.numpy()) tf_posemb = PositionEmbeddingWrapper("sinusoids", tf_emb) pt_posemb = PTPositionEmbeddingWrapper("sinusoids", pt_emb) assert_equal_numpy( tf_posemb(tf_logits_2d, mode="linear").numpy(), pt_posemb(pt_logits_2d, mode="linear").detach().numpy()) assert_equal_numpy( tf_posemb(tf_logits_3d, mode="linear").numpy(), pt_posemb(pt_logits_3d, mode="linear").detach().numpy()) assert_equal_numpy( tf_posemb(tf_inp_2d).numpy(), pt_posemb(pt_inp_2d).detach().numpy()) assert_equal_numpy( tf_posemb(tf_inp_1d, time=5).numpy(), pt_posemb(pt_inp_1d, time=5).detach().numpy())
def new(cls, args: dict, vocab_meta, name=None): """ Builds a sequence to sequence model. Args: args: A dict containing all model parameters. vocab_meta: A dict containing source-side vocabulary meta data, e.g. eos_id, vocab_size. name: The name of the model. Returns: A GPT2 model. """ embedding = WordEmbeddingSharedWeights( embedding_dim=args["hidden_size"], vocab_size=vocab_meta["vocab_size"], share_softmax_weights=True, use_bias=args["softmax_bias"], name="embeddings") timing = args["timing"] if timing: if isinstance(timing, str): timing = {"timing": timing} elif not isinstance(timing, dict): raise ValueError("Unknown type of timing params: {}".format( str(timing))) embedding = PositionEmbeddingWrapper(embedding_layer=embedding, name="posenc_wrapper", **timing) decoder = TransformerDecoder( num_layers=args["num_layers"], hidden_size=args["hidden_size"], num_attention_heads=args["num_attention_heads"], filter_size=args["filter_size"], ffn_activation=args["ffn_activation"], attention_dropout_rate=args["attention_dropout_rate"], attention_type=args["attention_type"], ffn_dropout_rate=args["ffn_dropout_rate"], layer_postprocess_dropout_rate=args[ "layer_postprocess_dropout_rate"], layer_postprocess_epsilon=args["layer_postprocess_epsilon"], no_cross_attn_layer_list=[i for i in range(args["num_layers"])], name="decoder") model = cls(args, vocab_meta, embedding, decoder, name=name) _ = model({ "trg": tf.convert_to_tensor([[0, 1, 2, vocab_meta["pad_id"]]], tf.int64), "trg_input": tf.convert_to_tensor([[vocab_meta["bos_id"], 1, 2]], tf.int64), "trg_length": tf.convert_to_tensor([4], tf.int64) }) return model
def new(cls, args: dict, vocab_meta, name=None): """ Builds a sequence to sequence model. Args: args: A dict containing all model parameters. vocab_meta: A dict containing source-side vocabulary meta data, e.g. eos_id, vocab_size. name: The name of the model. Returns: A GPT2 model. """ # build source and target modality embedding = PositionEmbeddingWrapper( embedding_layer=WordEmbeddingSharedWeights( embedding_dim=args["hidden_size"], vocab_size=vocab_meta["vocab_size"], share_softmax_weights=True, use_bias=False, name="embeddings"), name="posenc_wrapper", timing="emb", max_positions=args["max_position_embeddings"]) decoder = TransformerDecoder( num_layers=args["num_layers"], hidden_size=args["hidden_size"], num_attention_heads=args["num_attention_heads"], filter_size=args["filter_size"], ffn_activation=args["ffn_activation"], attention_dropout_rate=args["attention_dropout_rate"], attention_type=args["attention_type"], ffn_dropout_rate=args["ffn_dropout_rate"], layer_postprocess_dropout_rate=args[ "layer_postprocess_dropout_rate"], layer_postprocess_epsilon=args["layer_postprocess_epsilon"], with_encoder_decoder_attention=False, name="decoder") model = cls(args, vocab_meta, embedding, decoder, name=name) _ = model({"tokens": tf.convert_to_tensor([[0, 1, 2]], tf.int64)}) return model
def test_position_embedding(): embedding_layer = WordEmbeddingSharedWeights(embedding_dim=5, vocab_size=10, share_softmax_weights=False) embedding_layer = PositionEmbeddingWrapper( timing="sinusoids", embedding_layer=embedding_layer, ) inputs1d = tf.convert_to_tensor([4, 7, 8], tf.int32) inputs2d = tf.convert_to_tensor([[3, 1, 1, 1], [8, 1, 6, 4], [6, 6, 0, 5]], tf.int32) _ = embedding_layer(inputs2d) assert len(embedding_layer.get_weights()) == 1 assert "emb/weights" in embedding_layer.trainable_weights[0].name embedding_layer.set_weights([ numpy.array( [[-0.22683287, 0.20732224, -0.10953838, 0.15318757, -0.07203472], [0.48726183, 0.53683335, 0.38046378, -0.42776877, 0.51263684], [-0.20618078, 0.43871957, 0.26764846, 0.57276505, -0.13321346], [0.34847826, 0.1998071, 0.48136407, -0.03138721, -0.5397158], [-0.31466845, 0.24504018, 0.38156456, -0.03245735, 0.28105468], [-0.4769836, -0.2763745, -0.35024986, 0.5304734, -0.2523746], [0.13987714, -0.36480358, 0.5633767, 0.04371119, -0.5429846], [0.07482189, 0.4224295, 0.5645891, -0.12718052, 0.3637674], [0.4379062, 0.11231863, -0.6134181, -0.53932106, -0.5402442], [-0.18054467, -0.21964127, -0.14727849, 0.61441237, -0.13402274]]) ]) emb_for_2d = embedding_layer(inputs2d) emb_for_1d = embedding_layer(inputs1d, time=3) assert numpy.sum((emb_for_2d.numpy() - numpy.array( [[[0.77922106, 0.4467823, 2.0763628, 0.92981607, -1.2068413], [1.9310216, 1.2004958, 1.3910451, 0.04347992, 1.1462909], [1.998848, 1.2005959, 0.43459606, 0.04347992, 1.1462909], [1.2306706, 1.2006959, -0.13924962, 0.04347986, 1.1462909]], [[0.9791881, 0.2511521, -0.37164462, -0.2059586, -1.2080228], [1.9310216, 1.2004958, 1.3910451, 0.04347992, 1.1462909], [1.2220722, -0.81552565, 0.8436019, 1.0977412, -1.2141505], [-0.56250006, 0.5482265, -0.13678819, 0.9274231, 0.62845737]], [[0.3127748, -0.8157256, 2.2597487, 1.0977412, -1.2141505], [1.1542457, -0.8156256, 1.800051, 1.0977412, -1.2141505], [0.4020837, 0.46378663, -0.6610821, 1.3425379, -0.16107452], [-0.92544776, -0.6176922, -1.773175, 2.1861746, -0.56432676]]]))** 2) < 1e-9 assert numpy.sum((emb_for_1d.numpy() - numpy.array( [[-0.56250006, 0.5482265, -0.13678819, 0.9274231, 0.62845737], [0.30842686, 0.9448811, 0.27246714, 0.71561563, 0.8134086], [1.120308, 0.2514521, -2.361637, -0.20595866, -1.2080228]]))** 2) < 1e-9 emb_shared_layer = WordEmbeddingSharedWeights(embedding_dim=5, vocab_size=10, share_softmax_weights=True) emb_shared_layer = PositionEmbeddingWrapper( timing="emb", embedding_layer=emb_shared_layer) emb_for_2d = emb_shared_layer(inputs2d) logits_for_2d = emb_shared_layer(emb_for_2d, mode="linear") assert len(emb_shared_layer.get_weights()) == 3 for w in emb_shared_layer.trainable_weights: if "shared/weights" in w.name: weights = w elif "shared/bias" in w.name: bias = w assert numpy.sum( (numpy.dot(emb_for_2d.numpy(), numpy.transpose(weights.numpy())) + bias.numpy() - logits_for_2d.numpy())**2) < 1e-9