def __init__(self, in_dim, out_dim, N, heads, model_dim, key_dim, value_dim, ff_dim, max_len=10000, batch_first=True): super().__init__() self.name = 'transformer' self.batch_first = batch_first self.model_dim = model_dim # define layers # embedding layers self.src_embed = nn.Linear(in_dim, model_dim) self.tgt_embed = nn.Linear(in_dim, model_dim) self.pos_enc = PositionalEncoding(model_dim, max_len) # encoder-decoder self.encoder = Encoder(N, heads, model_dim, key_dim, value_dim, ff_dim) self.decoder = Decoder(N, heads, model_dim, key_dim, value_dim, ff_dim) # final output layer self.fc = nn.Linear(model_dim, out_dim) # xavier initialization for p in self.parameters(): if p.dim() > 1 and p.requires_grad: nn.init.xavier_uniform_(p)
def __init__(self, vocab_size, embed_model=None, emb_size=100, hidden_size=128, \ input_dropout_p=0, dropout_p=0, n_layers=1, bidirectional=False, \ rnn_cell=None, rnn_cell_name='gru', variable_lengths=True,d_ff=2048,dropout=0.3,N=1): super(EncoderRNN, self).__init__(vocab_size, emb_size, hidden_size, input_dropout_p, dropout_p, n_layers, rnn_cell_name) self.variable_lengths = variable_lengths self.bidirectional = bidirectional if bidirectional: self.d_model = 2 * hidden_size else: self.d_model = hidden_size ff = PositionwiseFeedForward(self.d_model, d_ff, dropout) if embed_model is None: self.embedding = nn.Embedding(vocab_size, emb_size) else: self.embedding = embed_model if rnn_cell is None: self.rnn = self.rnn_cell(emb_size, hidden_size, n_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout_p) else: self.rnn = rnn_cell self.group_attention = GroupAttention(8, self.d_model) self.onelayer = Encoder( EncoderLayer(self.d_model, deepcopy(self.group_attention), deepcopy(ff), dropout), N)
def __init__(self, model_dim, max_len, n_layer, n_head, n_vocab, lr, max_seg=3, drop_rate=0.1, padding_idx=0): super().__init__() self.padding_idx = padding_idx self.n_vocab = n_vocab self.max_len = max_len self.word_emb = keras.layers.Embedding( input_dim=n_vocab, output_dim=model_dim, # [n_vocab, dim] embeddings_initializer=tf.initializers.RandomNormal(0., 0.01), ) self.segment_emb = keras.layers.Embedding( input_dim=max_seg, output_dim=model_dim, # [max_seg, dim] embeddings_initializer=tf.initializers.RandomNormal(0., 0.01), ) self.position_emb = keras.layers.Embedding( input_dim=max_len, output_dim=model_dim, # [step, dim] embeddings_initializer=tf.initializers.RandomNormal(0., 0.01), ) self.position_emb = self.add_weight( name="pos", shape=[max_len, model_dim], dtype=tf.float32, initializer=keras.initializers.RandomNormal(0., 0.01)) self.position_space = tf.ones((1, max_len, max_len)) self.encoder = Encoder(n_head, model_dim, drop_rate, n_layer) self.task_mlm = keras.layers.Dense(n_vocab) self.task_nsp = keras.layers.Dense(2) self.cross_entropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none") self.opt = keras.optimizers.Adam(lr)
def __init__(self, preprocess_config, model_config): super(FastSpeech2, self).__init__() self.model_config = model_config self.encoder = Encoder(model_config) self.variance_adaptor = VarianceAdaptor(preprocess_config, model_config) self.decoder = Decoder(model_config) self.mel_linear = nn.Linear( model_config["transformer"]["decoder_hidden"], preprocess_config["preprocessing"]["mel"]["n_mel_channels"], ) self.postnet = PostNet() self.speaker_emb = None if model_config["multi_speaker"]: with open( os.path.join( preprocess_config["path"]["preprocessed_path"], "speakers.json" ), "r", ) as f: n_speaker = len(json.load(f)) self.speaker_emb = nn.Embedding( n_speaker, model_config["transformer"]["encoder_hidden"], )
def make_model(opt, src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab), opt) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform(p) return model
def test_encoder(self): max_seq_len, vocab_size, embed_dim, n_heads, dropout_rate, n_layers = 10, 200, 512, 8, 0.5, 6 encoder = Encoder(vocab_size, embed_dim, max_seq_len, n_heads, dropout_rate, n_layers) batch_size = 5 x = torch.randint(0, vocab_size, size=(batch_size, max_seq_len)) assert encoder(x).shape == (batch_size, max_seq_len, embed_dim)
def create_model(seq_len, vocab_size, pad_id, N, d_model, d_ff, h, dropout): inp = Input((seq_len, )) embedding = Embedding(vocab_size, d_model, pad_id)(inp) encoding = PositionalEncoding(d_model)(inp) net = Add()([embedding, encoding]) net = Dropout(dropout)(net) mask = Lambda(lambda t: create_padding_mask(t, pad_id), name="input_mask")(inp) net = Encoder(N=N, d_model=d_model, d_ff=d_ff, h=h, dropout=dropout)([net, mask]) net = Flatten()(net) net = Dense(2, activation="softmax")(net) model = Model(inp, net) # NOTE: keras optimizers cannot be saved with optimizer state # need to use an optimizer from `tf.train` # NOTE: this seems to be a 1.0 thing, in 2.0 all tf.train optimizers are # dropped and the keras versions are the only implementations # NOTE: this is not recommended for training, the paper authors describe # a variable learning rate schedule, that still needs to be implemented. optimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.98, epsilon=1e-9) model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["acc"]) return model
def transformer_pretrain( num_layers=4, d_model=128, num_heads=8, dff=256, maximum_position_encoding=2048, ): inp = Input((None, d_model)) encoder = Encoder( num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, maximum_position_encoding=maximum_position_encoding, rate=0.3, ) x = encoder(inp) out = Dense(d_model, activation="linear", name="out_pretraining")(x) model = Model(inputs=inp, outputs=out) opt = Adam(0.0001) model.compile(optimizer=opt, loss=mae) model.summary() return model
def __init__(self, src_vocab, tgt_vocab, hparams=None): super(GraphTransformer, self).__init__() self.hparams = dict(GraphTransformer.default_hparams) if hparams: for k, v in hparams.items(): if k in self.hparams: self.hparams[k] = v self.src_vocab = src_vocab self.src_vocab_size = len(src_vocab) self.tgt_vocab = tgt_vocab self.tgt_vocab_size = len(tgt_vocab) self.src_seq_len = self.hparams["num_src_tokens"] self.tgt_seq_len = self.hparams["num_tgt_tokens"] self.biaffine = self.hparams["biaffine"] self.encoder = Encoder(num_layers=self.hparams["num_layers"], d_model=self.hparams["d_model"], num_heads=self.hparams["num_heads"], dff=self.hparams["dff"], source_vocab_size=self.src_vocab_size, maximum_position_encoding=self.src_seq_len, rate=self.hparams["dropout"]) self.decoder = GraphDecoder(num_layers=self.hparams["num_layers"], d_model=self.hparams["d_model"], num_heads=self.hparams["num_heads"], dff=self.hparams["dff"], tgt_vocab=self.tgt_vocab, src_seq_len=self.src_seq_len, maximum_position_encoding=self.tgt_seq_len, rate=self.hparams["dropout"], biaffine=self.biaffine)
def __init__(self, dataset, params): super(Rel_time_emb, self).__init__() self.dataset = dataset self.params = params self.create_time_embedds() self.time_nl = torch.sin self.his_encoder = Encoder(self.params)
def __init__(self, sentence_encoder, hidden, n_layers, n_head, d_k, d_v, d_model, d_inner, d_mlp, dropout=0.1): super(Model, self).__init__() self.PositionEncoder = PositionalEncoding(dropout, hidden*2) self.Transformer = Encoder(n_layers, n_head, d_k, d_v, d_model, d_inner) self.Dropoutlayer = nn.Dropout(p=dropout) self.Decoderlayer = self.build_decoder(hidden*2, d_mlp, dropout) self.sentence_encoder = sentence_encoder self.criterion = nn.CrossEntropyLoss()
def __init__(self, model_dim, max_len, n_layer, n_head, n_vocab, lr, max_seg=3, drop_rate=0.1, padding_idx=0): super().__init__() self.padding_idx = padding_idx self.n_vocab = n_vocab self.max_len = max_len # I think task emb is not necessary for pretraining, # because the aim of all tasks is to train a universal sentence embedding # the body encoder is the same across all task, and the output layer defines each task. # finetuning replaces output layer and leaves the body encoder unchanged. # self.task_emb = keras.layers.Embedding( # input_dim=n_task, output_dim=model_dim, # [n_task, dim] # embeddings_initializer=tf.initializers.RandomNormal(0., 0.01), # ) self.word_emb = keras.layers.Embedding( input_dim=n_vocab, output_dim=model_dim, # [n_vocab, dim] embeddings_initializer=tf.initializers.RandomNormal(0., 0.01), ) self.segment_emb = keras.layers.Embedding( input_dim=max_seg, output_dim=model_dim, # [max_seg, dim] embeddings_initializer=tf.initializers.RandomNormal(0., 0.01), ) self.position_emb = keras.layers.Embedding( input_dim=max_len, output_dim=model_dim, # [step, dim] embeddings_initializer=tf.initializers.RandomNormal(0., 0.01), ) self.position_emb = self.add_weight( name="pos", shape=[max_len, model_dim], dtype=tf.float32, initializer=keras.initializers.RandomNormal(0., 0.01)) self.position_space = tf.ones((1, max_len, max_len)) self.encoder = Encoder(n_head, model_dim, drop_rate, n_layer) self.o_mlm = keras.layers.Dense(n_vocab) self.o_nsp = keras.layers.Dense(2) self.cross_entropy = keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction="none") self.opt = keras.optimizers.Adam(lr)
def __init__(self, vocab, config): super(NER_SOFTMAX_CHAR, self).__init__() word_emb_matrix = get_word_embd(vocab, config) embd_vector = torch.from_numpy(word_emb_matrix).float() self.word_embeds = nn.Embedding.from_pretrained(embd_vector, freeze=False) self.char_embeds = nn.Embedding(len(vocab.char_to_id), config.char_embd_dim, padding_idx=Constants.PAD_ID) if config.is_caps: self.caps_embeds = nn.Embedding(vocab.get_caps_cardinality(), config.caps_embd_dim, padding_idx=Constants.PAD_ID) self.lstm_char = nn.LSTM(self.char_embeds.embedding_dim, config.char_lstm_dim, num_layers=1, bidirectional=True, batch_first=True) input_size = self.word_embeds.embedding_dim + config.char_embd_dim * 2 if config.is_caps: input_size += config.caps_embd_dim model_dim = 128 #512 num_head = 2 #8 num_layer = 2 #6 dropout_ratio = 0.1 affine_dim = 256 #2048 self.tx_proj = nn.Linear(input_size, model_dim) self.lstm = Encoder(num_layer, num_head, dropout_ratio, model_dim, affine_dim) self.dropout = nn.Dropout(config.dropout_rate) self.hidden_layer = nn.Linear(model_dim, config.word_lstm_dim) self.tanh_layer = torch.nn.Tanh() self.hidden2tag = nn.Linear(config.word_lstm_dim, len(vocab.id_to_tag)) self.config = config init_lstm_wt(self.lstm_char) init_linear_wt(self.hidden_layer) init_linear_wt(self.hidden2tag) self.char_embeds.weight.data.uniform_(-1., 1.) if config.is_caps: self.caps_embeds.weight.data.uniform_(-1., 1.)
def __init__(self, in_dim, out_dim, N, heads, embed_dim, model_dim, key_dim, value_dim, ff_dim, dropout=0.1, max_len=10000, batch_first=True, pretrained_vec=None): super().__init__() self.name = 'transformer' self.batch_first = batch_first self.model_dim = model_dim self.embed_dim = embed_dim # define layers self.embedding = nn.Embedding(in_dim, embed_dim) # not training embedding layer if pretrained embedding is provided if pretrained_vec is not None: self.embedding = self.embedding.from_pretrained(pretrained_vec, freeze=True) if embed_dim != model_dim: self.fc_in = nn.Linear(embed_dim, model_dim) self.pos_enc = PositionalEncoding(model_dim, max_len) self.encoder = Encoder(N, heads, model_dim, key_dim, value_dim, ff_dim, dropout=dropout) # final output layer self.fc = nn.Linear(model_dim, out_dim) # xavier initialization for p in self.parameters(): if p.dim() > 1 and p.requires_grad: nn.init.xavier_uniform_(p)
def transformer_classifier( num_layers=4, d_model=128, num_heads=8, dff=256, maximum_position_encoding=2048, n_classes=16, ): inp = Input((None, d_model)) encoder = Encoder( num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, maximum_position_encoding=maximum_position_encoding, rate=0.3, ) x = encoder(inp) x = Dropout(0.2)(x) x = GlobalAvgPool1D()(x) x = Dense(4 * n_classes, activation="selu")(x) out = Dense(n_classes, activation="sigmoid")(x) model = Model(inputs=inp, outputs=out) opt = Adam(0.00001) model.compile(optimizer=opt, loss=custom_binary_crossentropy, metrics=[custom_binary_accuracy]) model.summary() return model
def setup_self_attn_model(): import torch.nn as nn from transformer import Encoder, Decoder, Transformer, EncoderLayer, DecoderLayer, SelfAttention, PositionwiseFeedforward device = torch.device('cuda:0') pad_idx = DE.vocab.stoi["<pad>"] hid_dim = 300 n_layers = 3 n_heads = 4 pf_dim = 512 # 2048 dropout = 0.1 input_dim = len(DE.vocab) enc = Encoder(input_dim, hid_dim, n_layers, n_heads, pf_dim, EncoderLayer, SelfAttention, PositionwiseFeedforward, dropout, device) output_dim = len(EN.vocab) dec = Decoder(output_dim, hid_dim, n_layers, n_heads, pf_dim, DecoderLayer, SelfAttention, PositionwiseFeedforward, dropout, device) model = Transformer(enc, dec, pad_idx, device) for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) # model.load_state_dict(torch.load("weights/bigger_self_attn_weights")) train_model(model, num_epochs=100, learning_rate=0.001, weight_decay=0, log_freq=1, self_attn_hid_dim=hid_dim) torch.save(model.state_dict(), "weights/bigger_self_attn_weights") return model
def __init__(self): super(EventEncoder, self).__init__() self.embedding = tf.keras.layers.Embedding(input_dim=len(vocab), output_dim=embedding_size) self.trajectory_encoder = tf.keras.models.Sequential([ tf.keras.layers.Input(shape=(EVENT_SIZE, 3, 1)), tf.keras.layers.Conv2D(filters=32, kernel_size=3, strides=(2, 1), padding='same', activation='relu'), tf.keras.layers.MaxPool2D(2, strides=(2, 1)), tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=(2, 1), padding='same', activation='relu'), tf.keras.layers.MaxPool2D(2, strides=(2, 1)), tf.keras.layers.Flatten(), tf.keras.layers.Dropout(.5), tf.keras.layers.Dense(d_model, activation='relu') ]) self.encoder = Encoder(1, d_model + embedding_size, 4, dff)
def __init__(self, dim, src_n_vocab, n_encod_layer, tgt_n_vocab, n_decode_layer, max_len=512): self.src_emb = EmbeddingWithPositionalEncoding(dim, src_n_vocab, max_len) self.tgt_emb = EmbeddingWithLearnedPositionalEncoding( dim, tgt_n_vocab, max_len) enc_layer = TransformerLayer(dim, MultiHeadAttention(6, dim, 0.1), None, nn.Linear(dim, dim), 0.1) self.encoder = Encoder(enc_layer, n_encod_layer) dec_layer = TransformerLayer(dim, MultiHeadAttention(6, dim, 0.1), MultiHeadAttention(6, dim, 0.1), nn.Linear(dim, dim), 0.1) self.decoder = Decoder(dec_layer, n_decode_layer) self.encoder_decoder = EncoderDecoder(self.encoder, self.decoder, self.src_emb, self.tgt_emb)
def build_model(n_tokens, len_limit, batch_size, d_model=256, d_inner_hid=512, n_head=4, d_k=64, d_v=64, layers=6, dropout=0.1, active_layers=999): d_emb = d_model pos_emb = Embedding(len_limit, d_emb, trainable=False, \ weights=[GetPosEncodingMatrix(len_limit, d_emb)], \ batch_input_shape=[batch_size, None]) word_emb = Embedding(n_tokens, d_emb, batch_input_shape=[batch_size, None]) encoder = Encoder(d_model, d_inner_hid, n_head, d_k, d_v, layers, dropout, \ word_emb=word_emb, pos_emb=pos_emb) target_layer = TimeDistributed(Dense(n_tokens, use_bias=False)) def get_pos_seq(x): mask = K.cast(K.not_equal(x, 0), 'int32') pos = K.cumsum(K.ones_like(x, 'int32'), 1) return pos * mask src_seq = Input(shape=(None, ), dtype='int32') src_pos = Lambda(get_pos_seq)(src_seq) enc_output = encoder(src_seq, src_pos, active_layers=active_layers) final_output = target_layer(enc_output) model = Model(inputs=src_seq, outputs=final_output) return model
def get_model(self): inp_exe = Input(shape=(H.executable_size, 3), dtype='int32', name='inp_exe') mask = Lambda(lambda x: x[:, :, 0])(inp_exe) mask = PaddingMask()(mask) print("##############", inp_exe, mask) encoding, enc_attention_weights = Encoder( num_layers=H.num_layers, d_model=H.d_model, num_heads=H.num_heads, d_ff=H.d_ff, vocab_size=H.real_vocab_size, dropout_rate=H.dropout_rate)(inp_exe, mask) encoding = GlobalAveragePooling1D()(encoding) inp_static = Input(shape=(H.static_feature_len)) concatenated_features = concatenate([encoding, inp_static]) layer_256 = Dense(256, activation="relu")(concatenated_features) layer_16 = Dense(16, activation="relu")(layer_256) result = Dense(1, activation="sigmoid")(layer_16) model = Model(inputs=[inp_exe, inp_static], outputs=result) model.summary() return model
def BuildModel(vocab_size, encoder_emb, decoder_emb, d_model = 512, N = 6, d_ff = 2048, h = 8, dropout = 0.1): target_vocab = vocab_size c = copy.deepcopy attention = MultiHeadedAttention(h, d_model) feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) encoder_layer = EncoderLayer(d_model, c(attention), c(feed_forward), dropout) decoder_layer = DecoderLayer(d_model, c(attention), c(attention), c(feed_forward), dropout) encoder = Encoder(encoder_layer, N) decoder = Decoder(decoder_layer, N) model = EncoderDecoder( encoder, decoder, nn.Sequential(Embeddings(encoder_emb, d_model), c(position)), nn.Sequential(Embeddings(decoder_emb, d_model), c(position)), Generator(d_model, target_vocab)) for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model
def __init__(self, model_dim, max_len, num_layer, num_head, n_vocab, lr, max_seg=3, drop_rate=0.2, padding_idx=0): super().__init__() self.padding_idx = padding_idx self.n_vocab = n_vocab self.max_len = max_len self.word_emb = nn.Embedding(n_vocab, model_dim) self.word_emb.weight.data.normal_(0, 0.1) self.segment_emb = nn.Embedding(num_embeddings=max_seg, embedding_dim=model_dim) self.segment_emb.weight.data.normal_(0, 0.1) self.position_emb = torch.empty(1, max_len, model_dim) nn.init.kaiming_normal_(self.position_emb, mode='fan_out', nonlinearity='relu') self.position_emb = nn.Parameter(self.position_emb) self.encoder = Encoder(n_head=num_head, emb_dim=model_dim, drop_rate=drop_rate, n_layer=num_layer) self.task_mlm = nn.Linear(in_features=model_dim, out_features=n_vocab) self.task_nsp = nn.Linear(in_features=model_dim * self.max_len, out_features=2) self.opt = optim.Adam(self.parameters(), lr)
def __init__(self): super(DualEventModel, self).__init__() self.event_encoder = EventEncoder() self.dual_event_encoder = Encoder(1, d_model, 4, 256, unordered=False) self.output_layer = tf.keras.layers.Dense(2, activation='softmax')
# embedding images fcnn = ResnetV1_FCNN(img_shape, 20) em_imgL = fcnn(imgL) em_imgR = fcnn(imgR) em_imgs = tf.keras.layers.Concatenate(axis=2)([em_imgL, em_imgR]) # embedding sentence print("creating transformer encoder") GloVe_embeddings = np.load("word_embeddings/embedding.npy") print(GloVe_embeddings.shape) enc_mask = create_padding_mask(sent) encoder = Encoder( num_layers=4, d_model=300, # also the word embedding dim num_heads=12, dff=512, input_vocab_size=GloVe_embeddings.shape[0], embeddings_initializer=Constant(GloVe_embeddings), ) em_sent = encoder(sent, training=True, mask=enc_mask) # getting prediction from the Relational Neural Network print("creating relational network") relation_matrix = RelationalProduct()([em_sent, em_imgs]) g = ConvolutionalPerceptron(relation_matrix.shape[1:], [256, 256]) em_relations = g(relation_matrix) relation_out = ReduceMean(axis=-1)(em_relations) f = Perceptron(relation_out.shape[1], [256, 256]) relation_out = f(relation_out) pred = Dense(1, activation="sigmoid")(relation_out)
from transformer import Encoder from LSTMEncoder import EncoderRNN import torch te = Encoder.TransformerEncoder(1000, 6) a = torch.LongTensor([[1, 2, 3, 4, 0, 0], [10, 0, 0, 0, 0, 0], [5, 6, 7, 8, 9, 0]]) #print(te.forward(a,torch.IntTensor([5,4])).shape) ls = EncoderRNN(1000, 15, 4096, 300) print( ls.forward(a, use_prob_vector=False, input_lengths=torch.Tensor([4, 1, 5])).shape)
def __init__(self): super(TrajectoryEncoder, self).__init__() self.aux_embedding = tf.keras.layers.Embedding( input_dim=vocab_size, output_dim=embedding_size) self.encoder = Encoder(1, d_model, 4, 256)
def __init__(self): super(FrameEncoder, self).__init__() self.aux_embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_size) self.encoder = Encoder(1, d_model, 4, 256, unordered=True)
def bst_model(sparse_input_length = 1, \ max_seq_length = 50, \ vocab_size_dict = None, \ embedding_dim = 512, \ dnn_unit_list = [512, 128, 32], \ activation = 'relu', \ dropout_rate = 0.2, \ n_layers = 2, \ num_heads = 8, \ middle_units = 1024, \ training = False ): # 1. Input layer # 1.1 user user_id_input_layer = Input(shape=(sparse_input_length, ), name="user_id_input_layer") gender_input_layer = Input(shape=(sparse_input_length, ), name="gender_input_layer") age_input_layer = Input(shape=(sparse_input_length, ), name="age_input_layer") user_click_item_seq_input_layer = Input(shape=(max_seq_length, ), name="user_click_item_seq_input_layer") user_click_cate_seq_input_layer = Input(shape=(max_seq_length, ), name="user_click_cate_seq_input_layer") # 1.2 item item_input_layer = Input(shape=(sparse_input_length, ), name="item_input_layer") cate_input_layer = Input(shape=(sparse_input_length, ), name="cate_input_layer") # 2. Embedding layer # 2.1 user user_id_embedding_layer = Embedding(vocab_size_dict["user_id"]+1, embedding_dim, \ mask_zero=True, name='user_id_embedding_layer')(user_id_input_layer) gender_embedding_layer = Embedding(vocab_size_dict["gender"]+1, embedding_dim, \ mask_zero=True, name='gender_embedding_layer')(gender_input_layer) age_embedding_layer = Embedding(vocab_size_dict["age"]+1, embedding_dim, \ mask_zero=True, name='age_embedding_layer')(age_input_layer) item_id_embedding = Embedding(vocab_size_dict["item_id"]+1, embedding_dim, \ mask_zero=True, name='item_id_embedding') cate_id_embedding = Embedding(vocab_size_dict["cate_id"]+1, embedding_dim, \ mask_zero=True, name='cate_id_embedding') user_click_item_seq_embedding_layer = item_id_embedding(user_click_item_seq_input_layer) user_click_cate_seq_embedding_layer = cate_id_embedding(user_click_cate_seq_input_layer) # 2.2 item target_item_embedding_layer = item_id_embedding(item_input_layer) target_cate_embedding_layer = cate_id_embedding(cate_input_layer) # 3. Concat layer # 3.1 user: other features other_features_concat_layer = concatenate([user_id_embedding_layer, gender_embedding_layer, \ age_embedding_layer], axis=-1) # 3.1 user: sequence features input_transformer_layer = concatenate([user_click_item_seq_embedding_layer, \ user_click_cate_seq_embedding_layer], axis=-1) # 3.2 item input_din_layer_query = concatenate([target_item_embedding_layer, \ target_cate_embedding_layer], axis=-1) # 4. Transformer layer d_model = input_transformer_layer.shape[-1] padding_mask_list = padding_mask(user_click_item_seq_input_layer) #print("padding_mask_list.shape: ", padding_mask_list.shape) output_tranformer_layer = Encoder(n_layers, d_model, num_heads, middle_units, max_seq_length, training)([input_transformer_layer, padding_mask_list]) #print("output_tranformer_layer.shape: ", output_tranformer_layer.shape) # 5. Din attention layer query = input_din_layer_query keys = output_tranformer_layer vecs = output_tranformer_layer din_padding_mask_list = din_padding_mask(user_click_item_seq_input_layer) #print("din_padding_mask_list.shape: ", din_padding_mask_list.shape) output_din_layer = DinAttentionLayer(d_model, middle_units, dropout_rate)([query, keys, vecs, din_padding_mask_list]) #print("output_din_layer.shape: ", output_din_layer.shape) # 6. DNN layer input_dnn_layer = concatenate([other_features_concat_layer, output_din_layer], \ axis=-1) input_dnn_layer = tf.squeeze(input=input_dnn_layer, axis=[1]) for inx in range(len(dnn_unit_list)): input_dnn_layer = Dense(dnn_unit_list[inx], activation=activation, \ name="FC_{0}".format(inx+1))(input_dnn_layer) input_dnn_layer = Dropout(dropout_rate, name="dropout_{0}".format(inx+1))(input_dnn_layer) output = Dense(1, activation='sigmoid', \ name='Sigmoid_output_layer')(input_dnn_layer) # Output model inputs_list = [user_id_input_layer, gender_input_layer, age_input_layer, \ user_click_item_seq_input_layer, user_click_cate_seq_input_layer, \ item_input_layer, cate_input_layer] model = Model(inputs = inputs_list, outputs = output) return model