def __init__(self, vocab_size, emb_dim=512, hidden_size=512, n_layers=8, n_heads=8, padding_idx=0, dropout_rate=0.1): """ __init__ """ super(TransformerEncoderModel, self).__init__() self.padding_idx = padding_idx self.token_embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=padding_idx) max_pos_len = 3000 self.pos_embedding = nn.Embedding(max_pos_len, emb_dim, padding_idx=padding_idx) self.dropout = nn.Dropout(p=dropout_rate) self.transformer_encoder_layer = nn.TransformerEncoderLayer(emb_dim, n_heads, dim_feedforward=hidden_size * 4, \ dropout=0.1, activation='gelu', attn_dropout=0.1, act_dropout=0) self.transformer_encoder = nn.TransformerEncoder( self.transformer_encoder_layer, n_layers) self.layer_norm = nn.LayerNorm(hidden_size) self.dropout = nn.Dropout(dropout_rate) self.apply(self.init_weights)
def __init__(self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02, pad_token_id=0): super(BertModel, self).__init__() self.pad_token_id = pad_token_id self.initializer_range = initializer_range self.embeddings = BertEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, type_vocab_size) encoder_layer = nn.TransformerEncoderLayer( hidden_size, num_attention_heads, intermediate_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=0) self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers) self.pooler = BertPooler(hidden_size) self.apply(self.init_weights)
def __init__(self, vocab_size: int = 300, max_len: int = 128, emb_size: int = 768, n_layers: int = 12, n_heads: int = 8, dropout: float = 0.1, pad_idx: int = 0): super().__init__() self.input_emb = InputEmbedding(max_len=max_len, vocab_size=vocab_size, emb_size=emb_size, pad_idx=pad_idx, dropout=dropout) self.transformers = nn.LayerList([ nn.TransformerEncoderLayer(d_model=emb_size, nhead=n_heads, dim_feedforward=4 * emb_size, normalize_before=True) for _ in range(n_layers) ]) self.max_len = max_len self.n_heads = n_heads self.pad_idx = pad_idx self.dim_output = emb_size self.vocab_size = vocab_size
def __init__(self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=514, initializer_range=0.02, pad_token_id=1): super(ErnieMModel, self).__init__() self.pad_token_id = pad_token_id self.initializer_range = initializer_range self.embeddings = ErnieMEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings) encoder_layer = nn.TransformerEncoderLayer( hidden_size, num_attention_heads, dim_feedforward=4 * hidden_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=0, normalize_before=False) self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers) self.pooler = ErnieMPooler(hidden_size) self.apply(self.init_weights)
def __init__(self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, pad_token_id=0): super(ErnieModel, self).__init__() self.pad_token_id = pad_token_id self.initializer_range = initializer_range weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( mean=0.0, std=self.initializer_range)) self.embeddings = ErnieEmbeddings( vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, type_vocab_size, pad_token_id, weight_attr) encoder_layer = nn.TransformerEncoderLayer( hidden_size, num_attention_heads, intermediate_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=0, weight_attr=weight_attr, ) self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers) self.pooler = ErniePooler(hidden_size, weight_attr) self.apply(self.init_weights)
def __init__(self, vocab_size, embedding_size, hidden_size, num_hidden_layers, num_attention_heads, intermediate_size, hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, max_position_embeddings, type_vocab_size, initializer_range, pad_token_id): super(ElectraModel, self).__init__() self.pad_token_id = pad_token_id self.initializer_range = initializer_range self.embeddings = ElectraEmbeddings(vocab_size, embedding_size, hidden_dropout_prob, max_position_embeddings, type_vocab_size) if embedding_size != hidden_size: self.embeddings_project = nn.Linear(embedding_size, hidden_size) encoder_layer = nn.TransformerEncoderLayer( hidden_size, num_attention_heads, intermediate_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=0) self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers) self.init_weights()
def __init__(self): super(TestModel, self).__init__() encoder_layer = nn.TransformerEncoderLayer(312, 12, 1024, dropout=0.1, activation='gelu', attn_dropout=0.1, act_dropout=0) self.encoder = nn.TransformerEncoder(encoder_layer, 3) self.fc = nn.Linear(312, 3)
def __init__(self, vocab_size, vocab_file, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, pad_token_id=0, do_lower_case=True, is_split_into_words=False, max_seq_len=128, pad_to_max_seq_len=False): super(PPMiniLMModel, self).__init__() if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the " "vocabulary from a pretrained model please use " "`model = PPMiniLMModel.from_pretrained(PRETRAINED_MODEL_NAME)`" .format(vocab_file)) self.vocab = self.load_vocabulary(vocab_file) self.do_lower_case = do_lower_case self.max_seq_len = max_seq_len self.is_split_into_words = is_split_into_words self.pad_token_id = pad_token_id self.pad_to_max_seq_len = pad_to_max_seq_len self.initializer_range = initializer_range weight_attr = paddle.ParamAttr( initializer=nn.initializer.TruncatedNormal( mean=0.0, std=self.initializer_range)) self.embeddings = PPMiniLMEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, type_vocab_size, pad_token_id, weight_attr) encoder_layer = nn.TransformerEncoderLayer( hidden_size, num_attention_heads, intermediate_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=0, weight_attr=weight_attr, normalize_before=False) self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers) self.pooler = PPMiniLMPooler(hidden_size, weight_attr) self.apply(self.init_weights)
def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu", attn_dropout=None, act_dropout=None, normalize_before=False, weight_attr=None, bias_attr=None): """TransformerEncoder""" super(TransformerEncoder, self).__init__() if isinstance(bias_attr, (list, tuple)): if len(bias_attr) == 1: encoder_bias_attr = [bias_attr[0]] * 2 elif len(bias_attr) == 2: encoder_bias_attr = bias_attr elif len(bias_attr) == 3: encoder_bias_attr = [bias_attr[0], bias_attr[-1]] else: assert False, ( "length of bias_attr should be 1 or 2 or 3 when it is a list/tuple" ) else: encoder_bias_attr = bias_attr if isinstance(weight_attr, (list, tuple)): if len(weight_attr) == 1: encoder_weight_attr = [weight_attr[0]] * 2 elif len(weight_attr) == 2: encoder_weight_attr = weight_attr elif len(weight_attr) == 3: encoder_weight_attr = [weight_attr[0], weight_attr[-1]] else: assert False, ( "length of weight_attr should be 1 or 2 or 3 when it is a list/tuple" ) else: encoder_weight_attr = weight_attr encoder_layer = nn.TransformerEncoderLayer( d_model, nhead, dim_feedforward, dropout, activation, attn_dropout, act_dropout, normalize_before, encoder_weight_attr, encoder_bias_attr) encoder_norm = nn.LayerNorm(d_model) self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) self.d_model = d_model self.nhead = nhead
def __init__(self, vocab_size, embedding_size=128, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02, pad_token_id=0, use_content_summary=True, content_summary_index=1, cls_num=2): super(ErnieCtmModel, self).__init__() self.pad_token_id = pad_token_id self.content_summary_index = content_summary_index self.initializer_range = initializer_range self.embeddings = ErnieCtmEmbeddings( vocab_size, embedding_size, hidden_dropout_prob=hidden_dropout_prob, max_position_embeddings=max_position_embeddings, type_vocab_size=type_vocab_size, padding_idx=pad_token_id, cls_num=cls_num) self.embedding_hidden_mapping_in = nn.Linear(embedding_size, hidden_size) encoder_layer = nn.TransformerEncoderLayer( hidden_size, num_attention_heads, intermediate_size, dropout=hidden_dropout_prob, activation="gelu", attn_dropout=attention_probs_dropout_prob, act_dropout=0) encoder_layer.activation = nn.GELU(approximate=True) self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers) self.pooler = ErnieCtmPooler(hidden_size) self.use_content_summary = use_content_summary self.content_summary_index = content_summary_index if use_content_summary is True: self.feature_fuse = nn.Linear(hidden_size * 2, intermediate_size) self.feature_output = nn.Linear(intermediate_size, hidden_size) self.apply(self.init_weights)
def __init__( self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act='relu', hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, normalize_before=False, max_position_embeddings=513, type_vocab_size=4, initializer_range=0.02, unk_token_id=17963, pad_token_id=0, bos_token_id=1, eos_token_id=3, mask_token_id=3, ): super(UNIMOModel, self).__init__() self.unk_token_id = unk_token_id self.pad_token_id = pad_token_id self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id self.mask_token_id = mask_token_id self.initializer_range = initializer_range self.embeddings = UNIMOEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, type_vocab_size) encoder_layer = nn.TransformerEncoderLayer( hidden_size, num_attention_heads, intermediate_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=0, normalize_before=normalize_before) self.encoder_norm = nn.LayerNorm(hidden_size) self.dropout = nn.Dropout(hidden_dropout_prob) self.encoder = nn.TransformerEncoder( encoder_layer, num_hidden_layers, ) self.apply(self.init_weights)
def __init__( self, vocab_size=23236, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, pad_token_id=0, pool_act="tanh", layer_norm_eps=1e-12, glyph_embedding_dim=1728, pinyin_map_len=32, ): super(ChineseBertModel, self).__init__() self.pad_token_id = pad_token_id self.layer_norm_eps = layer_norm_eps self.initializer_range = initializer_range self.embeddings = FusionBertEmbeddings( vocab_size, hidden_size, pad_token_id, type_vocab_size, max_position_embeddings, pinyin_map_len, glyph_embedding_dim, layer_norm_eps, hidden_dropout_prob, ) encoder_layer = nn.TransformerEncoderLayer( hidden_size, num_attention_heads, intermediate_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=0, ) self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers) self.pooler = BertPooler(hidden_size, pool_act) self.apply(self.init_weights)
def __init__(self, vocab_size, embed_tokens=None, pad_token_id=0, d_model=1280, num_encoder_layers=2, encoder_attention_heads=32, encoder_ffn_dim=5120, dropout=0.1, activation_function='gelu', attention_dropout=0.0, activation_dropout=0.0, max_position_embeddings=128, init_std=0.02, scale_embedding=True, normalize_before=True): super().__init__() self.init_std = init_std self.pad_token_id = pad_token_id if embed_tokens is not None: self.embed_tokens = embed_tokens else: self.embed_tokens = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model, padding_idx=pad_token_id) self.embed_scale = math.sqrt(d_model) if scale_embedding else 1.0 self.encoder_embed_positions = BlenderbotLearnedPositionalEmbedding( num_embeddings=max_position_embeddings, embedding_dim=d_model) self.encoder_dropout = nn.Dropout(dropout) self.encoder_layernorm = nn.LayerNorm(normalized_shape=d_model) encoder_layer = nn.TransformerEncoderLayer( d_model=d_model, nhead=encoder_attention_heads, dim_feedforward=encoder_ffn_dim, dropout=dropout, activation=activation_function, attn_dropout=attention_dropout, act_dropout=activation_dropout, normalize_before=normalize_before) self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=num_encoder_layers) self.apply(self.init_weights)
def __init__(self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, normalize_before=True, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, unk_token_id=0, pad_token_id=0, bos_token_id=1, eos_token_id=2, mask_token_id=30000, role_type_size=None): super(UnifiedTransformerModel, self).__init__() self.unk_token_id = unk_token_id self.pad_token_id = pad_token_id self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id self.mask_token_id = mask_token_id self.initializer_range = initializer_range self.embeddings = UnifiedTransformerEmbeddings( vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, type_vocab_size, role_type_size) encoder_layer = nn.TransformerEncoderLayer( hidden_size, num_attention_heads, intermediate_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=0, normalize_before=normalize_before) encoder_norm = nn.LayerNorm(hidden_size) self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers, encoder_norm) self.apply(self.init_weights)
def __init__(self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02, pad_token_id=0, fit_size=768): super(TinyBertModel, self).__init__() self.pad_token_id = pad_token_id self.initializer_range = initializer_range self.embeddings = BertEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, type_vocab_size) encoder_layer = nn.TransformerEncoderLayer( hidden_size, num_attention_heads, intermediate_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=0) self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers) self.pooler = BertPooler(hidden_size) # fit_dense(s) means a hidden states' transformation from student to teacher. # `fit_denses` is used in v2 model, and `fit_dense` is used in other pretraining models. self.fit_denses = nn.LayerList([ nn.Linear(hidden_size, fit_size) for i in range(num_hidden_layers + 1) ]) self.fit_dense = nn.Linear(hidden_size, fit_size) self.apply(self.init_weights)
def __init__( self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, layer_norm_eps=1e-12, max_position_embeddings=512, max_2d_position_embeddings=1024, type_vocab_size=16, initializer_range=0.02, pad_token_id=0, pool_act="tanh", ): super(LayoutLMModel, self).__init__() #self.config = kwargs self.num_hidden_layers = num_hidden_layers self.pad_token_id = pad_token_id self.initializer_range = initializer_range self.embeddings = LayoutLMEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, max_2d_position_embeddings, layer_norm_eps, pad_token_id, type_vocab_size) encoder_layer = nn.TransformerEncoderLayer( hidden_size, num_attention_heads, intermediate_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=0) self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers) self.pooler = LayoutLMPooler(hidden_size, pool_act) self.apply(self.init_weights)
def __init__(self, embed_tokens, vocab_size, pad_token_id=1, d_model=768, num_encoder_layers=6, encoder_attention_heads=12, encoder_ffn_dim=3072, dropout=0.1, activation_function='gelu', attention_dropout=0.1, activation_dropout=0.1, max_position_embeddings=1024, init_std=0.02): super().__init__() self.d_model = d_model self.init_std = init_std self.pad_token_id = pad_token_id if embed_tokens is not None: self.embed_tokens = embed_tokens else: self.embed_tokens = nn.Embedding(vocab_size, d_model, pad_token_id) self.encoder_embed_positions = MBartLearnedPositionalEmbedding( max_position_embeddings, d_model, pad_token_id) self.encoder_dropout = nn.Dropout(dropout) self.encoder_layernorm_embedding = nn.LayerNorm(d_model) encoder_layer = nn.TransformerEncoderLayer( d_model=d_model, nhead=encoder_attention_heads, dim_feedforward=encoder_ffn_dim, dropout=dropout, activation=activation_function, attn_dropout=attention_dropout, act_dropout=activation_dropout, normalize_before=True) self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers, nn.LayerNorm(d_model)) self.apply(self.init_weights)
def __init__(self, vocab_size, type_size, max_position_seq_len, num_layers, n_head, hidden_size, attn_dropout, act_dropout): super(NSP, self).__init__() self.n_head = n_head self.hidden_size = hidden_size self.word_embedding_layer = nn.Embedding(vocab_size, hidden_size) self.sent_embedding_layer = nn.Embedding(type_size, hidden_size) self.pos_embedding_layer = nn.Embedding(max_position_seq_len, hidden_size) encoder_layer = nn.TransformerEncoderLayer( hidden_size, n_head, hidden_size * 4, act_dropout, 'gelu', attn_dropout, act_dropout, 'True') encoder_norm = nn.LayerNorm(hidden_size) self.encoder = nn.TransformerEncoder(encoder_layer, num_layers, encoder_norm) self.fc1 = nn.Linear(hidden_size, hidden_size) self.fc2 = nn.Linear(hidden_size, 2) self.dropout_layer = nn.Dropout(act_dropout) self.tanh_layer = nn.Tanh() self.softmax = nn.Softmax()
def __init__(self, src_vocab_size, trg_vocab_size, max_length=256, n_layer=6, n_head=8, d_model=512, d_inner_hid=2048, dropout=0.1, weight_sharing=False, bos_id=0, eos_id=1, waitk=-1): super(SimultaneousTransformer, self).__init__() self.trg_vocab_size = trg_vocab_size self.emb_dim = d_model self.bos_id = bos_id self.eos_id = eos_id self.dropout = dropout self.waitk = waitk self.n_layer = n_layer self.n_head = n_head self.d_model = d_model self.src_word_embedding = WordEmbedding( vocab_size=src_vocab_size, emb_dim=d_model, bos_id=self.bos_id) self.src_pos_embedding = PositionalEmbedding( emb_dim=d_model, max_length=max_length+1) if weight_sharing: assert src_vocab_size == trg_vocab_size, ( "Vocabularies in source and target should be same for weight sharing." ) self.trg_word_embedding = self.src_word_embedding self.trg_pos_embedding = self.src_pos_embedding else: self.trg_word_embedding = WordEmbedding( vocab_size=trg_vocab_size, emb_dim=d_model, bos_id=self.bos_id) self.trg_pos_embedding = PositionalEmbedding( emb_dim=d_model, max_length=max_length+1) encoder_layer = nn.TransformerEncoderLayer( d_model=d_model, nhead=n_head, dim_feedforward=d_inner_hid, dropout=dropout, activation='relu', normalize_before=True, bias_attr=[False, True]) encoder_norm = nn.LayerNorm(d_model) self.encoder = nn.TransformerEncoder( encoder_layer=encoder_layer, num_layers=n_layer, norm=encoder_norm) decoder_layer = DecoderLayer( d_model=d_model, nhead=n_head, dim_feedforward=d_inner_hid, dropout=dropout, activation='relu', normalize_before=True, bias_attr=[False, False, True]) decoder_norm = nn.LayerNorm(d_model) self.decoder = Decoder( decoder_layer=decoder_layer, num_layers=n_layer, norm=decoder_norm) if weight_sharing: self.linear = lambda x: paddle.matmul( x=x, y=self.trg_word_embedding.word_embedding.weight, transpose_y=True) else: self.linear = nn.Linear( in_features=d_model, out_features=trg_vocab_size, bias_attr=False)
def __init__( self, user_size, adgroup_size, pid_size, cms_segid_size, cms_group_size, final_gender_size, age_level_size, pvalue_level_size, shopping_level_size, occupation_size, new_user_class_level_size, campaign_size, customer_size, cate_size, brand_size, # above is all sparse feat size sparse_embed_size=4, att_embedding_size=8, sess_count=5, sess_max_length=10, l2_reg_embedding=1e-6): super().__init__() # feature size self.user_size = user_size self.adgroup_size = adgroup_size self.pid_size = pid_size self.cms_segid_size = cms_segid_size self.cms_group_size = cms_group_size self.final_gender_size = final_gender_size self.age_level_size = age_level_size self.pvalue_level_size = pvalue_level_size self.shopping_level_size = shopping_level_size self.occupation_size = occupation_size self.new_user_class_level_size = new_user_class_level_size self.campaign_size = campaign_size self.customer_size = customer_size self.cate_size = cate_size self.brand_size = brand_size # sparse embed size self.sparse_embed_size = sparse_embed_size # transform attention embed size self.att_embedding_size = att_embedding_size # hyper_parameters self.sess_count = 5 self.sess_max_length = 10 # sparse embedding layer self.userid_embeddings_var = paddle.nn.Embedding( self.user_size, self.sparse_embed_size, sparse=True, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) self.adgroup_embeddings_var = paddle.nn.Embedding( self.adgroup_size, self.sparse_embed_size, sparse=True, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) self.pid_embeddings_var = paddle.nn.Embedding( self.pid_size, self.sparse_embed_size, #sparse=True, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) self.cmsid_embeddings_var = paddle.nn.Embedding( self.cms_segid_size, self.sparse_embed_size, #sparse=True, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) self.cmsgroup_embeddings_var = paddle.nn.Embedding( self.cms_group_size, self.sparse_embed_size, #sparse=True, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) self.gender_embeddings_var = paddle.nn.Embedding( self.final_gender_size, self.sparse_embed_size, #sparse=True, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) self.age_embeddings_var = paddle.nn.Embedding( self.age_level_size, self.sparse_embed_size, #sparse=True, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) self.pvalue_embeddings_var = paddle.nn.Embedding( self.pvalue_level_size, self.sparse_embed_size, #sparse=True, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) self.shopping_embeddings_var = paddle.nn.Embedding( self.shopping_level_size, self.sparse_embed_size, #sparse=True, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) self.occupation_embeddings_var = paddle.nn.Embedding( self.occupation_size, self.sparse_embed_size, #sparse=True, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) self.new_user_class_level_embeddings_var = paddle.nn.Embedding( self.new_user_class_level_size, self.sparse_embed_size, #sparse=True, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) self.campaign_embeddings_var = paddle.nn.Embedding( self.campaign_size, self.sparse_embed_size, sparse=True, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) self.customer_embeddings_var = paddle.nn.Embedding( self.customer_size, self.sparse_embed_size, sparse=True, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) self.cate_embeddings_var = paddle.nn.Embedding( self.cate_size, self.sparse_embed_size, sparse=True, padding_idx=0, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) self.brand_embeddings_var = paddle.nn.Embedding( self.brand_size, self.sparse_embed_size, sparse=True, padding_idx=0, weight_attr=paddle.ParamAttr( regularizer=paddle.regularizer.L2Decay(l2_reg_embedding), initializer=nn.initializer.Normal(mean=0.0, std=0.0001))) # sess interest extractor layer self.position_encoding = PositionalEncoder(2 * self.sparse_embed_size) self.transform = nn.TransformerEncoderLayer( d_model=self.att_embedding_size, nhead=8, dim_feedforward=64, weight_attr=self._get_weight_attr(), bias_attr=False, dropout=0.0) # sess interest interacting layer self.bilstm = nn.LSTM(2 * self.sparse_embed_size, 2 * self.sparse_embed_size, num_layers=2, direction='bidirectional') # sess interest activating layer self.transform_actpool = AttentionSequencePoolingLayer( weight_normalization=True, name='transform') self.lstm_actpool = AttentionSequencePoolingLayer( weight_normalization=True, name='lstm') # MLP moudle self.mlp = MLP(mlp_hidden_units=[77, 200, 80])