def __init__(self, d_model, n_head, d_k=64, d_v=64, res_dropout=0.1): super(MultiHeadAttention, self).__init__() self.w_qs = nn.ModuleList( [Linear(d_model, d_k, bias=False) for _ in range(n_head)]) self.w_ks = nn.ModuleList( [Linear(d_model, d_k, bias=False) for _ in range(n_head)]) self.w_vs = nn.ModuleList( [Linear(d_model, d_v, bias=False) for _ in range(n_head)]) self.attention = ScaledDotProductAttention(d_model) self.layer_norm = LayerNormalization(d_model) self.proj = Linear(n_head * d_v, d_model) self.dropout = nn.Dropout(res_dropout)
def __init__( self, n_src_vocab, n_tgt_vocab, n_max_seq, n_layers=6, n_head=8, d_word_vec=512, d_model=512, d_inner_hid=1024, d_k=64, d_v=64, dropout=0.1, proj_share_weight=True, embs_share_weight=True): super(Transformer, self).__init__() self.encoder = Encoder( n_src_vocab, n_max_seq, n_layers=n_layers, n_head=n_head, d_word_vec=d_word_vec, d_model=d_model, d_inner_hid=d_inner_hid, dropout=dropout) self.decoder = Decoder( n_tgt_vocab, n_max_seq, n_layers=n_layers, n_head=n_head, d_word_vec=d_word_vec, d_model=d_model, d_inner_hid=d_inner_hid, dropout=dropout) self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False) self.dropout = nn.Dropout(dropout) self.padding_bottleneck=PaddingBottleneck() # We will store the padding tensor here to find it after a call to forward: self.padding=None self.padding_amount=self.padding_bottleneck.padding_amount assert d_model == d_word_vec, \ 'To facilitate the residual connections, \ the dimensions of all module output shall be the same.' if proj_share_weight: # Share the weight matrix between tgt word embedding/projection assert d_model == d_word_vec self.tgt_word_proj.weight = self.decoder.tgt_word_emb.weight if embs_share_weight: # Share the weight matrix between src/tgt word embeddings # assume the src/tgt word vec size are the same assert n_src_vocab == n_tgt_vocab, \ "To share word embedding table, the vocabulary size of src/tgt shall be the same." self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight
def __init__( self, n_src_vocab, n_tgt_vocab, n_max_seq, n_layers=6, n_head=8, d_word_vec=512, d_model=512, d_inner_hid=1024, d_k=64, d_v=64, dropout=0.1, proj_share_weight=True, embs_share_weight=True): super(Transformer, self).__init__() self.encoder = Encoder( n_src_vocab, n_max_seq, n_layers=n_layers, n_head=n_head, d_word_vec=d_word_vec, d_model=d_model, d_inner_hid=d_inner_hid, dropout=dropout) self.decoder = Decoder( n_tgt_vocab, n_max_seq, n_layers=n_layers, n_head=n_head, d_word_vec=d_word_vec, d_model=d_model, d_inner_hid=d_inner_hid, dropout=dropout) self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False) self.dropout = nn.Dropout(dropout) if proj_share_weight: # Share the weight matrix between tgt word embedding/projection assert d_model == d_word_vec self.tgt_word_proj.weight = self.decoder.tgt_word_emb.weight if embs_share_weight: # Share the weight matrix between src/tgt word embeddings # assume the src/tgt word vec size are the same assert n_src_vocab == n_tgt_vocab, \ "To share word embedding table, the vocabulary size of src/tgt shall be the same." self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): """初始化多头 Arguments: n_head {int} -- 头的数量 d_model {int} -- 模型总维度 d_k {int} -- Query和Key分别的子头维度 d_v {int} -- Value的子头维度 """ super(MultiHeadAttention, self).__init__() self.n_head = n_head self.d_k = d_k self.d_v = d_v self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k)) self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k)) self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v)) self.attention = ScaledDotProductAttention(d_model) self.layer_norm = LayerNormalization(d_model) self.proj = Linear(n_head * d_v, d_model) self.dropout = nn.Dropout(dropout) init.xavier_normal(self.w_qs) init.xavier_normal(self.w_ks) init.xavier_normal(self.w_vs)
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1, enc_output=None): ''' :param n_head: :param d_model: :param d_k: :param d_v: :param dropout: ''' super().__init__() self.n_head = n_head self.d_k = d_k self.d_v = d_v self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k)) self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k)) self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v)) self.attention = ScaledDotProductAttention() self.layer_norm = LayerNormalization(d_model) self.proj = Linear(n_head * d_v, d_model) self.dropout = nn.Dropout(dropout) init.xavier_normal_(self.w_qs) init.xavier_normal_(self.w_ks) init.xavier_normal_(self.w_vs)
def __init__(self, lda_mat, n_src_dim, encoder_max_len, d_model=256, dropout=0.1, contexts=[[0]]): super(EncoderTest, self).__init__() self.d_model = d_model self.dropout = nn.Dropout(dropout) self.trans_pos_enc = nn.Embedding(encoder_max_len, d_model, padding_idx=constants.PAD) self.trans_pos_enc.weight.data = position_encoding_init( encoder_max_len, d_model) self.trans_pos_enc.weight.requires_grad = False #project the source to dim of model lda_concat_index = [-2, -1, 0, 1, 2] self.concat = ConcatLayer(lda_concat_index) self.lda_layer = LDALayer(lda_mat) self.src_projection = Linear(n_src_dim * len(lda_concat_index), d_model, bias=False) #self.tdnn0 = TDNNLayer(n_src_dim * len(lda_concat_index), d_model, contexts[0], dropout=dropout) self.tdnn_stack = nn.ModuleList([ TDNNLayer(d_model, d_model, context, dropout=dropout) for context in contexts ])
def __init__(self, n_tgt_vocab, n_max_seq, n_layers=4, n_head=8, d_word_vec=64, d_model=64, d_inner_hid=200, dropout=0.1): super(Decoder, self).__init__() n_position = n_max_seq + 1 self.n_max_seq = n_max_seq self.d_model = d_model self.position_enc = nn.Embedding(n_position, d_word_vec, padding_idx=Constants.PAD) self.position_enc.weight.data = position_encoding_init( n_position, d_word_vec) self.tgt_word_emb = nn.Embedding(n_tgt_vocab, d_word_vec, padding_idx=Constants.PAD) self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False) self.dropout = nn.Dropout(dropout) self.layer_stack = nn.ModuleList([ DecoderLayer(d_model, d_inner_hid, n_head) for _ in range(n_layers) ])
def __init__(self, n_src_vocab, n_tgt_vocab, n_max_seq, emb_path=None, n_layers=6, n_head=6, d_word_vec=300, d_model=300, d_inner_hid=500, d_k=50, d_v=50, dropout=0.1, proj_share_weight=True, embs_share_weight=True): super(Transformer, self).__init__() self.encoder = Encoder(n_src_vocab, n_max_seq, emb_path=emb_path, n_layers=n_layers, n_head=n_head, d_word_vec=d_word_vec, d_model=d_model, d_k=d_k, d_v=d_v, d_inner_hid=d_inner_hid, dropout=dropout) self.decoder = Decoder(n_tgt_vocab, n_max_seq, emb_path=emb_path, n_layers=n_layers, n_head=n_head, d_word_vec=d_word_vec, d_model=d_model, d_k=d_k, d_v=d_v, d_inner_hid=d_inner_hid, dropout=dropout) self.tgt_word_proj = Linear( d_model, n_tgt_vocab, bias=False) # d_model到n_tgt_vocab的映射,最终预测结果 self.dropout = nn.Dropout(dropout) assert d_model == d_word_vec, \ 'To facilitate the residual connections, \ the dimensions of all module output shall be the same.' if proj_share_weight: # Share the weight matrix between tgt word embedding/projection assert d_model == d_word_vec self.tgt_word_proj.weight = self.decoder.tgt_word_emb.weight if embs_share_weight: # Share the weight matrix between src/tgt word embeddings # assume the src/tgt word vec size are the same assert n_src_vocab == n_tgt_vocab, \ "To share word embedding table, the vocabulary size of src/tgt shall be the same." self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight
def __init__(self, n_tgt_vocab, decoder_max_len, n_layers=2, n_head=3, sub_sequence=(-1, 1), d_k=64, d_v=64, en_d_model=256, de_d_model=128, d_inner_hid=128, dropout=0.1): super(Decoder, self).__init__() self.sub = sub_sequence self.en_d_model = en_d_model self.de_d_model = de_d_model self.dropout = nn.Dropout(dropout) self.position_enc = nn.Embedding(decoder_max_len, de_d_model, padding_idx=constants.PAD) self.position_enc.weight.data = position_encoding_init( decoder_max_len, de_d_model) self.position_enc.weight.requires_grad = False self.tgt_word_emb = nn.Embedding(n_tgt_vocab, de_d_model, padding_idx=constants.PAD) self.tgt_word_proj = Linear(de_d_model, n_tgt_vocab, bias=False) self.layer_stack = nn.ModuleList([ DecoderLayer(de_d_model, d_inner_hid, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers) ]) #project the encoder output to dim of decoder self.enc_dec_projection = Linear(en_d_model, de_d_model, bias=False)
def __init__(self, n_src_dim, n_tgt_vocab, n_max_seq, n_layers=6, n_head=8, d_model=512, d_inner_hid=1024, d_k=64, d_v=64, dropout=0.1, proj_share_weight=True, embs_share_weight=True): super(Transformer, self).__init__() self.encoder = Encoder(n_max_seq, n_layers=n_layers, n_head=n_head, d_model=d_model, d_inner_hid=d_inner_hid, dropout=dropout) self.decoder = Decoder(n_max_seq, n_layers=n_layers, n_head=n_head, d_model=d_model, d_inner_hid=d_inner_hid, dropout=dropout) #project the source to dim of model self.src_projection = Linear(n_src_dim, d_model, bias=False) self.tgt_word_emb = nn.Embedding(n_tgt_vocab, d_model, padding_idx=Constants.PAD) self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False) self.dropout = nn.Dropout(dropout)
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): super(MultiHeadAttention, self).__init__() self.n_head = n_head self.d_k = d_k self.d_v = d_v self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k)) self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k)) self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v)) self.attention = ScaledDotProductAttention(d_model) self.layer_norm = LayerNormalization(d_model) self.proj = Linear(n_head * d_v, d_model) self.dropout = nn.Dropout(dropout) init.xavier_normal(self.w_qs) init.xavier_normal(self.w_ks) init.xavier_normal(self.w_vs)
def __init__(self, n_src_dim, encoder_max_len, n_layers=2, n_head=3, sub_sequence=(-1, 1), d_k=64, d_v=64, d_model=256, d_inner_hid=256, dropout=0.1): super(Encoder, self).__init__() self.sub = sub_sequence self.d_model = d_model self.dropout = nn.Dropout(dropout) self.position_enc = nn.Embedding(encoder_max_len, d_model, padding_idx=constants.PAD) self.position_enc.weight.data = position_encoding_init( encoder_max_len, d_model) self.position_enc.weight.requires_grad = False self.trans_pos_enc = nn.Embedding(encoder_max_len, d_model, padding_idx=constants.PAD) self.trans_pos_enc.weight.data = position_encoding_init( encoder_max_len, d_model) self.trans_pos_enc.weight.requires_grad = False #project the source to dim of model self.src_projection = Linear(n_src_dim, d_model, bias=False) self.layer_stack = nn.ModuleList([ EncoderLayer(d_model, d_inner_hid, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers) ])
def __init__(self, user_size, kernel_size=3, n_layers=1, n_head=1, d_k=32, d_v=32, d_word_vec=32, d_model=32, d_inner_hid=32, dropout=0.1, finit=0): super(Decoder, self).__init__() self.d_model = d_model self.user_size = user_size self.user_emb = nn.Embedding(user_size, d_word_vec, padding_idx=Constants.PAD) self.tgt_user_proj = Linear(d_model, user_size, bias=False) self.dropout = nn.Dropout(dropout) self.conv = nn.Conv1d(d_model, user_size, kernel_size, padding=kernel_size - 1, bias=True) self.padding = kernel_size - 1 self.finit = finit self.layer_stack = nn.ModuleList([ DecoderLayer(d_model, d_inner_hid, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers) ])
def __init__(self, n_src_vocab, n_tgt_vocab, n_max_seq, n_layers=6, n_head=8, d_word_vec=512, d_model=512, d_inner_hid=1024, d_k=64, d_v=64, dropout=0.1, proj_share_weight=True, embs_share_weight=True, use_ctx=False): self.use_ctx = use_ctx super(Transformer, self).__init__() self.encoder = Encoder(n_src_vocab, n_max_seq, n_layers=n_layers, n_head=n_head, d_word_vec=d_word_vec, d_model=d_model, d_inner_hid=d_inner_hid, dropout=dropout) if use_ctx: self.encoder_ctx = Encoder(n_src_vocab, n_max_seq, n_layers=n_layers, n_head=n_head, d_word_vec=d_word_vec, d_model=d_model, d_inner_hid=d_inner_hid, dropout=dropout) # Share the word embeddings between the src encoder and the ctx encoder self.encoder_ctx.src_word_emb.weight = self.encoder.src_word_emb.weight self.decoder = Decoder(n_tgt_vocab, n_max_seq, n_layers=n_layers, n_head=n_head, d_word_vec=d_word_vec, d_model=d_model, d_inner_hid=d_inner_hid, dropout=dropout, use_ctx=use_ctx) self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False) self.dropout = nn.Dropout(dropout) assert d_model == d_word_vec, \ 'To facilitate the residual connections, the dimensions of all module output shall be the same.' if proj_share_weight: # Share the weight matrix between tgt word embedding/projection assert d_model == d_word_vec self.tgt_word_proj.weight = self.decoder.tgt_word_emb.weight if embs_share_weight: # Share the weight matrix between src/tgt word embeddings # assume the src/tgt word vec size are the same assert n_src_vocab == n_tgt_vocab, \ "To share word embedding table, the vocabulary size of src/tgt shall be the same." self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight