def __init__(self, d_model, d_ff, dropout=0.1): super(PoswiseFeedForwardNet, self).__init__() self.relu = nn.ReLU() self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1) self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1) self.dropout = nn.Dropout(dropout) self.layer_norm = LayerNormalization(d_model)
def __init__(self, d_k, d_v, d_model, n_heads, dropout): super(MultiHeadAttention, self).__init__() self.n_heads = n_heads self.multihead_attn = _MultiHeadAttention(d_k, d_v, d_model, n_heads, dropout) self.proj = Linear(n_heads * d_v, d_model) self.dropout = nn.Dropout(dropout) self.layer_norm = LayerNormalization(d_model)
def __init__(self, d_k, d_v, d_model, d_ff, n_branches, dropout): super(MultiBranchAttention, self).__init__() self.d_k = d_k self.d_v = d_v self.d_model = d_model self.d_ff = d_ff self.n_branches = n_branches self.multihead_attn = _MultiHeadAttention(d_k, d_v, d_model, n_branches, dropout) # additional parameters for BranchedAttention self.w_o = nn.ModuleList( [Linear(d_v, d_model) for _ in range(n_branches)]) self.w_kp = torch.rand(n_branches) self.w_kp = nn.Parameter(self.w_kp / self.w_kp.sum()) self.w_a = torch.rand(n_branches) self.w_a = nn.Parameter(self.w_a / self.w_a.sum()) self.pos_ffn = nn.ModuleList([ PoswiseFeedForwardNet(d_model, d_ff // n_branches, dropout) for _ in range(n_branches) ]) self.dropout = nn.Dropout(dropout) self.layer_norm = LayerNormalization(d_model) init.xavier_normal(self.w_o)
def __init__(self, d_k, d_v, d_model, d_ff, n_heads, dropout=0.1): super(DecoderLayer, self).__init__() self.dec_self_attn = MultiHeadAttention(d_k, d_v, d_model, n_heads, dropout) self.dec_enc_attn = MultiHeadAttention(d_k, d_v, d_model, n_heads, dropout) self.pos_ffn = PoswiseFeedForwardNet(d_model, d_ff, dropout) self.layer_norm = LayerNormalization(d_model)
def __init__(self, d_hidden=2048, d_model=512, dropout=0.1): super(PositionwiseFeedForward, self).__init__() self.conv1 = nn.Conv1d(d_model, d_hidden, 1) self.conv2 = nn.Conv1d(d_hidden, d_model, 1) self.relu = nn.ReLU() self.dropout = nn.Dropout(dropout) self.layer_norm = LayerNormalization(d_model)
def __init__(self, n_heads, d_k, d_v, d_model, dropout=0.1): super(MultiHeadAttention, self).__init__() assert n_heads * d_k == d_model, ('`n_heads` * `d_k` != `d_model`' ' ({} x {} != {})'.format( n_heads, d_k, d_model)) self.n_heads = n_heads self.d_k = d_k self.d_v = d_v self.w_q = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_k)) self.w_k = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_k)) self.w_v = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_v)) self.attn = ScaledDotProductAttention(d_k, attn_droput=dropout) self.proj = nn.Linear(n_heads * d_v, d_model) self.dropout = nn.Dropout(dropout) self.layer_norm = LayerNormalization(d_model) nn.init.xavier_normal(self.w_q) nn.init.xavier_normal(self.w_k) nn.init.xavier_normal(self.w_v)
def __init__(self, opt): super(Transformer, self).__init__() self.encoder = Encoder(opt.n_layers, opt.d_k, opt.d_v, opt.d_model, opt.d_ff, opt.n_heads, opt.max_src_seq_len, opt.src_vocab_size, opt.dropout, opt.weighted_model) self.decoder = Decoder(opt.n_layers, opt.d_k, opt.d_v, opt.d_model, opt.d_ff, opt.n_heads, opt.max_tgt_seq_len, opt.tgt_vocab_size, opt.dropout, opt.weighted_model) self.norm = LayerNormalization(opt.d_model) self.tgt_proj = Linear(opt.d_model, opt.tgt_vocab_size) self.weighted_model = opt.weighted_model if opt.share_proj_weight: print('Sharing target embedding and projection..') self.tgt_proj.weight = self.decoder.tgt_emb.weight if opt.share_embs_weight: print('Sharing source and target embedding..') assert opt.src_vocab_size == opt.tgt_vocab_size, \ 'To share word embeddings, the vocabulary size of src/tgt should be the same' self.encoder.src_emb.weight = self.decoder.tgt_emb.weight