def __init__(self, input_size, n_heads, drop_rate): super(TransformerBlock, self).__init__() # multi-head attention self.attentionMH = MultiHeadedAttention(n_heads, input_size, drop_rate) # layer normalization self.norm1 = LayerNormalization(input_size) self.norm2 = LayerNormalization(input_size) # layer feed-forward self.layer_ff = PositionwiseFeedForward(input_size, input_size * 4, input_size, drop_rate) self.drop = torch.nn.Dropout(drop_rate)
def __init__(self, input_size, n_heads, drop_rate, device=torch.device("cpu")): super().__init__() # multi-head attention self.attnSelf = MultiHeadedAttention_Basic(n_heads, input_size, drop_rate).to(device) self.attnEnc = MultiHeadedAttention_Basic(n_heads, input_size, drop_rate).to(device) # layer normalization self.norm1 = LayerNormalization(input_size).to(device) self.norm2 = LayerNormalization(input_size).to(device) self.norm3 = LayerNormalization(input_size).to(device) # layer feed-forward self.pos_ff = PositionwiseFeedForward_Basic(input_size, input_size * 4, input_size, drop_rate).to(device) self.drop = torch.nn.Dropout(drop_rate).to(device)
def __init__(self, vocab_size, hidden_size, factor_size, device=torch.device("cpu")): super().__init__() self.word_embeddings = torch.nn.Embedding(vocab_size, factor_size) self.word_trans = torch.nn.Linear(factor_size, hidden_size) self.position_embeddings = PositionalEmbedding(factor_size, device) self.position_trans = torch.nn.Linear(factor_size, hidden_size) self.norm = LayerNormalization(hidden_size)