def __init__(self, params): super(MultiHeadAttention, self).__init__() assert params.hidden_dim % params.n_head == 0 self.attentions = nn.ModuleList([SelfAttention(params) for _ in range(params.n_head)]) self.o_w = nn.Linear(params.hidden_dim, params.hidden_dim, bias=False) init_weight(self.o_w) self.dropout = nn.Dropout(params.dropout)
def __init__(self, params): super(PositionWiseFeedForward, self).__init__() # nn.Conv1d input = (batch size , # of channels) """ Multi Head Attention에서 각 head가 만들어낸 self-attention을 치우치치 않게 균등하게 섞는 역할 선형변환이 position마다 동일하게 적용이 되지만, layer마다 다른 파라미터를 사용한다. 이를 kernel size가 1d인 2 개의 convolution들로 나타냄 """ self.conv1 = nn.Conv1d(params.hidden_dim, params.feed_forward_dim, kernel_size=1) self.conv2 = nn.Conv1d(params.feed_forward_dim, params.hidden_dim, kernel_size=1) init_weight(self.conv1) init_weight(self.conv2) self.dropout = nn.Dropout(params.dropout)
def __init__(self, params): super(MultiHeadAttention, self).__init__() assert params.hidden_dim % params.n_head == 0 """ self_attentions : self-attention을 num_head번 반복하도록 선언 """ self.attentions = nn.ModuleList( [SelfAttention(params) for _ in range(params.n_head)]) """ self.o_w : 가중치 행렬 선언 및 초기화 """ self.o_w = nn.Linear(params.hidden_dim, params.hidden_dim, bias=False) init_weight(self.o_w) """ self.dropout : Dropout 선언 """ self.dropout = nn.Dropout(params.dropout)
def __init__(self, params): super(SelfAttention, self).__init__() self.hidden_dim = params.hidden_dim self.attention_dim = params.hidden_dim // params.n_head self.q_w = nn.Linear(self.hidden_dim, self.attention_dim, bias=False) self.k_w = nn.Linear(self.hidden_dim, self.attention_dim, bias=False) self.v_w = nn.Linear(self.hidden_dim, self.attention_dim, bias=False) init_weight(self.q_w) init_weight(self.k_w) init_weight(self.v_w) self.dropout = nn.Dropout(params.dropout) self.scale_factor = torch.sqrt(torch.FloatTensor([self.attention_dim])).to(params.device)