def __init__(self, input_size, n_heads, filter_size, hidden_size, dropout = None) -> None: super().__init__() self.self_norm = nn.LayerNorm(input_size) self.self_attention = MultiHeadAttention(n_heads,[input_size,input_size]) self.cross_attention = MultiHeadAttention(n_heads,[input_size,input_size]) self.cross_norm_source = nn.LayerNorm(input_size) self.cross_norm_target = nn.LayerNorm(input_size) self.feed_forward = TransformerFeedForward(input_size, filter_size, hidden_size, dropout)
def __init__(self, n_heads, filter_size, hidden_size, dropout = None) -> None: super().__init__() self.norm = LayerNorm() self.self_attention = MultiHeadAttention(n_heads) self.feed_forward = TransformerFeedForward(filter_size, hidden_size, dropout)