def __init__(self, d_model, self_attention_layer, feed_forward_layer,
              dropout):
     super(EncoderLayer, self).__init__()
     self.self_attention_layer = self_attention_layer
     self.feed_forward_layer = feed_forward_layer
     self.sublayer = clones(SublayerConnection(d_model, dropout), 2)
     self.d_model = d_model
 def __init__(self, header_num, d_model, dropout=0.1):
     super(MultiHeadedAttention, self).__init__()
     assert d_model % header_num == 0
     self.dk = d_model // header_num
     self.header_num = header_num
     self.linear_layers = clones(nn.Linear(d_model, d_model), 4)
     self.atten = None
     self.dropout_layer = nn.Dropout(p=dropout)
 def __init__(self, layer, N):
     super(Encoder, self).__init__()
     self.layers = clones(layer, N)
     self.norm = LayerNorm(layer.d_model)