예제 #1
0
 def __init__(self, num_classes: int = 1000) -> None:
     super(QuantizationAlexNet, self).__init__()
     self.features = nn.Sequential(
         nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
         nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=3, stride=2),
         nn.Conv2d(64, 192, kernel_size=5, padding=2),
         nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=3, stride=2),
         nn.Conv2d(192, 384, kernel_size=3, padding=1),
         nn.ReLU(inplace=True),
         nn.Conv2d(384, 256, kernel_size=3, padding=1),
         nn.ReLU(inplace=True),
         nn.Conv2d(256, 256, kernel_size=3, padding=1),
         nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=3, stride=2),
     )
     self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
     self.classifier = nn.Sequential(
         nn.Dropout(),
         nn.Linear(256 * 6 * 6, 4096),
         nn.ReLU(inplace=True),
         nn.Dropout(),
         nn.Linear(4096, 4096),
         nn.ReLU(inplace=True),
         nn.Linear(4096, num_classes),
     )
예제 #2
0
    def __init__(
        self,
        n_heads,
        d_model,
        dropout_rate=0.0,
        skip_term_b=False,
        share_qvk_proj=False,
    ):
        super(MultiHeadedSelfAttentionWithRelPos, self).__init__(
            n_heads, d_model, dropout_rate, share_qvk_proj
        )

        self.d_model = d_model
        self.share_qvk_proj = share_qvk_proj
        self.skip_term_b = skip_term_b
        self.nheads = n_heads
        self.d_k = d_model // n_heads

        self.qvk_proj = nn.Linear(
            d_model, d_model if self.share_qvk_proj else d_model * 3
        )

        self.pos_proj = nn.Linear(d_model, d_model, bias=False)

        self.posu = nn.Parameter(flow.Tensor(1, 1, n_heads, self.d_k))
        self.posv = nn.Parameter(flow.Tensor(1, 1, n_heads, self.d_k))
예제 #3
0
파일: lenet.py 프로젝트: BBuf/oneflow-cifar
 def __init__(self):
     super(LeNet, self).__init__()
     self.conv1 = nn.Conv2d(3, 6, 5)
     self.conv2 = nn.Conv2d(6, 16, 5)
     self.fc1   = nn.Linear(16*5*5, 120)
     self.fc2   = nn.Linear(120, 84)
     self.fc3   = nn.Linear(84, 10)
예제 #4
0
 def __init__(self, num_classes=10):
     super(AlexNet, self).__init__()
     self.features = nn.Sequential(
         nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=2),
         nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2),
         nn.Conv2d(64, 192, kernel_size=3,
                   padding=2), nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=2),
         nn.Conv2d(192, 384, kernel_size=3, padding=1),
         nn.ReLU(inplace=True), nn.Conv2d(384,
                                          256,
                                          kernel_size=3,
                                          padding=1), nn.ReLU(inplace=True),
         nn.Conv2d(256, 256, kernel_size=3,
                   padding=1), nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=3, stride=2))
     self.fc_layers = nn.Sequential(
         nn.Dropout(0.6),
         nn.Linear(4096, 2048),
         nn.ReLU(inplace=True),
         nn.Dropout(0.6),
         nn.Linear(2048, 2048),
         nn.ReLU(inplace=True),
         nn.Linear(2048, num_classes),
     )
예제 #5
0
 def __init__(
     self,
     in_channels=1,
     out_channels=32,
     input_dim=312,
     hidden_dim=32,
     output_dim=10,
 ):
     super(cnn1d_ser, self).__init__()
     self.classifier = nn.Sequential(
         nn.Conv1d(in_channels, out_channels, 5, stride=1, padding=2),
         nn.BatchNorm1d(out_channels),
         nn.ReLU(),
         nn.Dropout(0.5),
         nn.Conv1d(out_channels, out_channels, 5, stride=1, padding=2),
         nn.BatchNorm1d(out_channels),
         nn.ReLU(),
         nn.Dropout(0.5),
         nn.Flatten(),
         nn.Linear(input_dim * out_channels, hidden_dim),
         nn.BatchNorm1d(hidden_dim),
         nn.ReLU(),
         nn.Dropout(0.5),
         nn.Linear(hidden_dim, output_dim),
     )
예제 #6
0
파일: bert.py 프로젝트: Oneflow-Inc/models
    def __init__(
        self,
        max_position_embeddings,
        hidden_size,
        nheads,
        dropout=0,
        position_embedding_type="absolute",
        is_decoder=False,
    ):
        super(BertSelfAttention, self).__init__()
        if hidden_size % nheads != 0:
            raise ValueError(
                f"The hidden size ({hidden_size}) is not a multiple of the number of attention "
                f"heads ({nheads})")

        self.num_attention_heads = nheads
        self.attention_head_size = int(hidden_size / nheads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(hidden_size, self.all_head_size)
        self.key = nn.Linear(hidden_size, self.all_head_size)
        self.value = nn.Linear(hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(dropout)
        self.position_embedding_type = position_embedding_type
        if (self.position_embedding_type == "relative_key"
                or self.position_embedding_type == "relative_key_query"):
            self.max_position_embeddings = max_position_embeddings
            self.distance_embedding = nn.Embedding(
                2 * max_position_embeddings - 1, self.attention_head_size)

        self.is_decoder = is_decoder
예제 #7
0
    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k)
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)
        nn.init.normal_(self.w_qs.weight,
                        mean=0,
                        std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_ks.weight,
                        mean=0,
                        std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_vs.weight,
                        mean=0,
                        std=np.sqrt(2.0 / (d_model + d_v)))

        self.attention = ScaledDotProductAttention(temperature=np.power(
            d_k, 0.5),
                                                   attn_dropout=dropout)
        self.layer_norm = nn.LayerNorm(d_model)

        self.fc = nn.Linear(n_head * d_v, d_model)
        nn.init.xavier_normal_(self.fc.weight)

        self.dropout = nn.Dropout(dropout)
예제 #8
0
 def __init__(self):
     super().__init__()
     self.conv1 = nn.Conv2d(3, 6, 5)
     self.pool = nn.MaxPool2d(2, 2)
     self.conv2 = nn.Conv2d(6, 16, 5)
     self.fc1 = nn.Linear(16 * 5 * 5, 120)
     self.fc2 = nn.Linear(120, 84)
     self.fc3 = nn.Linear(84, 10)
예제 #9
0
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
예제 #10
0
 def __init__(
     self, input_size=784, hidden_size1=128, hidden_size2=64, num_classes=10
 ):
     super(Net, self).__init__()
     self.l1 = nn.Linear(input_size, hidden_size1)
     self.relu1 = nn.ReLU()
     self.l2 = nn.Linear(hidden_size1, hidden_size2)
     self.relu2 = nn.ReLU()
     self.l3 = nn.Linear(hidden_size2, num_classes)
예제 #11
0
 def __init__(self, input_dim, hidden_dim, output_dim, batch_size):
     super(lstm_ser, self).__init__()
     self.classifier = nn.Sequential(
         LSTM(input_dim, hidden_dim, batch_size),
         nn.Dropout(0.5),
         nn.Linear(hidden_dim, 32),
         nn.ReLU(),
         nn.Linear(32, output_dim),
     )
예제 #12
0
파일: ffn.py 프로젝트: Oneflow-Inc/models
    def __init__(self, d_model, d_ff, dropout, activation="relu"):
        super(PositionwiseFeedForward, self).__init__()
        self.activation = activation

        assert activation in ["relu", "gelu", "glu", "tanh", "swish"]

        self.w_1 = nn.Linear(d_model,
                             d_ff * 2 if activation == "glu" else d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
예제 #13
0
 def __init__(
     self,
     hidden_size: int,
     intermediate_size: int,
     hidden_dropout_prob: float = 0.1,
     hidden_act: str = "relu",
 ) -> None:
     super().__init__()
     self.hidden_act = hidden_act
     self.intermediate = nn.Linear(hidden_size, intermediate_size)
     self.output = nn.Linear(intermediate_size, hidden_size)
     self.dropout = nn.Dropout(hidden_dropout_prob)
예제 #14
0
파일: vit.py 프로젝트: Oneflow-Inc/models
    def __init__(self, in_dim, mlp_dim, out_dim, dropout_rate=0.1):
        super(MlpBlock, self).__init__()

        # init layers
        self.fc1 = nn.Linear(in_dim, mlp_dim)
        self.fc2 = nn.Linear(mlp_dim, out_dim)
        self.act = nn.GELU()
        if dropout_rate > 0.0:
            self.dropout1 = nn.Dropout(dropout_rate)
            self.dropout2 = nn.Dropout(dropout_rate)
        else:
            self.dropout1 = None
            self.dropout2 = None
예제 #15
0
파일: vit.py 프로젝트: Oneflow-Inc/models
    def __init__(self, in_dim, heads=8, dropout_rate=0.1):
        super(SelfAttention, self).__init__()
        self.heads = heads
        self.head_dim = in_dim // heads
        self.scale = self.head_dim**0.5

        self.query = nn.Linear(in_dim, self.heads * self.head_dim)
        self.key = nn.Linear(in_dim, self.heads * self.head_dim)
        self.value = nn.Linear(in_dim, self.heads * self.head_dim)
        self.out = nn.Linear(self.heads * self.head_dim, in_dim)

        if dropout_rate > 0:
            self.dropout = nn.Dropout(dropout_rate)
        else:
            self.dropout = None
예제 #16
0
 def __init__(
     self,
     in_features,
     hidden_features=None,
     out_features=None,
     act_layer=nn.GELU,
     drop=0.0,
 ):
     super().__init__()
     out_features = out_features or in_features
     hidden_features = hidden_features or in_features
     self.fc1 = nn.Linear(in_features, hidden_features)
     self.act = act_layer()
     self.fc2 = nn.Linear(hidden_features, out_features)
     self.drop = nn.Dropout(drop)
예제 #17
0
파일: model.py 프로젝트: Oneflow-Inc/models
 def __init__(self, num_speakers=2) -> None:
     super(simple_CNN, self).__init__()
     self.convs = nn.Sequential(
         nn.Conv1d(1, 16, 100, stride=10),
         nn.BatchNorm1d(16),
         nn.ReLU(),
         nn.Conv1d(16, 64, 21, stride=10),
         nn.BatchNorm1d(64),
         nn.ReLU(),
         nn.Conv1d(64, 64, 5, stride=5),
         nn.BatchNorm1d(64),
         nn.ReLU(),
     )
     self.linears = nn.Sequential(nn.Linear(1 * 6 * 64, 128),
                                  nn.Linear(128, num_speakers))
예제 #18
0
    def __init__(
        self,
        spatial_feature_size=7,
        dropout_ratio=0.8,
        num_classes=101,
        with_avg_pool=False,
        temporal_feature_size=1,
        in_channels=2048,
        init_std=0.01,
        fcn_testing=False,
    ):

        super(ClsHead, self).__init__()

        self.with_avg_pool = with_avg_pool
        self.dropout_ratio = dropout_ratio
        self.in_channels = in_channels
        self.dropout_ratio = dropout_ratio
        self.temporal_feature_size = temporal_feature_size
        self.spatial_feature_size = spatial_feature_size
        self.init_std = init_std
        self.fcn_testing = fcn_testing
        self.num_classes = num_classes

        if self.dropout_ratio != 0:
            self.dropout = nn.Dropout(p=self.dropout_ratio)
        else:
            self.dropout = None
        if self.with_avg_pool:
            self.avg_pool = nn.AvgPool3d(
                (temporal_feature_size, spatial_feature_size,
                 spatial_feature_size))

        self.fc_cls = nn.Linear(in_channels, num_classes)
        self.new_cls = None
예제 #19
0
파일: model.py 프로젝트: Oneflow-Inc/models
 def __init__(
     self,
     input_sz,
     output_sz,
     d_model,
     nhead,
     num_encoder_layers,
     num_decoder_layers,
     dim_feedforward,
     dropout,
 ):
     super(TransformerModel, self).__init__()
     self.transformer = Transformer(
         d_model=d_model,
         nhead=nhead,
         num_encoder_layers=num_encoder_layers,
         num_decoder_layers=num_decoder_layers,
         dim_feedforward=dim_feedforward,
         dropout=dropout,
         batch_first=False,
     )
     self.softmax = nn.Softmax(dim=2)
     self.linear = nn.Linear(d_model, output_sz)
     self.pos_encoder = PositionalEncoding(d_model, dropout)
     self.pos_decoder = PositionalEncoding(d_model, dropout)
     self.src_embedding = Embeddings(input_sz, d_model)
     self.tgt_embedding = Embeddings(output_sz, d_model)
예제 #20
0
    def __init__(
        self, n_heads, d_model, memory_dim, dropout_rate=0.0, share_vk_proj=False
    ):
        super(MultiHeadedCrossAttention, self).__init__(
            d_model, d_model, enable_output_proj=True, dropout=dropout_rate
        )

        self.d_model = d_model
        self.share_vk_proj = share_vk_proj
        self.nheads = n_heads
        self.d_k = d_model // n_heads

        self.q_proj = nn.Linear(d_model, d_model)
        self.vk_proj = nn.Linear(
            memory_dim, d_model if self.share_vk_proj else d_model * 2
        )
예제 #21
0
    def __init__(
        self,
        vocab_size,
        seq_length,
        hidden_size,
        hidden_layers,
        atten_heads,
        intermediate_size,
        hidden_act,
        hidden_dropout_prob,
        attention_probs_dropout_prob,
        max_position_embeddings,
        type_vocab_size,
        initializer_range=0.02,
    ):
        super().__init__()
        self.bert = BertModel(
            vocab_size,
            seq_length,
            hidden_size,
            hidden_layers,
            atten_heads,
            intermediate_size,
            hidden_act,
            hidden_dropout_prob,
            attention_probs_dropout_prob,
            max_position_embeddings,
            type_vocab_size,
        )
        self.seq_length = seq_length
        self.hidden_size = hidden_size
        self.cls_squad = nn.Linear(hidden_size, 2)

        self.cls_squad.weight.data.normal_(mean=0.0, std=initializer_range)
        self.cls_squad.bias.data.fill_(0)
예제 #22
0
    def __init__(self, cfg):
        super(DPN, self).__init__()
        in_planes, out_planes = cfg['in_planes'], cfg['out_planes']
        num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth']

        self.conv1 = nn.Conv2d(3,
                               64,
                               kernel_size=3,
                               stride=1,
                               padding=1,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.last_planes = 64
        self.layer1 = self._make_layer(in_planes[0],
                                       out_planes[0],
                                       num_blocks[0],
                                       dense_depth[0],
                                       stride=1)
        self.layer2 = self._make_layer(in_planes[1],
                                       out_planes[1],
                                       num_blocks[1],
                                       dense_depth[1],
                                       stride=2)
        self.layer3 = self._make_layer(in_planes[2],
                                       out_planes[2],
                                       num_blocks[2],
                                       dense_depth[2],
                                       stride=2)
        self.layer4 = self._make_layer(in_planes[3],
                                       out_planes[3],
                                       num_blocks[3],
                                       dense_depth[3],
                                       stride=2)
        self.linear = nn.Linear(
            out_planes[3] + (num_blocks[3] + 1) * dense_depth[3], 10)
예제 #23
0
    def __init__(self, num_classes: int = 5) -> None:
        super(PoseNet, self).__init__()

        self.conv2d_1a_3x3 = BasicConv2d(3, 32, kernel_size=3, stride=2)
        self.conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3)
        self.conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, padding=1)
        self.MaxPool_3a_3x3 = nn.MaxPool2d(3, stride=2)
        self.conv2d_3b_1x1 = BasicConv2d(64, 80, kernel_size=1)
        self.conv2d_4a_3x3 = BasicConv2d(80, 192, kernel_size=3)
        self.MaxPool_5a_3x3 = nn.MaxPool2d(kernel_size=3, stride=2)  # stem

        self.Mixed_5b = self._generate_inception_module(192, 320, 1, Mixed_5b)
        self.block35 = self._generate_inception_module(320, 320, 1, block35)

        self.conv_ls1 = BasicConv2d(320, 320, kernel_size=3, stride=2, padding=1)
        self.MaxPool_3x3_ls1 = nn.MaxPool2d(kernel_size=3, stride=2)

        self.Mixed_6a = self._generate_inception_module(320, 1088, 1, Mixed_6a)
        self.block17 = self._generate_inception_module(1088, 1088, 1, block17)

        self.conv_ls2 = BasicConv2d(1088, 1088, kernel_size=3, stride=2)

        self.Mixed_7a = self._generate_inception_module(1088, 2080, 1, Mixed_7a)
        self.block8 = self._generate_inception_module(2080, 2080, 1, block8)

        self.conv_ls3 = BasicConv2d(3488, 2080, kernel_size=1)
        self.Conv2d_7b_1x1 = BasicConv2d(2080, 1536, kernel_size=1)
        self.AvgPool_1a_8x8 = nn.AvgPool2d(kernel_size=[8, 8])

        self.dense = nn.Linear(1536, num_classes)
        self.relu = nn.ReLU(inplace=True)
예제 #24
0
 def __init__(
     self,
     wide_vocab_size: int,
     deep_vocab_size: int,
     deep_embedding_vec_size: int = 16,
     num_deep_sparse_fields: int = 26,
     num_dense_fields: int = 13,
     hidden_size: int = 1024,
     hidden_units_num: int = 7,
     deep_dropout_rate: float = 0.5,
 ):
     super(LocalWideAndDeep, self).__init__()
     self.wide_embedding = Embedding(
         wide_vocab_size,
         1,
     )
     self.deep_embedding = Embedding(deep_vocab_size,
                                     deep_embedding_vec_size)
     deep_feature_size = (deep_embedding_vec_size * num_deep_sparse_fields +
                          num_dense_fields)
     self.linear_layers = nn.Sequential(
         OrderedDict([(
             f"fc{i}",
             Dense(
                 deep_feature_size if i == 0 else hidden_size,
                 hidden_size,
                 deep_dropout_rate,
             ),
         ) for i in range(hidden_units_num)]))
     self.deep_scores = nn.Linear(hidden_size, 1)
     self.sigmoid = nn.Sigmoid()
예제 #25
0
 def __init__(
     self,
     word_emb_dim,
     vocab_size,
     dim_channel,
     kernel_wins,
     dropout_rate,
     num_class,
     max_seq_len,
     training=True,
 ):
     super(textCNN, self).__init__()
     self.embed = nn.Embedding(vocab_size, word_emb_dim)
     self.convs = nn.ModuleList([
         nn.Conv2d(1, dim_channel, (w, word_emb_dim)) for w in kernel_wins
     ])
     self.maxpool = nn.ModuleList([
         nn.MaxPool2d((max_seq_len - w + 1, 1), stride=1)
         for w in kernel_wins
     ])
     # Dropout layer
     self.dropout = nn.Dropout(dropout_rate)
     self.training = training
     # FC layer
     self.fc = nn.Linear(len(kernel_wins) * dim_channel, num_class)
예제 #26
0
 def __init__(
     self,
     c_in,
     c_cond,
     c_h,
     c_out,
     kernel_size,
     n_conv_blocks,
     upsample,
     act,
     sn,
     dropout_rate,
 ):
     super(Decoder, self).__init__()
     self.n_conv_blocks = n_conv_blocks
     self.upsample = upsample
     self.act = get_act(act)
     f = lambda x: x
     self.in_conv_layer = f(nn.Conv1d(c_in, c_h, kernel_size=1))
     self.first_conv_layers = nn.ModuleList([
         f(nn.Conv1d(c_h, c_h, kernel_size=kernel_size))
         for _ in range(n_conv_blocks)
     ])
     self.second_conv_layers = nn.ModuleList([
         f(nn.Conv1d(c_h, c_h * up, kernel_size=kernel_size))
         for _, up in zip(range(n_conv_blocks), self.upsample)
     ])
     self.norm_layer = nn.InstanceNorm1d(c_h, affine=False)
     self.conv_affine_layers = nn.ModuleList(
         [f(nn.Linear(c_cond, c_h * 2)) for _ in range(n_conv_blocks * 2)])
     self.out_conv_layer = f(nn.Conv1d(c_h, c_out, kernel_size=1))
     self.dropout_layer = nn.Dropout(p=dropout_rate)
예제 #27
0
파일: bert.py 프로젝트: Oneflow-Inc/models
 def __init__(self, hidden_size, intermediate_size, activation):
     super(BertIntermediate, self).__init__()
     self.dense = nn.Linear(hidden_size, intermediate_size)
     if isinstance(activation, str):
         self.intermediate_act_fn = ACT2FN[activation]
     else:
         self.intermediate_act_fn = activation
예제 #28
0
파일: lm.py 프로젝트: Oneflow-Inc/models
    def __init__(self, params):
        super(RecurrentLanguageModel, self).__init__(params)

        self.model_type = "recurrent_lm"
        self.vocab_size = params["vocab_size"]
        self.share_embedding = params["share_embedding"]
        self.smoothing = params["smoothing"]
        self.num_layers = params["num_layers"]
        self.hidden_size = params["hidden_size"]

        self.embedding = nn.Embedding(params["vocab_size"],
                                      params["hidden_size"])
        self.rnn = nn.LSTM(
            input_size=params["hidden_size"],
            hidden_size=params["hidden_size"],
            num_layers=params["num_layers"],
            batch_first=True,
            dropout=params["dropout"],
            bidirectional=False,
        )

        self.output_project = nn.Linear(params["hidden_size"],
                                        params["vocab_size"])

        if self.share_embedding:
            assert self.embedding.weight.size(
            ) == self.output_project.weight.size()
            self.output_project.weight = self.embedding.weight

        self.crit = LabelSmoothingLoss(size=self.vocab_size,
                                       smoothing=self.smoothing,
                                       padding_idx=PAD)
예제 #29
0
파일: dla.py 프로젝트: BBuf/oneflow-cifar
    def __init__(self, block=BasicBlock, num_classes=10):
        super(DLA, self).__init__()
        self.base = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(True)
        )

        self.layer1 = nn.Sequential(
            nn.Conv2d(16, 16, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(True)
        )

        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(True)
        )

        self.layer3 = Tree(block,  32,  64, level=1, stride=1)
        self.layer4 = Tree(block,  64, 128, level=2, stride=2)
        self.layer5 = Tree(block, 128, 256, level=2, stride=2)
        self.layer6 = Tree(block, 256, 512, level=1, stride=2)
        self.linear = nn.Linear(512, num_classes)
예제 #30
0
    def __init__(
        self,
        vocab_size,
        d_model=256,
        n_heads=4,
        d_ff=2048,
        memory_dim=256,
        n_blocks=6,
        pos_dropout=0.0,
        slf_attn_dropout=0.0,
        src_attn_dropout=0.0,
        ffn_dropout=0.0,
        residual_dropout=0.1,
        activation="relu",
        normalize_before=True,
        concat_after=False,
        share_embedding=False,
    ):
        super(TransformerDecoder, self).__init__()

        self.decoder_type = "transformer"
        self.normalize_before = normalize_before
        self.relative_positional = False

        self.d_model = d_model

        self.embedding = nn.Embedding(vocab_size, d_model)

        self.pos_emb = PositionalEncoding(d_model, pos_dropout)

        self.blocks = nn.ModuleList(
            [
                TransformerDecoderLayer(
                    n_heads,
                    d_model,
                    d_ff,
                    memory_dim,
                    slf_attn_dropout,
                    src_attn_dropout,
                    ffn_dropout,
                    residual_dropout,
                    normalize_before=normalize_before,
                    concat_after=concat_after,
                    relative_positional=False,
                    activation=activation,
                )
                for _ in range(n_blocks)
            ]
        )

        if self.normalize_before:
            self.after_norm = nn.LayerNorm(d_model)

        self.output_layer = nn.Linear(d_model, vocab_size)

        if share_embedding:
            assert self.embedding.weight.size() == self.output_layer.weight.size()
            self.output_layer.weight = self.embedding.weight
            logger.info("Tie the weights between the embedding and output layer.")