def test_inference_no_head_absolute_embedding(self):
        model = ElectraModel.from_pretrained("google/electra-small-discriminator")
        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
        output = model(input_ids, attention_mask=attention_mask)[0]
        expected_shape = torch.Size((1, 11, 256))
        self.assertEqual(output.shape, expected_shape)
        expected_slice = torch.tensor(
            [[[0.4471, 0.6821, -0.3265], [0.4627, 0.5255, -0.3668], [0.4532, 0.3313, -0.4344]]]
        )

        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
Exemplo n.º 2
0
    def __init__(self, config):
        super().__init__(config)

        self.electra = ElectraModel(config)
        self.generator_predictions = ElectraGeneratorPredictions(config)

        self.loss_fct = nn.CrossEntropyLoss(
            reduction='none')  # -100 index = padding token

        self.generator_lm_head = nn.Linear(config.embedding_size,
                                           config.vocab_size)
        self.init_weights()
Exemplo n.º 3
0
    def create_and_check_electra_model(
        self,
        config,
        input_ids,
        token_type_ids,
        input_mask,
        sequence_labels,
        token_labels,
        choice_labels,
        fake_token_labels,
    ):
        model = ElectraModel(config=config)
        model.to(torch_device)
        model.eval()
        (sequence_output, ) = model(input_ids,
                                    attention_mask=input_mask,
                                    token_type_ids=token_type_ids)
        (sequence_output, ) = model(input_ids, token_type_ids=token_type_ids)
        (sequence_output, ) = model(input_ids)

        result = {
            "sequence_output": sequence_output,
        }
        self.parent.assertListEqual(
            list(result["sequence_output"].size()),
            [self.batch_size, self.seq_length, self.hidden_size])
 def create_and_check_electra_model_as_decoder(
     self,
     config,
     input_ids,
     token_type_ids,
     input_mask,
     sequence_labels,
     token_labels,
     choice_labels,
     encoder_hidden_states,
     encoder_attention_mask,
 ):
     config.add_cross_attention = True
     model = ElectraModel(config)
     model.to(torch_device)
     model.eval()
     result = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         encoder_hidden_states=encoder_hidden_states,
         encoder_attention_mask=encoder_attention_mask,
     )
     result = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         encoder_hidden_states=encoder_hidden_states,
     )
     result = model(input_ids,
                    attention_mask=input_mask,
                    token_type_ids=token_type_ids)
     self.parent.assertEqual(
         result.last_hidden_state.shape,
         (self.batch_size, self.seq_length, self.hidden_size))
Exemplo n.º 5
0
    def __init__(self, extractor, config, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.extractor = extractor
        self.config = config

        if config["pretrained"] == "electra-base-msmarco":
            self.bert = ElectraModel.from_pretrained(
                "Capreolus/electra-base-msmarco")
        elif config["pretrained"] == "bert-base-msmarco":
            self.bert = BertModel.from_pretrained(
                "Capreolus/bert-base-msmarco")
        elif config["pretrained"] == "bert-base-uncased":
            self.bert = BertModel.from_pretrained("bert-base-uncased")
        else:
            raise ValueError(
                f"unsupported model: {config['pretrained']}; need to ensure correct tokenizers will be used before arbitrary hgf models are supported"
            )

        self.transformer_layer_1 = BertLayer(self.bert.config)
        self.transformer_layer_2 = BertLayer(self.bert.config)
        self.num_passages = extractor.config["numpassages"]
        self.maxseqlen = extractor.config["maxseqlen"]
        self.linear = nn.Linear(self.bert.config.hidden_size, 1)

        if config["aggregation"] == "max":
            raise NotImplementedError()
        elif config["aggregation"] == "avg":
            raise NotImplementedError()
        elif config["aggregation"] == "attn":
            raise NotImplementedError()
        elif config["aggregation"] == "transformer":
            self.aggregation = self.aggregate_using_transformer
            input_embeddings = self.bert.get_input_embeddings()
            # TODO hardcoded CLS token id
            cls_token_id = torch.tensor([[101]])
            self.initial_cls_embedding = input_embeddings(cls_token_id).view(
                1, self.bert.config.hidden_size)
            self.full_position_embeddings = torch.zeros(
                (1, self.num_passages + 1, self.bert.config.hidden_size),
                requires_grad=True,
                dtype=torch.float)
            torch.nn.init.normal_(self.full_position_embeddings,
                                  mean=0.0,
                                  std=0.02)

            self.initial_cls_embedding = nn.Parameter(
                self.initial_cls_embedding, requires_grad=True)
            self.full_position_embeddings = nn.Parameter(
                self.full_position_embeddings, requires_grad=True)
        else:
            raise ValueError(
                f"unknown aggregation type: {self.config['aggregation']}")
Exemplo n.º 6
0
    def __init__(self):

        super(ElectraClassificationHead, self).__init__()

        electra_base = "google/electra-base-discriminator"
        electra_large = "google/electra-large-discriminator"
        self.electra = ElectraModel.from_pretrained(electra_large)
        self.dense = torch.nn.Linear(self.electra.config.hidden_size,
                                     self.electra.config.hidden_size)
        self.dropout = torch.nn.Dropout(
            self.electra.config.hidden_dropout_prob)
        self.out_proj = torch.nn.Linear(self.electra.config.hidden_size, 1)
        self.gelu = torch.nn.GELU()
    def __init__(self, config, ):
        super(ElectraSpanForNer, self).__init__(config)
        self.num_labels = config.num_labels
        self.soft_label = config.soft_label
        self.BaseModel = ElectraModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.start_fc = PoolerStartLogits(config.hidden_size, self.num_labels)
        if self.soft_label:
            self.end_fc = PoolerEndLogits(config.hidden_size + self.num_labels, self.num_labels)
        else:
            self.end_fc = PoolerEndLogits(config.hidden_size + 1, self.num_labels)
        self.init_weights()
 def __init__(self,
              config,
              model_name,
              only_embedding=True,
              output_hidden_states=True):
     super(ElectraTokenEmbedder, self).__init__(config)
     self.config = config
     self.only_embedding = only_embedding
     self.model = ElectraModel.from_pretrained(
         model_name, output_hidden_states=output_hidden_states)
     if self.only_embedding:
         self.model = self.model.get_input_embeddings()
         self.model.weight.requires_grad = False
Exemplo n.º 9
0
class ElectraEncoder(ElectraPreTrainedModel):
    def __init__(self, config):
        super(ElectraEncoder, self).__init__(config)

        self.electra = ElectraModel(config)
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.electra.forward(input_ids,
                                       attention_mask=attention_mask,
                                       token_type_ids=token_type_ids)
        pooled_output = outputs[0][:, 0]  # embedding 가져오기
        return pooled_output
Exemplo n.º 10
0
    def __init__(self, config):
        super().__init__(config)
        self.hidden_size = config.hidden_size
        self.electra = ElectraModel(config)
        self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
        self.pooler_activation = nn.Tanh()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 1)
        self.classifier2 = nn.Linear(config.hidden_size, 2)

        self.gru = GRUWithPadding(config)
        #self.attention = MultiHeadAttention(config.hidden_size)
        self.init_weights()
Exemplo n.º 11
0
    def __init__(self, model_name, config, num_speakers=2):
        super().__init__(config)

        self.num_speakers = num_speakers
        self.electra      = ElectraModel.from_pretrained(model_name)
        self.embeddings   = SpeakerAwareElectraEmbeddings(config, self.num_speakers)
        self.num_labels   = config.num_labels

        self.dense        = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout      = nn.Dropout(config.hidden_dropout_prob)
        self.out_proj     = nn.Linear(config.hidden_size, self.num_labels)
        
        self.gelu = nn.GELU()
        self.init_weights()
Exemplo n.º 12
0
 def __init__(self, output_size, dropout_rate=0.1, device='cpu'):
     super().__init__()
     self.device = device
     self.dropout = nn.Dropout(p=dropout_rate)
     self.electra = ElectraModel.from_pretrained(
         'google/electra-small-discriminator').to(device)
     self.cls_query = nn.Parameter(
         torch.randn(1, self.electra.config.hidden_size, device=device)
     )  # a learnable vector acting as query for output att
     self.cls_att = AttentionModule(d_model=self.electra.config.hidden_size,
                                    d_k=self.electra.config.hidden_size,
                                    device=device,
                                    dropout=self.dropout)
     self.output = nn.Linear(self.electra.config.hidden_size,
                             output_size).to(device)
Exemplo n.º 13
0
 def __init__(self, config):
     super().__init__(config)
     self.num_labels = config.num_labels
     self.electra = ElectraModel(config)
     self.dropout = nn.Dropout(config.hidden_dropout_prob)
     self.classifier = nn.Linear(config.hidden_size, config.num_labels)
     self.loss_fct = nn.CrossEntropyLoss()
     self.use_crf = config.use_crf
     if self.use_crf:
         self.crf_layer = Transformer_CRF(
             num_labels=config.num_labels,
             start_label_id=config.label2idx['CLS'])
     else:
         self.crf_layer = None
     self.init_weights()
Exemplo n.º 14
0
    def __init__(self, config: TrainConfig, logger: logging.Logger):
        super().__init__()
        self.config = config

        self.electra: ElectraModel = ElectraModel.from_pretrained(
            config.pretrained_model_name)
        self.dense = nn.Linear(self.electra.config.hidden_size,
                               self.electra.config.hidden_size)
        self.dropout = nn.Dropout(self.electra.config.hidden_dropout_prob)

        self.bias_classifier = nn.Linear(self.electra.config.hidden_size, 3)
        self.hate_classifier = nn.Linear(self.electra.config.hidden_size, 3)

        self.criterion = nn.CrossEntropyLoss()
        self.learning_rate = config.learning_rate

        self.stream_logger = logger
Exemplo n.º 15
0
def _get_bert(model_type, model_path_dict):
    if model_type == 'bert':
        config = BertConfig.from_pretrained(model_path_dict['config'])
        config.output_hidden_states = True
        bert = BertModel.from_pretrained(model_path_dict['model'],
                                         config=config)
    elif model_type == 'electra':
        config = ElectraConfig.from_pretrained(model_path_dict['config'])
        config.output_hidden_states = True
        bert = ElectraModel.from_pretrained(model_path_dict['model'],
                                            config=config)
    elif model_type == 'roberta':
        config = RobertaConfig.from_pretrained(model_path_dict['config'])
        config.output_hidden_states = True
        bert = RobertaModel.from_pretrained(model_path_dict['model'],
                                            config=config)
    return bert, config
Exemplo n.º 16
0
    def __init__(self, config, bidirectional=True):
        super().__init__(config)
        self.electra = ElectraModel(config)
        feature_dim = config.hidden_size
        if bidirectional:
            feature_dim += config.hidden_size
        self.score = nn.Linear(2 * config.hidden_size, 1)
        self.pooler = nn.Linear(feature_dim, config.hidden_size)
        self.pooler_activation = nn.Tanh()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 1)
        self.classifier2 = nn.Linear(config.hidden_size, 2)
        self.bidirectional = bidirectional
        self.gru1 = GRUWithPadding(config.hidden_size,
                                   bidirectional=bidirectional)

        self.init_weights()
        print("bidirectional is: " + str(bidirectional))
Exemplo n.º 17
0
    def __init__(
            self,
            intent_class_num,
            entity_class_num,
            default_model_path='monologg/koelectra-small-v2-discriminator',
            pad_token_id=0):
        super(KoELECTRAFineTuner, self).__init__()

        self.intent_class_num = intent_class_num
        self.entity_class_num = entity_class_num
        self.backbone = ElectraModel.from_pretrained(default_model_path)
        self.feature_dim = self.backbone.config.hidden_size
        self.intent_embedding = nn.Linear(self.feature_dim,
                                          self.intent_class_num)
        self.entity_embedding = nn.Linear(self.feature_dim,
                                          self.entity_class_num)
        self.entity_featurizer = CRF(self.entity_class_num, batch_first=True)
        self.pad_token_id = pad_token_id
Exemplo n.º 18
0
    def __init__(self, hidden_dim, pretrained_model):
        super(UtterancePretrainedModel, self).__init__()
        self._pretrained_model = pretrained_model

        if pretrained_model == "bert":
            self._encoder = BertModel.from_pretrained("bert-base-uncased")
        elif pretrained_model == "roberta":
            self._encoder = RobertaModel.from_pretrained("roberta-base")
        elif pretrained_model == "xlnet":
            self._encoder = XLNetModel.from_pretrained("xlnet-base-cased")
        elif pretrained_model == "albert":
            self._encoder = AlbertModel.from_pretrained("albert-base-v2")
        elif pretrained_model == "electra":
            self._encoder = ElectraModel.from_pretrained("google/electra-base-discriminator")
        else:
            assert False, "Something wrong with the parameter --pretrained_model"

        self._linear = nn.Linear(UtterancePretrainedModel.HIDDEN_DIM, hidden_dim)
    def __init__(self, configs):
        super(ElectraClassification, self).__init__()
        self.configs = configs
        self.bert_hiddensize = self.configs["bert_hiddensize"]
        self.dense = self.configs["dense"]
        self.label_nums = self.configs["label_nums"]
        self.dropout = self.configs["dropout"]

        self.electra_model = ElectraModel.from_pretrained(self.configs["path"]["electra_path"])
        # for p in self.bert_model.parameters():
        #     p.requires_grad = True
        # output shape of bert: (batch_size, seqlens, lstm_hiddensize)
        self.classification = torch.nn.Sequential(
            torch.nn.Linear(self.bert_hiddensize, self.dense),
            torch.nn.ReLU(),
            torch.nn.Dropout(self.dropout),
            torch.nn.Linear(self.dense, self.label_nums)
        )
Exemplo n.º 20
0
    def __init__(
        self,
        backbone: None,
        vocab_size: int,
        seq_len: int,
        intent_class_num: int,
        entity_class_num: int,
        d_model=512,
        nhead=8,
        num_encoder_layers=6,
        dim_feedforward=2048,
        dropout=0.1,
        activation="relu",
        pad_token_id: int = 0,
    ):
        super(EmbeddingTransformer, self).__init__()
        self.backbone = backbone
        self.seq_len = seq_len
        self.pad_token_id = pad_token_id

        if backbone is None:
            self.encoder = nn.TransformerEncoder(
                TransformerEncoderLayer(d_model, nhead, dim_feedforward,
                                        dropout, activation),
                num_encoder_layers,
                LayerNorm(d_model),
            )
        else:  # pre-defined model architecture use
            if backbone == "kobert":
                self.encoder = get_kobert_model()
            elif backbone == "distill_kobert":
                self.encoder = get_distilkobert_model()
            elif backbone == "koelectra":
                self.encoder = ElectraModel.from_pretrained(
                    "monologg/koelectra-small-v2-discriminator")

            d_model = self.encoder.config.hidden_size

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(self.seq_len, d_model)

        self.intent_feature = nn.Linear(d_model, intent_class_num)
        self.entity_feature = nn.Linear(d_model, entity_class_num)
Exemplo n.º 21
0
    def __init__(self, config, need_birnn=False, rnn_dim=128):
        super(Electra_BiLSTM_CRF, self).__init__(config)

        self.num_tags = config.num_labels
        self.electra = ElectraModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        out_dim = config.hidden_size
        self.need_birnn = need_birnn

        # 如果为False,则不要BiLSTM层
        if need_birnn:
            self.birnn = nn.LSTM(config.hidden_size,
                                 rnn_dim,
                                 num_layers=1,
                                 bidirectional=True,
                                 batch_first=True)
            out_dim = rnn_dim * 2

        self.hidden2tag = nn.Linear(out_dim, config.num_labels)
        self.crf = CRF(config.num_labels, batch_first=True)
Exemplo n.º 22
0
    def __get_model_and_tokenizer(self):
        model, tokenizer = None, None

        if self.transformer_model == TransformerType.BERT:
            tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
            model = BertModel.from_pretrained('bert-base-cased')

        if self.transformer_model == TransformerType.XLNet:
            tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
            model = XLNetModel.from_pretrained('xlnet-base-cased')

        if self.transformer_model == TransformerType.RoBERTa:
            tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
            model = RobertaModel.from_pretrained('roberta-base')

        if self.transformer_model == TransformerType.ELECTRA:
            tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
            model = ElectraModel.from_pretrained('google/electra-small-discriminator')

        return model, tokenizer
Exemplo n.º 23
0
    def __init__(self, config: dict):
        super(Model, self).__init__()
        self.electra_cfg = ElectraConfig()
        self.electra = ElectraModel.from_pretrained(config["pretrained_dir"] +
                                                    "electra_small.index",
                                                    config=self.electra_cfg,
                                                    from_tf=True)

        self.sentence_encoder = AttentionSentenceEncoder(
            self.electra_cfg.hidden_size, config["sent_head"],
            config["max_sents"] + 1)  # 多一个位置给CLS
        self.img_encoder = SimpleImageEncoder(config["img_input_size"],
                                              config["img_output_size"],
                                              config["img_num"],
                                              dropout=config["dropout"])

        self.output_layer = OutputLayer(
            config["task"],
            self.electra_cfg.hidden_size + config["img_output_size"],
            config["output_size"], config["dropout"])
Exemplo n.º 24
0
 def __init__(self,
              config,
              add_GRU=True,
              bidirectional=False,
              word_level=False,
              add_cls=False,
              word_and_sent=False):
     super().__init__(config)
     self.electra = ElectraModel(config)
     feature_dim = config.hidden_size
     if bidirectional:
         feature_dim += config.hidden_size
     if word_and_sent:
         feature_dim *= 2
     if add_cls:
         feature_dim += config.hidden_size
     self.pooler = nn.Linear(feature_dim, config.hidden_size)
     self.pooler_activation = nn.Tanh()
     self.dropout = nn.Dropout(config.hidden_dropout_prob)
     self.classifier = nn.Linear(config.hidden_size, 1)
     self.classifier2 = nn.Linear(config.hidden_size, 2)
     self.add_GRU = add_GRU
     self.word_level = word_level
     self.add_cls = add_cls
     self.bidirectional = bidirectional
     self.word_and_sent = word_and_sent
     if self.add_GRU:
         self.gru = GRUWithPadding(config.hidden_size)
         #self.gru = nn.GRU(config.hidden_size,config.hidden_size,num_layers=1,batch_first = True, bidirectional=bidirectional)
     if self.word_and_sent:
         self.gru2 = nn.GRU(config.hidden_size,
                            config.hidden_size,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=bidirectional)
     self.init_weights()
     print("add_GRU is: " + str(add_GRU))
     print("bidirectional is: " + str(bidirectional))
     print("word_level is:" + str(word_level))
     print("add_cls is:" + str(add_cls))
     print("word_and_sent is:" + str(word_and_sent))
Exemplo n.º 25
0
    def __init__(self,
                 config: ElectraConfig,
                 args: Namespace,
                 bias_label_lst=None,
                 hate_label_lst=None):
        super().__init__(config)
        self.args = args
        self.num_bias_labels = len(
            bias_label_lst) if bias_label_lst is not None else 0
        self.num_hate_labels = len(
            hate_label_lst) if hate_label_lst is not None else 0

        self.electra = ElectraModel(config)
        self.bias_classifier = BiasClassificationHead(config,
                                                      self.num_bias_labels)
        self.hate_classifier = HateClassificationHead(config,
                                                      self.num_hate_labels)

        self.loss_fct = nn.CrossEntropyLoss()

        self.init_weights()
Exemplo n.º 26
0
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.num_regs = config.num_regs

        self.electra = ElectraModel(config)

        self.classifier = nn.Sequential(
            nn.Dropout(config.hidden_dropout_prob),
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.GELU(),
            nn.Dropout(config.hidden_dropout_prob),
            nn.Linear(config.hidden_size, config.num_labels),
        )
        self.regressor = nn.Sequential(
            nn.Dropout(config.hidden_dropout_prob),
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.GELU(),
            nn.Dropout(config.hidden_dropout_prob),
            nn.Linear(config.hidden_size, config.num_regs),
        )
Exemplo n.º 27
0
    def __init__(self, config, num_rnn=1, num_decoupling=1):
        super().__init__(config)

        self.electra = ElectraModel(config)
        self.num_decoupling = num_decoupling

        self.localMHA = nn.ModuleList(
            [MHA(config) for _ in range(num_decoupling)])
        self.globalMHA = nn.ModuleList(
            [MHA(config) for _ in range(num_decoupling)])

        self.fuse1 = FuseLayer(config)
        self.fuse2 = FuseLayer(config)

        self.gru1 = GRUWithPadding(config, num_rnn)

        self.pooler = nn.Linear(2 * config.hidden_size, config.hidden_size)
        self.pooler_activation = nn.Tanh()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 1)

        self.init_weights()
Exemplo n.º 28
0
    def __init__(self, config, bidirectional=False):
        super().__init__(config)
        self.electra = ElectraModel(config)
        feature_dim = config.hidden_size * 4
        if bidirectional:
            feature_dim *= 2

        #Attention Flow Layer
        self.att_weight_c = nn.Linear(config.hidden_size, 1)
        self.att_weight_q = nn.Linear(config.hidden_size, 1)
        self.att_weight_cq = nn.Linear(config.hidden_size, 1)

        self.pooler = nn.Linear(feature_dim, config.hidden_size)
        self.pooler_activation = nn.Tanh()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 1)
        self.classifier2 = nn.Linear(config.hidden_size, 2)
        self.bidirectional = bidirectional
        self.gru1 = GRUWithPadding(config.hidden_size * 4,
                                   bidirectional=bidirectional)

        self.init_weights()
        print("bidirectional is: " + str(bidirectional))
Exemplo n.º 29
0
    def __init__(
        self,
        config,
        n_layers=2,
        activation='relu',
        beta=100,
    ):
        super(ElectraForConversationalQuestionAnswering, self).__init__(config)
        self.electra = ElectraModel(config)
        hidden_size = config.hidden_size
        self.rational_l = MultiLinearLayer(n_layers, hidden_size, hidden_size,
                                           1, activation)
        self.logits_l = MultiLinearLayer(n_layers, hidden_size, hidden_size, 2,
                                         activation)
        self.unk_l = MultiLinearLayer(n_layers, hidden_size, hidden_size, 1,
                                      activation)
        self.attention_l = MultiLinearLayer(n_layers, hidden_size, hidden_size,
                                            1, activation)
        self.yn_l = MultiLinearLayer(n_layers, hidden_size, hidden_size, 2,
                                     activation)
        self.beta = beta

        self.init_weights()
Exemplo n.º 30
0
class Pronunciation2Spelling(nn.Module):
    def __init__(self, enc_config, dec_config):
        super(Pronunciation2Spelling, self).__init__()
        self.encoders = ElectraModel(enc_config)
        self.embedding = self.encoders.get_input_embeddings()
        if enc_config.embedding_size != dec_config.hidden_size:
            self.embedding_projection = nn.Linear(enc_config.embedding_size,
                                                  dec_config.hidden_size)
        self.decoders = Decoders(dec_config)
        self.dense = nn.Linear(dec_config.hidden_size,
                               dec_config.trg_vocab_size)

        self.padding_idx = dec_config.padding_idx

    def forward(self, enc_ids, dec_ids):
        dec_embeddings = self.embedding(dec_ids)
        if hasattr(self, 'embedding_projection'):
            dec_embeddings = self.embedding_projection(dec_embeddings)
        enc_outputs = self.encoders(enc_ids).last_hidden_state
        dec_outputs, _, _ = self.decoders(enc_ids, enc_outputs, dec_ids,
                                          dec_embeddings)
        model_output = self.dense(dec_outputs)
        return model_output