Python GlobalAttention示例，onmt.modules.GlobalAttention Python示例

示例#1

0

显示文件

    def __init__(self,
                 z_dim,
                 input_size,
                 dist_type,
                 image_features_type="global"):
        assert(dist_type in ["normal", "logistic_normal"]), \
                "Distribution not supported: %s"%str(dist_type)
        assert(image_features_type in ["global", "posterior", "local"]), \
                "Image features type not supported: %s"%str(image_features_type)

        super(GlobalFullInferenceNetwork,
              self).__init__(z_dim, input_size, dist_type)

        self.image_features_type = image_features_type
        if image_features_type == 'local':
            attn_type = 'general'
            coverage_attn = False
            # TODO: remove hardcoded hyperparameters
            #hidden_size = input_size // 4
            hidden_size = 500
            #print("input_size: ", input_size)
            #print("hidden_size: ", hidden_size)

            # linear layer to project local image features into RNN hidden state space
            self.image_proj = nn.Linear(2048, 500)

            # create attention mechanisms for the local image features
            self.src_image_attn = GlobalAttention(hidden_size,
                                                  coverage=coverage_attn,
                                                  attn_type=attn_type)
            self.tgt_image_attn = GlobalAttention(hidden_size,
                                                  coverage=coverage_attn,
                                                  attn_type=attn_type)
            self.src_dropout = nn.Dropout(0.5)
            self.tgt_dropout = nn.Dropout(0.5)

示例#2

0

显示文件

文件： decoder.py 项目： WERimagin/NQG_Interrogative_Phrases

    def __init__(self, rnn_type, bidirectional_encoder, num_layers,
                 hidden_size, attn_type="general", attn_func="softmax",
                 coverage_attn=False, context_gate=None,
                 copy_attn=False, dropout=0.0, embeddings=None,
                 reuse_copy_attn=False, copy_attn_type="general"):
        super(RNNDecoderBase, self).__init__(
            attentional=attn_type != "none" and attn_type is not None)

        self.bidirectional_encoder = bidirectional_encoder
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embeddings = embeddings
        self.dropout = nn.Dropout(dropout)

        # Decoder state
        self.state = {}

        # Build the RNN.
        #LSTM
        self.rnn = self._build_rnn(rnn_type,
                                   input_size=self._input_size,
                                   hidden_size=hidden_size,
                                   num_layers=num_layers,
                                   dropout=dropout)

        # Set up the context gate.
        self.context_gate = None
        if context_gate is not None:
            self.context_gate = context_gate_factory(
                context_gate, self._input_size,
                hidden_size, hidden_size, hidden_size
            )

        # Set up the standard attention.
        self._coverage = coverage_attn
        if not self.attentional:
            if self._coverage:
                raise ValueError("Cannot use coverage term with no attention.")
            self.attn = None
        else:
            self.attn = GlobalAttention(
                hidden_size, coverage=coverage_attn,
                attn_type=attn_type, attn_func=attn_func
            )

        if copy_attn and not reuse_copy_attn:
            if copy_attn_type == "none" or copy_attn_type is None:
                raise ValueError(
                    "Cannot use copy_attn with copy_attn_type none")
            self.copy_attn = GlobalAttention(
                hidden_size, attn_type=copy_attn_type, attn_func=attn_func
            )
        else:
            self.copy_attn = None

        self._reuse_copy_attn = reuse_copy_attn and copy_attn
        if self._reuse_copy_attn and not self.attentional:
            raise ValueError("Cannot reuse copy attention with no attention.")

示例#3

0

显示文件

    def __init__(self,
                 rnn_type,
                 bidirectional_encoder,
                 num_layers,
                 hidden_size,
                 attn_type="general",
                 attn_func="softmax",
                 coverage_attn=False,
                 context_gate=None,
                 copy_attn=False,
                 dropout=0.0,
                 embeddings=None,
                 reuse_copy_attn=False):
        super(RNNDecoderBase, self).__init__()

        self.bidirectional_encoder = bidirectional_encoder
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embeddings = embeddings
        self.dropout = nn.Dropout(dropout)

        # Decoder state
        self.state = {}

        # Build the RNN.
        self.rnn = self._build_rnn(rnn_type,
                                   input_size=self._input_size,
                                   hidden_size=hidden_size,
                                   num_layers=num_layers,
                                   dropout=dropout)

        # Set up the context gate.
        self.context_gate = None
        if context_gate is not None:
            self.context_gate = context_gate_factory(context_gate,
                                                     self._input_size,
                                                     hidden_size, hidden_size,
                                                     hidden_size)

        # Set up the standard attention.
        self._coverage = coverage_attn
        self.attn = GlobalAttention(hidden_size,
                                    coverage=coverage_attn,
                                    attn_type=attn_type,
                                    attn_func=attn_func)

        if copy_attn and not reuse_copy_attn:
            self.copy_attn = GlobalAttention(hidden_size,
                                             attn_type=attn_type,
                                             attn_func=attn_func)
        else:
            self.copy_attn = None

        self._reuse_copy_attn = reuse_copy_attn and copy_attn

示例#4

0

显示文件

    def __init__(self, num_layers, d_model, heads, d_ff, attn_type, copy_attn,
                 self_attn_type, dropout, embeddings, max_relative_positions):
        super(TransformerDecoder, self).__init__()

        self.embeddings = embeddings

        # Decoder State
        self.state = {}

        self.transformer_layers = nn.ModuleList([
            TransformerDecoderLayer(
                d_model,
                heads,
                d_ff,
                dropout,
                self_attn_type=self_attn_type,
                max_relative_positions=max_relative_positions)
            for i in range(num_layers)
        ])

        # previously, there was a GlobalAttention module here for copy
        # attention. But it was never actually used -- the "copy" attention
        # just reuses the context attention.
        self._copy = copy_attn
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)

        self.copy_attn = GlobalAttention(768,
                                         attn_type='general',
                                         attn_func='softmax')

示例#5

0

显示文件

文件： decoder.py 项目： A-Amer/nmt

    def __init__(self,
                 rnn_type,
                 bidirectional_encoder,
                 num_layers,
                 hidden_size,
                 attn_type="general",
                 attn_func="softmax",
                 dropout=0.0,
                 embeddings=None):
        super(RNNDecoderBase, self).__init__(
            attentional=attn_type != "none" and attn_type is not None)

        self.bidirectional_encoder = bidirectional_encoder
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embeddings = embeddings
        self.dropout = nn.Dropout(dropout)

        # Decoder state
        self.state = {}

        # Build the RNN.
        self.rnn = self._build_rnn(rnn_type,
                                   input_size=self._input_size,
                                   hidden_size=hidden_size,
                                   num_layers=num_layers,
                                   dropout=dropout)

        self.attn = GlobalAttention(hidden_size,
                                    attn_type=attn_type,
                                    attn_func=attn_func)

示例#6

0

显示文件

    def __init__(self, num_layers, hidden_size, attn_type, copy_attn,
                 cnn_kernel_width, dropout, embeddings, copy_attn_type):
        super(CNNDecoder, self).__init__()

        self.cnn_kernel_width = cnn_kernel_width
        self.embeddings = embeddings

        # Decoder State
        self.state = {}

        input_size = self.embeddings.embedding_size
        self.linear = nn.Linear(input_size, hidden_size)
        self.conv_layers = nn.ModuleList([
            GatedConv(hidden_size, cnn_kernel_width, dropout, True)
            for i in range(num_layers)
        ])
        self.attn_layers = nn.ModuleList(
            [ConvMultiStepAttention(hidden_size) for i in range(num_layers)])

        # CNNDecoder has its own attention mechanism.
        # Set up a separate copy attention layer if needed.
        assert not copy_attn, "Copy mechanism not yet tested in conv2conv"
        if copy_attn:
            self.copy_attn = GlobalAttention(hidden_size,
                                             attn_type=copy_attn_type)
        else:
            self.copy_attn = None

示例#7

0

显示文件

    def __init__(self, opt, mode):
        super(DNC, self).__init__()

        self.input_feed = opt.input_feed if mode == 'decode' else 0

        opt.rnn_size = opt.word_vec_size if mode == 'diag_encode' else opt.rnn_size

        
        self.layers =  opt.layers

        self.rnn_sz = (opt.word_vec_size, None) if self.layers == 1 else (
            opt.rnn_size, opt.word_vec_size)

        use_cuda = len(opt.gpus) > 0
        self.memory = Memory(opt.mem_slots, opt.mem_size,
                             opt.read_heads, opt.batch_size, use_cuda)

        input_sz = 2 * opt.word_vec_size if self.input_feed else opt.word_vec_size

        self.controller = Controller(input_sz, opt.word_vec_size, opt.read_heads, opt.rnn_size,
                                     opt.mem_size, opt.batch_size, opt.dropout,  self.layers)

        self.dropout = nn.Dropout(opt.dropout)
        self.attn = GlobalAttention(
            opt.word_vec_size) if opt.attn and mode == 'decode' else None

示例#8

0

显示文件

    def __init__(self, opt, mode, memory, controller):
        super(DNC, self).__init__()

        self.input_feed = opt.input_feed if mode == 'decode' else 0

        output_size = opt.word_vec_size // 2 if mode == 'bla_diag_encode' else opt.word_vec_size

        opt.rnn_size = opt.word_vec_size if mode == 'diag_encode' else opt.rnn_size

        self.rnn_sz = (opt.word_vec_size,
                       None) if opt.layers == 1 else (opt.rnn_size,
                                                      output_size)

        self.layers = opt.layers
        self.net_data = [] if opt.gather_net_data else None

        use_cuda = len(opt.gpus) > 0
        self.memory = memory

        self.controller = controller

        self.dropout = nn.Dropout(opt.dropout)
        self.attn = GlobalAttention(
            opt.word_vec_size
        ) if opt.attn and mode == 'context_decode' else None

示例#9

0

显示文件

文件： dnc.py 项目： jacobver/mem_seq2seq

    def __init__(self, opt):
        super(DNC, self).__init__()

        self.input_feed = opt.input_feed if opt.seq == 'decoder' else 0
        self.rnn_sz = (opt.word_vec_size, None) if opt.layers == 1 else (
            opt.rnn_size, opt.word_vec_size)
        self.layers = opt.layers
        self.net_data = [] if opt.gather_net_data else None

        use_cuda = len(opt.gpus) > 0
        self.memory = Memory(opt.mem_slots, opt.mem_size,
                             opt.read_heads, opt.batch_size, use_cuda)

        input_sz = 2 * opt.word_vec_size if self.input_feed else opt.word_vec_size
        self.controller = Controller(input_sz, opt.word_vec_size, opt.read_heads, opt.rnn_size,
                                     opt.mem_size, opt.batch_size, opt.dropout, opt.layers)

        self.attn = GlobalAttention(
            opt.word_vec_size) if opt.attn and opt.seq == 'decoder' else None

示例#10

0

显示文件

文件： custom_model.py 项目： Quantumgame/dl-project

 def __init__(self, distinct_tokens, encoder_count, sos, eos, max_len=70):
     super(Speller, self).__init__()
     self.vocab_to_embedding = nn.Embedding(distinct_tokens + 1,
                                            8)  # +1 for padding
     self.to_tokens = nn.Linear(encoder_count * 512, distinct_tokens + 1)
     self.initial_hiddens = nn.Linear(encoder_count * 512, 512)
     self.gru = nn.GRU(input_size=8,
                       hidden_size=512,
                       num_layers=1,
                       batch_first=True,
                       bidirectional=False,
                       dropout=0.15)
     self.attns = [
         GlobalAttention(512, attn_type="general")
         for _ in range(0, encoder_count)
     ]
     for i in range(0, encoder_count):
         setattr(self, "attn_{}".format(i + 1), self.attns[i])
     self.distinct_tokens = distinct_tokens
     self.max_len = max_len
     self.sos = sos
     self.eos = eos

示例#11

0

显示文件

    def __init__(
        self,
        embeddings=None,
        hidden=None,
        encoder_hidden=None,
        num_layers=1,
        teacher_forcing_p=0.0,
        attention=None,
        dropout=None,
        char_encoder_hidden=None,
        char_attention=None,
        word_dropout=None,
        encoder_feeding=False,
        variational=False,
        decoder="LSTM",
        latent_size=None,
    ):

        super(LSTM_Decoder, self).__init__()
        # TODO Use Fairseq attention
        self.embeddings = embeddings
        self.hidden = hidden
        self.embedding_dropout = nn.Dropout(dropout.input)

        self.hidden_state_dropout = nn.Dropout(dropout.output)
        self.word_dropout_p = word_dropout if word_dropout is not None else 0.0
        self.variational = variational
        if self.variational:
            encoder_feeding = True
            self.encoder_hidden_proj = lambda x: x
        else:
            self.encoder_hidden_proj = (
                nn.Linear(encoder_hidden, hidden, bias=False)
                if (encoder_hidden != hidden or encoder_hidden is None)
                else lambda x: x
            )

        self.lstm_decoder = nn.ModuleList()
        self.num_layers = num_layers

        self.input_feeding_size = 0  # latent_size if self.variational else 0

        if decoder == "LSTM":
            decoder_cell = [nn.LSTMCell] * self.num_layers
        elif decoder == "VDM":
            if self.num_layers == 1:
                decoder_cell = [VDM_LSTMCell]
            else:
                decoder_cell = [nn.LSTMCell] * (self.num_layers - 1) + [VDM_LSTMCell]
        for i in range(self.num_layers):
            if i == 0:
                self.lstm_decoder.append(
                    decoder_cell[i](
                        self.embeddings.embedding_dim + self.input_feeding_size,
                        self.hidden,
                    )
                )
            else:
                self.lstm_decoder.append(decoder_cell[i](self.hidden, self.hidden))
        self.teacher_forcing_p = teacher_forcing_p
        self.state = {
            "hidden": [None] * self.num_layers,
            "cell": [None] * self.num_layers,
            "latent": [None] * self.num_layers,
        }
        self.attention = attention
        self.proj_layer = nn.Linear(self.hidden, self.embeddings.num_embeddings)
        if attention is not None:
            self.enc_hidden_att_komp = (
                nn.Linear(encoder_hidden, hidden, bias=False)
                if (encoder_hidden != hidden or encoder_hidden is None)
                else lambda x: x
            )
            self.attention = Attention(
                self.hidden, attn_type=attention, attn_func="softmax",
            )
        self.char_attention = char_attention
        if char_attention is not None:
            self.char_hidden_att_komp = (
                nn.Linear(char_encoder_hidden, hidden, bias=False)
                if (char_encoder_hidden != hidden or char_encoder_hidden is None)
                else lambda x: x
            )
            self.char_attention = GlobalAttention(
                self.hidden, attn_type=attention, attn_func="softmax"
            )
            self.proj_layer = nn.Linear(self.hidden * 2, self.embeddings.num_embeddings)

示例#12

0

显示文件

文件： decoder.py 项目： zhongpeixiang/CARE

    def __init__(self, rnn_type, bidirectional_encoder, num_layers,
                 hidden_size, attn_type="general", attn_func="softmax",
                 coverage_attn=False, context_gate=None,
                 copy_attn=False, dropout=0.0, embeddings=None,
                 reuse_copy_attn=False, copy_attn_type="general",
                 num_emotion_classes=0, emotion_emb_size=0, 
                 generic_vocab_indices=None, emotion_vocab_indices=None, 
                 eds_type=0, no_clf_loss=False, no_eds_attention=False):
        super(RNNDecoderBase, self).__init__(
            attentional=attn_type != "none" and attn_type is not None)

        self.bidirectional_encoder = bidirectional_encoder
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embeddings = embeddings
        self.dropout = nn.Dropout(dropout)
        self.embedding_size = self.embeddings.embedding_size
        self.vocab_size = self.embeddings.word_vocab_size
        self.eds_type = eds_type
        self.no_clf_loss = no_clf_loss
        self.no_eds_attention = no_eds_attention

        # Emotion embedding
        # init_logger()
        self.num_emotion_classes = num_emotion_classes
        self.emotion_emb_size = emotion_emb_size
        rnn_input_size = self._input_size
        if num_emotion_classes != 0 and emotion_emb_size != 0:
            self.emo_embedding = nn.Embedding(num_emotion_classes, emotion_emb_size)
            rnn_input_size += emotion_emb_size

        # EDS model
        self.generic_vocab_indices = None # a 1D list
        self.emotion_vocab_indices = None # a 2D list
        if generic_vocab_indices is not None:
            if not self.no_eds_attention:
                rnn_input_size *= 2 # one from word embedding and another from emotion embedding
            
            self.all_vocab_indices = nn.Parameter(torch.arange(0, self.vocab_size, dtype=torch.long), requires_grad=False)
            self.generic_vocab_indices = nn.Parameter(torch.LongTensor(generic_vocab_indices), requires_grad=False)
            self.emotion_vocab_indices = nn.Parameter(torch.LongTensor(emotion_vocab_indices), requires_grad=False)
            self.generic_vocab_size = self.generic_vocab_indices.size(0) 
            self.emotion_vocab_size = self.emotion_vocab_indices.size(1)
            self.num_emotions = self.emotion_vocab_indices.size(0)
            self.alpha = nn.Parameter(torch.zeros(hidden_size))
            self.beta = nn.Parameter(torch.zeros(hidden_size))
            self.gamma = nn.Parameter(torch.zeros(self.embedding_size))
            self.emotion_classifier = nn.Linear(self.embedding_size, self.num_emotions)
            self.generic_mask = nn.Parameter(torch.zeros(self.vocab_size), requires_grad=False)
            self.generic_mask[self.generic_vocab_indices] = 1
            
            other_emotion_indices = []
            flattened_emotion_vocab_indices = [i for e in emotion_vocab_indices for i in e]
            for i in range(len(emotion_vocab_indices)):
                other_emotion_indices.append(list(set(flattened_emotion_vocab_indices).difference(set(emotion_vocab_indices[i]))))
            self.other_emotion_indices = nn.Parameter(torch.LongTensor(other_emotion_indices), requires_grad=False)
            self.all_emotion_indices = nn.Parameter(torch.LongTensor(list(set(flattened_emotion_vocab_indices))), requires_grad=False)
            
            self.vocab_embedding = nn.Parameter(self.embeddings(self.all_vocab_indices.unsqueeze(0).unsqueeze(-1)).squeeze(0), requires_grad=False) # (vocab, emb_size)
            # print(self.all_vocab_indices.shape)
            # print(self.generic_vocab_indices.shape)
            # print(self.emotion_vocab_indices.shape)
            # print(self.other_emotion_indices.shape)
            

        # Decoder state
        self.state = {}

        # Build the RNN.
        self.rnn = self._build_rnn(rnn_type,
                                   input_size=rnn_input_size, # input_size=self._input_size
                                   hidden_size=hidden_size,
                                   num_layers=num_layers,
                                   dropout=dropout)

        # Set up the context gate.
        self.context_gate = None
        if context_gate is not None:
            self.context_gate = context_gate_factory(
                context_gate, self._input_size,
                hidden_size, hidden_size, hidden_size
            )

        # Set up the standard attention.
        self._coverage = coverage_attn
        if not self.attentional:
            if self._coverage:
                raise ValueError("Cannot use coverage term with no attention.")
            self.attn = None
        else:
            self.attn = GlobalAttention(
                hidden_size, coverage=coverage_attn,
                attn_type=attn_type, attn_func=attn_func
            )

        if copy_attn and not reuse_copy_attn:
            if copy_attn_type == "none" or copy_attn_type is None:
                raise ValueError(
                    "Cannot use copy_attn with copy_attn_type none")
            self.copy_attn = GlobalAttention(
                hidden_size, attn_type=copy_attn_type, attn_func=attn_func
            )
        else:
            self.copy_attn = None

        self._reuse_copy_attn = reuse_copy_attn and copy_attn
        if self._reuse_copy_attn and not self.attentional:
            raise ValueError("Cannot reuse copy attention with no attention.")

示例#13

0

显示文件

文件： decoder.py 项目： GarrettNicolai/OpenNMT-py

    def __init__(self,
                 rnn_type,
                 bidirectional_encoder,
                 num_layers,
                 hidden_size,
                 attn_type="general",
                 attn_func="softmax",
                 coverage_attn=False,
                 context_gate=None,
                 teacher_forcing="teacher",
                 copy_attn=False,
                 dropout=0.0,
                 embeddings=None,
                 reuse_copy_attn=False,
                 copy_attn_type="general"):
        super(RNNDecoderBase, self).__init__(
            attentional=attn_type != "none" and attn_type is not None)

        self.bidirectional_encoder = bidirectional_encoder
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embeddings = embeddings
        self.dropout = nn.Dropout(dropout)
        self.teacher_forcing = teacher_forcing
        # Decoder state
        self.state = {}
        self.lin = nn.Linear(self.hidden_size, 100)  # This line!
        # Build the RNN.
        self.rnn = self._build_rnn(rnn_type,
                                   input_size=self._input_size,
                                   hidden_size=hidden_size,
                                   num_layers=num_layers,
                                   dropout=dropout)
        self.eval_status = False
        # Set up the context gate.
        self.context_gate = None
        if context_gate is not None:
            self.context_gate = context_gate_factory(context_gate,
                                                     self._input_size,
                                                     hidden_size, hidden_size,
                                                     hidden_size)

        # Set up the standard attention.
        self._coverage = coverage_attn
        if not self.attentional:
            if self._coverage:
                raise ValueError("Cannot use coverage term with no attention.")
            self.attn = None
        else:
            self.attn = GlobalAttention(hidden_size,
                                        coverage=coverage_attn,
                                        attn_type=attn_type,
                                        attn_func=attn_func)

        if copy_attn and not reuse_copy_attn:
            if copy_attn_type == "none" or copy_attn_type is None:
                raise ValueError(
                    "Cannot use copy_attn with copy_attn_type none")
            self.copy_attn = GlobalAttention(hidden_size,
                                             attn_type=copy_attn_type,
                                             attn_func=attn_func)
        else:
            self.copy_attn = None

        self.vocab_size = 0  #Only used by student-forcing, rand, and dist
        self.generator = None  #Only used by student-forcing, rand, and dist

        self._reuse_copy_attn = reuse_copy_attn and copy_attn
        if self._reuse_copy_attn and not self.attentional:
            raise ValueError("Cannot reuse copy attention with no attention.")

示例#14

0

显示文件

    def __init__(
        self,
        rnn_type,
        bidirectional_encoder,
        num_layers,
        hidden_size,
        attn_type="general",
        attn_func="softmax",
        coverage_attn=False,
        context_gate=None,
        copy_attn=False,
        dropout=0.0,
        embeddings=None,
        reuse_copy_attn=False,
        copy_attn_type="general",
        target_encoder_type=None,
        detach_target_encoder=False,
    ):
        super(RNNDecoderBase, self).__init__(
            attentional=attn_type != "none" and attn_type is not None)

        self.bidirectional_encoder = bidirectional_encoder
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embeddings = embeddings
        self.dropout = nn.Dropout(dropout)

        # Decoder state
        self.state = {}

        # @memray: hack to change size for target encoding
        self.input_size = self._input_size
        if target_encoder_type == 'none':
            target_encoder_type = None
        if target_encoder_type is not None:
            self.input_size += self.hidden_size

        # Build the RNN.
        self.rnn = self._build_rnn(rnn_type,
                                   input_size=self.input_size,
                                   hidden_size=hidden_size,
                                   num_layers=1,
                                   dropout=dropout)

        # Set up the context gate.
        self.context_gate = None
        if context_gate is not None:
            self.context_gate = context_gate_factory(context_gate,
                                                     self.input_size,
                                                     hidden_size, hidden_size,
                                                     hidden_size)

        # Set up the standard attention.
        self._coverage = coverage_attn
        if not self.attentional:
            if self._coverage:
                raise ValueError("Cannot use coverage term with no attention.")
            self.attn = None
        else:
            self.attn = GlobalAttention(hidden_size,
                                        coverage=coverage_attn,
                                        attn_type=attn_type,
                                        attn_func=attn_func)

        if copy_attn and not reuse_copy_attn:
            if copy_attn_type == "none" or copy_attn_type is None:
                raise ValueError(
                    "Cannot use copy_attn with copy_attn_type none")
            self.copy_attn = GlobalAttention(hidden_size,
                                             attn_type=copy_attn_type,
                                             attn_func=attn_func)
        else:
            self.copy_attn = None

        self._reuse_copy_attn = reuse_copy_attn and copy_attn
        if self._reuse_copy_attn and not self.attentional:
            raise ValueError("Cannot reuse copy attention with no attention.")

        # @memray
        # Build the Target Encoder. Feed its output to the decoder as auxiliary input
        self.target_encoder_type = target_encoder_type
        self.target_encoder = None
        if target_encoder_type == 'rnn':
            self.target_encoder = self._build_rnn(
                "GRU",
                input_size=self.embeddings.embedding_size,
                hidden_size=hidden_size,
                num_layers=1,
                dropout=dropout)
        self.detach_target_encoder = detach_target_encoder
        self.bilinear_layer = nn.Bilinear(in1_features=hidden_size,
                                          in2_features=hidden_size,
                                          out_features=1)