def rnn_decoder(decoder_params): decoder_embedding_layer = DropoutEmbeddings( ntokens=decoder_params.ntokens, emb_size=decoder_params.emb_size, ) if decoder_params.attention: # attention decoder must have double the input_size to accommodate for the attention concat decoder_rnn = RNNLayers(input_size=decoder_params.emb_size * 2, output_size=decoder_params.emb_size, nhid=decoder_params.nhid, bidir=False, nlayers=decoder_params.nlayers, cell_type="gru") projection_layer = AttentionProjection( output_size=decoder_params.ntokens, input_size=decoder_params.emb_size, att_nhid=decoder_params.att_hid, tie_encoder=None, dropout=0.0) decoder = AttentionDecoder(decoder_layer=decoder_rnn, embedding_layer=decoder_embedding_layer, projection_layer=projection_layer, pad_token=1, eos_token=2, max_tokens=decoder_params.max_tokens) else: decoder_rnn = RNNLayers(input_size=decoder_params.emb_size, output_size=decoder_params.emb_size, nhid=decoder_params.nhid, bidir=False, nlayers=decoder_params.nlayers, cell_type="gru") projection_layer = Projection(output_size=decoder_params.ntokens, input_size=decoder_params.emb_size, dropout=0.0, tie_encoder=None) decoder = Decoder( decoder_layer=decoder_rnn, projection_layer=projection_layer, embedding_layer=decoder_embedding_layer, pad_token=0, eos_token=1, max_tokens=decoder_params.max_tokens, ) decoder = to_gpu(decoder) decoder.reset(decoder_params.batch_size) return decoder, decoder_params
class Seq2SeqAttention(nn.Module): def __init__(self, ntoken: HParam, emb_sz: HParam, nhid: HParam, nlayers: HParam, att_nhid: int, pad_token: int, eos_token: int, max_tokens: int = 50, share_embedding_layer: bool = False, tie_decoder: bool = True, bidir: bool = False, **kwargs): """ Args: ntoken (Union[List[int],int]): Number of tokens for the encoder and the decoder emb_sz (Union[List[int],int]): Embedding size for the encoder and decoder embeddings nhid (Union[List[int],int]): Number of hidden dims for the encoder and the decoder nlayers (Union[List[int],int]): Number of layers for the encoder and the decoder att_nhid (int): Number of hidden dims for the attention Module pad_token (int): The index of the token used for padding eos_token (int): The index of the token used for eos max_tokens (int): The maximum number of steps the decoder iterates before stopping share_embedding_layer (bool): if True the decoder shares its input and output embeddings tie_decoder (bool): if True the encoder and the decoder share their embeddings bidir (bool): if True use a bidirectional encoder **kwargs: Extra embeddings that will be passed to the encoder and the decoder """ super().__init__() # allow for the same or different parameters between encoder and decoder ntoken, emb_sz, nhid, nlayers = get_list(ntoken, 2), get_list(emb_sz, 2), \ get_list(nhid, 2), get_list(nlayers, 2) dropoutd = get_kwarg(kwargs, name="dropoutd", default_value=0.5) # output dropout dropoute = get_kwarg(kwargs, name="dropout_e", default_value=0.1) # encoder embedding dropout dropoute = get_list(dropoute, 2) dropouti = get_kwarg(kwargs, name="dropout_i", default_value=0.65) # input dropout dropouti = get_list(dropouti, 2) dropouth = get_kwarg(kwargs, name="dropout_h", default_value=0.3) # RNN output layers dropout dropouth = get_list(dropouth, 2) wdrop = get_kwarg(kwargs, name="wdrop", default_value=0.5) # RNN weights dropout wdrop = get_list(wdrop, 2) cell_type = get_kwarg(kwargs, name="cell_type", default_value="lstm") self.nlayers = nlayers self.nhid = nhid self.emb_sz = emb_sz self.pr_force = 1.0 encoder_embedding_layer = DropoutEmbeddings(ntokens=ntoken[0], emb_size=emb_sz[0], dropoute=dropoute[0], dropouti=dropouti[0]) encoder_rnn = RNNLayers( input_size=emb_sz[0], output_size=kwargs.get("output_size", emb_sz[0]), nhid=nhid[0], bidir=bidir, dropouth=dropouth[0], wdrop=wdrop[0], nlayers=nlayers[0], cell_type=cell_type, ) self.encoder = Encoder(embedding_layer=encoder_embedding_layer, encoder_layer=encoder_rnn) if share_embedding_layer: decoder_embedding_layer = encoder_embedding_layer else: decoder_embedding_layer = DropoutEmbeddings(ntokens=ntoken[-1], emb_size=emb_sz[-1], dropoute=dropoute[1], dropouti=dropouti[1]) decoder_rnn = RNNLayers(input_size=kwargs.get("input_size", emb_sz[-1] * 2), output_size=kwargs.get("output_size", emb_sz[-1]), nhid=nhid[-1], bidir=False, dropouth=dropouth[1], wdrop=wdrop[1], nlayers=nlayers[-1], cell_type=cell_type) projection_layer = AttentionProjection( output_size=ntoken[-1], input_size=emb_sz[-1], dropout=dropoutd, att_nhid=att_nhid, tie_encoder=decoder_embedding_layer if tie_decoder else None) self.decoder = AttentionDecoder( decoder_layer=decoder_rnn, projection_layer=projection_layer, embedding_layer=decoder_embedding_layer, pad_token=pad_token, eos_token=eos_token, max_tokens=max_tokens, ) def forward(self, *inputs, num_beams=0): with torch.set_grad_enabled(self.training): encoder_inputs, decoder_inputs = inputs # reset the states for the new batch bs = encoder_inputs.size(1) self.encoder.reset(bs) self.decoder.reset(bs) outputs = self.encoder(encoder_inputs) # as initial state we use the initial decoder state (zeros) state = self.decoder.hidden assert_dims( outputs, [self.nlayers[0], None, bs, (self.nhid[0], self.emb_sz[0])]) # pass the encoder outputs as keys to the attention projection_layer self.decoder.projection_layer.reset(keys=outputs[-1]) if self.training: self.decoder.pr_force = self.pr_force nb = 1 if self.pr_force < 1 else 0 else: nb = num_beams outputs_dec = self.decoder(decoder_inputs, hidden=state, num_beams=nb) predictions = outputs_dec[:decoder_inputs.size( 0)] if num_beams == 0 else self.decoder.beam_outputs return predictions, [*outputs, *outputs_dec]
class HREDAttention(nn.Module): """Basic HRED model paper: A Hierarchical Latent Variable Encoder-Decoder Model for Generating Dialogues. Iulian Vlad Serban et al. 2016a. github: https://github.com/julianser/hed-dlg-truncated arxiv: http://arxiv.org/abs/1605.06069 """ BPTT_MAX_UTTERANCES = 1000 def __init__(self, ntoken: int, emb_sz: HParam, nhid: HParam, nlayers: HParam, att_nhid: int, pad_token: int, eos_token: int, max_tokens: int = 50, share_embedding_layer: bool = False, tie_decoder: bool = True, bidir: bool = False, **kwargs): """ Args: ntoken (int): Number of tokens for the encoder and the decoder emb_sz (Union[List[int],int]): Embedding size for the encoder and decoder embeddings nhid (Union[List[int],int]): Number of hidden dims for the encoder (first two values) and the decoder nlayers (Union[List[int],int]): Number of layers for the encoder and the decoder att_nhid (int): Number of hidden dims for the attention Module pad_token (int): The index of the token used for padding eos_token (int): The index of the token used for eos max_tokens (int): The maximum number of steps the decoder iterates before stopping share_embedding_layer (bool): if True the decoder shares its input and output embeddings tie_decoder (bool): if True the encoder and the decoder share their embeddings bidir (bool): if True use a bidirectional encoder **kwargs: Extra embeddings that will be passed to the encoder and the decoder """ super().__init__() # allow for the same or different parameters between encoder and decoder ntoken, emb_sz, nhid, nlayers = get_list(ntoken), get_list( emb_sz, 2), get_list(nhid, 3), get_list(nlayers, 3) dropoutd = get_kwarg(kwargs, name="dropoutd", default_value=0.5) # output dropout dropoute = get_kwarg(kwargs, name="dropout_e", default_value=0.1) # encoder embedding dropout dropoute = get_list(dropoute, 2) dropouti = get_kwarg(kwargs, name="dropout_i", default_value=0.65) # input dropout dropouti = get_list(dropouti, 2) dropouth = get_kwarg(kwargs, name="dropout_h", default_value=0.3) # RNN output layers dropout dropouth = get_list(dropouth, 3) wdrop = get_kwarg(kwargs, name="wdrop", default_value=0.5) # RNN weights dropout wdrop = get_list(wdrop, 3) self.cell_type = "gru" self.nt = ntoken[-1] self.pr_force = 1.0 self.nlayers = nlayers encoder_embedding_layer = DropoutEmbeddings(ntokens=ntoken[0], emb_size=emb_sz[0], dropoute=dropoute[0], dropouti=dropouti[0]) encoder_rnn = RNNLayers( input_size=emb_sz[0], output_size=kwargs.get("output_size_encoder", emb_sz[0]), nhid=nhid[0], bidir=bidir, dropouth=dropouth[0], wdrop=wdrop[0], nlayers=nlayers[0], cell_type=self.cell_type, ) self.query_encoder = Encoder(embedding_layer=encoder_embedding_layer, encoder_layer=encoder_rnn) self.session_encoder = RNNLayers( input_size=encoder_rnn.output_size, nhid=nhid[1], output_size=kwargs.get("output_size", emb_sz[0]), nlayers=1, bidir=False, cell_type=self.cell_type, wdrop=wdrop[1], dropouth=dropouth[1], ) if share_embedding_layer: decoder_embedding_layer = encoder_embedding_layer else: decoder_embedding_layer = DropoutEmbeddings(ntokens=ntoken[-1], emb_size=emb_sz[-1], dropoute=dropoute[1], dropouti=dropouti[1]) decoder_rnn = RNNLayers(input_size=kwargs.get("input_size", emb_sz[-1] * 2), output_size=kwargs.get("output_size", emb_sz[-1]), nhid=nhid[-1], bidir=False, dropouth=dropouth[2], wdrop=wdrop[2], nlayers=nlayers[-1], cell_type=self.cell_type) projection_layer = AttentionProjection( output_size=ntoken[-1], input_size=emb_sz[-1], dropout=dropoutd, att_nhid=att_nhid, att_type="SDP", tie_encoder=decoder_embedding_layer if tie_decoder else None) self.decoder = AttentionDecoder( decoder_layer=decoder_rnn, projection_layer=projection_layer, embedding_layer=decoder_embedding_layer, pad_token=pad_token, eos_token=eos_token, max_tokens=max_tokens, ) def forward(self, *inputs, num_beams=0): encoder_inputs, decoder_inputs = assert_dims( inputs, [2, None, None]) # dims: [sl, bs] for encoder and decoder # reset the states for the new batch bs = encoder_inputs.size(2) self.session_encoder.reset(bs) self.decoder.reset(bs) query_encoder_outputs = [] outputs = [] num_utterances, max_sl, *_ = encoder_inputs.size() for index, context in enumerate(encoder_inputs): self.query_encoder.reset(bs) outputs = self.query_encoder(context) # context has size [sl, bs] # BPTT if the dialogue is too long repackage the first half of the outputs to decrease # the gradient backpropagation and fit it into memory # to test before adding back out = repackage_var(outputs[-1][ -1]) if max_sl * num_utterances > self.BPTT_MAX_UTTERANCES and index <= num_utterances // 2 else \ outputs[-1][-1] query_encoder_outputs.append( out) # get the last sl output of the query_encoder query_encoder_outputs = torch.stack(query_encoder_outputs, dim=0) # [cl, bs, nhid] session_outputs = self.session_encoder(query_encoder_outputs) self.decoder.projection_layer.reset(keys=session_outputs[-1]) if self.training: self.decoder.pr_force = self.pr_force nb = 1 if self.pr_force < 1 else 0 else: nb = num_beams state = self.decoder.hidden outputs_dec = self.decoder(decoder_inputs, hidden=state, num_beams=nb) predictions = outputs_dec[-1][:decoder_inputs.size( 0)] if num_beams == 0 else self.decoder.beam_outputs return predictions, [*outputs, *outputs_dec]