예제 #1
0
    def __init__(self, src_n_tokens, trg_n_tokens, **kwargs):
        super(Seq2SeqRNN, self).__init__(src_n_tokens, trg_n_tokens)

        self.bridge_non_linearity = kwargs["bridge_non_linearity"]
        self.ninp = kwargs["encoder"]["emb_size"]  # for noam optimizer

        self.embed_src = Embed(self.src_n_tokens,
                               kwargs["encoder"]["emb_size"],
                               emb_dropout=kwargs["encoder"]["emb_dropout"],
                               trainable=kwargs["encoder"]["emb_trainable"],
                               layer_norm=kwargs["encoder"]["emb_layer_norm"],
                               padding_idx=kwargs.get("enc_padding_idx", 0))

        self.embed_tgt = Embed(self.trg_n_tokens,
                               kwargs["decoder"]["emb_size"],
                               emb_dropout=kwargs["decoder"]["emb_dropout"],
                               trainable=kwargs["decoder"]["emb_trainable"],
                               layer_norm=kwargs["decoder"]["emb_layer_norm"],
                               padding_idx=kwargs.get("dec_padding_idx", 0))

        self.encoder = RNNEncoder(self.embed_src, **kwargs["encoder"])

        self.decoder = AttSeqDecoder(self.trg_n_tokens, self.embed_tgt,
                                     self.encoder.hidden_size,
                                     **kwargs["decoder"])

        self.bridge = RNNBridge(enc_dim=self.encoder.hidden_size,
                                dec_dim=self.decoder.rnn.hidden_size,
                                dec_layers=self.decoder.rnn.num_layers,
                                dec_type=self.decoder.rnn.mode)
예제 #2
0
파일: modules.py 프로젝트: amagooda/siatl
    def __init__(self, ntokens, **kwargs):
        super(LangModel, self).__init__()

        ############################################
        # Params
        ############################################
        self.ntokens = ntokens
        self.emb_size = kwargs.get("emb_size", 100)
        self.embed_noise = kwargs.get("embed_noise", .0)
        self.embed_dropout = kwargs.get("embed_dropout", .0)
        self.rnn_size = kwargs.get("rnn_size", 100)
        self.rnn_layers = kwargs.get("rnn_layers", 1)
        self.rnn_dropout = kwargs.get("rnn_dropout", .0)
        self.decode = kwargs.get("decode", False)
        self.tie_weights = kwargs.get("tie_weights", False)
        self.pack = kwargs.get("pack", True)

        ############################################
        # Layers
        ############################################
        self.embed = Embed(ntokens, self.emb_size,
                           noise=self.embed_noise,
                           dropout=self.embed_dropout)

        self.encoder = RNNModule(input_size=self.emb_size,
                                 rnn_size=self.rnn_size,
                                 num_layers=self.rnn_layers,
                                 bidirectional=False,
                                 pack=self.pack)

        self.decoder = nn.Linear(self.rnn_size, ntokens)
        if self.tie_weights:
            self.decoder.weight = self.embed.embedding.weight
            if self.rnn_size != self.emb_size:
                self.down = nn.Linear(self.rnn_size, self.emb_size)
예제 #3
0
    def __init__(self, ntokens, nclasses, feat_size, **kwargs):
        super(BaselineConcClassifier, self).__init__()

        ############################################
        # Params
        ############################################
        self.ntokens = ntokens
        self.feat_size = feat_size
        self.embed_finetune = kwargs.get("embed_finetune", False)
        self.emb_size = kwargs.get("emb_size", 100)
        self.embed_noise = kwargs.get("embed_noise", .0)
        self.embed_dropout = kwargs.get("embed_dropout", .0)
        self.rnn_size = kwargs.get("rnn_size", 100)
        self.rnn_layers = kwargs.get("rnn_layers", 1)
        self.rnn_dropout = kwargs.get("rnn_dropout", .0)
        self.pack = kwargs.get("pack", True)
        self.no_rnn = kwargs.get("no_rnn", False)
        self.conc_emb = kwargs.get("conc_emb", False)
        self.conc_rnn = kwargs.get("conc_rnn", False)
        self.conc_out = kwargs.get("conc_out", False)
        self.bidir = kwargs.get("bidir", False)

        ############################################
        # Layers
        ############################################

        self.word_embedding = Embed(num_embeddings=ntokens,
                                    embedding_dim=self.emb_size,
                                    noise=self.embed_noise,
                                    dropout=self.embed_dropout)

        if self.conc_emb:
            rnn_input_size = self.emb_size + self.feat_size
        else:
            rnn_input_size = self.emb_size

        self.rnn = RNNModule(input_size=rnn_input_size,
                             rnn_size=self.rnn_size,
                             num_layers=self.rnn_layers,
                             bidirectional=self.bidir,
                             dropout=self.rnn_dropout,
                             pack=self.pack)

        if self.no_rnn:
            self.attention_size = rnn_input_size
        else:
            if self.conc_rnn:
                self.attention_size = self.rnn.feature_size + self.feat_size
            else:
                self.attention_size = self.rnn.feature_size

        self.attention = SelfAttention(self.attention_size, baseline=True)

        output_input_size = self.attention_size

        self.classes = nn.Linear(output_input_size, nclasses)
예제 #4
0
    def __init__(self,
                 src_n_tokens,
                 trg_n_tokens,
                 emb_size=512,
                 nhead=8,
                 nhid=2048,
                 nlayers=6,
                 dropout=0.1,
                 tie_projections=True,
                 **kwargs):
        super(Seq2SeqTransformer, self).__init__(src_n_tokens, trg_n_tokens,
                                                 **kwargs)

        self.ninp = emb_size

        self.tgt_mask = None
        self.tie_projections = tie_projections

        self.embed_src = Embed(self.src_n_tokens,
                               emb_size,
                               scale=True,
                               padding_idx=kwargs.get("enc_padding_idx", 0))
        self.embed_tgt = Embed(self.trg_n_tokens,
                               emb_size,
                               scale=True,
                               padding_idx=kwargs.get("dec_padding_idx", 0))

        self.encoder = TransformerEncoder(hidden_size=emb_size,
                                          ff_size=nhid,
                                          num_layers=nlayers,
                                          num_heads=nhead,
                                          dropout=dropout,
                                          emb_dropout=dropout)

        self.decoder = TransformerDecoder(num_layers=nlayers,
                                          num_heads=nhead,
                                          hidden_size=emb_size,
                                          ff_size=nhid,
                                          dropout=dropout,
                                          emb_dropout=dropout,
                                          vocab_size=trg_n_tokens)

        self.tie_weights()
예제 #5
0
    def __init__(self, ntokens, **kwargs):
        super(SeqReader, self).__init__()

        ############################################
        # Attributes
        ############################################
        self.ntokens = ntokens
        self.emb_size = kwargs.get("emb_size", 100)
        self.embed_noise = kwargs.get("embed_noise", .0)
        self.embed_dropout = kwargs.get("embed_dropout", .0)
        self.rnn_size = kwargs.get("rnn_size", 100)
        self.rnn_layers = kwargs.get("rnn_layers", 1)
        self.rnn_dropout = kwargs.get("rnn_dropout", .0)
        self.rnn_bidirectional = kwargs.get("rnn_bidirectional", False)
        self.decode = kwargs.get("decode", False)
        self.tie_weights = kwargs.get("tie_weights", False)
        self.pack = kwargs.get("pack", True)
        self.countdown = kwargs.get("countdown", False)

        ############################################
        # Layers
        ############################################
        self.embed = Embed(ntokens,
                           self.emb_size,
                           noise=self.embed_noise,
                           dropout=self.embed_dropout)

        self.encoder = RNNModule(input_size=self.emb_size,
                                 rnn_size=self.rnn_size,
                                 num_layers=self.rnn_layers,
                                 bidirectional=self.rnn_bidirectional,
                                 dropout=self.rnn_dropout,
                                 pack=self.pack,
                                 countdown=self.countdown)

        if self.rnn_bidirectional:
            self.rnn_size *= 2

        if self.decode:
            if self.rnn_bidirectional:
                raise ValueError("Can't decode with bidirectional RNNs!")

            if self.tie_weights and self.rnn_size != self.emb_size:
                rnn_out = self.emb_size
                self.down = nn.Linear(self.rnn_size, rnn_out)
            else:
                rnn_out = self.rnn_size

            self.out = nn.Linear(rnn_out, ntokens)

            if self.tie_weights:
                # if self.rnn_size != self.emb_size:
                #     raise ValueError("if `tie_weights` is True,"
                #                      "emb_size has to be equal to rnn_size")
                self.out.weight = self.embed.embedding.weight
예제 #6
0
파일: modules.py 프로젝트: amagooda/siatl
    def __init__(self, ntokens, nclasses, attention=False, **kwargs):
        super(NaiveClassifier, self).__init__()

        ############################################
        # Params
        ############################################
        self.ntokens = ntokens
        self.emb_size = kwargs.get("emb_size", 100)
        self.embed_noise = kwargs.get("embed_noise", .0)
        self.embed_dropout = kwargs.get("embed_dropout", .0)
        self.bottom_rnn_size = kwargs.get("bottom_rnn_size", 100)
        self.attention_dropout = kwargs.get("attention_dropout", .0)
        self.bottom_rnn_layers = kwargs.get("bottom_rnn_layers", 1)
        self.bottom_rnn_dropout = kwargs.get("bottom_rnn_dropout", .0)
        self.tie_weights = kwargs.get("tie_weights", False)
        self.pack = kwargs.get("pack", True)
        self.att = attention
        self.attention_layers = kwargs.get("attention_layers", 1)

        ############################################
        # Layers
        ############################################
        self.embed = Embed(ntokens, self.emb_size,
                           noise=self.embed_noise, dropout=
                           self.embed_dropout)
        if self.att:
            last = False
        else:
            last = True

        self.bottom_rnn = RNNModule(input_size=self.emb_size,
                                    rnn_size=self.bottom_rnn_size,
                                    num_layers=self.bottom_rnn_layers,
                                    bidirectional=False,
                                    dropout=self.bottom_rnn_dropout,
                                    pack=self.pack,
                                    last=last)
        if self.att:
            self.attention = SelfAttention(attention_size=
                                           self.bottom_rnn_size,
                                           dropout=
                                           self.attention_dropout)

        self.classes = nn.Linear(self.bottom_rnn_size, nclasses)
예제 #7
0
    def __init__(self,
                 ntokens,
                 tie_projections=False,
                 out_layer_norm=False,
                 **kwargs):
        super(RNNLM, self).__init__()

        self.tie_projections = tie_projections
        self.out_layer_norm = out_layer_norm

        self.embed = Embed(ntokens,
                           kwargs["emb_size"],
                           emb_dropout=kwargs["emb_dropout"],
                           trainable=kwargs["emb_trainable"],
                           layer_norm=kwargs["emb_layer_norm"],
                           max_norm=kwargs["emb_max_norm"])

        self.encoder = RNNEncoder(self.embed, **kwargs)
        assert not self.encoder.rnn.bidirectional

        enc_dim = self.encoder.hidden_size
        emb_dim = self.embed.embedding_dim

        self.project_down = (self.tie_projections and enc_dim != emb_dim)

        if self.project_down:
            self.W_h = nn.Linear(enc_dim, emb_dim)
            out_dim = emb_dim
        else:
            out_dim = enc_dim

        if self.out_layer_norm:
            self.LN_h = nn.LayerNorm(out_dim, eps=1e-6)

        self.logits = nn.Linear(out_dim, ntokens)

        self.init_weights()
        self.tie_weights()
예제 #8
0
    def __init__(self,
                 ntoken,
                 emb_size=512,
                 nhead=8,
                 nhid=2048,
                 nlayers=6,
                 dropout=0.1,
                 tie_projections=True,
                 **kwargs):
        super(TransformerLM, self).__init__()

        self.tie_projections = tie_projections
        self.ninp = emb_size

        self.embed = Embed(ntoken, emb_size, scale=True)
        self.encoder = TransformerEncoder(hidden_size=emb_size,
                                          ff_size=nhid,
                                          num_layers=nlayers,
                                          num_heads=nhead,
                                          dropout=dropout,
                                          emb_dropout=dropout)
        self.logits = nn.Linear(emb_size, ntoken)

        self.tie_weights()
예제 #9
0
    def __init__(self, ntokens, nclasses, feature_size, **kwargs):
        super(AffectiveAttention, self).__init__()

        ############################################
        # Params
        ############################################
        self.ntokens = ntokens
        self.feat_size = feature_size
        self.attention_type = kwargs["attention_type"]
        self.embed_finetune = kwargs.get("embed_finetune", False)
        self.emb_size = kwargs.get("emb_size", 100)
        self.embed_noise = kwargs.get("embed_noise", .0)
        self.embed_dropout = kwargs.get("embed_dropout", .0)
        self.rnn_size = kwargs.get("rnn_size", 100)
        self.rnn_layers = kwargs.get("rnn_layers", 1)
        self.rnn_dropout = kwargs.get("rnn_dropout", .0)
        self.att_dropout = kwargs.get("attention_dropout", .0)
        self.pack = kwargs.get("pack", True)
        self.no_rnn = kwargs.get("no_rnn", False)
        self.conc_emb = kwargs.get("conc_emb", False)
        self.conc_rnn = kwargs.get("conc_rnn", False)
        self.conc_out = kwargs.get("conc_out", False)
        self.bidir = kwargs.get("bidir", False)

        ############################################
        # Layers
        ############################################
        self.word_embedding = Embed(ntokens,
                                    self.emb_size,
                                    noise=self.embed_noise,
                                    dropout=self.embed_dropout)

        # todo: use features Embedding layer :)
        # self.feat_embedding = nn.Embedding(num_embeddings=num_features,
        #                              embedding_dim=feature_size)

        if self.conc_emb:

            rnn_input_size = self.emb_size + self.feat_size
        else:
            rnn_input_size = self.emb_size

        self.rnn = RNNModule(input_size=rnn_input_size,
                             rnn_size=self.rnn_size,
                             num_layers=self.rnn_layers,
                             bidirectional=False,
                             dropout=self.rnn_dropout,
                             pack=self.pack)

        if self.no_rnn:
            self.attention_size = rnn_input_size
        else:
            if self.conc_rnn:
                self.attention_size = self.rnn.feature_size + self.feat_size
            else:
                self.attention_size = self.rnn.feature_size

        if self.attention_type == "affine":
            self.scale = nn.Linear(feature_size, self.attention_size)
            self.shift = nn.Linear(feature_size, self.attention_size)
            self.attention = SelfAttention(attention_size=self.attention_size,
                                           dropout=self.att_dropout)

        elif self.attention_type == "non_linear_affine":
            self.scale = nn.Linear(feature_size, self.attention_size)
            self.shift = nn.Linear(feature_size, self.attention_size)
            self.tanh = nn.Tanh()
            self.attention = SelfAttention(attention_size=self.attention_size,
                                           dropout=self.att_dropout)

        elif self.attention_type == "concat":
            self.attention = SelfAttention(attention_size=self.attention_size +
                                           feature_size,
                                           dropout=self.att_dropout)

        elif self.attention_type == "gate":
            self.gate = nn.Linear(feature_size, self.attention_size)
            self.sigmoid = nn.Sigmoid()
            self.attention = SelfAttention(attention_size=self.attention_size,
                                           dropout=self.att_dropout)

        else:
            raise ValueError("Unknown attention_type")

        self.classes = nn.Linear(self.attention_size, nclasses)
예제 #10
0
    def __init__(self, trg_ntokens, enc_size, **kwargs):
        super(AttSeqDecoder, self).__init__()

        ############################################
        # Attributes
        ############################################
        self.trg_ntokens = trg_ntokens
        emb_size = kwargs.get("emb_size", 100)
        embed_noise = kwargs.get("embed_noise", .0)
        embed_dropout = kwargs.get("embed_dropout", .0)
        rnn_size = kwargs.get("rnn_size", 100)
        rnn_layers = kwargs.get("rnn_layers", 1)
        rnn_dropout = kwargs.get("rnn_dropout", .0)
        tie_weights = kwargs.get("tie_weights", False)
        attention_fn = kwargs.get("attention_fn", "general")
        self.input_feeding = kwargs.get("input_feeding", False)
        self.learn_tau = kwargs.get("learn_tau", False)
        self.length_control = kwargs.get("length_control", False)
        self.gumbel = kwargs.get("gumbel", False)
        self.out_non_linearity = kwargs.get("out_non_linearity", None)
        self.layer_norm = kwargs.get("layer_norm", None)
        self.input_feeding_learnt = kwargs.get("input_feeding_learnt", False)

        ############################################
        # Layers
        ############################################
        self.embed = Embed(trg_ntokens,
                           emb_size,
                           noise=embed_noise,
                           dropout=embed_dropout)

        # the output size of the ho token: ho = [ h || c]
        if tie_weights:
            self.ho_size = emb_size
        else:
            self.ho_size = rnn_size

        dec_input_size = emb_size
        if self.input_feeding:
            dec_input_size += self.ho_size
        if self.length_control:
            dec_input_size += 2

            # length scaling parameter
            self.W_tick = nn.Parameter(torch.rand(1))

        self.rnn = nn.LSTM(input_size=dec_input_size,
                           hidden_size=rnn_size,
                           num_layers=rnn_layers,
                           batch_first=True)

        self.rnn_dropout = nn.Dropout(rnn_dropout)

        self.attention = Attention(enc_size, rnn_size, method=attention_fn)

        # learnt temperature parameter
        if self.learn_tau:
            self.softplus = nn.Sequential(
                nn.Linear(self.ho_size, 1, bias=False), nn.Softplus())
            self.tau_0 = kwargs.get("tau_0", 1)

        # initial input feeding
        if self.input_feeding_learnt:
            self.Wi = nn.Linear(enc_size, self.ho_size)

        # source context-aware output projection
        self.Wc = nn.Linear(rnn_size + enc_size, self.ho_size)

        # projection layer to the vocabulary
        self.Wo = nn.Linear(self.ho_size, trg_ntokens)

        if self.layer_norm:
            self.norm_ctx = nn.LayerNorm(self.ho_size)

            if self.input_feeding_learnt:
                self.norm_input_feed = nn.LayerNorm(self.ho_size)

        if tie_weights:
            # if rnn_size != emb_size:
            #     raise ValueError("if `tie_weights` is True,"
            #                      "emb_size has to be equal to rnn_size")
            self.Wo.weight = self.embed.embedding.weight
예제 #11
0
class AttSeqDecoder(nn.Module):
    def __init__(self, trg_ntokens, enc_size, **kwargs):
        super(AttSeqDecoder, self).__init__()

        ############################################
        # Attributes
        ############################################
        self.trg_ntokens = trg_ntokens
        emb_size = kwargs.get("emb_size", 100)
        embed_noise = kwargs.get("embed_noise", .0)
        embed_dropout = kwargs.get("embed_dropout", .0)
        rnn_size = kwargs.get("rnn_size", 100)
        rnn_layers = kwargs.get("rnn_layers", 1)
        rnn_dropout = kwargs.get("rnn_dropout", .0)
        tie_weights = kwargs.get("tie_weights", False)
        attention_fn = kwargs.get("attention_fn", "general")
        self.input_feeding = kwargs.get("input_feeding", False)
        self.learn_tau = kwargs.get("learn_tau", False)
        self.length_control = kwargs.get("length_control", False)
        self.gumbel = kwargs.get("gumbel", False)
        self.out_non_linearity = kwargs.get("out_non_linearity", None)
        self.layer_norm = kwargs.get("layer_norm", None)
        self.input_feeding_learnt = kwargs.get("input_feeding_learnt", False)

        ############################################
        # Layers
        ############################################
        self.embed = Embed(trg_ntokens,
                           emb_size,
                           noise=embed_noise,
                           dropout=embed_dropout)

        # the output size of the ho token: ho = [ h || c]
        if tie_weights:
            self.ho_size = emb_size
        else:
            self.ho_size = rnn_size

        dec_input_size = emb_size
        if self.input_feeding:
            dec_input_size += self.ho_size
        if self.length_control:
            dec_input_size += 2

            # length scaling parameter
            self.W_tick = nn.Parameter(torch.rand(1))

        self.rnn = nn.LSTM(input_size=dec_input_size,
                           hidden_size=rnn_size,
                           num_layers=rnn_layers,
                           batch_first=True)

        self.rnn_dropout = nn.Dropout(rnn_dropout)

        self.attention = Attention(enc_size, rnn_size, method=attention_fn)

        # learnt temperature parameter
        if self.learn_tau:
            self.softplus = nn.Sequential(
                nn.Linear(self.ho_size, 1, bias=False), nn.Softplus())
            self.tau_0 = kwargs.get("tau_0", 1)

        # initial input feeding
        if self.input_feeding_learnt:
            self.Wi = nn.Linear(enc_size, self.ho_size)

        # source context-aware output projection
        self.Wc = nn.Linear(rnn_size + enc_size, self.ho_size)

        # projection layer to the vocabulary
        self.Wo = nn.Linear(self.ho_size, trg_ntokens)

        if self.layer_norm:
            self.norm_ctx = nn.LayerNorm(self.ho_size)

            if self.input_feeding_learnt:
                self.norm_input_feed = nn.LayerNorm(self.ho_size)

        if tie_weights:
            # if rnn_size != emb_size:
            #     raise ValueError("if `tie_weights` is True,"
            #                      "emb_size has to be equal to rnn_size")
            self.Wo.weight = self.embed.embedding.weight

    @staticmethod
    def _top_hidden(hidden):
        """
        Get the hidden state from the top RNN layer.
        Used as a query for attention mechanisms.
        Args:
            hidden:

        Returns:

        """
        if isinstance(hidden, tuple):
            return hidden[0][-1]
        else:
            return hidden[-1]

    @staticmethod
    def _coin_flip(prob):
        """
        Return the outcome of a biased coin flip.
        Args:
            prob: the probability of True.

        Returns: bool

        """
        return prob > 0 and torch.rand(1).item() < prob

    def get_embedding(self, step, trg, logits, sampling_prob, argmax, hard,
                      tau):
        """
        Get the token embedding for the current timestep. Possible options:
        - select the embedding by a given index
        - sample a token from a probability distribution and embed
        - construct a "fuzzy" embedding, by taking a convex combination of all
        the token embeddings, parameterized by a probability distribution

        Note: In the first step (step==0) select the embedding
        of the actual target word (usually the <sos> token).

        Args:
            step: the i-th timestep
            trg: the true token at the given step
            logits: the unormalized probability distribution over the tokens
                from the previous timestep.
            sampling_prob: how often to sample a word instead of using
                the gold one. (free-run vs. teacher-forcing)
            argmax: take the argmax of the distibution
            hard: (Straight-Trough Estimator) discretize the probability
                distribution and compute a convex combination
            tau:

        Returns: the word embedding and its index.

        """
        # in sample is `True`, then feed the prediction back to the model,
        # instead of the true target word
        sample = sampling_prob == 1 or self._coin_flip(sampling_prob)

        if step > 0 and sample:

            if argmax:  # get the argmax
                maxv, maxi = logits[-1].max(dim=2)
                e_i = self.embed(maxi)
                return e_i, None

            else:  # get the expected embedding, parameterized by the posterior
                if self.gumbel and self.training:
                    dist = gumbel_softmax(logits[-1].squeeze(), tau, hard)
                else:
                    dist = straight_softmax(logits[-1].squeeze(), tau, hard)

                e_i = self.embed.expectation(dist.unsqueeze(1))
                return e_i, dist
        else:

            w_i = trg[:, step].unsqueeze(1)
            e_i = self.embed(w_i)
            return e_i, None

    def _init_input_feed(self, enc_states, lengths):

        batch = enc_states.size(0)
        if self.input_feeding_learnt:

            mean = enc_states.sum(1) / lengths.unsqueeze(1).float()
            ho = self.Wi(mean).squeeze().unsqueeze(1)

            if self.layer_norm:
                ho = self.norm_input_feed(ho)

            if self.out_non_linearity == "relu":
                ho = torch.relu(ho)
            elif self.out_non_linearity == "tanh":
                ho = torch.tanh(ho)
        else:
            ho = torch.zeros((batch, 1, self.ho_size),
                             device=enc_states.device,
                             dtype=enc_states.dtype)

        return ho

    def step(self, embs, enc_outputs, state, enc_lengths, ho=None, tick=None):
        """
        Perform one decoding step.
        1. Construct the input. If input-feeding is used, then the input is the
            concatenation of the current embedding and previous context vector.
        2. Feed the input to the decoder and obtain the contextualized
            token representations.
        3. Generate a context vector. It is a convex combination of the
            states of the encoder, the weights of which are a function of each
            state of the encoder and the current state of the decoder.
        4. Re-weight the decoder's state with the context vector.
        5. Project the context-aware vector to the vocabulary.

        Args:
            embs:
            enc_outputs:
            state:
            ho:
            enc_lengths:
            tick:
      Returns:

        """

        # 1. Construct the input
        decoder_input = embs
        if self.input_feeding:
            if ho is None:
                ho = self._init_input_feed(enc_outputs, enc_lengths)
            decoder_input = torch.cat([embs, ho], -1)
        if self.length_control:
            decoder_input = torch.cat([decoder_input, tick], -1)

        # 2. Feed the input to the decoder
        self.rnn.flatten_parameters()
        outputs, state = self.rnn(decoder_input, state)
        outputs = self.rnn_dropout(outputs)

        # 3. Generate the context vector
        query = outputs.squeeze(1)
        contexts, att_scores = self.attention(enc_outputs, query, enc_lengths)
        contexts = contexts.unsqueeze(1)

        # 4. Re-weight the decoder's state with the context vector.
        ho = self.Wc(torch.cat([outputs, contexts], -1))

        if self.layer_norm:
            ho = self.norm_ctx(ho)

        if self.out_non_linearity == "relu":
            ho = torch.relu(ho)
        elif self.out_non_linearity == "tanh":
            ho = torch.tanh(ho)

        # 5. Project the context-aware vector to the vocabulary.
        dec_logits = self.Wo(ho)

        return dec_logits, outputs, state, ho, att_scores

    def forward(self,
                gold_tokens,
                enc_outputs,
                init_hidden,
                enc_lengths,
                sampling_prob=0.0,
                argmax=False,
                hard=False,
                tau=1.0,
                desired_lengths=None,
                word_dropout=0):
        """

        Args:
            gold_tokens:
            enc_outputs:
            init_hidden:
            enc_lengths:
            sampling_prob:
            argmax:
            hard:
            tau:
            desired_lengths:
            word_dropout:

        Returns:
            Note: dists contain one less element than logits, because
            we do not care about sampling from the last timestep as it will not
            be used for sampling another token. The last timestep should
            correspond to the EOS token, and the corresponding logit will be
            used only for computing the NLL loss of the EOS token.

        """

        batch, max_length = gold_tokens.size()

        logits = []
        outputs = []
        attentions = []
        dists = []
        taus = []

        # initial hidden state of the decoder, and initial context
        state = init_hidden
        ho = None
        tick = None

        if self.length_control:
            countdown = length_countdown(desired_lengths).float() * self.W_tick
            ratio = desired_lengths.float() / enc_lengths.float()

        for i in range(max_length):
            # obtain the input word embedding
            e_i, d_i = self.get_embedding(i, gold_tokens, logits,
                                          sampling_prob, argmax, hard, tau)

            if word_dropout > 0 and i > 0:
                e_i, mask = drop_tokens(e_i, word_dropout)

            # the number of remaining tokens
            if self.length_control:
                tick = torch.stack([countdown[:, i], ratio], -1).unsqueeze(1)

            # perform one decoding step
            _logits, outs, state, ho, att = self.step(e_i, enc_outputs, state,
                                                      enc_lengths, ho, tick)

            if self.learn_tau and self.training:
                tau = 1 / (self.softplus(ho.squeeze()) + self.tau_0)
                taus.append(tau)

            logits.append(_logits)
            outputs.append(outs)
            attentions.append(att)

            if i > 0 and sampling_prob == 1 and not argmax:
                dists.append(d_i)

        outputs = torch.cat(outputs, dim=1).contiguous()
        logits = torch.cat(logits, dim=1).contiguous()
        attentions = torch.stack(attentions, dim=1).contiguous()

        if len(dists) > 0:
            dists = torch.stack(dists, dim=1).contiguous()
        else:
            dists = None

        if len(taus) > 0:
            taus = torch.stack(taus, dim=1).squeeze()

        return logits, outputs, state, dists, attentions, taus
예제 #12
0
    def __init__(self, ntokens, nclasses, **kwargs):
        super(Classifier, self).__init__()

        ############################################
        # Params
        ############################################
        self.ntokens = ntokens
        self.emb_size = kwargs.get("emb_size", 100)
        self.embed_noise = kwargs.get("embed_noise", .0)
        self.embed_dropout = kwargs.get("embed_dropout", .0)
        self.bottom_rnn_size = kwargs.get("bottom_rnn_size", 100)
        self.bottom_rnn_layers = kwargs.get("bottom_rnn_layers", 1)
        self.bottom_rnn_dropout = kwargs.get("bottom_rnn_dropout", .0)
        self.top_rnn_size = kwargs.get("top_rnn_size", 100)
        self.top_rnn_layers = kwargs.get("top_rnn_layers", 1)
        self.top_rnn_dropout = kwargs.get("top_rnn_dropout", .0)
        self.tie_weights = kwargs.get("tie_weights", False)
        self.pack = kwargs.get("pack", True)
        self.attention_dropout = kwargs.get("attention_dropout", .0)
        self.attention_layers = kwargs.get("attention_layers", 1)
        self.dropout = kwargs.get("dropout", 0.1)
        self.dropouti = kwargs.get("dropouti", 0.1)
        self.dropouth = kwargs.get("dropouth", 0.1)
        self.dropoute = kwargs.get("dropoute", 0.1)
        self.wdrop = kwargs.get("wdrop", 0.0)
        self.att = kwargs.get("has_att", False)
        self.lockdrop = LockedDropout()
        self.idrop = nn.Dropout(self.dropouti)
        self.hdrop = nn.Dropout(self.dropouth)
        self.drop = nn.Dropout(self.dropout)
        self.top_bidir = kwargs.get("top_rnn_bidir", False)
        self.new_lm = kwargs.get("new_lm", False)
        ############################################
        # Layers
        ############################################
        self.embed = Embed(ntokens,
                           self.emb_size,
                           noise=self.embed_noise,
                           dropout=self.embed_dropout)
        if self.att:
            last = False
        else:
            last = True

        self.bottom_rnn = RNNModule(input_size=self.emb_size,
                                    rnn_size=self.bottom_rnn_size,
                                    num_layers=self.bottom_rnn_layers,
                                    bidirectional=False,
                                    dropout=self.bottom_rnn_dropout,
                                    pack=self.pack)
        if self.tie_weights:
            input_top_size = self.emb_size
        else:
            input_top_size = self.bottom_rnn_size

        self.top_rnn = RNNModule(input_size=input_top_size,
                                 rnn_size=self.top_rnn_size,
                                 num_layers=self.top_rnn_layers,
                                 bidirectional=self.top_bidir,
                                 dropout=self.top_rnn_dropout,
                                 pack=self.pack,
                                 last=last)
        if self.att:
            self.attention = SelfAttention(
                attention_size=self.top_rnn.feature_size,
                dropout=self.attention_dropout,
                layers=self.attention_layers)

        self.vocab = nn.Linear(self.bottom_rnn_size, ntokens)
        self.classes = nn.Linear(self.top_rnn.feature_size, nclasses)

        if self.tie_weights:
            self.vocab.weight = self.embed.embedding.weight
            if self.bottom_rnn_size != self.emb_size:
                self.down = nn.Linear(self.bottom_rnn_size, self.emb_size)