def __init__(self, src_n_tokens, trg_n_tokens, **kwargs): super(Seq2SeqRNN, self).__init__(src_n_tokens, trg_n_tokens) self.bridge_non_linearity = kwargs["bridge_non_linearity"] self.ninp = kwargs["encoder"]["emb_size"] # for noam optimizer self.embed_src = Embed(self.src_n_tokens, kwargs["encoder"]["emb_size"], emb_dropout=kwargs["encoder"]["emb_dropout"], trainable=kwargs["encoder"]["emb_trainable"], layer_norm=kwargs["encoder"]["emb_layer_norm"], padding_idx=kwargs.get("enc_padding_idx", 0)) self.embed_tgt = Embed(self.trg_n_tokens, kwargs["decoder"]["emb_size"], emb_dropout=kwargs["decoder"]["emb_dropout"], trainable=kwargs["decoder"]["emb_trainable"], layer_norm=kwargs["decoder"]["emb_layer_norm"], padding_idx=kwargs.get("dec_padding_idx", 0)) self.encoder = RNNEncoder(self.embed_src, **kwargs["encoder"]) self.decoder = AttSeqDecoder(self.trg_n_tokens, self.embed_tgt, self.encoder.hidden_size, **kwargs["decoder"]) self.bridge = RNNBridge(enc_dim=self.encoder.hidden_size, dec_dim=self.decoder.rnn.hidden_size, dec_layers=self.decoder.rnn.num_layers, dec_type=self.decoder.rnn.mode)
def __init__(self, ntokens, **kwargs): super(LangModel, self).__init__() ############################################ # Params ############################################ self.ntokens = ntokens self.emb_size = kwargs.get("emb_size", 100) self.embed_noise = kwargs.get("embed_noise", .0) self.embed_dropout = kwargs.get("embed_dropout", .0) self.rnn_size = kwargs.get("rnn_size", 100) self.rnn_layers = kwargs.get("rnn_layers", 1) self.rnn_dropout = kwargs.get("rnn_dropout", .0) self.decode = kwargs.get("decode", False) self.tie_weights = kwargs.get("tie_weights", False) self.pack = kwargs.get("pack", True) ############################################ # Layers ############################################ self.embed = Embed(ntokens, self.emb_size, noise=self.embed_noise, dropout=self.embed_dropout) self.encoder = RNNModule(input_size=self.emb_size, rnn_size=self.rnn_size, num_layers=self.rnn_layers, bidirectional=False, pack=self.pack) self.decoder = nn.Linear(self.rnn_size, ntokens) if self.tie_weights: self.decoder.weight = self.embed.embedding.weight if self.rnn_size != self.emb_size: self.down = nn.Linear(self.rnn_size, self.emb_size)
def __init__(self, ntokens, nclasses, feat_size, **kwargs): super(BaselineConcClassifier, self).__init__() ############################################ # Params ############################################ self.ntokens = ntokens self.feat_size = feat_size self.embed_finetune = kwargs.get("embed_finetune", False) self.emb_size = kwargs.get("emb_size", 100) self.embed_noise = kwargs.get("embed_noise", .0) self.embed_dropout = kwargs.get("embed_dropout", .0) self.rnn_size = kwargs.get("rnn_size", 100) self.rnn_layers = kwargs.get("rnn_layers", 1) self.rnn_dropout = kwargs.get("rnn_dropout", .0) self.pack = kwargs.get("pack", True) self.no_rnn = kwargs.get("no_rnn", False) self.conc_emb = kwargs.get("conc_emb", False) self.conc_rnn = kwargs.get("conc_rnn", False) self.conc_out = kwargs.get("conc_out", False) self.bidir = kwargs.get("bidir", False) ############################################ # Layers ############################################ self.word_embedding = Embed(num_embeddings=ntokens, embedding_dim=self.emb_size, noise=self.embed_noise, dropout=self.embed_dropout) if self.conc_emb: rnn_input_size = self.emb_size + self.feat_size else: rnn_input_size = self.emb_size self.rnn = RNNModule(input_size=rnn_input_size, rnn_size=self.rnn_size, num_layers=self.rnn_layers, bidirectional=self.bidir, dropout=self.rnn_dropout, pack=self.pack) if self.no_rnn: self.attention_size = rnn_input_size else: if self.conc_rnn: self.attention_size = self.rnn.feature_size + self.feat_size else: self.attention_size = self.rnn.feature_size self.attention = SelfAttention(self.attention_size, baseline=True) output_input_size = self.attention_size self.classes = nn.Linear(output_input_size, nclasses)
def __init__(self, src_n_tokens, trg_n_tokens, emb_size=512, nhead=8, nhid=2048, nlayers=6, dropout=0.1, tie_projections=True, **kwargs): super(Seq2SeqTransformer, self).__init__(src_n_tokens, trg_n_tokens, **kwargs) self.ninp = emb_size self.tgt_mask = None self.tie_projections = tie_projections self.embed_src = Embed(self.src_n_tokens, emb_size, scale=True, padding_idx=kwargs.get("enc_padding_idx", 0)) self.embed_tgt = Embed(self.trg_n_tokens, emb_size, scale=True, padding_idx=kwargs.get("dec_padding_idx", 0)) self.encoder = TransformerEncoder(hidden_size=emb_size, ff_size=nhid, num_layers=nlayers, num_heads=nhead, dropout=dropout, emb_dropout=dropout) self.decoder = TransformerDecoder(num_layers=nlayers, num_heads=nhead, hidden_size=emb_size, ff_size=nhid, dropout=dropout, emb_dropout=dropout, vocab_size=trg_n_tokens) self.tie_weights()
def __init__(self, ntokens, **kwargs): super(SeqReader, self).__init__() ############################################ # Attributes ############################################ self.ntokens = ntokens self.emb_size = kwargs.get("emb_size", 100) self.embed_noise = kwargs.get("embed_noise", .0) self.embed_dropout = kwargs.get("embed_dropout", .0) self.rnn_size = kwargs.get("rnn_size", 100) self.rnn_layers = kwargs.get("rnn_layers", 1) self.rnn_dropout = kwargs.get("rnn_dropout", .0) self.rnn_bidirectional = kwargs.get("rnn_bidirectional", False) self.decode = kwargs.get("decode", False) self.tie_weights = kwargs.get("tie_weights", False) self.pack = kwargs.get("pack", True) self.countdown = kwargs.get("countdown", False) ############################################ # Layers ############################################ self.embed = Embed(ntokens, self.emb_size, noise=self.embed_noise, dropout=self.embed_dropout) self.encoder = RNNModule(input_size=self.emb_size, rnn_size=self.rnn_size, num_layers=self.rnn_layers, bidirectional=self.rnn_bidirectional, dropout=self.rnn_dropout, pack=self.pack, countdown=self.countdown) if self.rnn_bidirectional: self.rnn_size *= 2 if self.decode: if self.rnn_bidirectional: raise ValueError("Can't decode with bidirectional RNNs!") if self.tie_weights and self.rnn_size != self.emb_size: rnn_out = self.emb_size self.down = nn.Linear(self.rnn_size, rnn_out) else: rnn_out = self.rnn_size self.out = nn.Linear(rnn_out, ntokens) if self.tie_weights: # if self.rnn_size != self.emb_size: # raise ValueError("if `tie_weights` is True," # "emb_size has to be equal to rnn_size") self.out.weight = self.embed.embedding.weight
def __init__(self, ntokens, nclasses, attention=False, **kwargs): super(NaiveClassifier, self).__init__() ############################################ # Params ############################################ self.ntokens = ntokens self.emb_size = kwargs.get("emb_size", 100) self.embed_noise = kwargs.get("embed_noise", .0) self.embed_dropout = kwargs.get("embed_dropout", .0) self.bottom_rnn_size = kwargs.get("bottom_rnn_size", 100) self.attention_dropout = kwargs.get("attention_dropout", .0) self.bottom_rnn_layers = kwargs.get("bottom_rnn_layers", 1) self.bottom_rnn_dropout = kwargs.get("bottom_rnn_dropout", .0) self.tie_weights = kwargs.get("tie_weights", False) self.pack = kwargs.get("pack", True) self.att = attention self.attention_layers = kwargs.get("attention_layers", 1) ############################################ # Layers ############################################ self.embed = Embed(ntokens, self.emb_size, noise=self.embed_noise, dropout= self.embed_dropout) if self.att: last = False else: last = True self.bottom_rnn = RNNModule(input_size=self.emb_size, rnn_size=self.bottom_rnn_size, num_layers=self.bottom_rnn_layers, bidirectional=False, dropout=self.bottom_rnn_dropout, pack=self.pack, last=last) if self.att: self.attention = SelfAttention(attention_size= self.bottom_rnn_size, dropout= self.attention_dropout) self.classes = nn.Linear(self.bottom_rnn_size, nclasses)
def __init__(self, ntokens, tie_projections=False, out_layer_norm=False, **kwargs): super(RNNLM, self).__init__() self.tie_projections = tie_projections self.out_layer_norm = out_layer_norm self.embed = Embed(ntokens, kwargs["emb_size"], emb_dropout=kwargs["emb_dropout"], trainable=kwargs["emb_trainable"], layer_norm=kwargs["emb_layer_norm"], max_norm=kwargs["emb_max_norm"]) self.encoder = RNNEncoder(self.embed, **kwargs) assert not self.encoder.rnn.bidirectional enc_dim = self.encoder.hidden_size emb_dim = self.embed.embedding_dim self.project_down = (self.tie_projections and enc_dim != emb_dim) if self.project_down: self.W_h = nn.Linear(enc_dim, emb_dim) out_dim = emb_dim else: out_dim = enc_dim if self.out_layer_norm: self.LN_h = nn.LayerNorm(out_dim, eps=1e-6) self.logits = nn.Linear(out_dim, ntokens) self.init_weights() self.tie_weights()
def __init__(self, ntoken, emb_size=512, nhead=8, nhid=2048, nlayers=6, dropout=0.1, tie_projections=True, **kwargs): super(TransformerLM, self).__init__() self.tie_projections = tie_projections self.ninp = emb_size self.embed = Embed(ntoken, emb_size, scale=True) self.encoder = TransformerEncoder(hidden_size=emb_size, ff_size=nhid, num_layers=nlayers, num_heads=nhead, dropout=dropout, emb_dropout=dropout) self.logits = nn.Linear(emb_size, ntoken) self.tie_weights()
def __init__(self, ntokens, nclasses, feature_size, **kwargs): super(AffectiveAttention, self).__init__() ############################################ # Params ############################################ self.ntokens = ntokens self.feat_size = feature_size self.attention_type = kwargs["attention_type"] self.embed_finetune = kwargs.get("embed_finetune", False) self.emb_size = kwargs.get("emb_size", 100) self.embed_noise = kwargs.get("embed_noise", .0) self.embed_dropout = kwargs.get("embed_dropout", .0) self.rnn_size = kwargs.get("rnn_size", 100) self.rnn_layers = kwargs.get("rnn_layers", 1) self.rnn_dropout = kwargs.get("rnn_dropout", .0) self.att_dropout = kwargs.get("attention_dropout", .0) self.pack = kwargs.get("pack", True) self.no_rnn = kwargs.get("no_rnn", False) self.conc_emb = kwargs.get("conc_emb", False) self.conc_rnn = kwargs.get("conc_rnn", False) self.conc_out = kwargs.get("conc_out", False) self.bidir = kwargs.get("bidir", False) ############################################ # Layers ############################################ self.word_embedding = Embed(ntokens, self.emb_size, noise=self.embed_noise, dropout=self.embed_dropout) # todo: use features Embedding layer :) # self.feat_embedding = nn.Embedding(num_embeddings=num_features, # embedding_dim=feature_size) if self.conc_emb: rnn_input_size = self.emb_size + self.feat_size else: rnn_input_size = self.emb_size self.rnn = RNNModule(input_size=rnn_input_size, rnn_size=self.rnn_size, num_layers=self.rnn_layers, bidirectional=False, dropout=self.rnn_dropout, pack=self.pack) if self.no_rnn: self.attention_size = rnn_input_size else: if self.conc_rnn: self.attention_size = self.rnn.feature_size + self.feat_size else: self.attention_size = self.rnn.feature_size if self.attention_type == "affine": self.scale = nn.Linear(feature_size, self.attention_size) self.shift = nn.Linear(feature_size, self.attention_size) self.attention = SelfAttention(attention_size=self.attention_size, dropout=self.att_dropout) elif self.attention_type == "non_linear_affine": self.scale = nn.Linear(feature_size, self.attention_size) self.shift = nn.Linear(feature_size, self.attention_size) self.tanh = nn.Tanh() self.attention = SelfAttention(attention_size=self.attention_size, dropout=self.att_dropout) elif self.attention_type == "concat": self.attention = SelfAttention(attention_size=self.attention_size + feature_size, dropout=self.att_dropout) elif self.attention_type == "gate": self.gate = nn.Linear(feature_size, self.attention_size) self.sigmoid = nn.Sigmoid() self.attention = SelfAttention(attention_size=self.attention_size, dropout=self.att_dropout) else: raise ValueError("Unknown attention_type") self.classes = nn.Linear(self.attention_size, nclasses)
def __init__(self, trg_ntokens, enc_size, **kwargs): super(AttSeqDecoder, self).__init__() ############################################ # Attributes ############################################ self.trg_ntokens = trg_ntokens emb_size = kwargs.get("emb_size", 100) embed_noise = kwargs.get("embed_noise", .0) embed_dropout = kwargs.get("embed_dropout", .0) rnn_size = kwargs.get("rnn_size", 100) rnn_layers = kwargs.get("rnn_layers", 1) rnn_dropout = kwargs.get("rnn_dropout", .0) tie_weights = kwargs.get("tie_weights", False) attention_fn = kwargs.get("attention_fn", "general") self.input_feeding = kwargs.get("input_feeding", False) self.learn_tau = kwargs.get("learn_tau", False) self.length_control = kwargs.get("length_control", False) self.gumbel = kwargs.get("gumbel", False) self.out_non_linearity = kwargs.get("out_non_linearity", None) self.layer_norm = kwargs.get("layer_norm", None) self.input_feeding_learnt = kwargs.get("input_feeding_learnt", False) ############################################ # Layers ############################################ self.embed = Embed(trg_ntokens, emb_size, noise=embed_noise, dropout=embed_dropout) # the output size of the ho token: ho = [ h || c] if tie_weights: self.ho_size = emb_size else: self.ho_size = rnn_size dec_input_size = emb_size if self.input_feeding: dec_input_size += self.ho_size if self.length_control: dec_input_size += 2 # length scaling parameter self.W_tick = nn.Parameter(torch.rand(1)) self.rnn = nn.LSTM(input_size=dec_input_size, hidden_size=rnn_size, num_layers=rnn_layers, batch_first=True) self.rnn_dropout = nn.Dropout(rnn_dropout) self.attention = Attention(enc_size, rnn_size, method=attention_fn) # learnt temperature parameter if self.learn_tau: self.softplus = nn.Sequential( nn.Linear(self.ho_size, 1, bias=False), nn.Softplus()) self.tau_0 = kwargs.get("tau_0", 1) # initial input feeding if self.input_feeding_learnt: self.Wi = nn.Linear(enc_size, self.ho_size) # source context-aware output projection self.Wc = nn.Linear(rnn_size + enc_size, self.ho_size) # projection layer to the vocabulary self.Wo = nn.Linear(self.ho_size, trg_ntokens) if self.layer_norm: self.norm_ctx = nn.LayerNorm(self.ho_size) if self.input_feeding_learnt: self.norm_input_feed = nn.LayerNorm(self.ho_size) if tie_weights: # if rnn_size != emb_size: # raise ValueError("if `tie_weights` is True," # "emb_size has to be equal to rnn_size") self.Wo.weight = self.embed.embedding.weight
class AttSeqDecoder(nn.Module): def __init__(self, trg_ntokens, enc_size, **kwargs): super(AttSeqDecoder, self).__init__() ############################################ # Attributes ############################################ self.trg_ntokens = trg_ntokens emb_size = kwargs.get("emb_size", 100) embed_noise = kwargs.get("embed_noise", .0) embed_dropout = kwargs.get("embed_dropout", .0) rnn_size = kwargs.get("rnn_size", 100) rnn_layers = kwargs.get("rnn_layers", 1) rnn_dropout = kwargs.get("rnn_dropout", .0) tie_weights = kwargs.get("tie_weights", False) attention_fn = kwargs.get("attention_fn", "general") self.input_feeding = kwargs.get("input_feeding", False) self.learn_tau = kwargs.get("learn_tau", False) self.length_control = kwargs.get("length_control", False) self.gumbel = kwargs.get("gumbel", False) self.out_non_linearity = kwargs.get("out_non_linearity", None) self.layer_norm = kwargs.get("layer_norm", None) self.input_feeding_learnt = kwargs.get("input_feeding_learnt", False) ############################################ # Layers ############################################ self.embed = Embed(trg_ntokens, emb_size, noise=embed_noise, dropout=embed_dropout) # the output size of the ho token: ho = [ h || c] if tie_weights: self.ho_size = emb_size else: self.ho_size = rnn_size dec_input_size = emb_size if self.input_feeding: dec_input_size += self.ho_size if self.length_control: dec_input_size += 2 # length scaling parameter self.W_tick = nn.Parameter(torch.rand(1)) self.rnn = nn.LSTM(input_size=dec_input_size, hidden_size=rnn_size, num_layers=rnn_layers, batch_first=True) self.rnn_dropout = nn.Dropout(rnn_dropout) self.attention = Attention(enc_size, rnn_size, method=attention_fn) # learnt temperature parameter if self.learn_tau: self.softplus = nn.Sequential( nn.Linear(self.ho_size, 1, bias=False), nn.Softplus()) self.tau_0 = kwargs.get("tau_0", 1) # initial input feeding if self.input_feeding_learnt: self.Wi = nn.Linear(enc_size, self.ho_size) # source context-aware output projection self.Wc = nn.Linear(rnn_size + enc_size, self.ho_size) # projection layer to the vocabulary self.Wo = nn.Linear(self.ho_size, trg_ntokens) if self.layer_norm: self.norm_ctx = nn.LayerNorm(self.ho_size) if self.input_feeding_learnt: self.norm_input_feed = nn.LayerNorm(self.ho_size) if tie_weights: # if rnn_size != emb_size: # raise ValueError("if `tie_weights` is True," # "emb_size has to be equal to rnn_size") self.Wo.weight = self.embed.embedding.weight @staticmethod def _top_hidden(hidden): """ Get the hidden state from the top RNN layer. Used as a query for attention mechanisms. Args: hidden: Returns: """ if isinstance(hidden, tuple): return hidden[0][-1] else: return hidden[-1] @staticmethod def _coin_flip(prob): """ Return the outcome of a biased coin flip. Args: prob: the probability of True. Returns: bool """ return prob > 0 and torch.rand(1).item() < prob def get_embedding(self, step, trg, logits, sampling_prob, argmax, hard, tau): """ Get the token embedding for the current timestep. Possible options: - select the embedding by a given index - sample a token from a probability distribution and embed - construct a "fuzzy" embedding, by taking a convex combination of all the token embeddings, parameterized by a probability distribution Note: In the first step (step==0) select the embedding of the actual target word (usually the <sos> token). Args: step: the i-th timestep trg: the true token at the given step logits: the unormalized probability distribution over the tokens from the previous timestep. sampling_prob: how often to sample a word instead of using the gold one. (free-run vs. teacher-forcing) argmax: take the argmax of the distibution hard: (Straight-Trough Estimator) discretize the probability distribution and compute a convex combination tau: Returns: the word embedding and its index. """ # in sample is `True`, then feed the prediction back to the model, # instead of the true target word sample = sampling_prob == 1 or self._coin_flip(sampling_prob) if step > 0 and sample: if argmax: # get the argmax maxv, maxi = logits[-1].max(dim=2) e_i = self.embed(maxi) return e_i, None else: # get the expected embedding, parameterized by the posterior if self.gumbel and self.training: dist = gumbel_softmax(logits[-1].squeeze(), tau, hard) else: dist = straight_softmax(logits[-1].squeeze(), tau, hard) e_i = self.embed.expectation(dist.unsqueeze(1)) return e_i, dist else: w_i = trg[:, step].unsqueeze(1) e_i = self.embed(w_i) return e_i, None def _init_input_feed(self, enc_states, lengths): batch = enc_states.size(0) if self.input_feeding_learnt: mean = enc_states.sum(1) / lengths.unsqueeze(1).float() ho = self.Wi(mean).squeeze().unsqueeze(1) if self.layer_norm: ho = self.norm_input_feed(ho) if self.out_non_linearity == "relu": ho = torch.relu(ho) elif self.out_non_linearity == "tanh": ho = torch.tanh(ho) else: ho = torch.zeros((batch, 1, self.ho_size), device=enc_states.device, dtype=enc_states.dtype) return ho def step(self, embs, enc_outputs, state, enc_lengths, ho=None, tick=None): """ Perform one decoding step. 1. Construct the input. If input-feeding is used, then the input is the concatenation of the current embedding and previous context vector. 2. Feed the input to the decoder and obtain the contextualized token representations. 3. Generate a context vector. It is a convex combination of the states of the encoder, the weights of which are a function of each state of the encoder and the current state of the decoder. 4. Re-weight the decoder's state with the context vector. 5. Project the context-aware vector to the vocabulary. Args: embs: enc_outputs: state: ho: enc_lengths: tick: Returns: """ # 1. Construct the input decoder_input = embs if self.input_feeding: if ho is None: ho = self._init_input_feed(enc_outputs, enc_lengths) decoder_input = torch.cat([embs, ho], -1) if self.length_control: decoder_input = torch.cat([decoder_input, tick], -1) # 2. Feed the input to the decoder self.rnn.flatten_parameters() outputs, state = self.rnn(decoder_input, state) outputs = self.rnn_dropout(outputs) # 3. Generate the context vector query = outputs.squeeze(1) contexts, att_scores = self.attention(enc_outputs, query, enc_lengths) contexts = contexts.unsqueeze(1) # 4. Re-weight the decoder's state with the context vector. ho = self.Wc(torch.cat([outputs, contexts], -1)) if self.layer_norm: ho = self.norm_ctx(ho) if self.out_non_linearity == "relu": ho = torch.relu(ho) elif self.out_non_linearity == "tanh": ho = torch.tanh(ho) # 5. Project the context-aware vector to the vocabulary. dec_logits = self.Wo(ho) return dec_logits, outputs, state, ho, att_scores def forward(self, gold_tokens, enc_outputs, init_hidden, enc_lengths, sampling_prob=0.0, argmax=False, hard=False, tau=1.0, desired_lengths=None, word_dropout=0): """ Args: gold_tokens: enc_outputs: init_hidden: enc_lengths: sampling_prob: argmax: hard: tau: desired_lengths: word_dropout: Returns: Note: dists contain one less element than logits, because we do not care about sampling from the last timestep as it will not be used for sampling another token. The last timestep should correspond to the EOS token, and the corresponding logit will be used only for computing the NLL loss of the EOS token. """ batch, max_length = gold_tokens.size() logits = [] outputs = [] attentions = [] dists = [] taus = [] # initial hidden state of the decoder, and initial context state = init_hidden ho = None tick = None if self.length_control: countdown = length_countdown(desired_lengths).float() * self.W_tick ratio = desired_lengths.float() / enc_lengths.float() for i in range(max_length): # obtain the input word embedding e_i, d_i = self.get_embedding(i, gold_tokens, logits, sampling_prob, argmax, hard, tau) if word_dropout > 0 and i > 0: e_i, mask = drop_tokens(e_i, word_dropout) # the number of remaining tokens if self.length_control: tick = torch.stack([countdown[:, i], ratio], -1).unsqueeze(1) # perform one decoding step _logits, outs, state, ho, att = self.step(e_i, enc_outputs, state, enc_lengths, ho, tick) if self.learn_tau and self.training: tau = 1 / (self.softplus(ho.squeeze()) + self.tau_0) taus.append(tau) logits.append(_logits) outputs.append(outs) attentions.append(att) if i > 0 and sampling_prob == 1 and not argmax: dists.append(d_i) outputs = torch.cat(outputs, dim=1).contiguous() logits = torch.cat(logits, dim=1).contiguous() attentions = torch.stack(attentions, dim=1).contiguous() if len(dists) > 0: dists = torch.stack(dists, dim=1).contiguous() else: dists = None if len(taus) > 0: taus = torch.stack(taus, dim=1).squeeze() return logits, outputs, state, dists, attentions, taus
def __init__(self, ntokens, nclasses, **kwargs): super(Classifier, self).__init__() ############################################ # Params ############################################ self.ntokens = ntokens self.emb_size = kwargs.get("emb_size", 100) self.embed_noise = kwargs.get("embed_noise", .0) self.embed_dropout = kwargs.get("embed_dropout", .0) self.bottom_rnn_size = kwargs.get("bottom_rnn_size", 100) self.bottom_rnn_layers = kwargs.get("bottom_rnn_layers", 1) self.bottom_rnn_dropout = kwargs.get("bottom_rnn_dropout", .0) self.top_rnn_size = kwargs.get("top_rnn_size", 100) self.top_rnn_layers = kwargs.get("top_rnn_layers", 1) self.top_rnn_dropout = kwargs.get("top_rnn_dropout", .0) self.tie_weights = kwargs.get("tie_weights", False) self.pack = kwargs.get("pack", True) self.attention_dropout = kwargs.get("attention_dropout", .0) self.attention_layers = kwargs.get("attention_layers", 1) self.dropout = kwargs.get("dropout", 0.1) self.dropouti = kwargs.get("dropouti", 0.1) self.dropouth = kwargs.get("dropouth", 0.1) self.dropoute = kwargs.get("dropoute", 0.1) self.wdrop = kwargs.get("wdrop", 0.0) self.att = kwargs.get("has_att", False) self.lockdrop = LockedDropout() self.idrop = nn.Dropout(self.dropouti) self.hdrop = nn.Dropout(self.dropouth) self.drop = nn.Dropout(self.dropout) self.top_bidir = kwargs.get("top_rnn_bidir", False) self.new_lm = kwargs.get("new_lm", False) ############################################ # Layers ############################################ self.embed = Embed(ntokens, self.emb_size, noise=self.embed_noise, dropout=self.embed_dropout) if self.att: last = False else: last = True self.bottom_rnn = RNNModule(input_size=self.emb_size, rnn_size=self.bottom_rnn_size, num_layers=self.bottom_rnn_layers, bidirectional=False, dropout=self.bottom_rnn_dropout, pack=self.pack) if self.tie_weights: input_top_size = self.emb_size else: input_top_size = self.bottom_rnn_size self.top_rnn = RNNModule(input_size=input_top_size, rnn_size=self.top_rnn_size, num_layers=self.top_rnn_layers, bidirectional=self.top_bidir, dropout=self.top_rnn_dropout, pack=self.pack, last=last) if self.att: self.attention = SelfAttention( attention_size=self.top_rnn.feature_size, dropout=self.attention_dropout, layers=self.attention_layers) self.vocab = nn.Linear(self.bottom_rnn_size, ntokens) self.classes = nn.Linear(self.top_rnn.feature_size, nclasses) if self.tie_weights: self.vocab.weight = self.embed.embedding.weight if self.bottom_rnn_size != self.emb_size: self.down = nn.Linear(self.bottom_rnn_size, self.emb_size)