def __init__(self, n_heads, d_model, d_head=None, p=0., bias=True, scale=True): super().__init__() d_head = ifnone(d_head, d_model // n_heads) self.n_heads, self.d_head, self.scale = n_heads, d_head, scale self.q_wgt, self.k_wgt, self.v_wgt = [ nn.Linear(d_model, n_heads * d_head, bias=bias) for o in range(3) ] self.out = nn.Linear(n_heads * d_head, d_model, bias=bias) self.drop_att, self.drop_res = nn.Dropout(p), nn.Dropout(p) self.ln = nn.LayerNorm(d_model)
def __init__(self, enc_size, dec_size, emb_enc, emb_dec, n_layers=6, n_heads=8, d_model=256, d_head=32, d_inner=1024, p=0.1, bias=True, scale=True, double_drop=True, pad_idx=1): self.enc_emb = TransformerEmbedding(emb_enc, p) self.dec_emb = TransformerEmbedding(emb_dec, 0.) args = (n_heads, d_model, d_head, d_inner, p, bias, scale, double_drop) self.encoder = nn.ModuleList( [EncoderBlock(*args) for _ in range(n_layers)]) self.decoder = nn.ModuleList( [DecoderBlock(*args) for _ in range(n_layers)]) self.out = nn.Linear(d_model, dec_size) self.out.weight = self.dec_emb.embed.weight self.pad_idx = pad_idx
def __init__(self, emb_enc, emb_dec, nh, out_sl, nl=2, bos_idx=0, pad_idx=1): super().__init__() self.nl, self.nh, self.out_sl, self.pr_force = nl, nh, out_sl, 1 self.bos_idx, self.pad_idx = bos_idx, pad_idx self.emb_enc, self.emb_dec = emb_enc, emb_dec self.emb_sz_enc, self.emb_sz_dec = emb_enc.embedding_dim, emb_dec.embedding_dim self.voc_sz_dec = emb_dec.num_embeddings self.emb_enc_drop = nn.Dropout(0.15) self.gru_enc = nn.GRU(self.emb_sz_enc, nh, num_layers=nl, dropout=0.25, batch_first=True, bidirectional=True) self.out_enc = nn.Linear(2 * nh, self.emb_sz_dec, bias=False) self.gru_dec = nn.GRU(self.emb_sz_dec + 2 * nh, self.emb_sz_dec, num_layers=nl, dropout=0.1, batch_first=True) self.out_drop = nn.Dropout(0.35) self.out = nn.Linear(self.emb_sz_dec, self.voc_sz_dec) self.out.weight.data = self.emb_dec.weight.data self.enc_att = nn.Linear(2 * nh, self.emb_sz_dec, bias=False) self.hid_att = nn.Linear(self.emb_sz_dec, self.emb_sz_dec) self.V = self.init_param(self.emb_sz_dec)
def __init__(self, emb_enc, emb_dec, hidden_layer_size, max_output_length, nl=1, bos_idx=0, pad_idx=1): super().__init__() self.nl, self.hidden_layer_size, self.max_output_length = nl, hidden_layer_size, max_output_length self.bos_idx, self.pad_idx = bos_idx, pad_idx self.em_sz_enc = emb_enc.embedding_dim self.em_sz_dec = emb_dec.embedding_dim self.voc_sz_dec = emb_dec.num_embeddings self.emb_enc = emb_enc self.emb_enc_drop = nn.Dropout(0.15) # self.gru_enc = nn.GRU(self.em_sz_enc, hidden_layer_size, num_layers=nl, # dropout=0.25, batch_first=True) self.gru_enc = nn.GRU(self.em_sz_enc, hidden_layer_size, num_layers=nl, batch_first=True) self.out_enc = nn.Linear(hidden_layer_size, self.em_sz_dec, bias=False) self.emb_dec = emb_dec # self.gru_dec = nn.GRU(self.em_sz_dec, self.em_sz_dec, num_layers=nl, # dropout=0.1, batch_first=True) self.gru_dec = nn.GRU(self.em_sz_dec, self.em_sz_dec, num_layers=nl, batch_first=True) self.out_drop = nn.Dropout(0.35) self.out = nn.Linear(self.em_sz_dec, self.voc_sz_dec) self.out.weight.data = self.emb_dec.weight.data self.pr_force = 0.
def feed_forward(d_model, d_ff, ff_p=0., double_drop=True): layers = [nn.Linear(d_model, d_ff), nn.ReLU()] if double_drop: layers.append(nn.Dropout(ff_p)) return SequentialEx(*layers, nn.Linear(d_ff, d_model), nn.Dropout(ff_p), MergeLayer(), nn.LayerNorm(d_model))