def __init__(self, layer1dim, layer2dim, optim): super().__init__(optim) self.layer1 = Linear(layer1dim) self.relu1 = Relu() self.bn1 = BatchNorm() self.layer2 = Linear(layer2dim) self.loss = CrossEntropy() self.loss_ = None
def __init__(self, num_head, num_dim, num_dim_k, num_dim_v, dropout_rate=0.1): """ num_head: the number of head num_dim: the number of dimension of each query word and key num_dim_k: the number of dimension query and key will mapping to num_dim_v: the number of dimension value will mapping to """ super(MultiHeadAttention, self).__init__() self.num_head = num_head self.num_dim = num_dim self.num_dim_k = num_dim_k self.num_dim_v = num_dim_v # parameter w_q, w_v, w_k for all head self.w_q = nn.Parameter(torch.FloatTensor(num_head, num_dim, num_dim_k)) self.w_k = nn.Parameter(torch.FloatTensor(num_head, num_dim, num_dim_k)) self.w_v = nn.Parameter(torch.FloatTensor(num_head, num_dim, num_dim_v)) nn.init.xavier_normal(self.w_q) nn.init.xavier_normal(self.w_k) nn.init.xavier_normal(self.w_k) self.attention = ScaledDotProductAttention(num_dim) self.project = Linear(num_head * num_dim_v, num_dim) self.dropout = nn.Dropout(dropout_rate)
def __init__(self, n_src_vocab, n_tgt_vocab, n_max_seq, n_layers=6, n_head=8, d_word_vec=512, d_model=512, d_inner_hid=1024, d_k=64, d_v=64, dropout=0.1, proj_share_weight=True, embs_share_weight=True): super(Transformer, self).__init__() self.encoder = Encoder(n_src_vocab, n_max_seq, n_layers=n_layers, n_head=n_head, d_word_vec=d_word_vec, d_model=d_model, d_inner_hid=d_inner_hid, dropout=dropout) self.decoder = Decoder(n_tgt_vocab, n_max_seq, n_layers=n_layers, n_head=n_head, d_word_vec=d_word_vec, d_model=d_model, d_inner_hid=d_inner_hid, dropout=dropout) self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False) self.dropout = nn.Dropout(dropout) assert d_model == d_word_vec, \ 'To facilitate the residual connections, \ the dimensions of all module output shall be the same.' if proj_share_weight: # Share the weight matrix between tgt word embedding/projection assert d_model == d_word_vec self.tgt_word_proj.weight = self.decoder.tgt_word_emb.weight if embs_share_weight: # Share the weight matrix between src/tgt word embeddings # assume the src/tgt word vec size are the same assert n_src_vocab == n_tgt_vocab, \ "To share word embedding table, the vocabulary size of src/tgt shall be the same." self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): super(MultiHeadAttention, self).__init__() self.n_head = n_head self.d_k = d_k self.d_v = d_v self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k)) self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k)) self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v)) self.attention = ScaledDotProductAttention(d_model) self.attention = LayerNormalization(d_model) self.proj = Linear(n_head*d_v, d_model) self.dropout = nn.Dropout(dropout) init.xavier_normal(self.w_qs) init.xavier_normal(self.w_ks) init.xavier_normal(self.w_vs)