class MatGRUGate(torch.nn.Module): """ GRU gate for matrix, similar to the official code. Please refer to section 3.4 of the paper for the formula. """ def __init__(self, rows, cols, activation): super().__init__() self.activation = activation self.W = Parameter(torch.Tensor(rows, rows)) self.U = Parameter(torch.Tensor(rows, rows)) self.bias = Parameter(torch.Tensor(rows, cols)) self.reset_parameters() def reset_parameters(self): init.xavier_uniform_(self.W) init.xavier_uniform_(self.U) init.zeros_(self.bias) def forward(self, x, hidden): out = self.activation(self.W.matmul(x) + \ self.U.matmul(hidden) + \ self.bias) return out
class mat_GRU_gate(torch.nn.Module): def __init__(self, rows, cols, activation): super().__init__() self.activation = activation #the k here should be in_feats which is actually the rows self.W = Parameter(torch.Tensor(rows, rows)) nn.init.orthogonal_(self.W) # self.reset_param(self.W) self.U = Parameter(torch.Tensor(rows, rows)) nn.init.orthogonal_(self.U) # self.reset_param(self.U) self.bias = Parameter(torch.zeros(rows, cols)) def reset_param(self, t): #Initialize based on the number of columns stdv = 1. / math.sqrt(t.size(1)) t.data.uniform_(-stdv, stdv) def forward(self, x, hidden): out = self.activation(self.W.matmul(x) + \ self.U.matmul(hidden) + \ self.bias) return out
class LeftSVDLayer(nn.Module): def __init__(self, ih, oh, dropout=None, bias=True): super().__init__() self.weight = Parameter(torch.Tensor(oh, ih)) self.dropout = dropout if bias: self.bias = Parameter(torch.Tensor(oh, 1)) else: self.register_parameter('bias', None) self.reset_parameters() def reset_parameters(self): nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) if self.bias is not None: fin, _ = nn.init._calculate_fan_in_and_fan_out(self.weight) bound = 1. / math.sqrt(fin / 2.) nn.init.uniform_(self.bias, -bound, bound) def forward(self, x): y = self.weight.matmul(x) if self.bias is not None: y = y + self.bias if self.dropout is not None: y = F.dropout(y, p=self.dropout) return y
class SVDLayer(nn.Module): def __init__(self, in_size, out_size, bias=True): super().__init__() ih, iw = _pair(in_size) oh, ow = _pair(out_size) self.w1 = Parameter(torch.Tensor(oh, ih)) # def.d transposed instead of transposing every forward call self.w2 = Parameter(torch.Tensor(iw, ow)) if bias: self.bias = Parameter(torch.Tensor(oh, ow)) else: self.register_parameter('bias', None) self.reset_parameters() def reset_parameters(self): nn.init.kaiming_uniform_(self.w1, a=math.sqrt(5)) nn.init.kaiming_uniform_(self.w2, a=math.sqrt(5)) if self.bias is not None: fin1, _ = nn.init._calculate_fan_in_and_fan_out(self.w1) fin2, _ = nn.init._calculate_fan_in_and_fan_out(self.w2) bound = 1. / math.sqrt((fin1 + fin2) / 2.) nn.init.uniform_(self.bias, -bound, bound) def forward(self, x): a = self.w1.matmul(x) y = a.matmul(self.w2) if self.bias is not None: return y + self.bias else: return y def __repr__(self): oh, ih = self.w1.shape iw, ow = self.w2.shape return f'SVDLayer ({ih}, {iw}) -> ({oh}, {ow})'
class coattention(nn.Module): """coattention get high-level h from given V (d*N) and Q (d*T). """ def __init__(self, dim_d): super(coattention, self).__init__() dim_k = dim_d self.W_b = Parameter(torch.Tensor(dim_d, dim_d)) self.W_v = Parameter(torch.Tensor(dim_k, dim_d)) self.W_q = Parameter(torch.Tensor(dim_k, dim_d)) self.w_hv = Parameter(torch.Tensor(1, dim_k)) self.w_hq = Parameter(torch.Tensor(1, dim_k)) self.tanh = nn.Tanh() self.softmax = nn.Softmax() def forward(self, Q, V): """ :param Q: [batch, dim_d, dim_T] :param V: [batch, dim_d, dim_N] :return: q_hat [dim_d], v_hat [dim_d] """ # print('\n Q:', Q.size()) # print('\n V:', V.size()) QT = torch.transpose(Q, 1, 2) C = QT.matmul(self.W_b.matmul(V)) # [dim_d, dim_d] C = self.tanh(C) # print('\n size C:', C.size()) Hv = self.tanh(self.W_v.matmul(V) + self.W_q.matmul(Q).matmul(C)) av = self.softmax(self.w_hv.matmul(Hv)) v_hat = torch.bmm(av, V.transpose(1, 2)).squeeze() # print('\n v_hat:', v_hat.size()) # print('\n size 1:', self.W_q.matmul(Q).size()) # print('\n size 2 part:', self.W_v.matmul(V).transpose(0, 1).size()) Hq = self.tanh( self.W_q.matmul(Q) + self.W_v.matmul(V).matmul(torch.transpose(C, 1, 2))) aq = self.softmax(self.w_hq.matmul(Hq)) q_hat = torch.bmm(aq, Q.transpose(1, 2)).squeeze() return q_hat, v_hat