def __init__(self, input_dim1, input_dim2, output_dim, sum_pool=True): """ Args: output_dim1: output dimension for compact bilinear pooling. output_dim2: output dimension for compact bilinear pooling. sum_pool: (Optional) If True, sum the output along height and width dimensions and return output shape [batch_size, output_dim]. sum_pool: (Optional) If True, sum the output along height and width dimensions and return output shape [batch_size, output_dim]. Otherwise return [batch_size, height, width, output_dim]. Default: True. """ super(CompactBilinearPooling, self).__init__() self.input_dim1 = input_dim1 self.input_dim2 = input_dim2 self.output_dim = output_dim self.sum_pool = sum_pool # Step 0: Generate vectors and sketch matrix for tensor count sketch # This is only done once during graph construction, and fixed during each # operation np.random.seed(1) self.rand_h_1 = np.random.randint(output_dim, size=input_dim1) np.random.seed(3) self.rand_s_1 = 2 * np.random.randint(2, size=input_dim1) - 1 self.sparse_sketch_matrix1 = _generate_sketch_matrix( self.rand_h_1, self.rand_s_1, self.output_dim) np.random.seed(5) self.rand_h_2 = np.random.randint(output_dim, size=input_dim2) np.random.seed(7) self.rand_s_2 = 2 * np.random.randint(2, size=input_dim2) - 1 self.sparse_sketch_matrix2 = _generate_sketch_matrix( self.rand_h_2, self.rand_s_2, self.output_dim) self.f1 = fft.Fft() self.f2 = fft.Fft() self.invf = fft.Ifft()
def forward(self, bottom1, bottom2): """ bottom1: 1st input, 4D Tensor of shape [batch_size, input_dim1, height, width]. bottom2: 2nd input, 4D Tensor of shape [batch_size, input_dim2, height, width]. """ assert bottom1.size(1) == self.input_dim1 and \ bottom2.size(1) == self.input_dim2 batch_size, _, height, width = bottom1.size() bottom1_flat = bottom1.permute(0, 2, 3, 1).contiguous().view( -1, self.input_dim1) bottom2_flat = bottom2.permute(0, 2, 3, 1).contiguous().view( -1, self.input_dim2) sketch_1 = bottom1_flat.mm(self.sparse_sketch_matrix1) sketch_2 = bottom2_flat.mm(self.sparse_sketch_matrix2) fft1_real, fft1_imag = afft.Fft()(sketch_1, Variable(torch.zeros( sketch_1.size())).cuda()) fft2_real, fft2_imag = afft.Fft()(sketch_2, Variable(torch.zeros( sketch_2.size())).cuda()) temp_rr, temp_ii = fft1_real.mul(fft2_real), fft1_imag.mul(fft2_imag) fft_product_real = temp_rr - temp_ii fft_product_imag = temp_rr + temp_ii cbp_flat = afft.Ifft()(fft_product_real, fft_product_imag)[0] cbp = cbp_flat.view(batch_size, height, width, self.output_dim) if self.sum_pool: cbp = cbp.sum(dim=1).sum(dim=1) return cbp
def forward(self, bottom1, bottom2, bottom3): """ bottom1: 1st input, 4D Tensor of shape [batch_size, input_dim1, height, width]. bottom2: 2nd input, 4D Tensor of shape [batch_size, input_dim2, height, width]. """ assert bottom1.size(0) == self.input_dim1 and \ bottom2.size(0) == self.input_dim2 and \ bottom3.size(0) == self.input_dim3 _, height, width = bottom1.size() bottom1_flat = bottom1.permute(1, 2, 0).contiguous().view(-1, self.input_dim1) bottom2_flat = bottom2.permute(1, 2, 0).contiguous().view(-1, self.input_dim2) bottom3_flat = bottom3.permute(1, 2, 0).contiguous().view(-1, self.input_dim3) sketch_1 = bottom1_flat.mm(self.sparse_sketch_matrix1) sketch_2 = bottom2_flat.mm(self.sparse_sketch_matrix2) sketch_3 = bottom3_flat.mm(self.sparse_sketch_matrix3) fft1_real, fft1_imag = afft.Fft()(sketch_1, Variable(torch.zeros(sketch_1.size())).cuda()) fft2_real, fft2_imag = afft.Fft()(sketch_2, Variable(torch.zeros(sketch_2.size())).cuda()) fft3_real, fft3_imag = afft.Fft()(sketch_3, Variable(torch.zeros(sketch_3.size())).cuda()) fft_product_real = fft1_real.mul(fft2_real) - fft1_imag.mul(fft2_imag) fft_product_imag = fft1_real.mul(fft2_imag) + fft1_imag.mul(fft2_real) fft_product_real_new = fft_product_real.mul(fft3_real) - fft_product_imag.mul(fft3_imag) fft_product_imag_new = fft_product_real.mul(fft3_imag) + fft_product_imag.mul(fft3_real) cbp_flat = afft.Ifft()(fft_product_real_new, fft_product_imag_new)[0] cbp = cbp_flat.view(height, width, self.output_dim) if self.sum_pool: cbp = cbp.sum(dim=1).sum(dim=1) return cbp
def forward(self, X): """ Given a (mini)batch of triplets X of size M, compute the energies. Params: ------- X: int matrix of M x 3, where M is the (mini)batch size First column contains index of head entities. Second column contains index of relationships. Third column contains index of tail entities. Returns: -------- f: float matrix of M x 1 Contains energies of each triplets. """ # Decompose X into head, relationship, tail f = fft.Fft() invf = fft.Ifft() hs, ls, ts = X[:, 0], X[:, 1], X[:, 2] if self.gpu: hs = Variable(torch.from_numpy(hs).cuda()) ls = Variable(torch.from_numpy(ls).cuda()) ts = Variable(torch.from_numpy(ts).cuda()) else: hs = Variable(torch.from_numpy(hs)) ls = Variable(torch.from_numpy(ls)) ts = Variable(torch.from_numpy(ts)) e_hs_real = self.emb_E(hs) e_hs_imag = Variable(torch.zeros(self.emb_E(hs).size()).cuda()) e_ts_real = self.emb_E(ts) e_ts_imag = Variable(torch.zeros(self.emb_E(ts).size()).cuda()) e_ls = self.emb_E(ls) fft_hs = f(e_hs_real, e_hs_imag) fft_hs_conj = fft_hs[0], -1 * fft_hs[1] fft_ts = f(e_ts_real, e_ts_imag) real_fft = fft_hs_conj[0] * fft_ts[0] - fft_hs_conj[1] * fft_ts[1] imag_fft = fft_hs_conj[0] * fft_ts[1] + fft_hs_conj[1] * fft_ts[0] ccorr = invf(real_fft, imag_fft)[0] pdb.set_trace() f = torch.sum(e_ls * ccorr, 1) return f
def test_fft_gradcheck(): invar = create_complex_var(5, 10) assert torch.autograd.gradcheck(afft.Fft(), invar)
def compact_bilinear_pooling_layer(bottom1, bottom2, output_dim, not_variable=True, sum_pool=False, rand_h_1=None, rand_s_1=None, rand_h_2=None, rand_s_2=None, seed_h_1=1, seed_s_1=3, seed_h_2=5, seed_s_2=7, sequential=True, compute_size=128): """ Compute compact bilinear pooling over two bottom inputs. Reference: Yang Gao, et al. "Compact Bilinear Pooling." in Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (2016). Akira Fukui, et al. "Multimodal Compact Bilinear Pooling for Visual Question Answering and Visual Grounding." arXiv preprint arXiv:1606.01847 (2016). Args: bottom1: 1st input, 4D Tensor of shape [batch_size, height, width, input_dim1]. bottom2: 2nd input, 4D Tensor of shape [batch_size, height, width, input_dim2]. output_dim: output dimension for compact bilinear pooling. sum_pool: (Optional) If True, sum the output along height and width dimensions and return output shape [batch_size, output_dim]. Otherwise return [batch_size, height, width, output_dim]. Default: True. rand_h_1: (Optional) an 1D numpy array containing indices in interval `[0, output_dim)`. Automatically generated from `seed_h_1` if is None. rand_s_1: (Optional) an 1D numpy array of 1 and -1, having the same shape as `rand_h_1`. Automatically generated from `seed_s_1` if is None. rand_h_2: (Optional) an 1D numpy array containing indices in interval `[0, output_dim)`. Automatically generated from `seed_h_2` if is None. rand_s_2: (Optional) an 1D numpy array of 1 and -1, having the same shape as `rand_h_2`. Automatically generated from `seed_s_2` if is None. sequential: (Optional) if True, use the sequential FFT and IFFT instead of tf.batch_fft or tf.batch_ifft to avoid out-of-memory (OOM) error. Note: sequential FFT and IFFT are only available on GPU Default: True. compute_size: (Optional) The maximum size of sub-batch to be forwarded through FFT or IFFT in one time. Large compute_size may be faster but can cause OOM and FFT failure. This parameter is only effective when sequential == True. Default: 128. Returns: Compact bilinear pooled results of shape [batch_size, output_dim] or [batch_size, height, width, output_dim], depending on `sum_pool`. """ # Static shapes are needed to construction count sketch matrix input_dim1 = bottom1.size()[-1] input_dim2 = bottom2.size()[-1] # Step 0: Generate vectors and sketch matrix for tensor count sketch # This is only done once during graph construction, and fixed during each # operation if rand_h_1 is None: np.random.seed(seed_h_1) rand_h_1 = np.random.randint(output_dim, size=input_dim1) if rand_s_1 is None: np.random.seed(seed_s_1) rand_s_1 = 2*np.random.randint(2, size=input_dim1) - 1 sparse_sketch_matrix1 = _generate_sketch_matrix_pyt(rand_h_1, rand_s_1, output_dim) if rand_h_2 is None: np.random.seed(seed_h_2) rand_h_2 = np.random.randint(output_dim, size=input_dim2) if rand_s_2 is None: np.random.seed(seed_s_2) rand_s_2 = 2*np.random.randint(2, size=input_dim2) - 1 sparse_sketch_matrix2 = _generate_sketch_matrix_pyt(rand_h_2, rand_s_2, output_dim) # Step 1: Flatten the input tensors and count sketch bottom1_flat = bottom1_pyt.view(-1, input_dim1).float() bottom2_flat = bottom2_pyt.view(-1, input_dim2).float() # Essentially:_ # sketch1 = bottom1 * sparse_sketch_matrix # sketch2 = bottom2 * sparse_sketch_matrix # But tensorflow only supports left multiplying a sparse matrix, so: # sketch1 = (sparse_sketch_matrix.T * bottom1.T).T # sketch2 = (sparse_sketch_matrix.T * bottom2.T).T if not_variable == True: sketch1 = T.mm(sparse_sketch_matrix1.t().cuda(), bottom1_flat.t()).t() sketch2 = T.mm(sparse_sketch_matrix2.t().cuda(), bottom2_flat.t()).t() else: dense1 = Variable(sparse_sketch_matrix1.to_dense(), requires_grad = True).cuda() dense2 = Variable(sparse_sketch_matrix2.to_dense(), requires_grad = True).cuda() sketch1 = T.matmul(bottom1_flat, dense1).cuda() sketch2 = T.matmul(bottom2_flat, dense2).cuda() # Step 2: FFT if not_variable == True: fft1_real, fft1_img = fft.fft(sketch1, T.zeros(sketch1.size()).cuda()) fft2_real, fft2_img = fft.fft(sketch2, T.zeros(sketch2.size()).cuda()) else: f = Fft.Fft() fft1_real, fft1_img = f(sketch1, Variable(T.zeros(sketch1.size())).cuda()) fft2_real, fft2_img = f(sketch2, Variable(T.zeros(sketch2.size())).cuda()) # Step 3: Elementwise product fft_product_real = fft1_real * fft2_real - fft1_img * fft2_img # The result of only real number part fft_product_img = fft1_real * fft2_img + fft2_real * fft1_img # The result of only real number part # Step 4: Inverse FFT and reshape back # Compute output shape dynamically: [batch_size, height, width, output_dim] #cbp_flat = tf.real(_ifft(fft_product, sequential, compute_size)) if not_variable == True: cbp_flat, _ = fft.ifft(fft_product_real, fft_product_img) else: fi = Fft.Ifft() cbp_flat, _ = fi(fft_product_real, fft_product_img) output_shape = T.Size([bottom1.size()[0], bottom1.size()[1],bottom1.size()[2], output_dim]) cbp = cbp_flat.view(output_shape) # Step 5: Sum pool over spatial dimensions, if specified if sum_pool: cbp = T.sum(T.sum(cbp, dim=1),dim=2) return cbp