def forward(self, x): if self.expk0 is None or self.expk0.size(-2) != x.size(-2): self.expk0 = discrete_spectral_transform.get_expk(x.size(-2), dtype=x.dtype, device=x.device) if self.expk1 is None or self.expk1.size(-2) != x.size(-1): self.expk1 = discrete_spectral_transform.get_expk(x.size(-1), dtype=x.dtype, device=x.device) return IDSCT2Function.apply(x, self.expk0, self.expk1)
def eval_runtime(): runs = 100 M = 1024 N = 1024 dtype = torch.float64 x = torch.empty(M, N, dtype=dtype).uniform_(0, 10.0).cuda() print("M = {}, N = {}".format(M, N)) # 2cos(), 2sin() expk0 = discrete_spectral_transform.get_expk(M, dtype=x.dtype, device=x.device) expk1 = discrete_spectral_transform.get_expk(N, dtype=x.dtype, device=x.device) # cos(), -sin() expkM = discrete_spectral_transform.get_exact_expk(M, dtype=x.dtype, device=x.device) expkN = discrete_spectral_transform.get_exact_expk(N, dtype=x.dtype, device=x.device) eval_torch_rfft1d(x, runs) eval_torch_rfft2d(x, runs) eval_dct2d(x, expk0, expk1, expkM, expkN, runs) eval_idct2d(x, expk0, expk1, expkM, expkN, runs) eval_idxt2d(x, expk0, expk1, expkM, expkN, runs) eval_others(x, expk0, expk1, expkM, expkN, runs)
def forward(self, x): if self.expk is None or self.expk.size(-2) != x.size(-1): self.expk = discrete_spectral_transform.get_expk(x.size(-1), dtype=x.dtype, device=x.device) return IDCTFunction.apply(x, self.expk, self.algorithm)
def eval_runtime(): #x = torch.tensor([1, 2, 7, 9, 20, 31], dtype=torch.float64) #print(dct_N(x)) N = 512 runs = 10 x = torch.empty(10, N, N, dtype=torch.float64).uniform_(0, 10.0).cuda() perm = discrete_spectral_transform.get_perm(N, dtype=torch.int64, device=x.device) expk = discrete_spectral_transform.get_expk(N, dtype=x.dtype, device=x.device) #x_numpy = x.data.cpu().numpy() #tt = time.time() #for i in range(runs): # y = fftpack.dct(fftpack.dct(x_numpy[i%10].T, norm=None).T, norm=None) #print("scipy takes %.3f sec" % (time.time()-tt)) ## 9s for 200 iterations 1024x1024 on GTX 1080 #torch.cuda.synchronize() #tt = time.time() ##with torch.autograd.profiler.profile(use_cuda=True) as prof: #for i in range(runs): # y_2N = dct2_2N(x[0], expk0=expk, expk1=expk) #torch.cuda.synchronize() ##print(prof) #print("dct_2N takes %.3f ms" % ((time.time()-tt)/runs*1000)) ## 11s for 200 iterations 1024x1024 on GTX 1080 #torch.cuda.synchronize() #tt = time.time() ##with torch.autograd.profiler.profile(use_cuda=True) as prof: #for i in range(runs): # y_N = discrete_spectral_transform.dct2_N(x[i%10], perm0=perm, expk0=expk, perm1=perm, expk1=expk) #torch.cuda.synchronize() ##print(prof) #print("dct_N takes %.3f ms" % ((time.time()-tt)/runs*1000)) #dct2func = dct.DCT2(expk, expk, algorithm='2N') #torch.cuda.synchronize() #tt = time.time() ##with torch.autograd.profiler.profile(use_cuda=True) as prof: #for i in range(runs): # y_N = dct2func.forward(x[0]) #torch.cuda.synchronize() ##print(prof) #print("DCT2Function 2N takes %.3f ms" % ((time.time()-tt)/runs*1000)) #dct2func = dct.DCT2(expk, expk, algorithm='N') #torch.cuda.synchronize() #tt = time.time() ##with torch.autograd.profiler.profile(use_cuda=True) as prof: #for i in range(runs): # y_N = dct2func.forward(x[0]) #torch.cuda.synchronize() ##print(prof) #print("DCT2Function takes %.3f ms" % ((time.time()-tt)/runs*1000)) #dct2func = dct_lee.DCT2(expk, expk) #torch.cuda.synchronize() #tt = time.time() ##with torch.autograd.profiler.profile(use_cuda=True) as prof: #for i in range(runs): # y_N = dct2func.forward(x[i%10]) #torch.cuda.synchronize() ##print(prof) #print("DCT2Function lee takes %.3f ms" % ((time.time()-tt)/runs*1000)) #torch.cuda.synchronize() #tt = time.time() ##with torch.autograd.profiler.profile(use_cuda=True) as prof: #for i in range(runs): # y_N = discrete_spectral_transform.idct2_2N(x[i%10], expk0=expk, expk1=expk) #torch.cuda.synchronize() ##print(prof) #print("idct2_2N takes %.3f ms" % ((time.time()-tt)/runs*1000)) idct2func = dct.IDCT2(expk, expk, algorithm='2N') torch.cuda.synchronize() tt = time.time() #with torch.autograd.profiler.profile(use_cuda=True) as prof: for i in range(runs): y_N = idct2func.forward(x[i%10]) torch.cuda.synchronize() #print(prof) print("IDCT2Function 2N takes %.3f ms" % ((time.time()-tt)/runs*1000)) idct2func = dct.IDCT2(expk, expk, algorithm='N') torch.cuda.synchronize() tt = time.time() #with torch.autograd.profiler.profile(use_cuda=True) as prof: for i in range(runs): y_N = idct2func.forward(x[i%10]) torch.cuda.synchronize() #print(prof) print("IDCT2Function takes %.3f ms" % ((time.time()-tt)/runs*1000)) exit() #torch.cuda.synchronize() #tt = time.time() ##with torch.autograd.profiler.profile(use_cuda=True) as prof: #for i in range(runs): # y_N = discrete_spectral_transform.idxt(x[i%10], 0, expk=expk) #torch.cuda.synchronize() ##print(prof) #print("idxt takes %.3f ms" % ((time.time()-tt)/runs*1000)) #idxct_func = dct.IDXCT(expk) #torch.cuda.synchronize() #tt = time.time() ##with torch.autograd.profiler.profile(use_cuda=True) as prof: #for i in range(runs): # y_N = idxct_func.forward(x[i%10]) #torch.cuda.synchronize() ##print(prof) #print("IDXCTFunction takes %.3f ms" % ((time.time()-tt)/runs*1000)) torch.cuda.synchronize() tt = time.time() #with torch.autograd.profiler.profile(use_cuda=True) as prof: for i in range(runs): y_N = torch.rfft(x[i%10].view([1, N, N]), signal_ndim=2, onesided=False) torch.cuda.synchronize() #print(prof) print("fft2 takes %.3f ms" % ((time.time()-tt)/runs*1000)) torch.cuda.synchronize() tt = time.time() #with torch.autograd.profiler.profile(use_cuda=True) as prof: for i in range(runs): y_N = discrete_spectral_transform.idcct2(x[i%10], expk_0=expk, expk_1=expk) torch.cuda.synchronize() #print(prof) print("idcct2 takes %.3f ms" % ((time.time()-tt)/runs*1000)) func = dct.IDCCT2(expk, expk) torch.cuda.synchronize() tt = time.time() #with torch.autograd.profiler.profile(use_cuda=True) as prof: for i in range(runs): y_N = func.forward(x[i%10]) torch.cuda.synchronize() #print(prof) print("IDCCT2Function takes %.3f ms" % ((time.time()-tt)/runs*1000)) torch.cuda.synchronize() tt = time.time() #with torch.autograd.profiler.profile(use_cuda=True) as prof: for i in range(runs): y_N = discrete_spectral_transform.idcst2(x[i%10], expk_0=expk, expk_1=expk) torch.cuda.synchronize() #print(prof) print("idcst2 takes %.3f ms" % ((time.time()-tt)/runs*1000)) func = dct.IDCST2(expk, expk) torch.cuda.synchronize() tt = time.time() #with torch.autograd.profiler.profile(use_cuda=True) as prof: for i in range(runs): y_N = func.forward(x[i%10]) torch.cuda.synchronize() #print(prof) print("IDCST2Function takes %.3f ms" % ((time.time()-tt)/runs*1000)) torch.cuda.synchronize() tt = time.time() #with torch.autograd.profiler.profile(use_cuda=True) as prof: for i in range(runs): y_N = discrete_spectral_transform.idsct2(x[i%10], expk_0=expk, expk_1=expk) torch.cuda.synchronize() #print(prof) print("idsct2 takes %.3f ms" % ((time.time()-tt)/runs*1000)) func = dct.IDSCT2(expk, expk) torch.cuda.synchronize() tt = time.time() #with torch.autograd.profiler.profile(use_cuda=True) as prof: for i in range(runs): y_N = func.forward(x[i%10]) torch.cuda.synchronize() #print(prof) print("IDSCT2Function takes %.3f ms" % ((time.time()-tt)/runs*1000))
def forward( ctx, pos, node_size_x, node_size_y, bin_center_x, bin_center_y, initial_density_map, target_density, xl, yl, xh, yh, bin_size_x, bin_size_y, num_movable_nodes, num_filler_nodes, padding, padding_mask, # same dimensions as density map, with padding regions to be 1 num_bins_x, num_bins_y, num_movable_impacted_bins_x, num_movable_impacted_bins_y, num_filler_impacted_bins_x, num_filler_impacted_bins_y, perm_M=None, # permutation perm_N=None, # permutation expk_M=None, # 2*exp(j*pi*k/M) expk_N=None, # 2*exp(j*pi*k/N) inv_wu2_plus_wv2_2X=None, # 2.0/(wu^2 + wv^2) wu_by_wu2_plus_wv2_2X=None, # 2*wu/(wu^2 + wv^2) wv_by_wu2_plus_wv2_2X=None, # 2*wv/(wu^2 + wv^2) fast_mode=True # fast mode will discard some computation ): if pos.is_cuda: output = electric_potential_cuda.density_map( pos.view(pos.numel()), node_size_x, node_size_y, bin_center_x, bin_center_y, initial_density_map, target_density, xl, yl, xh, yh, bin_size_x, bin_size_y, num_movable_nodes, num_filler_nodes, padding, padding_mask, num_bins_x, num_bins_y, num_movable_impacted_bins_x, num_movable_impacted_bins_y, num_filler_impacted_bins_x, num_filler_impacted_bins_y) else: output = electric_potential_cpp.density_map( pos.view(pos.numel()), node_size_x, node_size_y, bin_center_x, bin_center_y, initial_density_map, target_density, xl, yl, xh, yh, bin_size_x, bin_size_y, num_movable_nodes, num_filler_nodes, padding, padding_mask, num_bins_x, num_bins_y, num_movable_impacted_bins_x, num_movable_impacted_bins_y, num_filler_impacted_bins_x, num_filler_impacted_bins_y) # output consists of (density_cost, density_map, max_density) ctx.node_size_x = node_size_x ctx.node_size_y = node_size_y ctx.bin_center_x = bin_center_x ctx.bin_center_y = bin_center_y ctx.target_density = target_density ctx.xl = xl ctx.yl = yl ctx.xh = xh ctx.yh = yh ctx.bin_size_x = bin_size_x ctx.bin_size_y = bin_size_y ctx.num_movable_nodes = num_movable_nodes ctx.num_filler_nodes = num_filler_nodes ctx.padding = padding ctx.num_bins_x = num_bins_x ctx.num_bins_y = num_bins_y ctx.num_movable_impacted_bins_x = num_movable_impacted_bins_x ctx.num_movable_impacted_bins_y = num_movable_impacted_bins_y ctx.num_filler_impacted_bins_x = num_filler_impacted_bins_x ctx.num_filler_impacted_bins_y = num_filler_impacted_bins_y ctx.pos = pos density_map = output.view([ctx.num_bins_x, ctx.num_bins_y]) #density_map = torch.ones([ctx.num_bins_x, ctx.num_bins_y], dtype=pos.dtype, device=pos.device) #ctx.field_map_x = torch.ones([ctx.num_bins_x, ctx.num_bins_y], dtype=pos.dtype, device=pos.device) #ctx.field_map_y = torch.ones([ctx.num_bins_x, ctx.num_bins_y], dtype=pos.dtype, device=pos.device) #return torch.zeros(1, dtype=pos.dtype, device=pos.device) # for DCT M = num_bins_x N = num_bins_y if expk_M is None: perm_M = discrete_spectral_transform.get_perm( M, dtype=torch.int64, device=density_map.device) perm_N = discrete_spectral_transform.get_perm( N, dtype=torch.int64, device=density_map.device) expk_M = discrete_spectral_transform.get_expk( M, dtype=density_map.dtype, device=density_map.device) expk_N = discrete_spectral_transform.get_expk( N, dtype=density_map.dtype, device=density_map.device) # wu and wv if inv_wu2_plus_wv2_2X is None: wu = torch.arange(M, dtype=density_map.dtype, device=density_map.device).mul(2 * np.pi / M).view([M, 1]) wv = torch.arange(N, dtype=density_map.dtype, device=density_map.device).mul(2 * np.pi / N).view([1, N]) wu2_plus_wv2 = wu.pow(2) + wv.pow(2) wu2_plus_wv2[0, 0] = 1.0 # avoid zero-division, it will be zeroed out inv_wu2_plus_wv2_2X = 2.0 / wu2_plus_wv2 inv_wu2_plus_wv2_2X[0, 0] = 0.0 wu_by_wu2_plus_wv2_2X = wu.mul(inv_wu2_plus_wv2_2X) wv_by_wu2_plus_wv2_2X = wv.mul(inv_wu2_plus_wv2_2X) # compute auv density_map.mul_(1.0 / (ctx.bin_size_x * ctx.bin_size_y)) #auv = discrete_spectral_transform.dct2_2N(density_map, expk0=expk_M, expk1=expk_N) auv = dct.dct2(density_map, expk0=expk_M, expk1=expk_N) auv[0, :].mul_(0.5) auv[:, 0].mul_(0.5) # compute field xi auv_by_wu2_plus_wv2_wu = auv.mul(wu_by_wu2_plus_wv2_2X) auv_by_wu2_plus_wv2_wv = auv.mul(wv_by_wu2_plus_wv2_2X) #ctx.field_map_x = discrete_spectral_transform.idsct2(auv_by_wu2_plus_wv2_wu, expk_M, expk_N).contiguous() ctx.field_map_x = dct.idsct2(auv_by_wu2_plus_wv2_wu, expk_M, expk_N) #ctx.field_map_y = discrete_spectral_transform.idcst2(auv_by_wu2_plus_wv2_wv, expk_M, expk_N).contiguous() ctx.field_map_y = dct.idcst2(auv_by_wu2_plus_wv2_wv, expk_M, expk_N) # energy = \sum q*phi # it takes around 80% of the computation time # so I will not always evaluate it if fast_mode: # dummy for invoking backward propagation energy = torch.zeros(1, dtype=pos.dtype, device=pos.device) else: # compute potential phi # auv / (wu**2 + wv**2) auv_by_wu2_plus_wv2 = auv.mul(inv_wu2_plus_wv2_2X).mul_(2) #potential_map = discrete_spectral_transform.idcct2(auv_by_wu2_plus_wv2, expk_M, expk_N) potential_map = dct.idcct2(auv_by_wu2_plus_wv2, expk_M, expk_N) # compute energy energy = potential_map.mul_(density_map).sum() #torch.set_printoptions(precision=10) #print("initial_density_map") #print(initial_density_map/(ctx.bin_size_x*ctx.bin_size_y)) #print("density_map") #print(density_map/(ctx.bin_size_x*ctx.bin_size_y)) #print("auv_by_wu2_plus_wv2") #print(auv_by_wu2_plus_wv2) #print("potential_map") #print(potential_map) #print("field_map_x") #print(ctx.field_map_x) #print("field_map_y") #print(ctx.field_map_y) #global plot_count #if plot_count >= 600 and plot_count % 1 == 0: # print("density_map") # plot(plot_count, density_map.clone().div(bin_size_x*bin_size_y).cpu().numpy(), padding, "summary/%d.density_map" % (plot_count)) # print("potential_map") # plot(plot_count, potential_map.clone().cpu().numpy(), padding, "summary/%d.potential_map" % (plot_count)) # print("field_map_x") # plot(plot_count, ctx.field_map_x.clone().cpu().numpy(), padding, "summary/%d.field_map_x" % (plot_count)) # print("field_map_y") # plot(plot_count, ctx.field_map_y.clone().cpu().numpy(), padding, "summary/%d.field_map_y" % (plot_count)) #plot_count += 1 torch.cuda.synchronize() return energy
def forward(self, pos): if self.initial_density_map is None: if self.num_terminals == 0: num_fixed_impacted_bins_x = 0 num_fixed_impacted_bins_y = 0 else: num_fixed_impacted_bins_x = int( ((self.node_size_x[ self.num_movable_nodes:self.num_movable_nodes + self.num_terminals].max() + self.bin_size_x) / self.bin_size_x).ceil().clamp(max=self.num_bins_x)) num_fixed_impacted_bins_y = int( ((self.node_size_y[ self.num_movable_nodes:self.num_movable_nodes + self.num_terminals].max() + self.bin_size_y) / self.bin_size_y).ceil().clamp(max=self.num_bins_y)) if pos.is_cuda: self.initial_density_map = electric_potential_cuda.fixed_density_map( pos.view(pos.numel()), self.node_size_x, self.node_size_y, self.bin_center_x, self.bin_center_y, self.xl, self.yl, self.xh, self.yh, self.bin_size_x, self.bin_size_y, self.num_movable_nodes, self.num_terminals, self.num_bins_x, self.num_bins_y, num_fixed_impacted_bins_x, num_fixed_impacted_bins_y) else: self.initial_density_map = electric_potential_cpp.fixed_density_map( pos.view(pos.numel()), self.node_size_x, self.node_size_y, self.bin_center_x, self.bin_center_y, self.xl, self.yl, self.xh, self.yh, self.bin_size_x, self.bin_size_y, self.num_movable_nodes, self.num_terminals, self.num_bins_x, self.num_bins_y, num_fixed_impacted_bins_x, num_fixed_impacted_bins_y) #plot(0, self.initial_density_map.clone().div(self.bin_size_x*self.bin_size_y).cpu().numpy(), self.padding, 'summary/initial_potential_map') # scale density of fixed macros self.initial_density_map.mul_(self.target_density) # expk M = self.num_bins_x N = self.num_bins_y self.perm_M = discrete_spectral_transform.get_perm( M, dtype=torch.int64, device=pos.device) self.perm_N = discrete_spectral_transform.get_perm( N, dtype=torch.int64, device=pos.device) self.expk_M = discrete_spectral_transform.get_expk( M, dtype=pos.dtype, device=pos.device) self.expk_N = discrete_spectral_transform.get_expk( N, dtype=pos.dtype, device=pos.device) # wu and wv wu = torch.arange(M, dtype=pos.dtype, device=pos.device).mul( 2 * np.pi / M).view([M, 1]) # scale wv because the aspect ratio of a bin may not be 1 wv = torch.arange(N, dtype=pos.dtype, device=pos.device).mul(2 * np.pi / N).view( [1, N]).mul_(self.bin_size_x / self.bin_size_y) wu2_plus_wv2 = wu.pow(2) + wv.pow(2) wu2_plus_wv2[0, 0] = 1.0 # avoid zero-division, it will be zeroed out self.inv_wu2_plus_wv2_2X = 2.0 / wu2_plus_wv2 self.inv_wu2_plus_wv2_2X[0, 0] = 0.0 self.wu_by_wu2_plus_wv2_2X = wu.mul(self.inv_wu2_plus_wv2_2X) self.wv_by_wu2_plus_wv2_2X = wv.mul(self.inv_wu2_plus_wv2_2X) return ElectricPotentialFunction.apply( pos, self.node_size_x, self.node_size_y, self.bin_center_x, self.bin_center_y, self.initial_density_map, self.target_density, self.xl, self.yl, self.xh, self.yh, self.bin_size_x, self.bin_size_y, self.num_movable_nodes, self.num_filler_nodes, self.padding, self.padding_mask, self.num_bins_x, self.num_bins_y, self.num_movable_impacted_bins_x, self.num_movable_impacted_bins_y, self.num_filler_impacted_bins_x, self.num_filler_impacted_bins_y, self.perm_M, self.perm_N, self.expk_M, self.expk_N, self.inv_wu2_plus_wv2_2X, self.wu_by_wu2_plus_wv2_2X, self.wv_by_wu2_plus_wv2_2X, self.fast_mode)
def compare_different_methods(cuda_flag, M=1024, N=1024, dtype=torch.float64): density_map = torch.empty(M, N, dtype=dtype).uniform_(0, 10.0) if cuda_flag: density_map = density_map.cuda() expkM = discrete_spectral_transform.get_expk(M, dtype, density_map.device) expkN = discrete_spectral_transform.get_expk(N, dtype, density_map.device) exact_expkM = discrete_spectral_transform.get_exact_expk(M, dtype, density_map.device) exact_expkN = discrete_spectral_transform.get_exact_expk(N, dtype, density_map.device) print("M = {}, N = {}".format(M, N)) wu = torch.arange(M, dtype=density_map.dtype, device=density_map.device).mul(2 * np.pi / M).view([M, 1]) wv = torch.arange(N, dtype=density_map.dtype, device=density_map.device).mul(2 * np.pi / N).view([1, N]) wu2_plus_wv2 = wu.pow(2) + wv.pow(2) wu2_plus_wv2[0, 0] = 1.0 # avoid zero-division, it will be zeroed out inv_wu2_plus_wv2_2X = 2.0 / wu2_plus_wv2 inv_wu2_plus_wv2_2X[0, 0] = 0.0 wu_by_wu2_plus_wv2_2X = wu.mul(inv_wu2_plus_wv2_2X) wv_by_wu2_plus_wv2_2X = wv.mul(inv_wu2_plus_wv2_2X) # the first approach is used as the ground truth auv_golden = dct.dct2(density_map, expk0=expkM, expk1=expkN) auv = auv_golden.clone() auv[0, :].mul_(0.5) auv[:, 0].mul_(0.5) auv_by_wu2_plus_wv2_wu = auv.mul(wu_by_wu2_plus_wv2_2X) auv_by_wu2_plus_wv2_wv = auv.mul(wv_by_wu2_plus_wv2_2X) field_map_x_golden = dct.idsct2(auv_by_wu2_plus_wv2_wu, expkM, expkN) field_map_y_golden = dct.idcst2(auv_by_wu2_plus_wv2_wv, expkM, expkN) # compute potential phi # auv / (wu**2 + wv**2) auv_by_wu2_plus_wv2 = auv.mul(inv_wu2_plus_wv2_2X).mul_(2) #potential_map = discrete_spectral_transform.idcct2(auv_by_wu2_plus_wv2, expkM, expkN) potential_map_golden = dct.idcct2(auv_by_wu2_plus_wv2, expkM, expkN) # compute energy energy_golden = potential_map_golden.mul(density_map).sum() if density_map.is_cuda: torch.cuda.synchronize() # the second approach uses the idxst_idct and idct_idxst dct2 = dct2_fft2.DCT2(exact_expkM, exact_expkN) idct2 = dct2_fft2.IDCT2(exact_expkM, exact_expkN) idct_idxst = dct2_fft2.IDCT_IDXST(exact_expkM, exact_expkN) idxst_idct = dct2_fft2.IDXST_IDCT(exact_expkM, exact_expkN) inv_wu2_plus_wv2 = 1.0 / wu2_plus_wv2 inv_wu2_plus_wv2[0, 0] = 0.0 wu_by_wu2_plus_wv2_half = wu.mul(inv_wu2_plus_wv2).mul_(0.5) wv_by_wu2_plus_wv2_half = wv.mul(inv_wu2_plus_wv2).mul_(0.5) buv = dct2.forward(density_map) buv_by_wu2_plus_wv2_wu = buv.mul(wu_by_wu2_plus_wv2_half) buv_by_wu2_plus_wv2_wv = buv.mul(wv_by_wu2_plus_wv2_half) field_map_x = idxst_idct.forward(buv_by_wu2_plus_wv2_wu) field_map_y = idct_idxst.forward(buv_by_wu2_plus_wv2_wv) buv_by_wu2_plus_wv2 = buv.mul(inv_wu2_plus_wv2) potential_map = idct2.forward(buv_by_wu2_plus_wv2) energy = potential_map.mul(density_map).sum() if density_map.is_cuda: torch.cuda.synchronize() # compare results np.testing.assert_allclose(buv.data.cpu().numpy(), auv_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5) np.testing.assert_allclose(field_map_x.data.cpu().numpy(), field_map_x_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5) np.testing.assert_allclose(field_map_y.data.cpu().numpy(), field_map_y_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5) np.testing.assert_allclose(potential_map.data.cpu().numpy(), potential_map_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5) np.testing.assert_allclose(energy.data.cpu().numpy(), energy_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5) # the third approach uses the dct.idxst_idct and dct.idxst_idct dct2 = dct.DCT2(expkM, expkN) idct2 = dct.IDCT2(expkM, expkN) idct_idxst = dct.IDCT_IDXST(expkM, expkN) idxst_idct = dct.IDXST_IDCT(expkM, expkN) cuv = dct2.forward(density_map) cuv_by_wu2_plus_wv2_wu = cuv.mul(wu_by_wu2_plus_wv2_half) cuv_by_wu2_plus_wv2_wv = cuv.mul(wv_by_wu2_plus_wv2_half) field_map_x = idxst_idct.forward(cuv_by_wu2_plus_wv2_wu) field_map_y = idct_idxst.forward(cuv_by_wu2_plus_wv2_wv) cuv_by_wu2_plus_wv2 = cuv.mul(inv_wu2_plus_wv2) potential_map = idct2.forward(cuv_by_wu2_plus_wv2) energy = potential_map.mul(density_map).sum() if density_map.is_cuda: torch.cuda.synchronize() # compare results np.testing.assert_allclose(cuv.data.cpu().numpy(), auv_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5) np.testing.assert_allclose(field_map_x.data.cpu().numpy(), field_map_x_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5) np.testing.assert_allclose(field_map_y.data.cpu().numpy(), field_map_y_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5) np.testing.assert_allclose(potential_map.data.cpu().numpy(), potential_map_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5) np.testing.assert_allclose(energy.data.cpu().numpy(), energy_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5)
scipy_auv[0, 1:] *= np.sqrt(2.0) / np.sqrt(M*N) scipy_auv[1:, 0] *= np.sqrt(2.0) / np.sqrt(M*N) scipy_auv[0, 0] *= 1.0 / np.sqrt(M*N) ratio = scipy_auv/auv_map print("scipy_auv/auv_map") print(ratio.min()) print(ratio.max()) print(ratio.mean()) np.testing.assert_allclose(scipy_auv, auv_map, rtol=3e-1) """ density_map = torch.from_numpy(density_map) # for DCT M = density_map.shape[0] N = density_map.shape[1] expk_M = discrete_spectral_transform.get_expk(M, dtype=density_map.dtype, device=density_map.device) expk_N = discrete_spectral_transform.get_expk(N, dtype=density_map.dtype, device=density_map.device) # wu and wv wu = torch.arange(M, dtype=density_map.dtype, device=density_map.device).mul(2*np.pi/M).view([M, 1]) wv = torch.arange(N, dtype=density_map.dtype, device=density_map.device).mul(2*np.pi/N).view([1, N]) wu2_plus_wv2 = wu.pow(2) + wv.pow(2) wu2_plus_wv2[0, 0] = 1.0 # avoid zero-division, it will be zeroed out inv_wu2_plus_wv2_2X = 2.0 / wu2_plus_wv2 inv_wu2_plus_wv2_2X[0, 0] = 0.0 wu_by_wu2_plus_wv2 = wu.mul(inv_wu2_plus_wv2_2X) wv_by_wu2_plus_wv2 = wv.mul(inv_wu2_plus_wv2_2X) # compute auv auv = discrete_spectral_transform.dct2(density_map, expk_M, expk_N) auv[0, :].mul_(0.5) auv[:, 0].mul_(0.5)