示例#1
0
文件: dct.py 项目: leotam/DREAMPlace
 def forward(self, x):
     if self.expk0 is None or self.expk0.size(-2) != x.size(-2):
         self.expk0 = discrete_spectral_transform.get_expk(x.size(-2),
                                                           dtype=x.dtype,
                                                           device=x.device)
     if self.expk1 is None or self.expk1.size(-2) != x.size(-1):
         self.expk1 = discrete_spectral_transform.get_expk(x.size(-1),
                                                           dtype=x.dtype,
                                                           device=x.device)
     return IDSCT2Function.apply(x, self.expk0, self.expk1)
示例#2
0
def eval_runtime():
    runs = 100

    M = 1024
    N = 1024
    dtype = torch.float64
    x = torch.empty(M, N, dtype=dtype).uniform_(0, 10.0).cuda()

    print("M = {}, N = {}".format(M, N))

    # 2cos(), 2sin()
    expk0 = discrete_spectral_transform.get_expk(M, dtype=x.dtype, device=x.device)
    expk1 = discrete_spectral_transform.get_expk(N, dtype=x.dtype, device=x.device)
    # cos(), -sin()
    expkM = discrete_spectral_transform.get_exact_expk(M, dtype=x.dtype, device=x.device)
    expkN = discrete_spectral_transform.get_exact_expk(N, dtype=x.dtype, device=x.device)

    eval_torch_rfft1d(x, runs)
    eval_torch_rfft2d(x, runs)
    eval_dct2d(x, expk0, expk1, expkM, expkN, runs)
    eval_idct2d(x, expk0, expk1, expkM, expkN, runs)
    eval_idxt2d(x, expk0, expk1, expkM, expkN, runs)
    eval_others(x, expk0, expk1, expkM, expkN, runs)
示例#3
0
文件: dct.py 项目: leotam/DREAMPlace
 def forward(self, x):
     if self.expk is None or self.expk.size(-2) != x.size(-1):
         self.expk = discrete_spectral_transform.get_expk(x.size(-1),
                                                          dtype=x.dtype,
                                                          device=x.device)
     return IDCTFunction.apply(x, self.expk, self.algorithm)
示例#4
0
def eval_runtime():
    #x = torch.tensor([1, 2, 7, 9, 20, 31], dtype=torch.float64)
    #print(dct_N(x))

    N = 512
    runs = 10
    x = torch.empty(10, N, N, dtype=torch.float64).uniform_(0, 10.0).cuda()
    perm = discrete_spectral_transform.get_perm(N, dtype=torch.int64, device=x.device)
    expk = discrete_spectral_transform.get_expk(N, dtype=x.dtype, device=x.device)

    #x_numpy = x.data.cpu().numpy()
    #tt = time.time()
    #for i in range(runs): 
    #    y = fftpack.dct(fftpack.dct(x_numpy[i%10].T, norm=None).T, norm=None)
    #print("scipy takes %.3f sec" % (time.time()-tt))

    ## 9s for 200 iterations 1024x1024 on GTX 1080
    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_2N = dct2_2N(x[0], expk0=expk, expk1=expk)
    #torch.cuda.synchronize()
    ##print(prof)
    #print("dct_2N takes %.3f ms" % ((time.time()-tt)/runs*1000))

    ## 11s for 200 iterations 1024x1024 on GTX 1080
    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_N = discrete_spectral_transform.dct2_N(x[i%10], perm0=perm, expk0=expk, perm1=perm, expk1=expk)
    #torch.cuda.synchronize()
    ##print(prof)
    #print("dct_N takes %.3f ms" % ((time.time()-tt)/runs*1000))

    #dct2func = dct.DCT2(expk, expk, algorithm='2N')
    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_N = dct2func.forward(x[0])
    #torch.cuda.synchronize()
    ##print(prof)
    #print("DCT2Function 2N takes %.3f ms" % ((time.time()-tt)/runs*1000))

    #dct2func = dct.DCT2(expk, expk, algorithm='N')
    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_N = dct2func.forward(x[0])
    #torch.cuda.synchronize()
    ##print(prof)
    #print("DCT2Function takes %.3f ms" % ((time.time()-tt)/runs*1000))

    #dct2func = dct_lee.DCT2(expk, expk)
    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_N = dct2func.forward(x[i%10])
    #torch.cuda.synchronize()
    ##print(prof)
    #print("DCT2Function lee takes %.3f ms" % ((time.time()-tt)/runs*1000))

    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_N = discrete_spectral_transform.idct2_2N(x[i%10], expk0=expk, expk1=expk)
    #torch.cuda.synchronize()
    ##print(prof)
    #print("idct2_2N takes %.3f ms" % ((time.time()-tt)/runs*1000))

    idct2func = dct.IDCT2(expk, expk, algorithm='2N')
    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = idct2func.forward(x[i%10])
    torch.cuda.synchronize()
    #print(prof)
    print("IDCT2Function 2N takes %.3f ms" % ((time.time()-tt)/runs*1000))

    idct2func = dct.IDCT2(expk, expk, algorithm='N')
    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = idct2func.forward(x[i%10])
    torch.cuda.synchronize()
    #print(prof)
    print("IDCT2Function takes %.3f ms" % ((time.time()-tt)/runs*1000))
    exit()

    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_N = discrete_spectral_transform.idxt(x[i%10], 0, expk=expk)
    #torch.cuda.synchronize()
    ##print(prof)
    #print("idxt takes %.3f ms" % ((time.time()-tt)/runs*1000))

    #idxct_func = dct.IDXCT(expk)
    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_N = idxct_func.forward(x[i%10])
    #torch.cuda.synchronize()
    ##print(prof)
    #print("IDXCTFunction takes %.3f ms" % ((time.time()-tt)/runs*1000))

    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = torch.rfft(x[i%10].view([1, N, N]), signal_ndim=2, onesided=False)
    torch.cuda.synchronize()
    #print(prof)
    print("fft2 takes %.3f ms" % ((time.time()-tt)/runs*1000))

    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = discrete_spectral_transform.idcct2(x[i%10], expk_0=expk, expk_1=expk)
    torch.cuda.synchronize()
    #print(prof)
    print("idcct2 takes %.3f ms" % ((time.time()-tt)/runs*1000))

    func = dct.IDCCT2(expk, expk)
    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = func.forward(x[i%10])
    torch.cuda.synchronize()
    #print(prof)
    print("IDCCT2Function takes %.3f ms" % ((time.time()-tt)/runs*1000))

    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = discrete_spectral_transform.idcst2(x[i%10], expk_0=expk, expk_1=expk)
    torch.cuda.synchronize()
    #print(prof)
    print("idcst2 takes %.3f ms" % ((time.time()-tt)/runs*1000))

    func = dct.IDCST2(expk, expk)
    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = func.forward(x[i%10])
    torch.cuda.synchronize()
    #print(prof)
    print("IDCST2Function takes %.3f ms" % ((time.time()-tt)/runs*1000))

    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = discrete_spectral_transform.idsct2(x[i%10], expk_0=expk, expk_1=expk)
    torch.cuda.synchronize()
    #print(prof)
    print("idsct2 takes %.3f ms" % ((time.time()-tt)/runs*1000))

    func = dct.IDSCT2(expk, expk)
    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = func.forward(x[i%10])
    torch.cuda.synchronize()
    #print(prof)
    print("IDSCT2Function takes %.3f ms" % ((time.time()-tt)/runs*1000))
示例#5
0
    def forward(
        ctx,
        pos,
        node_size_x,
        node_size_y,
        bin_center_x,
        bin_center_y,
        initial_density_map,
        target_density,
        xl,
        yl,
        xh,
        yh,
        bin_size_x,
        bin_size_y,
        num_movable_nodes,
        num_filler_nodes,
        padding,
        padding_mask,  # same dimensions as density map, with padding regions to be 1 
        num_bins_x,
        num_bins_y,
        num_movable_impacted_bins_x,
        num_movable_impacted_bins_y,
        num_filler_impacted_bins_x,
        num_filler_impacted_bins_y,
        perm_M=None,  # permutation 
        perm_N=None,  # permutation
        expk_M=None,  # 2*exp(j*pi*k/M)
        expk_N=None,  # 2*exp(j*pi*k/N)
        inv_wu2_plus_wv2_2X=None,  # 2.0/(wu^2 + wv^2)
        wu_by_wu2_plus_wv2_2X=None,  # 2*wu/(wu^2 + wv^2)
        wv_by_wu2_plus_wv2_2X=None,  # 2*wv/(wu^2 + wv^2)
        fast_mode=True  # fast mode will discard some computation  
    ):

        if pos.is_cuda:
            output = electric_potential_cuda.density_map(
                pos.view(pos.numel()), node_size_x, node_size_y, bin_center_x,
                bin_center_y, initial_density_map, target_density, xl, yl, xh,
                yh, bin_size_x, bin_size_y, num_movable_nodes,
                num_filler_nodes, padding, padding_mask, num_bins_x,
                num_bins_y, num_movable_impacted_bins_x,
                num_movable_impacted_bins_y, num_filler_impacted_bins_x,
                num_filler_impacted_bins_y)
        else:
            output = electric_potential_cpp.density_map(
                pos.view(pos.numel()), node_size_x, node_size_y, bin_center_x,
                bin_center_y, initial_density_map, target_density, xl, yl, xh,
                yh, bin_size_x, bin_size_y, num_movable_nodes,
                num_filler_nodes, padding, padding_mask, num_bins_x,
                num_bins_y, num_movable_impacted_bins_x,
                num_movable_impacted_bins_y, num_filler_impacted_bins_x,
                num_filler_impacted_bins_y)

        # output consists of (density_cost, density_map, max_density)
        ctx.node_size_x = node_size_x
        ctx.node_size_y = node_size_y
        ctx.bin_center_x = bin_center_x
        ctx.bin_center_y = bin_center_y
        ctx.target_density = target_density
        ctx.xl = xl
        ctx.yl = yl
        ctx.xh = xh
        ctx.yh = yh
        ctx.bin_size_x = bin_size_x
        ctx.bin_size_y = bin_size_y
        ctx.num_movable_nodes = num_movable_nodes
        ctx.num_filler_nodes = num_filler_nodes
        ctx.padding = padding
        ctx.num_bins_x = num_bins_x
        ctx.num_bins_y = num_bins_y
        ctx.num_movable_impacted_bins_x = num_movable_impacted_bins_x
        ctx.num_movable_impacted_bins_y = num_movable_impacted_bins_y
        ctx.num_filler_impacted_bins_x = num_filler_impacted_bins_x
        ctx.num_filler_impacted_bins_y = num_filler_impacted_bins_y
        ctx.pos = pos
        density_map = output.view([ctx.num_bins_x, ctx.num_bins_y])
        #density_map = torch.ones([ctx.num_bins_x, ctx.num_bins_y], dtype=pos.dtype, device=pos.device)
        #ctx.field_map_x = torch.ones([ctx.num_bins_x, ctx.num_bins_y], dtype=pos.dtype, device=pos.device)
        #ctx.field_map_y = torch.ones([ctx.num_bins_x, ctx.num_bins_y], dtype=pos.dtype, device=pos.device)
        #return torch.zeros(1, dtype=pos.dtype, device=pos.device)

        # for DCT
        M = num_bins_x
        N = num_bins_y
        if expk_M is None:
            perm_M = discrete_spectral_transform.get_perm(
                M, dtype=torch.int64, device=density_map.device)
            perm_N = discrete_spectral_transform.get_perm(
                N, dtype=torch.int64, device=density_map.device)
            expk_M = discrete_spectral_transform.get_expk(
                M, dtype=density_map.dtype, device=density_map.device)
            expk_N = discrete_spectral_transform.get_expk(
                N, dtype=density_map.dtype, device=density_map.device)
        # wu and wv
        if inv_wu2_plus_wv2_2X is None:
            wu = torch.arange(M,
                              dtype=density_map.dtype,
                              device=density_map.device).mul(2 * np.pi /
                                                             M).view([M, 1])
            wv = torch.arange(N,
                              dtype=density_map.dtype,
                              device=density_map.device).mul(2 * np.pi /
                                                             N).view([1, N])
            wu2_plus_wv2 = wu.pow(2) + wv.pow(2)
            wu2_plus_wv2[0,
                         0] = 1.0  # avoid zero-division, it will be zeroed out
            inv_wu2_plus_wv2_2X = 2.0 / wu2_plus_wv2
            inv_wu2_plus_wv2_2X[0, 0] = 0.0
            wu_by_wu2_plus_wv2_2X = wu.mul(inv_wu2_plus_wv2_2X)
            wv_by_wu2_plus_wv2_2X = wv.mul(inv_wu2_plus_wv2_2X)

        # compute auv
        density_map.mul_(1.0 / (ctx.bin_size_x * ctx.bin_size_y))
        #auv = discrete_spectral_transform.dct2_2N(density_map, expk0=expk_M, expk1=expk_N)
        auv = dct.dct2(density_map, expk0=expk_M, expk1=expk_N)
        auv[0, :].mul_(0.5)
        auv[:, 0].mul_(0.5)

        # compute field xi
        auv_by_wu2_plus_wv2_wu = auv.mul(wu_by_wu2_plus_wv2_2X)
        auv_by_wu2_plus_wv2_wv = auv.mul(wv_by_wu2_plus_wv2_2X)
        #ctx.field_map_x = discrete_spectral_transform.idsct2(auv_by_wu2_plus_wv2_wu, expk_M, expk_N).contiguous()
        ctx.field_map_x = dct.idsct2(auv_by_wu2_plus_wv2_wu, expk_M, expk_N)
        #ctx.field_map_y = discrete_spectral_transform.idcst2(auv_by_wu2_plus_wv2_wv, expk_M, expk_N).contiguous()
        ctx.field_map_y = dct.idcst2(auv_by_wu2_plus_wv2_wv, expk_M, expk_N)

        # energy = \sum q*phi
        # it takes around 80% of the computation time
        # so I will not always evaluate it
        if fast_mode:  # dummy for invoking backward propagation
            energy = torch.zeros(1, dtype=pos.dtype, device=pos.device)
        else:
            # compute potential phi
            # auv / (wu**2 + wv**2)
            auv_by_wu2_plus_wv2 = auv.mul(inv_wu2_plus_wv2_2X).mul_(2)
            #potential_map = discrete_spectral_transform.idcct2(auv_by_wu2_plus_wv2, expk_M, expk_N)
            potential_map = dct.idcct2(auv_by_wu2_plus_wv2, expk_M, expk_N)
            # compute energy
            energy = potential_map.mul_(density_map).sum()

        #torch.set_printoptions(precision=10)
        #print("initial_density_map")
        #print(initial_density_map/(ctx.bin_size_x*ctx.bin_size_y))
        #print("density_map")
        #print(density_map/(ctx.bin_size_x*ctx.bin_size_y))
        #print("auv_by_wu2_plus_wv2")
        #print(auv_by_wu2_plus_wv2)
        #print("potential_map")
        #print(potential_map)
        #print("field_map_x")
        #print(ctx.field_map_x)
        #print("field_map_y")
        #print(ctx.field_map_y)

        #global plot_count
        #if plot_count >= 600 and plot_count % 1 == 0:
        #    print("density_map")
        #    plot(plot_count, density_map.clone().div(bin_size_x*bin_size_y).cpu().numpy(), padding, "summary/%d.density_map" % (plot_count))
        #    print("potential_map")
        #    plot(plot_count, potential_map.clone().cpu().numpy(), padding, "summary/%d.potential_map" % (plot_count))
        #    print("field_map_x")
        #    plot(plot_count, ctx.field_map_x.clone().cpu().numpy(), padding, "summary/%d.field_map_x" % (plot_count))
        #    print("field_map_y")
        #    plot(plot_count, ctx.field_map_y.clone().cpu().numpy(), padding, "summary/%d.field_map_y" % (plot_count))
        #plot_count += 1

        torch.cuda.synchronize()
        return energy
示例#6
0
    def forward(self, pos):
        if self.initial_density_map is None:
            if self.num_terminals == 0:
                num_fixed_impacted_bins_x = 0
                num_fixed_impacted_bins_y = 0
            else:
                num_fixed_impacted_bins_x = int(
                    ((self.node_size_x[
                        self.num_movable_nodes:self.num_movable_nodes +
                        self.num_terminals].max() + self.bin_size_x) /
                     self.bin_size_x).ceil().clamp(max=self.num_bins_x))
                num_fixed_impacted_bins_y = int(
                    ((self.node_size_y[
                        self.num_movable_nodes:self.num_movable_nodes +
                        self.num_terminals].max() + self.bin_size_y) /
                     self.bin_size_y).ceil().clamp(max=self.num_bins_y))
            if pos.is_cuda:
                self.initial_density_map = electric_potential_cuda.fixed_density_map(
                    pos.view(pos.numel()), self.node_size_x, self.node_size_y,
                    self.bin_center_x, self.bin_center_y, self.xl, self.yl,
                    self.xh, self.yh, self.bin_size_x, self.bin_size_y,
                    self.num_movable_nodes, self.num_terminals,
                    self.num_bins_x, self.num_bins_y,
                    num_fixed_impacted_bins_x, num_fixed_impacted_bins_y)
            else:
                self.initial_density_map = electric_potential_cpp.fixed_density_map(
                    pos.view(pos.numel()), self.node_size_x, self.node_size_y,
                    self.bin_center_x, self.bin_center_y, self.xl, self.yl,
                    self.xh, self.yh, self.bin_size_x, self.bin_size_y,
                    self.num_movable_nodes, self.num_terminals,
                    self.num_bins_x, self.num_bins_y,
                    num_fixed_impacted_bins_x, num_fixed_impacted_bins_y)
            #plot(0, self.initial_density_map.clone().div(self.bin_size_x*self.bin_size_y).cpu().numpy(), self.padding, 'summary/initial_potential_map')
            # scale density of fixed macros
            self.initial_density_map.mul_(self.target_density)
            # expk
            M = self.num_bins_x
            N = self.num_bins_y
            self.perm_M = discrete_spectral_transform.get_perm(
                M, dtype=torch.int64, device=pos.device)
            self.perm_N = discrete_spectral_transform.get_perm(
                N, dtype=torch.int64, device=pos.device)
            self.expk_M = discrete_spectral_transform.get_expk(
                M, dtype=pos.dtype, device=pos.device)
            self.expk_N = discrete_spectral_transform.get_expk(
                N, dtype=pos.dtype, device=pos.device)
            # wu and wv
            wu = torch.arange(M, dtype=pos.dtype, device=pos.device).mul(
                2 * np.pi / M).view([M, 1])
            # scale wv because the aspect ratio of a bin may not be 1
            wv = torch.arange(N, dtype=pos.dtype,
                              device=pos.device).mul(2 * np.pi / N).view(
                                  [1,
                                   N]).mul_(self.bin_size_x / self.bin_size_y)
            wu2_plus_wv2 = wu.pow(2) + wv.pow(2)
            wu2_plus_wv2[0,
                         0] = 1.0  # avoid zero-division, it will be zeroed out
            self.inv_wu2_plus_wv2_2X = 2.0 / wu2_plus_wv2
            self.inv_wu2_plus_wv2_2X[0, 0] = 0.0
            self.wu_by_wu2_plus_wv2_2X = wu.mul(self.inv_wu2_plus_wv2_2X)
            self.wv_by_wu2_plus_wv2_2X = wv.mul(self.inv_wu2_plus_wv2_2X)

        return ElectricPotentialFunction.apply(
            pos, self.node_size_x, self.node_size_y, self.bin_center_x,
            self.bin_center_y, self.initial_density_map, self.target_density,
            self.xl, self.yl, self.xh, self.yh, self.bin_size_x,
            self.bin_size_y, self.num_movable_nodes, self.num_filler_nodes,
            self.padding, self.padding_mask, self.num_bins_x, self.num_bins_y,
            self.num_movable_impacted_bins_x, self.num_movable_impacted_bins_y,
            self.num_filler_impacted_bins_x, self.num_filler_impacted_bins_y,
            self.perm_M, self.perm_N, self.expk_M, self.expk_N,
            self.inv_wu2_plus_wv2_2X, self.wu_by_wu2_plus_wv2_2X,
            self.wv_by_wu2_plus_wv2_2X, self.fast_mode)
def compare_different_methods(cuda_flag, M=1024, N=1024, dtype=torch.float64):
    density_map = torch.empty(M, N, dtype=dtype).uniform_(0, 10.0)
    if cuda_flag:
        density_map = density_map.cuda()
    expkM = discrete_spectral_transform.get_expk(M, dtype, density_map.device)
    expkN = discrete_spectral_transform.get_expk(N, dtype, density_map.device)
    exact_expkM = discrete_spectral_transform.get_exact_expk(M, dtype, density_map.device)
    exact_expkN = discrete_spectral_transform.get_exact_expk(N, dtype, density_map.device)
    print("M = {}, N = {}".format(M, N))

    wu = torch.arange(M, dtype=density_map.dtype, device=density_map.device).mul(2 * np.pi / M).view([M, 1])
    wv = torch.arange(N, dtype=density_map.dtype, device=density_map.device).mul(2 * np.pi / N).view([1, N])
    wu2_plus_wv2 = wu.pow(2) + wv.pow(2)
    wu2_plus_wv2[0, 0] = 1.0  # avoid zero-division, it will be zeroed out

    inv_wu2_plus_wv2_2X = 2.0 / wu2_plus_wv2
    inv_wu2_plus_wv2_2X[0, 0] = 0.0
    wu_by_wu2_plus_wv2_2X = wu.mul(inv_wu2_plus_wv2_2X)
    wv_by_wu2_plus_wv2_2X = wv.mul(inv_wu2_plus_wv2_2X)

    # the first approach is used as the ground truth
    auv_golden = dct.dct2(density_map, expk0=expkM, expk1=expkN)
    auv = auv_golden.clone()
    auv[0, :].mul_(0.5)
    auv[:, 0].mul_(0.5)
    auv_by_wu2_plus_wv2_wu = auv.mul(wu_by_wu2_plus_wv2_2X)
    auv_by_wu2_plus_wv2_wv = auv.mul(wv_by_wu2_plus_wv2_2X)
    field_map_x_golden = dct.idsct2(auv_by_wu2_plus_wv2_wu, expkM, expkN)
    field_map_y_golden = dct.idcst2(auv_by_wu2_plus_wv2_wv, expkM, expkN)
    # compute potential phi
    # auv / (wu**2 + wv**2)
    auv_by_wu2_plus_wv2 = auv.mul(inv_wu2_plus_wv2_2X).mul_(2)
    #potential_map = discrete_spectral_transform.idcct2(auv_by_wu2_plus_wv2, expkM, expkN)
    potential_map_golden = dct.idcct2(auv_by_wu2_plus_wv2, expkM, expkN)
    # compute energy
    energy_golden = potential_map_golden.mul(density_map).sum()

    if density_map.is_cuda:
        torch.cuda.synchronize()

    # the second approach uses the idxst_idct and idct_idxst
    dct2 = dct2_fft2.DCT2(exact_expkM, exact_expkN)
    idct2 = dct2_fft2.IDCT2(exact_expkM, exact_expkN)
    idct_idxst = dct2_fft2.IDCT_IDXST(exact_expkM, exact_expkN)
    idxst_idct = dct2_fft2.IDXST_IDCT(exact_expkM, exact_expkN)

    inv_wu2_plus_wv2 = 1.0 / wu2_plus_wv2
    inv_wu2_plus_wv2[0, 0] = 0.0
    wu_by_wu2_plus_wv2_half = wu.mul(inv_wu2_plus_wv2).mul_(0.5)
    wv_by_wu2_plus_wv2_half = wv.mul(inv_wu2_plus_wv2).mul_(0.5)

    buv = dct2.forward(density_map)

    buv_by_wu2_plus_wv2_wu = buv.mul(wu_by_wu2_plus_wv2_half)
    buv_by_wu2_plus_wv2_wv = buv.mul(wv_by_wu2_plus_wv2_half)
    field_map_x = idxst_idct.forward(buv_by_wu2_plus_wv2_wu)
    field_map_y = idct_idxst.forward(buv_by_wu2_plus_wv2_wv)
    buv_by_wu2_plus_wv2 = buv.mul(inv_wu2_plus_wv2)
    potential_map = idct2.forward(buv_by_wu2_plus_wv2)
    energy = potential_map.mul(density_map).sum()

    if density_map.is_cuda:
        torch.cuda.synchronize()

    # compare results
    np.testing.assert_allclose(buv.data.cpu().numpy(), auv_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5)
    np.testing.assert_allclose(field_map_x.data.cpu().numpy(), field_map_x_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5)
    np.testing.assert_allclose(field_map_y.data.cpu().numpy(), field_map_y_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5)
    np.testing.assert_allclose(potential_map.data.cpu().numpy(), potential_map_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5)
    np.testing.assert_allclose(energy.data.cpu().numpy(), energy_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5)

    # the third approach uses the dct.idxst_idct and dct.idxst_idct
    dct2 = dct.DCT2(expkM, expkN)
    idct2 = dct.IDCT2(expkM, expkN)
    idct_idxst = dct.IDCT_IDXST(expkM, expkN)
    idxst_idct = dct.IDXST_IDCT(expkM, expkN)

    cuv = dct2.forward(density_map)

    cuv_by_wu2_plus_wv2_wu = cuv.mul(wu_by_wu2_plus_wv2_half)
    cuv_by_wu2_plus_wv2_wv = cuv.mul(wv_by_wu2_plus_wv2_half)
    field_map_x = idxst_idct.forward(cuv_by_wu2_plus_wv2_wu)
    field_map_y = idct_idxst.forward(cuv_by_wu2_plus_wv2_wv)
    cuv_by_wu2_plus_wv2 = cuv.mul(inv_wu2_plus_wv2)
    potential_map = idct2.forward(cuv_by_wu2_plus_wv2)
    energy = potential_map.mul(density_map).sum()

    if density_map.is_cuda:
        torch.cuda.synchronize()

    # compare results
    np.testing.assert_allclose(cuv.data.cpu().numpy(), auv_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5)
    np.testing.assert_allclose(field_map_x.data.cpu().numpy(), field_map_x_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5)
    np.testing.assert_allclose(field_map_y.data.cpu().numpy(), field_map_y_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5)
    np.testing.assert_allclose(potential_map.data.cpu().numpy(), potential_map_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5)
    np.testing.assert_allclose(energy.data.cpu().numpy(), energy_golden.data.cpu().numpy(), rtol=1e-6, atol=1e-5)
    scipy_auv[0, 1:] *= np.sqrt(2.0) / np.sqrt(M*N)
    scipy_auv[1:, 0] *= np.sqrt(2.0) / np.sqrt(M*N)
    scipy_auv[0, 0] *= 1.0 / np.sqrt(M*N)
    ratio = scipy_auv/auv_map
    print("scipy_auv/auv_map")
    print(ratio.min())
    print(ratio.max())
    print(ratio.mean())
    np.testing.assert_allclose(scipy_auv, auv_map, rtol=3e-1)
    """

    density_map = torch.from_numpy(density_map)
    # for DCT 
    M = density_map.shape[0]
    N = density_map.shape[1]
    expk_M = discrete_spectral_transform.get_expk(M, dtype=density_map.dtype, device=density_map.device)
    expk_N = discrete_spectral_transform.get_expk(N, dtype=density_map.dtype, device=density_map.device)
    # wu and wv 
    wu = torch.arange(M, dtype=density_map.dtype, device=density_map.device).mul(2*np.pi/M).view([M, 1])
    wv = torch.arange(N, dtype=density_map.dtype, device=density_map.device).mul(2*np.pi/N).view([1, N])
    wu2_plus_wv2 = wu.pow(2) + wv.pow(2)
    wu2_plus_wv2[0, 0] = 1.0 # avoid zero-division, it will be zeroed out 
    inv_wu2_plus_wv2_2X = 2.0 / wu2_plus_wv2
    inv_wu2_plus_wv2_2X[0, 0] = 0.0 
    wu_by_wu2_plus_wv2 = wu.mul(inv_wu2_plus_wv2_2X)
    wv_by_wu2_plus_wv2 = wv.mul(inv_wu2_plus_wv2_2X)
    
    # compute auv 
    auv = discrete_spectral_transform.dct2(density_map, expk_M, expk_N)
    auv[0, :].mul_(0.5)
    auv[:, 0].mul_(0.5)