Пример #1
0
def eval_runtime():
    #x = torch.tensor([1, 2, 7, 9, 20, 31], dtype=torch.float64)
    #print(dct_N(x))

    N = 512
    runs = 10
    x = torch.empty(10, N, N, dtype=torch.float64).uniform_(0, 10.0).cuda()
    perm = discrete_spectral_transform.get_perm(N, dtype=torch.int64, device=x.device)
    expk = discrete_spectral_transform.get_expk(N, dtype=x.dtype, device=x.device)

    #x_numpy = x.data.cpu().numpy()
    #tt = time.time()
    #for i in range(runs): 
    #    y = fftpack.dct(fftpack.dct(x_numpy[i%10].T, norm=None).T, norm=None)
    #print("scipy takes %.3f sec" % (time.time()-tt))

    ## 9s for 200 iterations 1024x1024 on GTX 1080
    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_2N = dct2_2N(x[0], expk0=expk, expk1=expk)
    #torch.cuda.synchronize()
    ##print(prof)
    #print("dct_2N takes %.3f ms" % ((time.time()-tt)/runs*1000))

    ## 11s for 200 iterations 1024x1024 on GTX 1080
    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_N = discrete_spectral_transform.dct2_N(x[i%10], perm0=perm, expk0=expk, perm1=perm, expk1=expk)
    #torch.cuda.synchronize()
    ##print(prof)
    #print("dct_N takes %.3f ms" % ((time.time()-tt)/runs*1000))

    #dct2func = dct.DCT2(expk, expk, algorithm='2N')
    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_N = dct2func.forward(x[0])
    #torch.cuda.synchronize()
    ##print(prof)
    #print("DCT2Function 2N takes %.3f ms" % ((time.time()-tt)/runs*1000))

    #dct2func = dct.DCT2(expk, expk, algorithm='N')
    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_N = dct2func.forward(x[0])
    #torch.cuda.synchronize()
    ##print(prof)
    #print("DCT2Function takes %.3f ms" % ((time.time()-tt)/runs*1000))

    #dct2func = dct_lee.DCT2(expk, expk)
    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_N = dct2func.forward(x[i%10])
    #torch.cuda.synchronize()
    ##print(prof)
    #print("DCT2Function lee takes %.3f ms" % ((time.time()-tt)/runs*1000))

    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_N = discrete_spectral_transform.idct2_2N(x[i%10], expk0=expk, expk1=expk)
    #torch.cuda.synchronize()
    ##print(prof)
    #print("idct2_2N takes %.3f ms" % ((time.time()-tt)/runs*1000))

    idct2func = dct.IDCT2(expk, expk, algorithm='2N')
    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = idct2func.forward(x[i%10])
    torch.cuda.synchronize()
    #print(prof)
    print("IDCT2Function 2N takes %.3f ms" % ((time.time()-tt)/runs*1000))

    idct2func = dct.IDCT2(expk, expk, algorithm='N')
    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = idct2func.forward(x[i%10])
    torch.cuda.synchronize()
    #print(prof)
    print("IDCT2Function takes %.3f ms" % ((time.time()-tt)/runs*1000))
    exit()

    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_N = discrete_spectral_transform.idxt(x[i%10], 0, expk=expk)
    #torch.cuda.synchronize()
    ##print(prof)
    #print("idxt takes %.3f ms" % ((time.time()-tt)/runs*1000))

    #idxct_func = dct.IDXCT(expk)
    #torch.cuda.synchronize()
    #tt = time.time()
    ##with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #for i in range(runs): 
    #    y_N = idxct_func.forward(x[i%10])
    #torch.cuda.synchronize()
    ##print(prof)
    #print("IDXCTFunction takes %.3f ms" % ((time.time()-tt)/runs*1000))

    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = torch.rfft(x[i%10].view([1, N, N]), signal_ndim=2, onesided=False)
    torch.cuda.synchronize()
    #print(prof)
    print("fft2 takes %.3f ms" % ((time.time()-tt)/runs*1000))

    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = discrete_spectral_transform.idcct2(x[i%10], expk_0=expk, expk_1=expk)
    torch.cuda.synchronize()
    #print(prof)
    print("idcct2 takes %.3f ms" % ((time.time()-tt)/runs*1000))

    func = dct.IDCCT2(expk, expk)
    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = func.forward(x[i%10])
    torch.cuda.synchronize()
    #print(prof)
    print("IDCCT2Function takes %.3f ms" % ((time.time()-tt)/runs*1000))

    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = discrete_spectral_transform.idcst2(x[i%10], expk_0=expk, expk_1=expk)
    torch.cuda.synchronize()
    #print(prof)
    print("idcst2 takes %.3f ms" % ((time.time()-tt)/runs*1000))

    func = dct.IDCST2(expk, expk)
    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = func.forward(x[i%10])
    torch.cuda.synchronize()
    #print(prof)
    print("IDCST2Function takes %.3f ms" % ((time.time()-tt)/runs*1000))

    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = discrete_spectral_transform.idsct2(x[i%10], expk_0=expk, expk_1=expk)
    torch.cuda.synchronize()
    #print(prof)
    print("idsct2 takes %.3f ms" % ((time.time()-tt)/runs*1000))

    func = dct.IDSCT2(expk, expk)
    torch.cuda.synchronize()
    tt = time.time()
    #with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs): 
        y_N = func.forward(x[i%10])
    torch.cuda.synchronize()
    #print(prof)
    print("IDSCT2Function takes %.3f ms" % ((time.time()-tt)/runs*1000))
Пример #2
0
    def forward(self, pos):
        if self.initial_density_map is None:
            if self.num_terminals == 0:
                num_fixed_impacted_bins_x = 0
                num_fixed_impacted_bins_y = 0
            else:
                num_fixed_impacted_bins_x = int(
                    ((self.node_size_x[
                        self.num_movable_nodes:self.num_movable_nodes +
                        self.num_terminals].max() + self.bin_size_x) /
                     self.bin_size_x).ceil().clamp(max=self.num_bins_x))
                num_fixed_impacted_bins_y = int(
                    ((self.node_size_y[
                        self.num_movable_nodes:self.num_movable_nodes +
                        self.num_terminals].max() + self.bin_size_y) /
                     self.bin_size_y).ceil().clamp(max=self.num_bins_y))
            if pos.is_cuda:
                self.initial_density_map = electric_potential_cuda.fixed_density_map(
                    pos.view(pos.numel()), self.node_size_x, self.node_size_y,
                    self.bin_center_x, self.bin_center_y, self.xl, self.yl,
                    self.xh, self.yh, self.bin_size_x, self.bin_size_y,
                    self.num_movable_nodes, self.num_terminals,
                    self.num_bins_x, self.num_bins_y,
                    num_fixed_impacted_bins_x, num_fixed_impacted_bins_y)
            else:
                self.initial_density_map = electric_potential_cpp.fixed_density_map(
                    pos.view(pos.numel()), self.node_size_x, self.node_size_y,
                    self.bin_center_x, self.bin_center_y, self.xl, self.yl,
                    self.xh, self.yh, self.bin_size_x, self.bin_size_y,
                    self.num_movable_nodes, self.num_terminals,
                    self.num_bins_x, self.num_bins_y,
                    num_fixed_impacted_bins_x, num_fixed_impacted_bins_y)
            #plot(0, self.initial_density_map.clone().div(self.bin_size_x*self.bin_size_y).cpu().numpy(), self.padding, 'summary/initial_potential_map')
            # scale density of fixed macros
            self.initial_density_map.mul_(self.target_density)
            # expk
            M = self.num_bins_x
            N = self.num_bins_y
            self.perm_M = discrete_spectral_transform.get_perm(
                M, dtype=torch.int64, device=pos.device)
            self.perm_N = discrete_spectral_transform.get_perm(
                N, dtype=torch.int64, device=pos.device)
            self.expk_M = discrete_spectral_transform.get_expk(
                M, dtype=pos.dtype, device=pos.device)
            self.expk_N = discrete_spectral_transform.get_expk(
                N, dtype=pos.dtype, device=pos.device)
            # wu and wv
            wu = torch.arange(M, dtype=pos.dtype, device=pos.device).mul(
                2 * np.pi / M).view([M, 1])
            # scale wv because the aspect ratio of a bin may not be 1
            wv = torch.arange(N, dtype=pos.dtype,
                              device=pos.device).mul(2 * np.pi / N).view(
                                  [1,
                                   N]).mul_(self.bin_size_x / self.bin_size_y)
            wu2_plus_wv2 = wu.pow(2) + wv.pow(2)
            wu2_plus_wv2[0,
                         0] = 1.0  # avoid zero-division, it will be zeroed out
            self.inv_wu2_plus_wv2_2X = 2.0 / wu2_plus_wv2
            self.inv_wu2_plus_wv2_2X[0, 0] = 0.0
            self.wu_by_wu2_plus_wv2_2X = wu.mul(self.inv_wu2_plus_wv2_2X)
            self.wv_by_wu2_plus_wv2_2X = wv.mul(self.inv_wu2_plus_wv2_2X)

        return ElectricPotentialFunction.apply(
            pos, self.node_size_x, self.node_size_y, self.bin_center_x,
            self.bin_center_y, self.initial_density_map, self.target_density,
            self.xl, self.yl, self.xh, self.yh, self.bin_size_x,
            self.bin_size_y, self.num_movable_nodes, self.num_filler_nodes,
            self.padding, self.padding_mask, self.num_bins_x, self.num_bins_y,
            self.num_movable_impacted_bins_x, self.num_movable_impacted_bins_y,
            self.num_filler_impacted_bins_x, self.num_filler_impacted_bins_y,
            self.perm_M, self.perm_N, self.expk_M, self.expk_N,
            self.inv_wu2_plus_wv2_2X, self.wu_by_wu2_plus_wv2_2X,
            self.wv_by_wu2_plus_wv2_2X, self.fast_mode)
Пример #3
0
    def forward(
        ctx,
        pos,
        node_size_x,
        node_size_y,
        bin_center_x,
        bin_center_y,
        initial_density_map,
        target_density,
        xl,
        yl,
        xh,
        yh,
        bin_size_x,
        bin_size_y,
        num_movable_nodes,
        num_filler_nodes,
        padding,
        padding_mask,  # same dimensions as density map, with padding regions to be 1 
        num_bins_x,
        num_bins_y,
        num_movable_impacted_bins_x,
        num_movable_impacted_bins_y,
        num_filler_impacted_bins_x,
        num_filler_impacted_bins_y,
        perm_M=None,  # permutation 
        perm_N=None,  # permutation
        expk_M=None,  # 2*exp(j*pi*k/M)
        expk_N=None,  # 2*exp(j*pi*k/N)
        inv_wu2_plus_wv2_2X=None,  # 2.0/(wu^2 + wv^2)
        wu_by_wu2_plus_wv2_2X=None,  # 2*wu/(wu^2 + wv^2)
        wv_by_wu2_plus_wv2_2X=None,  # 2*wv/(wu^2 + wv^2)
        fast_mode=True  # fast mode will discard some computation  
    ):

        if pos.is_cuda:
            output = electric_potential_cuda.density_map(
                pos.view(pos.numel()), node_size_x, node_size_y, bin_center_x,
                bin_center_y, initial_density_map, target_density, xl, yl, xh,
                yh, bin_size_x, bin_size_y, num_movable_nodes,
                num_filler_nodes, padding, padding_mask, num_bins_x,
                num_bins_y, num_movable_impacted_bins_x,
                num_movable_impacted_bins_y, num_filler_impacted_bins_x,
                num_filler_impacted_bins_y)
        else:
            output = electric_potential_cpp.density_map(
                pos.view(pos.numel()), node_size_x, node_size_y, bin_center_x,
                bin_center_y, initial_density_map, target_density, xl, yl, xh,
                yh, bin_size_x, bin_size_y, num_movable_nodes,
                num_filler_nodes, padding, padding_mask, num_bins_x,
                num_bins_y, num_movable_impacted_bins_x,
                num_movable_impacted_bins_y, num_filler_impacted_bins_x,
                num_filler_impacted_bins_y)

        # output consists of (density_cost, density_map, max_density)
        ctx.node_size_x = node_size_x
        ctx.node_size_y = node_size_y
        ctx.bin_center_x = bin_center_x
        ctx.bin_center_y = bin_center_y
        ctx.target_density = target_density
        ctx.xl = xl
        ctx.yl = yl
        ctx.xh = xh
        ctx.yh = yh
        ctx.bin_size_x = bin_size_x
        ctx.bin_size_y = bin_size_y
        ctx.num_movable_nodes = num_movable_nodes
        ctx.num_filler_nodes = num_filler_nodes
        ctx.padding = padding
        ctx.num_bins_x = num_bins_x
        ctx.num_bins_y = num_bins_y
        ctx.num_movable_impacted_bins_x = num_movable_impacted_bins_x
        ctx.num_movable_impacted_bins_y = num_movable_impacted_bins_y
        ctx.num_filler_impacted_bins_x = num_filler_impacted_bins_x
        ctx.num_filler_impacted_bins_y = num_filler_impacted_bins_y
        ctx.pos = pos
        density_map = output.view([ctx.num_bins_x, ctx.num_bins_y])
        #density_map = torch.ones([ctx.num_bins_x, ctx.num_bins_y], dtype=pos.dtype, device=pos.device)
        #ctx.field_map_x = torch.ones([ctx.num_bins_x, ctx.num_bins_y], dtype=pos.dtype, device=pos.device)
        #ctx.field_map_y = torch.ones([ctx.num_bins_x, ctx.num_bins_y], dtype=pos.dtype, device=pos.device)
        #return torch.zeros(1, dtype=pos.dtype, device=pos.device)

        # for DCT
        M = num_bins_x
        N = num_bins_y
        if expk_M is None:
            perm_M = discrete_spectral_transform.get_perm(
                M, dtype=torch.int64, device=density_map.device)
            perm_N = discrete_spectral_transform.get_perm(
                N, dtype=torch.int64, device=density_map.device)
            expk_M = discrete_spectral_transform.get_expk(
                M, dtype=density_map.dtype, device=density_map.device)
            expk_N = discrete_spectral_transform.get_expk(
                N, dtype=density_map.dtype, device=density_map.device)
        # wu and wv
        if inv_wu2_plus_wv2_2X is None:
            wu = torch.arange(M,
                              dtype=density_map.dtype,
                              device=density_map.device).mul(2 * np.pi /
                                                             M).view([M, 1])
            wv = torch.arange(N,
                              dtype=density_map.dtype,
                              device=density_map.device).mul(2 * np.pi /
                                                             N).view([1, N])
            wu2_plus_wv2 = wu.pow(2) + wv.pow(2)
            wu2_plus_wv2[0,
                         0] = 1.0  # avoid zero-division, it will be zeroed out
            inv_wu2_plus_wv2_2X = 2.0 / wu2_plus_wv2
            inv_wu2_plus_wv2_2X[0, 0] = 0.0
            wu_by_wu2_plus_wv2_2X = wu.mul(inv_wu2_plus_wv2_2X)
            wv_by_wu2_plus_wv2_2X = wv.mul(inv_wu2_plus_wv2_2X)

        # compute auv
        density_map.mul_(1.0 / (ctx.bin_size_x * ctx.bin_size_y))
        #auv = discrete_spectral_transform.dct2_2N(density_map, expk0=expk_M, expk1=expk_N)
        auv = dct.dct2(density_map, expk0=expk_M, expk1=expk_N)
        auv[0, :].mul_(0.5)
        auv[:, 0].mul_(0.5)

        # compute field xi
        auv_by_wu2_plus_wv2_wu = auv.mul(wu_by_wu2_plus_wv2_2X)
        auv_by_wu2_plus_wv2_wv = auv.mul(wv_by_wu2_plus_wv2_2X)
        #ctx.field_map_x = discrete_spectral_transform.idsct2(auv_by_wu2_plus_wv2_wu, expk_M, expk_N).contiguous()
        ctx.field_map_x = dct.idsct2(auv_by_wu2_plus_wv2_wu, expk_M, expk_N)
        #ctx.field_map_y = discrete_spectral_transform.idcst2(auv_by_wu2_plus_wv2_wv, expk_M, expk_N).contiguous()
        ctx.field_map_y = dct.idcst2(auv_by_wu2_plus_wv2_wv, expk_M, expk_N)

        # energy = \sum q*phi
        # it takes around 80% of the computation time
        # so I will not always evaluate it
        if fast_mode:  # dummy for invoking backward propagation
            energy = torch.zeros(1, dtype=pos.dtype, device=pos.device)
        else:
            # compute potential phi
            # auv / (wu**2 + wv**2)
            auv_by_wu2_plus_wv2 = auv.mul(inv_wu2_plus_wv2_2X).mul_(2)
            #potential_map = discrete_spectral_transform.idcct2(auv_by_wu2_plus_wv2, expk_M, expk_N)
            potential_map = dct.idcct2(auv_by_wu2_plus_wv2, expk_M, expk_N)
            # compute energy
            energy = potential_map.mul_(density_map).sum()

        #torch.set_printoptions(precision=10)
        #print("initial_density_map")
        #print(initial_density_map/(ctx.bin_size_x*ctx.bin_size_y))
        #print("density_map")
        #print(density_map/(ctx.bin_size_x*ctx.bin_size_y))
        #print("auv_by_wu2_plus_wv2")
        #print(auv_by_wu2_plus_wv2)
        #print("potential_map")
        #print(potential_map)
        #print("field_map_x")
        #print(ctx.field_map_x)
        #print("field_map_y")
        #print(ctx.field_map_y)

        #global plot_count
        #if plot_count >= 600 and plot_count % 1 == 0:
        #    print("density_map")
        #    plot(plot_count, density_map.clone().div(bin_size_x*bin_size_y).cpu().numpy(), padding, "summary/%d.density_map" % (plot_count))
        #    print("potential_map")
        #    plot(plot_count, potential_map.clone().cpu().numpy(), padding, "summary/%d.potential_map" % (plot_count))
        #    print("field_map_x")
        #    plot(plot_count, ctx.field_map_x.clone().cpu().numpy(), padding, "summary/%d.field_map_x" % (plot_count))
        #    print("field_map_y")
        #    plot(plot_count, ctx.field_map_y.clone().cpu().numpy(), padding, "summary/%d.field_map_y" % (plot_count))
        #plot_count += 1

        torch.cuda.synchronize()
        return energy
Пример #4
0
def eval_dct2d(x, expk0, expk1, expkM, expkN, runs):
    x_numpy = x.data.cpu().numpy()
    torch.cuda.synchronize()
    tt = time.time()
    y = fftpack.dct(fftpack.dct(x_numpy.T, norm=None).T/x.size(1), norm=None)/x.size(0)
    torch.cuda.synchronize()
    print("CPU scipy.fftpack.dct2d takes %.7f ms" % ((time.time()-tt)*1000))

    # 9s for 200 iterations 1024x1024 on GTX 1080
    torch.cuda.synchronize()
    tt = time.time()
    # with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs):
        y_2N = discrete_spectral_transform.dct2_2N(x, expk0=expk0, expk1=expk1)
    torch.cuda.synchronize()
    # print(prof)
    print("PyTorch: dct2d_2N takes %.7f ms" % ((time.time()-tt)/runs*1000))

    # 11s for 200 iterations 1024x1024 on GTX 1080
    perm0 = discrete_spectral_transform.get_perm(x.size(-2), dtype=torch.int64, device=x.device)
    perm1 = discrete_spectral_transform.get_perm(x.size(-1), dtype=torch.int64, device=x.device)
    torch.cuda.synchronize()
    tt = time.time()
    # with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs):
        y_N = discrete_spectral_transform.dct2_N(x, perm0=perm0, expk0=expk0, perm1=perm1, expk1=expk1)
    torch.cuda.synchronize()
    # print(prof)
    print("PyTorch: dct2d_N takes %.7f ms" % ((time.time()-tt)/runs*1000))

    dct2func = dct.DCT2(expk0, expk1, algorithm='2N')
    torch.cuda.synchronize()
    tt = time.time()
    # with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs):
        y_2N = dct2func.forward(x)
    torch.cuda.synchronize()
    # print(prof)
    print("DCT2d_2N Function takes %.7f ms" % ((time.time()-tt)/runs*1000))

    dct2func = dct.DCT2(expk0, expk1, algorithm='N')
    y_N = dct2func.forward(x)
    torch.cuda.synchronize()
    # with torch.autograd.profiler.profile(use_cuda=True) as prof:
    tt = time.time()
    for i in range(runs):
        y_N = dct2func.forward(x)
    torch.cuda.synchronize()
    # print(prof)
    print("DCT2d_N Function takes %.7f ms" % ((time.time()-tt)/runs*1000))

    # The implementation below only supports float64 by now
    dct2func = dct_lee.DCT2(expk0, expk1)
    torch.cuda.synchronize()
    tt = time.time()
    # with torch.autograd.profiler.profile(use_cuda=True) as prof:
    for i in range(runs):
        y_N = dct2func.forward(x)
    torch.cuda.synchronize()
    # print(prof)
    print("DCT2d_Lee Function takes %.7f ms" % ((time.time()-tt)/runs*1000))

    dct2func = dct2_fft2.DCT2(expkM, expkN)
    y = dct2func.forward(x)
    torch.cuda.synchronize()
    tt = time.time()
    for i in range(runs):
        y_test = dct2func.forward(x)
    torch.cuda.synchronize()
    print("DCT2_FFT2 Function takes %.7f ms" % ((time.time()-tt)/runs*1000))

    print("")