예제 #1
0
def test_efficiency_forward(h, w, c, kh, kw):
    x = torch.rand(1, c, h, w).cuda()
    y = torch.rand(1, c, h, w).cuda()

    with torch.no_grad():
        torch.cuda.reset_max_memory_allocated()
        z = f_similar(x, y, kh, kw)
        memory = torch.cuda.max_memory_allocated() / 1000000
        del z

    with torch.no_grad():
        torch.cuda.reset_max_memory_allocated()
        z = TorchLocalAttention.f_similar(x, y, kh, kw)
        memory_torch = torch.cuda.max_memory_allocated() / 1000000
        del z

    with torch.no_grad():
        torch.cuda.synchronize()
        t = time.time()
        for i in range(3):
            z = f_similar(x, y, kh, kw)
        torch.cuda.synchronize()
        t = (time.time() - t) / 3
        del z
        
        torch.cuda.synchronize()
        t_torch = time.time()
        for i in range(3):
            z = TorchLocalAttention.f_similar(x, y, kh, kw)
        torch.cuda.synchronize()
        t_torch = (time.time() - t_torch) / 3
        del z
    print("{:.2f},{:.2f}||{:.5f},{:.5f}".format(memory_torch, memory, t_torch, t))
예제 #2
0
 def __init__(self):
     super(Net, self).__init__()
     if args.mode == 'torch':
         self.main = nn.Sequential(
             nn.Conv2d(3, 64, 3, padding=1, bias=False), nn.ReLU(True),
             TorchLocalAttention(64, 64, 5, 5), nn.ReLU(True),
             TorchLocalAttention(64, 64, 5, 5), nn.ReLU(True),
             TorchLocalAttention(64, 64, 5, 5), nn.ReLU(True),
             nn.Conv2d(64, 3, 3, padding=1, bias=False))
     else:
         self.main = nn.Sequential(
             nn.Conv2d(3, 64, 3, padding=1, bias=False), nn.ReLU(True),
             LocalAttention(64, 64, 5, 5), nn.ReLU(True),
             LocalAttention(64, 64, 5, 5), nn.ReLU(True),
             LocalAttention(64, 64, 5, 5), nn.ReLU(True),
             nn.Conv2d(64, 3, 3, padding=1, bias=False))
예제 #3
0
def test_efficiency_backward(h, w, c, kh, kw):
    x = torch.rand(1, c, h, w).cuda()
    y = torch.rand(1, c, h, w).cuda()
    x.requires_grad_()
    y.requires_grad_()

    torch.cuda.reset_max_memory_allocated()
    z = f_similar(x, y, kh, kw)
    grad = torch.rand(z.size()).cuda()
    z.backward(grad)
    memory = torch.cuda.max_memory_allocated() / 1000000
    x.grad.data.zero_()
    y.grad.data.zero_()
    del z

    torch.cuda.reset_max_memory_allocated()
    z = TorchLocalAttention.f_similar(x, y, kh, kw)
    grad = torch.rand(z.size()).cuda()
    z.backward(grad)
    memory_torch = torch.cuda.max_memory_allocated() / 1000000
    x.grad.data.zero_()
    y.grad.data.zero_()
    del z

    torch.cuda.synchronize()
    t = time.time()
    for i in range(3):
        z = f_similar(x, y, kh, kw)
        z.backward(grad)
        x.grad.data.zero_()
        y.grad.data.zero_()
    torch.cuda.synchronize()
    t = (time.time() - t) / 3
    del z

    torch.cuda.synchronize()
    t_torch = time.time()
    for i in range(3):
        z = TorchLocalAttention.f_similar(x, y, kh, kw)
        z.backward(grad)
        x.grad.data.zero_()
        y.grad.data.zero_()
    torch.cuda.synchronize()
    t_torch = (time.time() - t_torch) / 3
    del z
    print("{:.2f},{:.2f}||{:.5f},{:.5f}".format(memory_torch, memory, t_torch, t))
예제 #4
0
def test_correct(h, w, c, kh, kw):
    x1 = torch.rand(4, c, h, w).cuda()
    y1 = torch.rand(4, c, h, w).cuda()
    x2 = x1.clone()
    y2 = y1.clone()

    x1.requires_grad_()
    y1.requires_grad_()
    x2.requires_grad_()
    y2.requires_grad_()

    z1 = TorchLocalAttention.f_similar(x1, y1, kh, kw)
    z2 = f_similar(x2, y2, kh, kw)

    grad = torch.rand(z1.size()).cuda()

    z1.backward(grad)
    z2.backward(grad)

    err1 = check(z1.data, z2.data)
    err2 = check(x1.grad.data, x2.grad.data)
    err3 = check(y1.grad.data, y2.grad.data)
    print("maximum difference: {:.5f}\t{:.5f}\t{:.5f}".format(err1.item(), err2.item(), err3.item()))