def __init__(self, kernel_width=None): self.kernel_type = 'keops' super().__init__(kernel_width) self.gaussian_convolve = generic_sum( "Exp(-G*SqDist(X,Y)) * P", "O = Vx(" + str(Settings().dimension) + ")", "G = Pm(1)", "X = Vx(" + str(Settings().dimension) + ")", "Y = Vy(" + str(Settings().dimension) + ")", "P = Vy(" + str(Settings().dimension) + ")") self.varifold_convolve = generic_sum( "Exp(-(WeightedSqDist(G, X, Y))) * Pow((Nx, Ny), 2) * P", "O = Vx(1)", "G = Pm(1)", "X = Vx(" + str(Settings().dimension) + ")", "Y = Vy(" + str(Settings().dimension) + ")", "Nx = Vx(" + str(Settings().dimension) + ")", "Ny = Vy(" + str(Settings().dimension) + ")", "P = Vy(1)") self.gaussian_convolve_gradient_x = generic_sum( "(Px, Py) * Exp(-G*SqDist(X,Y)) * (X-Y) * ", "O = Vx(" + str(Settings().dimension) + ")", "G = Pm(1)", "X = Vx(" + str(Settings().dimension) + ")", "Y = Vy(" + str(Settings().dimension) + ")", "Px = Vx(" + str(Settings().dimension) + ")", "Py = Vy(" + str(Settings().dimension) + ")")
def kernel_keops(kernel, α, x, β, y, potentials=False, ranges_xx=None, ranges_yy=None, ranges_xy=None): D = x.shape[1] kernel_conv = generic_sum( "(" + kernel + " * B)", # Formula "A = Vi(1)", # Output: a_i "X = Vi({})".format(D), # 1st input: x_i "Y = Vj({})".format(D), # 2nd input: y_j "B = Vj(1)") # 3rd input: b_j a_x = kernel_conv(double_grad(x), x.detach(), α.detach().view(-1, 1), ranges=ranges_xx) b_y = kernel_conv(double_grad(y), y.detach(), β.detach().view(-1, 1), ranges=ranges_yy) b_x = kernel_conv(x, y, β.view(-1, 1), ranges=ranges_xy) if potentials: a_y = kernel_conv(y, x, α.view(-1, 1), ranges=swap_axes(ranges_xy)) return a_x - b_x, b_y - a_y else: # Return the Kernel norm. N.B.: we assume that 'kernel' is symmetric: return .5 * scal( double_grad(α), a_x ) \ + .5 * scal( double_grad(β), b_y ) - scal( α, b_x )
def kernel_keops(kernel, α, x, β, y, ranges_xx=None, ranges_yy=None, ranges_xy=None): D = x.shape[1] kernel_conv = generic_sum( "(" + kernel + " * B)", # Formula "A = Vx(1)", # Output: a_i "X = Vx({})".format(D), # 1st input: x_i "Y = Vy({})".format(D), # 2nd input: y_j "B = Vy(1)") # 3rd input: b_j a_i = kernel_conv(double_grad(x), x.detach(), α.detach().view(-1, 1), ranges=ranges_xx) b_j = kernel_conv(double_grad(y), y.detach(), β.detach().view(-1, 1), ranges=ranges_yy) b_i = kernel_conv(x, y, β.view(-1, 1), ranges=ranges_xy) # N.B.: we assume that 'kernel' is symmetric: return .5 * scal( double_grad(α), a_i ) \ + .5 * scal( double_grad(β), b_j ) - scal( α, b_i )
def Projection_ops(p, ε, x_i, y_j): "Normalization weights for the Barycenter ops." if backend == "keops": # Memory-efficient GPU implementation # We create a KeOps GPU routine... if p == 1: formula = "Exp(Fj + Gi - (Sqrt(SqDist(Xi,Yj))/ E))" elif p == 2: formula = "Exp(Fj + Gi - (SqDist(Xi,Yj) / E))" else: formula = "Exp( Fj + Gi - (Powf(SqDist(Xi,Yj),R)/ E) )" raise (NotImplementedError( "I should fix the derivative at 0 of Powf, in KeOps's core.")) D = x_i.shape[1] # Dimension of the ambient space (typically 2 or 3) routine = generic_sum( formula, "outi = Vx(1)", # Formula, output... # and input variables : ε, x_i, y_j, f_j, p/2 given with their respective dimensions "E = Pm(1)", "Xi = Vx({})".format(D), "Yj = Vy({})".format(D), "Fj = Vy(1)", "Gi = Vx(1)", "R=Pm(1)") # Before wrapping it up in a simple pair of operators - don't forget the minus! ε, r = torch.Tensor([ε]).type_as(x_i), torch.Tensor([p / 2 ]).type_as(x_i) P_x = lambda f_i, g_j: routine(ε, y_j, x_i, f_i, g_j, r) P_y = lambda f_j, g_i: routine(ε, x_i, y_j, f_j, g_i, r) return P_x, P_y elif backend == "pytorch": raise NotImplementedError()
def conv(k, x_i, y_j, β_j) : k_name, s = k if backend == "keops" : # Memory-efficient GPU implementation : ONline map-reduce # We create a KeOps GPU routine... s2v = lambda g : torch.Tensor([g]).type_as(x_i) if k_name == "energy" : formula = " - Sqrt(SqDist(Xi,Yj)) * Bj" ; g = s2v(1.) elif k_name == "gaussian" : formula = "Exp( -G*SqDist(Xi,Yj) ) * Bj" ; g = s2v(1/s**2) elif k_name == "laplacian": formula = "Exp( -G*Sqrt(SqDist(Xi,Yj)) ) * Bj" ; g = s2v(1/s) else : raise NotImplementedError() D = x_i.shape[1] # Dimension of the ambient space (typically 2 or 3) routine = generic_sum( formula, "out_i = Vx(1)", # Formula, output... # and input variables : g, x_i, y_j, β_j, given with their respective dimensions "G = Pm(1)", "Xi = Vx({})".format(D), "Yj = Vy({})".format(D), "Bj = Vy(1)") # ...Before applying it to our data: return routine(g, x_i, y_j, β_j) elif backend == "pytorch" : # Naive matrix-vector implementation : OFFline map-reduce XmY2 = ( (x_i.unsqueeze(1) - y_j.unsqueeze(0)) ** 2).sum(2) if k_name == "energy" : K = -XmY2.sqrt() elif k_name == "gaussian" : K = (-XmY2 / s**2).exp() elif k_name == "laplacian": K = (-XmY2.sqrt() / s).exp() else : raise NotImplementedError() return K @ β_j
def gaussianconv_keops(x, y, b): fun = generic_sum("Exp(-SqDist(X,Y)) * B", # Formula "A = Vi(1)", # Output "X = Vi({})".format(D), # 1st argument "Y = Vj({})".format(D), # 2nd argument "B = Vj(1)" ) # 3rd argument backend = 'GPU' if use_cuda else 'CPU' return fun(x, y, b, backend=backend)
def __init__(self, kernel_width=None, device=default.deformation_kernel_device, **kwargs): if device.lower() == 'cuda': device = 'GPU' super().__init__('keops', kernel_width, device) self.gamma = 1. / default.tensor_scalar_type([self.kernel_width**2]) self.gaussian_convolve = [] self.point_cloud_convolve = [] self.varifold_convolve = [] self.gaussian_convolve_gradient_x = [] for dimension in [2, 3]: self.gaussian_convolve.append( generic_sum("Exp(-G*SqDist(X,Y)) * P", "O = Vx(" + str(dimension) + ")", "G = Pm(1)", "X = Vx(" + str(dimension) + ")", "Y = Vy(" + str(dimension) + ")", "P = Vy(" + str(dimension) + ")")) self.point_cloud_convolve.append( generic_sum("Exp(-G*SqDist(X,Y)) * P", "O = Vx(1)", "G = Pm(1)", "X = Vx(" + str(dimension) + ")", "Y = Vy(" + str(dimension) + ")", "P = Vy(1)")) self.varifold_convolve.append( generic_sum( "Exp(-(WeightedSqDist(G, X, Y))) * Square((Nx|Ny)) * P", "O = Vx(1)", "G = Pm(1)", "X = Vx(" + str(dimension) + ")", "Y = Vy(" + str(dimension) + ")", "Nx = Vx(" + str(dimension) + ")", "Ny = Vy(" + str(dimension) + ")", "P = Vy(1)")) self.gaussian_convolve_gradient_x.append( generic_sum("(Px|Py) * Exp(-G*SqDist(X,Y)) * (X-Y)", "O = Vx(" + str(dimension) + ")", "G = Pm(1)", "X = Vx(" + str(dimension) + ")", "Y = Vy(" + str(dimension) + ")", "Px = Vx(" + str(dimension) + ")", "Py = Vy(" + str(dimension) + ")"))
def gaussianconv_keops(x, y, b, backend="GPU", **kwargs): """(B,N,D), (B,N,D), (B,N,1) -> (B,N,1)""" fun = generic_sum( "Exp(-SqDist(X,Y)) * B", # Formula "A = Vi(1)", # Output "X = Vi({})".format(D), # 1st argument "Y = Vj({})".format(D), # 2nd argument "B = Vj(1)", # 3rd argument ) return fun(x, y, b, backend=backend)
def expscalprod_keops_nochunks(x, y, b): D = x.shape[1] fun = generic_sum( "Exp(X|Y) * B", # Formula "A = Vi(1)", # Output "X = Vi({})".format(D), # 1st argument "Y = Vj({})".format(D), # 2nd argument "B = Vj(1)", # 3rd argument enable_chunks=False) backend = 'GPU' if use_cuda else 'CPU' return fun(x, y, b, backend=backend)
def gaussianconv_keops(x, y, b): D = x.shape[1] fun = generic_sum( "Exp(X|Y) * B", # Formula "A = Vi(1)", # Output "X = Vi({})".format(D), # 1st argument "Y = Vj({})".format(D), # 2nd argument "B = Vj(1)") # 3rd argument backend = 'GPU' if use_cuda else 'CPU' ex = (-(x * x).sum(-1)).exp()[:, None] ey = (-(y * y).sum(-1)).exp()[:, None] return ex * fun(2 * x, y, b * ey, backend=backend)
def gaussianconv_keops(x, y, b, backend="GPU", **kwargs): D = x.shape[-1] fun = generic_sum( "Exp(X|Y) * B", # Formula "A = Vi(1)", # Output "X = Vi({})".format(D), # 1st argument "Y = Vj({})".format(D), # 2nd argument "B = Vj(1)", # 3rd argument ) ex = (-(x * x).sum(-1)).exp()[:, :, None] ey = (-(y * y).sum(-1)).exp()[:, :, None] return ex * fun(2 * x, y, b * ey, backend=backend)
def gaussianconv_keops_nochunks(x, y, b): D = x.shape[1] fun = generic_sum( "Exp(X|Y) * B", # Formula "A = Vi(1)", # Output "X = Vi({})".format(D), # 1st argument "Y = Vj({})".format(D), # 2nd argument "B = Vj(1)", # 3rd argument enable_chunks=False, ) backend = "GPU" if use_cuda else "CPU" ex = (-(x * x).sum(-1)).exp()[:, None] ey = (-(y * y).sum(-1)).exp()[:, None] return ex * fun(2 * x, y, b * ey, backend=backend)
def Barycenters_ops(p, ε, x_i, y_j): """ Given: - an exponent p = 1 or 2 - a regularization strength ε > 0 - point clouds x_i and y_j, encoded as N-by-D and M-by-D torch arrays, Returns a pair of routines R_x, R_y such that [R_x(f_i, g_j)]_j = sum_i exp( f_i + g_j - |x_i-y_j|^p / ε ) * (x_i-y_j) [R_y(f_j, g_i)]_i = sum_j exp( f_j + g_i - |x_i-y_j|^p / ε ) * (y_j-x_i) This may look like a strange level of abstraction, but it is the most convenient way of working with KeOps and Vanilla pytorch (with a pre-computed cost matrix) at the same time. """ if backend == "keops": # Memory-efficient GPU implementation # We create a KeOps GPU routine... if p == 1: formula = "Exp(Fj + Gi - (Sqrt(SqDist(Xi,Yj))/ E)) * (Yj-Xi)" elif p == 2: formula = "Exp(Fj + Gi - (SqDist(Xi,Yj) / E)) * (Yj-Xi)" else: formula = "Exp( Fj + Gi - (Powf(SqDist(Xi,Yj),R)/ E) ) * (Yj-Xi)" raise (NotImplementedError( "I should fix the derivative at 0 of Powf, in KeOps's core.")) D = x_i.shape[1] # Dimension of the ambient space (typically 2 or 3) routine = generic_sum( formula, "outi = Vx({})".format(D), # Formula, output... # and input variables : ε, x_i, y_j, f_j, p/2 given with their respective dimensions "E = Pm(1)", "Xi = Vx({})".format(D), "Yj = Vy({})".format(D), "Fj = Vy(1)", "Gi = Vx(1)", "R=Pm(1)") # Before wrapping it up in a simple pair of operators - don't forget the minus! ε, r = torch.Tensor([ε]).type_as(x_i), torch.Tensor([p / 2 ]).type_as(x_i) R_x = lambda f_i, g_j: routine(ε, y_j, x_i, f_i, g_j, r) R_y = lambda f_j, g_i: routine(ε, x_i, y_j, f_j, g_i, r) return R_x, R_y elif backend == "pytorch": raise NotImplementedError()
def my_formula(p, x, y, backend="auto"): """ Applies a custom formula on the torch variables P, X and Y. Two backends are provided, so that we can check the correctness of both implementations. """ if backend == "pytorch": # Vanilla PyTorch implementation =================== scals = (x @ y.t())**2 # Memory-intensive computation! a = p[0] * scals.sum(1).view(-1,1) * x \ + p[1] * (scals @ y) return a else: # KeOps implementation ================================================ # We now expose the low-level syntax of KeOps. # The library relies on vector "Variables" which can be either: # - indexed by "i" ("x" variables, category 0) # - indexed by "j" ("y" variables, category 1) # - constant across the reduction ("parameters", category 2) # # First of all, we must define a "who's who" list of the variables used, # by specifying their categories, index in the arguments' list, and dimensions: types = [ "A = Vx(" + str(x.shape[1]) + ") ", # output, indexed by i, dim D. "P = Pm(2)", # 1st argument, a parameter, dim 2. "X = Vx(" + str(x.shape[1]) + ") ", # 2nd argument, indexed by i, dim D. "Y = Vy(" + str(y.shape[1]) + ") " ] # 3rd argument, indexed by j, dim D. # The actual formula: # a_i = (<x_i,y_j>**2) * ( p[0]*x_i + p[1]*y_j ) formula = "Pow( (X|Y) , 2) * ( (Elem(P,0) * X) + (Elem(P,1) * Y) )" my_routine = generic_sum(formula, *types) a = my_routine(p, x, y, backend=backend) return a
# We should simply come back to the expression of :math:`\pi_{i,j}` # and write: # # .. math:: # \text{Lab}_i ~&=~ \sum_{j=1}^M \exp \tfrac{1}{\varepsilon}[f_i+ g_j - \text{C}(x_i,y_j)] \cdot \beta_j \ell_j \\ # &=~ \frac{1}{M} \sum_{j=1}^M \exp \tfrac{1}{\varepsilon}[f_i+ g_j - \tfrac{1}{2}\|x_i-y_j\|^2] \cdot \ell_j. # from pykeops.torch import generic_sum # Define our KeOps CUDA kernel: transfer = generic_sum( "Exp( (F_i + G_j - IntInv(2)*SqDist(X_i,Y_j)) / E ) * L_j", # See the formula above "Lab = Vi(3)", # Output: one vector of size 3 per line "E = Pm(1)", # 1st arg: a scalar parameter, the temperature "X_i = Vi(2)", # 2nd arg: one 2d-point per line "Y_j = Vj(2)", # 3rd arg: one 2d-point per column "F_i = Vi(1)", # 4th arg: one scalar value per line "G_j = Vj(1)", # 5th arg: one scalar value per column "L_j = Vj(3)") # 6th arg: one vector of size 3 per column # And apply it on the data (KeOps is pretty picky on the input shapes...): labels_i = transfer( torch.Tensor([blur**2]).type(dtype), X_i, Y_j, F_i.view(-1, 1), G_j.view(-1, 1), l_j) / M ############################################### # That's it! We may now display our target point cloud :math:`(x_i)` # with its new set of labels: # sphinx_gallery_thumbnail_number = 2
def benchmark(bench_name, N, dev, backend, loops=10, enable_GC=True, fidelity=None): importlib.reload(torch) device = torch.device(dev) x_i = torch.randn(N, D, dtype=torch.float32, device=device, requires_grad=True) y_j = torch.randn(N, D, dtype=torch.float32, device=device) α_i = torch.randn(N, 1, dtype=torch.float32, device=device) β_j = torch.randn(N, 1, dtype=torch.float32, device=device) α_i = α_i.abs() β_j = β_j.abs() α_i = α_i / α_i.sum() β_j = β_j / β_j.sum() s2v = lambda x: torch.tensor([x], dtype=torch.float32, device=device) def scal(α, f): return torch.dot(α.view(-1), f.view(-1)) if bench_name == "energy_distance": keops_conv = generic_sum( "Sqrt(SqDist(Xi,Yj))* Bj", "out_i = Vx(1)", # Formula, output... # and input variables : x_i, y_j, β_j, given with their respective dimensions "Xi = Vx({})".format(D), "Yj = Vy({})".format(D), "Bj = Vy(1)") def vanilla_conv(x, y, β): XmY2 = ((x.unsqueeze(1) - y.unsqueeze(0))**2).sum(2) K = XmY2.sqrt() return K @ β def bench(α, x, β, y): if backend == "GPU_1D": conv = keops_conv elif backend == "pytorch": conv = vanilla_conv cost = scal(α, conv(x, y, β) - .5 * conv(x, x, α)) - .5 * scal(β, conv(y, y, β)) cost.backward() return cost code = '_ = bench(α_i,x_i,β_j,y_j)' task = "Energy Distances" if bench_name == "LogSumExp": keops_lse = generic_logsumexp( "Sqrt(SqDist(Xi,Yj))", "out_i = Vx(1)", # Formula, output... # and input variables : x_i, y_j, β_j, given with their respective dimensions "Xi = Vx({})".format(D), "Yj = Vy({})".format(D)) def lse(v_ij): """[lse(v_ij)]_i = log sum_j exp(v_ij), with numerical accuracy.""" V_i = torch.max(v_ij, 1)[0].view(-1, 1) return V_i + (v_ij - V_i).exp().sum(1).log().view(-1, 1) def vanilla_lse(x, y): XmY2 = ((x.unsqueeze(1) - y.unsqueeze(0))**2).sum(2) K = XmY2.sqrt() return lse(K) def bench(x, y): if backend == "GPU_1D": return keops_lse(x, y) elif backend == "pytorch": return vanilla_lse(x, y) else: raise NotImplementedError() code = '_ = bench(x_i,y_j)' task = "LSEs" elif bench_name == "fidelities": from divergences import kernel_divergence, regularized_ot, hausdorff_divergence, sinkhorn_divergence if fidelity == "energy_distance": params = ("energy", None) code = "c = kernel_divergence(α_i,x_i, β_j,y_j, k=params ) ; c.backward()" elif fidelity == "hausdorff": params = { "p": 1, "eps": .1, "nits": 3, "tol": 0., } code = "c = hausdorff_divergence(α_i,x_i, β_j,y_j, **params ) ; c.backward()" elif fidelity == "sinkhorn": params = { "p": 1, "eps": .1, "nits": (20, 3), "assume_convergence": True, # This is true in practice, and lets us win a x2 factor "tol": 0., } code = "c = sinkhorn_divergence(α_i,x_i, β_j,y_j, **params ) ; c.backward()" elif fidelity == "sinkhorn_nocv": params = { "p": 1, "eps": .1, "nits": (20, 3), "assume_convergence": False, "tol": 0., } code = "c = sinkhorn_divergence(α_i,x_i, β_j,y_j, **params ) ; c.backward()" task = "fidelities" exec(code, locals()) import gc GC = 'gc.enable();' if enable_GC else 'pass;' print("{:3} NxN {}, with N ={:7}: {:3}x".format(loops, task, N, loops), end="") exec(code, locals()) # Warmup run elapsed = timeit.Timer(code, GC, globals=locals(), timer=time.time).timeit(loops) print("{:3.6f}s".format(elapsed / loops)) return elapsed / loops
#--------------------------------------------------------------# # Kernel # #--------------------------------------------------------------# formula = "Square(p-a)*Exp(x+y)" types = [ "output = Vx(3)", # The result is indexed by "i", of size 3. "p = Pm(1)", # First arg : Parameter, of size 1 (scalar) "a = Vy(1)", # Second arg : j-variable, of size 1 (scalar) "x = Vx(3)", # Third arg : i-variable, of size 3 "y = Vy(3)" ] # Fourth arg : j-variable, of size 3 start = time.time() my_routine = generic_sum(formula, *types) c = my_routine(p, a, x, y, backend="CPU") # N.B.: If CUDA is available + backend="auto" (or not specified) + the arrays are large enough, # KeOps will load the data on the GPU + compute + unload the result back to the CPU, # as it is assumed to be more efficient. # By specifying backend="CPU", we make sure that the result is computed # using a simple C++ for loop. print("Time to compute the convolution operation on the cpu : ", round(time.time() - start, 2), "s") #--------------------------------------------------------------# # Gradient # #--------------------------------------------------------------#
KeOps ===== """ import torch import numpy as np from time import time nits = 10 Ns, D = [10000, 100000, 1000000], 3 from pykeops.torch import generic_sum KP = generic_sum( "Exp(-SqDist(X,Y)) * B", # Formula "A = Vi(1)", # Output "X = Vi({})".format(D), # 1st argument "Y = Vj({})".format(D), # 2nd argument "B = Vj(1)") # 3rd argument for N in Ns: # Generate the data x = torch.randn(N, D).cuda() y = torch.randn(N, D).cuda() p = torch.randn(N, 1).cuda() # First run just in case... p = KP(x, y, p) # Timings for KeOps start = time()