def solve(self, wt_n, y_nd, bend_coef, rot_coef,f_res): assert y_nd.shape == (self.n, self.d) assert bend_coef in self.bend_coefs assert np.allclose(rot_coef, self.rot_coef) assert self.valid self.initialize_solver(bend_coef, wt_n) gemm(self.QN_gpu, self.WQN_gpu, self.NHN_gpu, transa='T', alpha=1, beta=1) lhs = self.NHN_gpu.get() wy_nd = wt_n[:, None] * y_nd rhs = self.NR + self.QN.T.dot(wy_nd) z = scipy.linalg.solve(lhs, rhs) theta = self.N.dot(z) set_ThinPlateSpline(f_res, self.x_nd, theta)
def solve(self, wt_n, y_nd, bend_coef, f_res): if y_nd.shape[0] != self.n or y_nd.shape[1] != self.d: raise RuntimeError( "The dimensions of y_nd doesn't match the dimensions of x_nd") if not y_nd.flags.c_contiguous: raise RuntimeError("Expected y_nd to be c-contiguous but it isn't") self.sqrtWQN_gpu.set_async(np.sqrt(wt_n)[:, None] * self.QN) geam(self.NKN_gpu, self.NRN_gpu, self.lhs_gpu, alpha=bend_coef, beta=1) gemm(self.sqrtWQN_gpu, self.sqrtWQN_gpu, self.lhs_gpu, transa='T', alpha=1, beta=1) drv.memcpy_dtod_async(self.rhs_gpu.gpudata, self.NR_gpu.gpudata, self.rhs_gpu.nbytes) self.y_dnW_gpu.set_async( y_nd.T * wt_n) # use transpose so that it is f_contiguous gemm(self.QN_gpu, self.y_dnW_gpu, self.rhs_gpu, transa='T', transb='T', alpha=1, beta=1) if lfd.registration._has_cula: culinalg.cho_solve(self.lhs_gpu, self.rhs_gpu) z = self.rhs_gpu.get() culinalg.dot(self.N_gpu, self.rhs_gpu, out=self.theta_gpu) theta = self.theta_gpu.get() else: # if cula is not install perform the last two computations in the CPU z = np.linalg.solve(self.lhs_gpu.get(), self.rhs_gpu.get()) theta = self.N.dot(z) f_res.update(self.x_nd, y_nd, bend_coef, self.rot_coef, wt_n, theta, N=self.N, z=z)
def solve(self, wt_n, y_nd, bend_coef, f_res): if y_nd.shape[0] != self.n or y_nd.shape[1] != self.d: raise RuntimeError("The dimensions of y_nd doesn't match the dimensions of x_nd") if not y_nd.flags.c_contiguous: raise RuntimeError("Expected y_nd to be c-contiguous but it isn't") self.sqrtWQN_gpu.set_async(np.sqrt(wt_n)[:,None] * self.QN) geam(self.NKN_gpu, self.NRN_gpu, self.lhs_gpu, alpha=bend_coef, beta=1) gemm(self.sqrtWQN_gpu, self.sqrtWQN_gpu, self.lhs_gpu, transa='T', alpha=1, beta=1) drv.memcpy_dtod_async(self.rhs_gpu.gpudata, self.NR_gpu.gpudata, self.rhs_gpu.nbytes) self.y_dnW_gpu.set_async(y_nd.T * wt_n) # use transpose so that it is f_contiguous gemm(self.QN_gpu, self.y_dnW_gpu, self.rhs_gpu, transa='T', transb='T', alpha=1, beta=1) if lfd.registration._has_cula: culinalg.cho_solve(self.lhs_gpu, self.rhs_gpu) culinalg.dot(self.N_gpu, self.rhs_gpu, out=self.theta_gpu) theta = self.theta_gpu.get() else: # if cula is not install perform the last two computations in the CPU z = np.linalg.solve(self.lhs_gpu.get(), self.rhs_gpu.get()) theta = self.N.dot(z) f_res.set_ThinPlateSpline(self.x_nd, y_nd, bend_coef, self.rot_coef, wt_n, theta=theta)