def get_MH_Eval(k): if k not in MH_eval_functions: try: print('trying fast') from function_generator import FunctionGenerator from scipy.special import k0 fast_k0 = FunctionGenerator(k0, 0.0, 1000.0, tol=1e-14, verbose=True) _fast_k0 = fast_k0.get_base_function(check=False) @numba.njit(fastmath=True) def func(sx, sy, tx, ty): dx = tx - sx dy = ty - sy d = np.sqrt(dx * dx + dy * dy) return _fast_k0(k * d) print('fast success') except: @numba.njit(fastmath=True) def func(sx, sy, tx, ty): dx = tx - sx dy = ty - sy d = np.sqrt(dx * dx + dy * dy) return _numba_k0(k * d) MH_eval_functions[k] = func return MH_eval_functions[k]
error_model=error_model) build_time = time.time() - st fa = approx_func(xtest, check_bounds=False) out = np.empty(n, dtype=fa.dtype) st = time.time() fa = approx_func(xtest, check_bounds=False, out=out) approx_func_time1 = time.time() - st # test approximation function with checks fa = approx_func(xtest, check_bounds=True) st = time.time() fa1 = approx_func(xtest, check_bounds=True) approx_func_time2 = time.time() - st # extract serial function, and compile it base_func = approx_func.get_base_function(check=False) @numba.njit(parallel=True, fastmath=True) def func_eval(xs, out): for i in numba.prange(xs.size): out[i] = base_func(xs[i]) fa2 = np.empty_like(fa) func_eval(xtest, fa2) st = time.time() func_eval(xtest, fa2) approx_func_time3 = time.time() - st aerr = np.abs(fa - ft) rerr1 = np.abs(fa - ft) / np.abs(ft) scale = np.abs(ft)
If this fails, no comparison for correctness! On my macbook pro N=50,000 takes the direct method ~7s, the FMM <1s (with N_equiv=64, N_cutoff=500) And gives error <5e-14 """ cpu_num = int(os.cpu_count() / 2) helmholtz_k = 5.0 # fast version of Greens Function k0 = FunctionGenerator(scipy.special.k0, a=1e-20, b=1000, tol=1e-8, n=8) k1 = FunctionGenerator(scipy.special.k1, a=1e-20, b=1000, tol=1e-8, n=8) # extract compilable _k0 = k0.get_base_function() _k1 = k1.get_base_function() # Kernel @numba.njit(fastmath=True) def Eval(sx, sy, tx, ty): dx = tx - sx dy = ty - sy d = np.sqrt(dx**2 + dy**2) scale = 1.0 / (2 * np.pi) return _k0(helmholtz_k * d) * scale # Kernel @numba.njit(fastmath=True)
To compare to If this fails, no comparison for correctness! On my macbook pro N=50,000 takes the direct method ~7s, the FMM <1s (with N_equiv=64, N_cutoff=500) And gives error <5e-14 """ cpu_num = int(os.cpu_count()/2) helmholtz_k = 5.0 # fast version of Greens Function k0 = FunctionGenerator(scipy.special.k0, a=1e-20, b=1000, tol=1e-12, n=12) # extract compilable _k0 = k0.get_base_function() # Modified Helmholtz Kernel @numba.njit(fastmath=True) def Modified_Helmholtz_Eval(sx, sy, tx, ty): dx = tx-sx dy = ty-sy d = np.sqrt(dx**2 + dy**2) scale = 1.0/(2*np.pi) return _k0(helmholtz_k*d)*scale N_source = 1000*20 N_target = 1000*2000 test = 'circle' # clustered or circle or uniform reference_precision = 4
class ScalarGridBackend(object): def __init__(self, gf, fs, ifs, h, spread_width, kernel_kwargs=None, funcgen_tol=1e-10, inline_core=True): """ Backend class for re-usable 'ewald' sum grid evaluation for reusability, the grid must have the same h and the ewald sum must use the same spread width gf: numba callable greens function, gf(r) fs: Fourier symbol for operator, fs(kx, ky) ifs: Inverse Fourier symbol for operator, ifs(kx, ky) h: grid spacing spread_width: width to do spreading on for Laplace, 15 gives ~7 digits 20 gives ~10 digits can't seem to do much better than that, right now kernel_kwargs: dict of arguments to be passed to gf, fs, ifs, tsgf functions funcgen_tol: tolerance for function generator representation of functions used in interior spread funciton. can't seem to beat ~10 digits overall now, so no real reason to do more than that inline_core: whether to inline the function generator functions into the compiled ewald functions (inlining may speed things up but slows compilation time, sometimes dramatically) """ self.kernel_kwargs = {} if kernel_kwargs is None else kernel_kwargs self.gf = lambda r: gf(r, **self.kernel_kwargs) self.fourier_symbol = lambda kx, ky: fs(kx, ky, **self.kernel_kwargs) self.inverse_fourier_symbol = lambda kx, ky: ifs( kx, ky, **self.kernel_kwargs) self.h = h self.spread_width = spread_width self.funcgen_tol = funcgen_tol self.inline_core = inline_core # construct mollifier self.mollifier = SlepianMollifier(2 * self.spread_width) self.ssw = self.spread_width * self.h # screened greens function _excisor_gf = lambda d: excisor(d, 0.0, self.ssw, self.mollifier ) * self.gf(d) try: self.ex_funcgen = FunctionGenerator(_excisor_gf, 0.0, self.ssw, tol=self.funcgen_tol, inline_core=self.inline_core) self.excisor_gf = self.ex_funcgen.get_base_function(check=False) except: raise Exception( 'Failed constructing FunctionGenerator function for mollifier') # construct differential operator applied to residual of screened greens function _sn = 4 * self.spread_width _sgv = np.linspace(0, 4 * self.ssw, _sn, endpoint=False) _sgx, _sgy = np.meshgrid(_sgv, _sgv, indexing='ij') _skv = np.fft.fftfreq(_sn, self.h / (2 * np.pi)) _skx, _sky = np.meshgrid(_skv, _skv, indexing='ij') _slap = self.fourier_symbol(_skx, _sky) pt = np.array([[2 * self.ssw], [2 * self.ssw]]) targ = np.row_stack([_sgx.ravel(), _sgy.ravel()]) u = gf_apply(self.gf, pt[0], pt[1], targ[0], targ[1], np.array([ 1.0, ])).reshape(_sn, _sn) u[_sn // 2, _sn // 2] = 0.0 dist = np.hypot(_sgx - 2 * self.ssw, _sgy - 2 * self.ssw) dec1 = excisor(dist, 0.0, self.ssw, self.mollifier) dec2 = excisor(dist, self.ssw, 2 * self.ssw, self.mollifier) uf = u * (1 - dec1) * dec2 self.do_ufd = ifft2(fft2(uf) * _slap).real # get an interpolater for this _ax = np.linspace(np.pi, 1.5 * np.pi, 1000) _ay = np.repeat(np.pi, _ax.size) _ar = np.linspace(0, self.ssw, _ax.size) _fh = fft2(self.do_ufd) / (_sn * _sn) out = finufft.nufft2d2(_ax, _ay, _fh, isign=1, eps=1e-15, modeord=1) self._do_ufd_interpolater = sp.interpolate.InterpolatedUnivariateSpline( _ar, out.real, k=5, bbox=[0, self.ssw], ext=1) try: self.do_ufd_funcgen = FunctionGenerator( self._do_ufd_interpolater, 0.0, self.ssw, tol=self.funcgen_tol, inline_core=self.inline_core) self.do_ufd_interpolater = self.do_ufd_funcgen.get_base_function( check=False) except: raise Exception( 'Failed constructing FunctionGenerator function for laplacian of greens function times mollifier' ) def initialize_periodic(self): """ Define periodic local evaluator function """ _ex_gf = self.excisor_gf _do_ufd = self.do_ufd_interpolater h = self.h sw = self.spread_width @numba.njit(parallel=True, fastmath=True) def ewald_local_periodic(source, charge, xv, yv): xmin = xv[0] ymin = yv[0] shape = (charge.size, 2 * sw + 2, 2 * sw + 2) fwork1 = np.empty(shape, dtype=numba.float64) fwork2 = np.empty(shape, dtype=numba.float64) iwork1 = np.empty(shape, dtype=numba.int64) iwork2 = np.empty(shape, dtype=numba.int64) bwork1 = np.zeros(shape, dtype=numba.boolean) sh = (xv.size, yv.size) op = np.zeros(sh, dtype=numba.float64) u = np.zeros_like(op) N = source.shape[1] nx = xv.size ny = yv.size md = sw * h for i in numba.prange(N): sx = source[0, i] sy = source[1, i] ch = charge[i] indx = int((sx - xmin) // h) indy = int((sy - ymin) // h) lxi = indx - sw - 1 lyi = indy - sw - 1 hxi = indx + sw + 1 hyi = indy + sw + 1 for ixind, ix in enumerate(range(lxi, hxi)): ixm = ix % nx xvh = xmin + ix * h for iyind, iy in enumerate(range(lyi, hyi)): iym = iy % ny yvh = ymin + iy * h d = np.hypot(xvh - sx, yvh - sy) if d <= md: fwork1[i, ixind, iyind] = _ex_gf(d) * ch fwork2[i, ixind, iyind] = _do_ufd(d) * ch iwork1[i, ixind, iyind] = ixm iwork2[i, ixind, iyind] = iym bwork1[i, ixind, iyind] = True for i in range(N): for ixind in range(2 * sw + 2): for iyind in range(2 * sw + 2): if bwork1[i, ixind, iyind]: ixm = iwork1[i, ixind, iyind] iym = iwork2[i, ixind, iyind] u[ixm, iym] += fwork1[i, ixind, iyind] op[ixm, iym] += fwork2[i, ixind, iyind] return op, u self.ewald_local_periodic = ewald_local_periodic def initialize_freespace(self): """ Define periodic local evaluator function """ _ex_gf = self.excisor_gf _do_ufd = self.do_ufd_interpolater h = self.h sw = self.spread_width @numba.njit(parallel=True) def ewald_local_freespace(source, charge, xv, yv, op, u, op_na): xmin = xv[0] ymin = yv[0] shape = (charge.size, 2 * sw + 2, 2 * sw + 2) fwork1 = np.empty(shape, dtype=numba.float64) fwork2 = np.empty(shape, dtype=numba.float64) iwork1 = np.empty(shape, dtype=numba.int64) iwork2 = np.empty(shape, dtype=numba.int64) bwork1 = np.zeros(shape, dtype=numba.boolean) N = source.shape[1] nx = xv.size ny = yv.size md = sw * h for i in numba.prange(N): sx = source[0, i] sy = source[1, i] ch = charge[i] indx = int((sx - xmin) // h) indy = int((sy - ymin) // h) lxi = indx - sw - 1 lyi = indy - sw - 1 hxi = indx + sw + 1 hyi = indy + sw + 1 for ixind, ix in enumerate(range(lxi, hxi)): xvh = xmin + ix * h for iyind, iy in enumerate(range(lyi, hyi)): yvh = ymin + iy * h d = np.hypot(xvh - sx, yvh - sy) if d <= md: fwork1[i, ixind, iyind] = _ex_gf(d) * ch fwork2[i, ixind, iyind] = _do_ufd(d) * ch iwork1[i, ixind, iyind] = ix + op_na iwork2[i, ixind, iyind] = iy + op_na bwork1[i, ixind, iyind] = True for i in range(N): for ixind in range(2 * sw + 2): for iyind in range(2 * sw + 2): if bwork1[i, ixind, iyind]: ix = iwork1[i, ixind, iyind] iy = iwork2[i, ixind, iyind] u[ix, iy] += fwork1[i, ixind, iyind] op[ix, iy] += fwork2[i, ixind, iyind] self.ewald_local_freespace = ewald_local_freespace def check_periodic(self, xv, yv): self.check_either(xv, yv) def check_freespace(self, xv, yv): self.check_either(xv, yv) if xv.size != yv.size: raise Exception('Square grid required for freespace evaluator') def check_either(self, xv, yv): xh = xv[1] - xv[0] if np.abs(xh - self.h) > 1e-15: raise Exception('h of input xv vector not same as backend') yh = yv[1] - yv[0] if np.abs(yh - self.h) > 1e-15: raise Exception('h of input yv vector not same as backend')
fk0 = FunctionGenerator(k0, 0, 200, tol=1e-14, n=8, mw=1e-15, error_model=relative_error_model) fk1 = FunctionGenerator(k1, 0, 200, tol=1e-14, n=8, mw=1e-15, error_model=relative_error_model) _fk0 = fk0.get_base_function() _fk1 = fk1.get_base_function() @numba.njit() def _fg_k0(x): if x > 200: return 0.0 else: return _fk0(x) @numba.njit() def _fg_k1(x): if x > 200: return 0.0
import mkl mkl.set_num_threads(cpu_num) # Greens Function def GF(x): Y = yn(1, x) S3 = struve(-3, x) S2 = struve(-2, x) return (x * (Y - S3) - 4 * S2) / x**2 # fast version of Greens Function gf = FunctionGenerator(GF, a=1e-30, b=1000, tol=1e-12) # extract compilable _gf = gf.get_base_function() # Kernel @numba.njit(fastmath=True) def Kernel(sx, sy, tx, ty): dx = tx - sx dy = ty - sy d = np.sqrt(dx**2 + dy**2) return _gf(d) N_source = 1000 * 10 N_target = 1000 * 10 # construct some data to run FMM on