def kern_MKL_sparse(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, mu_egrid=None, mu_dEdX=None, mu_lidx_nsp=None, prog_bar=None): """`Intel MKL sparse BLAS <https://software.intel.com/en-us/articles/intel-mkl-sparse-blas-overview?language=en>`_ implementation of forward-euler integration. Function requires that the path to the MKL runtime library ``libmkl_rt.[so/dylib]`` defined in the config file. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` grid_idcs (list): indices at which longitudinal solutions have to be saved. prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ from ctypes import cdll, c_int, c_char, POINTER, byref try: mkl = cdll.LoadLibrary(config['MKL_path']) except OSError: raise Exception("kern_MKL_sparse(): MKL runtime library not " + "found. Please check path.") gemv = None axpy = None np_fl = None if config['FP_precision'] == 32: from ctypes import c_float as fl_pr # sparse CSR-matrix x dense vector gemv = mkl.mkl_scsrmv # dense vector + dense vector axpy = mkl.cblas_saxpy np_fl = np.float32 elif config['FP_precision'] == 64: from ctypes import c_double as fl_pr # sparse CSR-matrix x dense vector gemv = mkl.mkl_dcsrmv # dense vector + dense vector axpy = mkl.cblas_daxpy np_fl = np.float64 else: raise Exception("kern_MKL_sparse(): Unknown precision specified.") # Set number of threads mkl.mkl_set_num_threads(byref(c_int(config['MKL_threads']))) # Prepare CTYPES pointers for MKL sparse CSR BLAS int_m_data = int_m.data.ctypes.data_as(POINTER(fl_pr)) int_m_ci = int_m.indices.ctypes.data_as(POINTER(c_int)) int_m_pb = int_m.indptr[:-1].ctypes.data_as(POINTER(c_int)) int_m_pe = int_m.indptr[1:].ctypes.data_as(POINTER(c_int)) dec_m_data = dec_m.data.ctypes.data_as(POINTER(fl_pr)) dec_m_ci = dec_m.indices.ctypes.data_as(POINTER(c_int)) dec_m_pb = dec_m.indptr[:-1].ctypes.data_as(POINTER(c_int)) dec_m_pe = dec_m.indptr[1:].ctypes.data_as(POINTER(c_int)) npphi = np.copy(phi).astype(np_fl) phi = npphi.ctypes.data_as(POINTER(fl_pr)) npdelta_phi = np.zeros_like(npphi) delta_phi = npdelta_phi.ctypes.data_as(POINTER(fl_pr)) trans = c_char('n') npmatd = np.chararray(6) npmatd[0] = 'G' npmatd[3] = 'C' matdsc = npmatd.ctypes.data_as(POINTER(c_char)) m = c_int(int_m.shape[0]) cdzero = fl_pr(0.) cdone = fl_pr(1.) cione = c_int(1) enmuloss = config['enable_muon_energy_loss'] de = mu_egrid.size mu_egrid = mu_egrid.astype(np_fl) mu_dEdX = mu_dEdX.astype(np_fl) muloss_min_step = config['muon_energy_loss_min_step'] lidx, nmuspec = mu_lidx_nsp # Accumulate at least a few g/cm2 for energy loss steps # to avoid numerical errors dXaccum = 0. grid_step = 0 grid_sol = [] from time import time start = time() for step in xrange(nsteps): if prog_bar: prog_bar.update(step) # delta_phi = int_m.dot(phi) gemv(byref(trans), byref(m), byref(m), byref(cdone), matdsc, int_m_data, int_m_ci, int_m_pb, int_m_pe, phi, byref(cdzero), delta_phi) # delta_phi = rho_inv * dec_m.dot(phi) + delta_phi gemv(byref(trans), byref(m), byref(m), byref(fl_pr(rho_inv[step])), matdsc, dec_m_data, dec_m_ci, dec_m_pb, dec_m_pe, phi, byref(cdone), delta_phi) # phi = delta_phi * dX + phi axpy(m, fl_pr(dX[step]), delta_phi, cione, phi, cione) dXaccum += dX[step] if (enmuloss and (dXaccum > muloss_min_step or step == nsteps - 1)): for nsp in xrange(nmuspec): npphi[lidx + de * nsp:lidx + de * (nsp + 1)] = np.interp( mu_egrid, mu_egrid + mu_dEdX * dXaccum, npphi[lidx + de * nsp:lidx + de * (nsp + 1)]) dXaccum = 0. if (grid_idcs and grid_step < len(grid_idcs) and grid_idcs[grid_step] == step): grid_sol.append(np.copy(npphi)) grid_step += 1 if dbg: print "Performance: {0:6.2f}ms/iteration".format( 1e3 * (time() - start) / float(nsteps)) return npphi, grid_sol
def solv_MKL_sparse(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs): # mu_loss_handler): """`Intel MKL sparse BLAS <https://software.intel.com/en-us/articles/intel-mkl-sparse-blas-overview?language=en>`_ implementation of forward-euler integration. Function requires that the path to the MKL runtime library ``libmkl_rt.[so/dylib]`` defined in the config file. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` grid_idcs (list): indices at which longitudinal solutions have to be saved. Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ from ctypes import c_int, c_char, POINTER, byref from mceq_config import mkl gemv = None axpy = None np_fl = None from ctypes import c_double as fl_pr # sparse CSR-matrix x dense vector gemv = mkl.mkl_dcsrmv # dense vector + dense vector axpy = mkl.cblas_daxpy np_fl = np.float64 # Prepare CTYPES pointers for MKL sparse CSR BLAS int_m_data = int_m.data.ctypes.data_as(POINTER(fl_pr)) int_m_ci = int_m.indices.ctypes.data_as(POINTER(c_int)) int_m_pb = int_m.indptr[:-1].ctypes.data_as(POINTER(c_int)) int_m_pe = int_m.indptr[1:].ctypes.data_as(POINTER(c_int)) dec_m_data = dec_m.data.ctypes.data_as(POINTER(fl_pr)) dec_m_ci = dec_m.indices.ctypes.data_as(POINTER(c_int)) dec_m_pb = dec_m.indptr[:-1].ctypes.data_as(POINTER(c_int)) dec_m_pe = dec_m.indptr[1:].ctypes.data_as(POINTER(c_int)) npphi = np.copy(phi).astype(np_fl) phi = npphi.ctypes.data_as(POINTER(fl_pr)) npdelta_phi = np.zeros_like(npphi) delta_phi = npdelta_phi.ctypes.data_as(POINTER(fl_pr)) trans = c_char(b'n') npmatd = np.chararray(6) npmatd[0] = b'G' npmatd[3] = b'C' matdsc = npmatd.ctypes.data_as(POINTER(c_char)) m = c_int(int_m.shape[0]) cdzero = fl_pr(0.) cdone = fl_pr(1.) cione = c_int(1) grid_step = 0 grid_sol = [] from time import time start = time() for step in range(nsteps): # delta_phi = int_m.dot(phi) gemv(byref(trans), byref(m), byref(m), byref(cdone), matdsc, int_m_data, int_m_ci, int_m_pb, int_m_pe, phi, byref(cdzero), delta_phi) # delta_phi = rho_inv * dec_m.dot(phi) + delta_phi gemv(byref(trans), byref(m), byref(m), byref(fl_pr(rho_inv[step])), matdsc, dec_m_data, dec_m_ci, dec_m_pb, dec_m_pe, phi, byref(cdone), delta_phi) # phi = delta_phi * dX + phi axpy(m, fl_pr(dX[step]), delta_phi, cione, phi, cione) if (grid_idcs and grid_step < len(grid_idcs) and grid_idcs[grid_step] == step): grid_sol.append(np.copy(npphi)) grid_step += 1 info( 2, "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) / float(nsteps))) return npphi, np.asarray(grid_sol)
def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, mu_egrid=None, mu_dEdX=None, mu_lidx_nsp=None, prog_bar=None): """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation of forward-euler integration. Function requires a working :mod:`numbapro` installation. It is typically slower compared to :func:`kern_MKL_sparse` but it depends on your hardware. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ fl_pr = None if config['FP_precision'] == 32: fl_pr = np.float32 elif config['FP_precision'] == 64: fl_pr = np.float64 else: raise Exception("kern_CUDA_dense(): Unknown precision specified.") # if config['enable_muon_energyloss']: # raise NotImplementedError('kern_CUDA_dense(): ' + # 'Energy loss not imlemented for this solver.') if config['enable_muon_energy_loss']: raise NotImplementedError( 'kern_CUDA_dense(): ' + 'Energy loss not imlemented for this solver.') #======================================================================= # Setup GPU stuff and upload data to it #======================================================================= try: from accelerate.cuda.blas import Blas from accelerate.cuda import cuda except ImportError: raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cubl = Blas() m, n = int_m.shape stream = cuda.stream() cu_int_m = cuda.to_device(int_m.astype(fl_pr), stream) cu_dec_m = cuda.to_device(dec_m.astype(fl_pr), stream) cu_curr_phi = cuda.to_device(phi.astype(fl_pr), stream) cu_delta_phi = cuda.device_array(phi.shape, dtype=fl_pr) from time import time start = time() for step in xrange(nsteps): if prog_bar: prog_bar.update(step) cubl.gemv(trans='N', m=m, n=n, alpha=fl_pr(1.0), A=cu_int_m, x=cu_curr_phi, beta=fl_pr(0.0), y=cu_delta_phi) cubl.gemv(trans='N', m=m, n=n, alpha=fl_pr(rho_inv[step]), A=cu_dec_m, x=cu_curr_phi, beta=fl_pr(1.0), y=cu_delta_phi) cubl.axpy(alpha=fl_pr(dX[step]), x=cu_delta_phi, y=cu_curr_phi) print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) / float(nsteps)) return cu_curr_phi.copy_to_host(), []