def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, prog_bar=None): """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation of forward-euler integration. Function requires a working :mod:`numbapro` installation. It is typically slower compared to :func:`kern_MKL_sparse` but it depends on your hardware. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ calc_precision = None if config['CUDA_precision'] == 32: calc_precision = np.float32 elif config['CUDA_precision'] == 64: calc_precision = np.float64 else: raise Exception("kern_CUDA_dense(): Unknown precision specified.") #======================================================================= # Setup GPU stuff and upload data to it #======================================================================= try: from numbapro.cudalib.cublas import Blas # @UnresolvedImport from numbapro import cuda, float32 # @UnresolvedImport except ImportError: raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cubl = Blas() m, n = int_m.shape stream = cuda.stream() cu_int_m = cuda.to_device(int_m.astype(calc_precision), stream) cu_dec_m = cuda.to_device(dec_m.astype(calc_precision), stream) cu_curr_phi = cuda.to_device(phi.astype(calc_precision), stream) cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision) for step in xrange(nsteps): if prog_bar: prog_bar.update(step) cubl.gemv(trans='T', m=m, n=n, alpha=float32(1.0), A=cu_int_m, x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi) cubl.gemv(trans='T', m=m, n=n, alpha=float32(rho_inv[step]), A=cu_dec_m, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi) cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi) return cu_curr_phi.copy_to_host()
def sl_mst_lifetime_gpu(dest, weight, fe, od, disconnect_weight = None, MAX_TPB = 256, stream = None): """ Input are device arrays. Inputs: dest, weight, fe : device arrays disconnect_weight : weight between unconnected vertices mst : list of edges in MST MAX_TPB : number of threads per block stream : CUDA stream to use TODO: - argmax is from cuBlas and only works with 32/64 floats. Make this work with any type. - """ if disconnect_weight is None: disconnect_weight = weight.max() if stream is None: myStream = cuda.stream() else: myStream = stream mst, n_edges = boruvka_minho_gpu(dest, weight, fe, od, MAX_TPB=MAX_TPB, stream=myStream, returnDevAry=True) # Allocate array for the mst weights. h_n_edges = int(n_edges.getitem(0, stream=myStream)) # edges to keep in MST mst_weights = cuda.device_array(h_n_edges, dtype=weight.dtype) # Get array with only the considered weights in the MST # and remove those edges in the MST edge list mstGrid = compute_cuda_grid_dim(h_n_edges, MAX_TPB) d_weight = cuda.to_device(weight, stream = myStream) getWeightsOfEdges_gpu[mstGrid, MAX_TPB, myStream](mst, n_edges, d_weight, mst_weights) # Sort the MST weights. There are no repeated edges at this # point since the output MST is like a directed graph. sorter = RadixSort(maxcount = mst_weights.size, dtype = mst_weights.dtype, stream = myStream) sortedWeightArgs = sorter.argsort(mst_weights) # Allocate array for the lifetimes. lifetimes = cuda.device_array(mst_weights.size - 1, dtype=mst_weights.dtype) compute_lifetimes_CUDA[mstGrid, MAX_TPB, myStream](mst_weights, lifetimes) maxer = Blas(stream) arg_max_lt = maxer.amax(lifetimes) max_lt = lifetimes.getitem(arg_max_lt) # this is the lifetime between edges with no connection and the weakest link #lt_threshold = disconnect_weight - max_lt lt_threshold = disconnect_weight - mst_weights.getitem(mst_weights.size - 1) # if the maximum lifetime is higher or equal than the lifetime threshold # cut the tree if max_lt >= lt_threshold: # from arg_max_lt onward all edges are discarded n_discarded = lifetimes.size - arg_max_lt + 1 # remove edges removeGrid = compute_cuda_grid_dim(n_discarded, MAX_TPB) removeEdges[removeGrid, MAX_TPB](edgeList, sortedArgs, n_discarded) # compute new amount of edges and update it new_n_edges = h_n_edges - n_discarded cuda.to_device(np.array([new_n_edges], dtype = n_edges.dtype), to = n_edges, stream = myStream) ngraph = getGraphFromEdges_gpu(dest, weight, fe, od, edges = mst, n_edges = n_edges, MAX_TPB = MAX_TPB, stream = myStream) ndest, nweight, nfe, nod = ngraph labels = connected_comps_gpu(ndest, nweight, nfe, nod, MAX_TPB = 512, stream = myStream) del ndest, nweight, nfe, nod, lifetimes return labels
def kern_CUDA_sparse(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, prog_bar=None): """`NVIDIA CUDA cuSPARSE <https://developer.nvidia.com/cusparse>`_ implementation of forward-euler integration. Function requires a working :mod:`numbapro` installation. Note: Currently some bug in :mod:`numbapro` introduces unnecessary array copies and slows down the execution tremendously. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ calc_precision = None if config['CUDA_precision'] == 32: calc_precision = np.float32 elif config['CUDA_precision'] == 64: calc_precision = np.float64 else: raise Exception("kern_CUDA_sparse(): Unknown precision specified.") print ("kern_CUDA_sparse(): Warning, the performance is slower than " + "dense cuBLAS or any type of MKL.") #======================================================================= # Setup GPU stuff and upload data to it #======================================================================= try: from numbapro.cudalib import cusparse # @UnresolvedImport from numbapro.cudalib.cublas import Blas from numbapro import cuda, float32 # @UnresolvedImport except ImportError: raise Exception("kern_CUDA_sparse(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cusp = cusparse.Sparse() cubl = Blas() m, n = int_m.shape int_m_nnz = int_m.nnz int_m_csrValA = cuda.to_device(int_m.data.astype(calc_precision)) int_m_csrRowPtrA = cuda.to_device(int_m.indptr) int_m_csrColIndA = cuda.to_device(int_m.indices) dec_m_nnz = dec_m.nnz dec_m_csrValA = cuda.to_device(dec_m.data.astype(calc_precision)) dec_m_csrRowPtrA = cuda.to_device(dec_m.indptr) dec_m_csrColIndA = cuda.to_device(dec_m.indices) cu_curr_phi = cuda.to_device(phi.astype(calc_precision)) cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision) descr = cusp.matdescr() descr.indexbase = cusparse.CUSPARSE_INDEX_BASE_ZERO for step in xrange(nsteps): if prog_bar and (step % 5 == 0): prog_bar.update(step) cusp.csrmv(trans='T', m=m, n=n, nnz=int_m_nnz, descr=descr, alpha=float32(1.0), csrVal=int_m_csrValA, csrRowPtr=int_m_csrRowPtrA, csrColInd=int_m_csrColIndA, x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi) cusp.csrmv(trans='T', m=m, n=n, nnz=dec_m_nnz, descr=descr, alpha=float32(rho_inv[step]), csrVal=dec_m_csrValA, csrRowPtr=dec_m_csrRowPtrA, csrColInd=dec_m_csrColIndA, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi) cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi) return cu_curr_phi.copy_to_host()