def _univariate_pdf_call(cu_func, data, packed_params, get): ndata = len(data) nparams = len(packed_params) func_regs = cu_func.num_regs packed_params = util.prep_ndarray(packed_params) data_per, params_per = util.tune_blocksize(data, packed_params, func_regs) shared_mem = util.compute_shmem(data, packed_params, data_per, params_per) block_design = (data_per * params_per, 1, 1) grid_design = (util.get_boxes(ndata, data_per), util.get_boxes(nparams, params_per)) # see cufiles/univcaller.cu #gpu_dest = to_gpu(np.zeros((ndata, nparams), dtype=np.float32)) gpu_dest = gpu_empty((ndata, nparams), dtype=np.float32) gpu_data = data if isinstance(data, GPUArray) else to_gpu(data) gpu_packed_params = to_gpu(packed_params) design = np.array(((data_per, params_per) + # block design (len(data),) + packed_params.shape), # params spec dtype=np.int32) cu_func(gpu_dest, gpu_data, gpu_packed_params, design[0], design[1], design[2], design[3], design[4], block=block_design, grid=grid_design, shared=shared_mem) if get: output = gpu_dest.get() if nparams > 1: output = output.reshape((nparams, ndata), order='C').T return output else: return gpu_dest
def _multivariate_pdf_call(cu_func, data, packed_params, get, order, datadim=None): packed_params = util.prep_ndarray(packed_params) func_regs = cu_func.num_regs # Prep the data. Skip if gpudata ... if isinstance(data, GPUArray): padded_data = data if datadim==None: ndata, dim = data.shape else: ndata, dim = data.shape[0], datadim else: ndata, dim = data.shape padded_data = util.pad_data(data) nparams = len(packed_params) data_per, params_per = util.tune_blocksize(padded_data, packed_params, func_regs) blocksize = data_per * params_per #print 'the blocksize is ' + str(blocksize) #print 'data_per ' + str(data_per) + '. params_per ' + str(params_per) shared_mem = util.compute_shmem(padded_data, packed_params, data_per, params_per) block_design = (data_per * params_per, 1, 1) grid_design = (util.get_boxes(ndata, data_per), util.get_boxes(nparams, params_per)) # see cufiles/mvcaller.cu design = np.array(((data_per, params_per) + # block design padded_data.shape + # data spec (dim,) + # non-padded number of data columns packed_params.shape), # params spec dtype=np.int32) if nparams == 1: gpu_dest = gpu_empty(ndata, dtype=np.float32) #gpu_dest = to_gpu(np.zeros(ndata, dtype=np.float32)) else: gpu_dest = gpu_empty((ndata, nparams), dtype=np.float32, order='F') #gpu_dest = to_gpu(np.zeros((ndata, nparams), dtype=np.float32, order='F')) # Upload data if not already uploaded if not isinstance(padded_data, GPUArray): gpu_padded_data = to_gpu(padded_data) else: gpu_padded_data = padded_data gpu_packed_params = to_gpu(packed_params) params = (gpu_dest, gpu_padded_data, gpu_packed_params) + tuple(design) kwds = dict(block=block_design, grid=grid_design, shared=shared_mem) cu_func(*params, **kwds) gpu_packed_params.gpudata.free() if get: if order=='F': return gpu_dest.get() else: return np.asarray(gpu_dest.get(), dtype=np.float32, order='C') #output = gpu_dest.get() #if nparams > 1: # output = output.reshape((nparams, ndata), order='C').T #return output else: if order=='F' or nparams==1: return gpu_dest else: res = gpu_transpose(util.GPUarray_reshape(gpu_dest, (nparams, ndata), "C")) gpu_dest.gpudata.free() return res