int j=0; // for (j=0; j<k; j++) // dat.A[idx(i,j,k)] = mt_rand(mtState, idx); j = 4; for (i=0; i<k; i++) dat.B[idx(i,j,n)] = mt_rand(mtState, idx); mmult('t','f',m, n, k, 1.0, dat.A, dat.B, 0.0, C); } """, include_dirs=[mt_rand.get_include_dir(), matrix.get_include_dir()]) # seed the random number generator mt_rand.seed(cuda,mod) cu_mmult = mod.get_function("cu_mat_test") cu_mmult2 = mod.get_function("cu_mat_test2") m = np.uint32(2) k = np.uint32(5) n = np.uint32(10) A = np.zeros((m,k),dtype=np.float32) B = np.zeros((k,n),dtype=np.float32) A[0,:] = np.random.rand(k)
def __init__(self, nitems=2, nbins=1, nsims=1000, data=None, log_shift=.05, log_max=100., mbins=2000, mod_name=1, nreps=1): """ nitems is number of accumulators nsims is number of simulations """ self.data = data self.log_shift = log_shift self.log_max = log_max self.mbins = mbins self.nsims = nsims self.nitems = nitems self.nbins = nbins self.model_id = mod_name self.model_name = mod_name self.nreps = nreps # set default params self._dp = { 'max_time': 2.0, 'K': 0.1, 'L': 0.5, 'U': 0.0, 'eta': 1.0, 'thresh': 1.0, 'dt': .01, 'tau': .1, 'truncate': True, 'r': .1, 'p': 1.0, 'sd0': 2.0, 'sd_min': .01, 'alpha': 0.0 } # set the lengths lengths = { 'nitems': nitems, 'nsims': nsims, 'nbins': nbins, 'model_id': self.model_id, 'hack': '"%d, x[%d]=%f, xout[%d]=%f, t=%d\\n"' } # read in the cuda code code = open(os.path.join(modpath, 'lca_mt_conf.cu.h'), 'r').read() self._mod = SourceModule( code % lengths, no_extern_c=True, options=['-ccbin', 'clang-3.8', '-std=c++11'], include_dirs=[modpath, mt_rand.get_include_dir()]) # get the kernel functions to call self._setup_sim = self._mod.get_function('setup_sim') self._iaccumulate = self._mod.get_function("iaccumulate") # set up the i/o params (order and type matter!) self.io = GPUStruct([ (np.float32, '*out_time', np.zeros((nsims), dtype=np.float32)), (np.float32, '*x_out', np.zeros((nsims, nitems), dtype=np.float32)), (np.float32, '*confidence', np.zeros((nsims), dtype=np.float32)), (np.int32, '*x_ind', np.zeros((nsims), dtype=np.int32)), (np.float32, '*x_init', np.ones((nitems), dtype=np.float32) * ((self._dp['thresh']) * (1 / 3.))), (np.float32, '*bins', np.zeros((nbins, 2), dtype=np.float32)), (np.int32, '*bin_ind', np.zeros((nbins), dtype=np.int32)), (np.float32, 'sd0', self._dp['sd0']), (np.float32, 'sd_min', self._dp['sd_min']), (np.float32, 'r', self._dp['r']), (np.float32, 'p', self._dp['p']), (np.int32, 'max_iter', np.round(self._dp['max_time'] / self._dp['dt'])), (np.float32, 'max_time', self._dp['max_time']), (np.float32, 'K', self._dp['K']), (np.float32, 'L', self._dp['L']), (np.float32, 'U', self._dp['U']), (np.float32, 'eta', self._dp['eta']), (np.float32, 'thresh', self._dp['thresh']), (np.float32, 'alpha', self._dp['alpha']), (np.float32, 'dt', self._dp['dt']), (np.float32, 'tau', self._dp['tau']), (np.float32, 'dt_tau', self._dp['dt'] / self._dp['tau']), (np.float32, 'sqrt_dt_tau', np.sqrt(self._dp['dt'] / self._dp['tau'])), (np.int32, 'truncate', self._dp['truncate']) ]) # do full copy once self.io.copy_to_gpu() # set up the kernel grid parameters bsize = 256 gsize = (nsims / bsize) if gsize * bsize < nsims: gsize += 1 self._gsize = gsize self._bsize = bsize # set up the functions self._setup_sim.prepare('') self._iaccumulate.prepare('P') # setup the simulations mt_rand.seed(cuda, self._mod) timer = self._setup_sim.prepared_timed_call((self._gsize, 1), (self._bsize, 1, 1)) runtime = timer()
__global__ void cu_rand_test(float *x, int N) { unsigned int idx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; if (idx < N) { // initialize the MT MersenneTwisterState mtState; MersenneTwisterInitialise(mtState, idx); // x[idx] = mt_rand(mtState, idx); } } """, include_dirs=[mt_rand.get_include_dir()]) # seed the random number generator mt_rand.seed(cuda, mod) cu_rand = mod.get_function("cu_rand_test") asize = 10000 bsize = 16 a = np.zeros((asize, ), dtype=np.float32) ac = cuda.mem_alloc(a.nbytes) #cuda.memcpy_htod(ac,a) block = (bsize, 1, 1) cu_rand.set_block_shape(*block) cu_rand.param_set(ac, np.int32(asize)) gsize = (asize / bsize) + 1