예제 #1
0
  int j=0;
//  for (j=0; j<k; j++)
//    dat.A[idx(i,j,k)] = mt_rand(mtState, idx);

  j = 4;
  for (i=0; i<k; i++)
    dat.B[idx(i,j,n)] = mt_rand(mtState, idx);

  mmult('t','f',m, n, k, 
	1.0, dat.A, dat.B,
	0.0, C);
}


    """,
    include_dirs=[mt_rand.get_include_dir(),
                  matrix.get_include_dir()])

# seed the random number generator
mt_rand.seed(cuda,mod)

cu_mmult = mod.get_function("cu_mat_test")
cu_mmult2 = mod.get_function("cu_mat_test2")

m = np.uint32(2)
k = np.uint32(5)
n = np.uint32(10)

A = np.zeros((m,k),dtype=np.float32)
B = np.zeros((k,n),dtype=np.float32)
A[0,:] = np.random.rand(k)
예제 #2
0
    def __init__(self,
                 nitems=2,
                 nbins=1,
                 nsims=1000,
                 data=None,
                 log_shift=.05,
                 log_max=100.,
                 mbins=2000,
                 mod_name=1,
                 nreps=1):
        """
        nitems is number of accumulators
        nsims is number of simulations
        """
        self.data = data
        self.log_shift = log_shift
        self.log_max = log_max
        self.mbins = mbins
        self.nsims = nsims
        self.nitems = nitems
        self.nbins = nbins
        self.model_id = mod_name
        self.model_name = mod_name
        self.nreps = nreps

        # set default params
        self._dp = {
            'max_time': 2.0,
            'K': 0.1,
            'L': 0.5,
            'U': 0.0,
            'eta': 1.0,
            'thresh': 1.0,
            'dt': .01,
            'tau': .1,
            'truncate': True,
            'r': .1,
            'p': 1.0,
            'sd0': 2.0,
            'sd_min': .01,
            'alpha': 0.0
        }

        # set the lengths
        lengths = {
            'nitems': nitems,
            'nsims': nsims,
            'nbins': nbins,
            'model_id': self.model_id,
            'hack': '"%d, x[%d]=%f, xout[%d]=%f, t=%d\\n"'
        }

        # read in the cuda code
        code = open(os.path.join(modpath, 'lca_mt_conf.cu.h'), 'r').read()
        self._mod = SourceModule(
            code % lengths,
            no_extern_c=True,
            options=['-ccbin', 'clang-3.8', '-std=c++11'],
            include_dirs=[modpath, mt_rand.get_include_dir()])

        # get the kernel functions to call
        self._setup_sim = self._mod.get_function('setup_sim')
        self._iaccumulate = self._mod.get_function("iaccumulate")

        # set up the i/o params (order and type matter!)
        self.io = GPUStruct([
            (np.float32, '*out_time', np.zeros((nsims), dtype=np.float32)),
            (np.float32, '*x_out', np.zeros((nsims, nitems),
                                            dtype=np.float32)),
            (np.float32, '*confidence', np.zeros((nsims), dtype=np.float32)),
            (np.int32, '*x_ind', np.zeros((nsims), dtype=np.int32)),
            (np.float32, '*x_init', np.ones((nitems), dtype=np.float32) *
             ((self._dp['thresh']) * (1 / 3.))),
            (np.float32, '*bins', np.zeros((nbins, 2), dtype=np.float32)),
            (np.int32, '*bin_ind', np.zeros((nbins), dtype=np.int32)),
            (np.float32, 'sd0', self._dp['sd0']),
            (np.float32, 'sd_min', self._dp['sd_min']),
            (np.float32, 'r', self._dp['r']), (np.float32, 'p', self._dp['p']),
            (np.int32, 'max_iter',
             np.round(self._dp['max_time'] / self._dp['dt'])),
            (np.float32, 'max_time', self._dp['max_time']),
            (np.float32, 'K', self._dp['K']), (np.float32, 'L', self._dp['L']),
            (np.float32, 'U', self._dp['U']),
            (np.float32, 'eta', self._dp['eta']),
            (np.float32, 'thresh', self._dp['thresh']),
            (np.float32, 'alpha', self._dp['alpha']),
            (np.float32, 'dt', self._dp['dt']),
            (np.float32, 'tau', self._dp['tau']),
            (np.float32, 'dt_tau', self._dp['dt'] / self._dp['tau']),
            (np.float32, 'sqrt_dt_tau',
             np.sqrt(self._dp['dt'] / self._dp['tau'])),
            (np.int32, 'truncate', self._dp['truncate'])
        ])

        # do full copy once
        self.io.copy_to_gpu()

        # set up the kernel grid parameters
        bsize = 256
        gsize = (nsims / bsize)
        if gsize * bsize < nsims:
            gsize += 1
        self._gsize = gsize
        self._bsize = bsize

        # set up the functions
        self._setup_sim.prepare('')
        self._iaccumulate.prepare('P')

        # setup the simulations
        mt_rand.seed(cuda, self._mod)
        timer = self._setup_sim.prepared_timed_call((self._gsize, 1),
                                                    (self._bsize, 1, 1))
        runtime = timer()
예제 #3
0
__global__ void cu_rand_test(float *x, int N)
{
  unsigned int idx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
  if (idx < N)
  {
    // initialize the MT
    MersenneTwisterState mtState;
    MersenneTwisterInitialise(mtState, idx);

    //
    x[idx] = mt_rand(mtState, idx);
  }
}

    """,
                   include_dirs=[mt_rand.get_include_dir()])

# seed the random number generator
mt_rand.seed(cuda, mod)

cu_rand = mod.get_function("cu_rand_test")

asize = 10000
bsize = 16
a = np.zeros((asize, ), dtype=np.float32)
ac = cuda.mem_alloc(a.nbytes)
#cuda.memcpy_htod(ac,a)
block = (bsize, 1, 1)
cu_rand.set_block_shape(*block)
cu_rand.param_set(ac, np.int32(asize))
gsize = (asize / bsize) + 1