def plan_probes(queue, periods, X, Y, tag=None): """ Parameters ---------- P : raggedarray of ints The period (in time-steps) of each probe """ assert len(X) == len(Y) assert len(X) == len(periods) N = len(X) cl_countdowns = to_device(queue, np.zeros(N, dtype="int32")) cl_bufpositions = to_device(queue, np.zeros(N, dtype="int32")) cl_periods = to_device(queue, np.asarray(periods, dtype="int32")) assert X.cl_buf.ocldtype == Y.cl_buf.ocldtype ### N.B. X[i].shape = (ndims[i], ) ### Y[i].shape = (buf_ndims[i], buf_len) for i in xrange(N): assert X.shape0s[i] == Y.shape1s[i] assert X.shape1s[i] == 1 assert X.stride0s[i] == 1 assert Y.stride1s[i] == 1 text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( __global int *countdowns, __global int *bufpositions, __global const int *periods, __global const int *Xstarts, __global const int *Xshape0s, __global const ${Xtype} *Xdata, __global const int *Ystarts, __global ${Ytype} *Ydata ) { const int n = get_global_id(1); const int countdown = countdowns[n]; if (countdown == 0) { const int n_dims = Xshape0s[n]; __global const ${Xtype} *x = Xdata + Xstarts[n]; const int bufpos = bufpositions[n]; __global ${Ytype} *y = Ydata + Ystarts[n] + bufpos * n_dims; for (int ii = get_global_id(0); ii < n_dims; ii += get_global_size(0)) { y[ii] = x[ii]; } // This should *not* cause deadlock because // all local threads guaranteed to be // in this branch together. barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) == 0) { countdowns[n] = periods[n] - 1; bufpositions[n] = bufpos + 1; } } else { barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) == 0) { countdowns[n] = countdown - 1; } } } """ textconf = dict(N=N, Xtype=X.cl_buf.ocldtype, Ytype=Y.cl_buf.ocldtype) text = Template(text, output_encoding="ascii").render(**textconf) full_args = (cl_countdowns, cl_bufpositions, cl_periods, X.cl_starts, X.cl_shape0s, X.cl_buf, Y.cl_starts, Y.cl_buf) _fn = cl.Program(queue.context, text).build().fn _fn.set_args(*[arr.data for arr in full_args]) max_len = min(queue.device.max_work_group_size, max(X.shape0s)) gsize = (max_len, N) lsize = (max_len, 1) rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_probes", tag=tag) rval.full_args = full_args # prevent garbage-collection rval.cl_bufpositions = cl_bufpositions rval.Y = Y return rval
def plan_probes(queue, periods, X, Y, tag=None): """ Parameters ---------- P : raggedarray of ints The period (in time-steps) of each probe """ assert len(X) == len(Y) assert len(X) == len(periods) N = len(X) cl_countdowns = to_device(queue, np.zeros(N, dtype='int32')) cl_bufpositions = to_device(queue, np.zeros(N, dtype='int32')) cl_periods = to_device(queue, np.asarray(periods, dtype='int32')) assert X.cl_buf.ocldtype == Y.cl_buf.ocldtype ### N.B. X[i].shape = (ndims[i], ) ### Y[i].shape = (buf_ndims[i], buf_len) for i in xrange(N): assert X.shape0s[i] == Y.shape1s[i] assert X.shape1s[i] == 1 assert X.stride0s[i] == 1 assert Y.stride1s[i] == 1 text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( __global int *countdowns, __global int *bufpositions, __global const int *periods, __global const int *Xstarts, __global const int *Xshape0s, __global const ${Xtype} *Xdata, __global const int *Ystarts, __global ${Ytype} *Ydata ) { const int n = get_global_id(1); const int countdown = countdowns[n]; if (countdown == 0) { const int n_dims = Xshape0s[n]; __global const ${Xtype} *x = Xdata + Xstarts[n]; const int bufpos = bufpositions[n]; __global ${Ytype} *y = Ydata + Ystarts[n] + bufpos * n_dims; for (int ii = get_global_id(0); ii < n_dims; ii += get_global_size(0)) { y[ii] = x[ii]; } // This should *not* cause deadlock because // all local threads guaranteed to be // in this branch together. barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) == 0) { countdowns[n] = periods[n] - 1; bufpositions[n] = bufpos + 1; } } else { barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) == 0) { countdowns[n] = countdown - 1; } } } """ textconf = dict(N=N, Xtype=X.cl_buf.ocldtype, Ytype=Y.cl_buf.ocldtype) text = Template(text, output_encoding='ascii').render(**textconf) full_args = ( cl_countdowns, cl_bufpositions, cl_periods, X.cl_starts, X.cl_shape0s, X.cl_buf, Y.cl_starts, Y.cl_buf, ) _fn = cl.Program(queue.context, text).build().fn _fn.set_args(*[arr.data for arr in full_args]) max_len = min(queue.device.max_work_group_size, max(X.shape0s)) gsize = ( max_len, N, ) lsize = (max_len, 1) rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_probes", tag=tag) rval.full_args = full_args # prevent garbage-collection rval.cl_bufpositions = cl_bufpositions rval.Y = Y return rval