def cl_geometry_and_textconf(self, items, padding=4): p = self max_n_dots = max(len(p.geometry[ii]['dots']) for ii in items) n_structure_vars = 4 * max_n_dots + 5 structure_vars_stride = int( padding * math.ceil(float(n_structure_vars) / padding)) gstructure = np.zeros((len(items), structure_vars_stride), dtype='int32') A_starts = p.A.starts X_starts = p.X.starts Y_starts = p.Y.starts Y_in_starts = p.Y_in.starts A_stride0s = p.A.stride0s A_shape1s = p.A.shape1s Y_shape0s = p.Y.shape0s for bbi, bb in enumerate(items): x_js_i = p.X_js[bb] A_js_i = p.A_js[bb] assert len(x_js_i) == len(A_js_i) for ii, (xi, ai) in enumerate(zip(x_js_i, A_js_i)): assert xi.size == 1 and ai.size == 1 xi, ai = xi[0], ai[0] # to ignore numpy DeprecationWarning gstructure[bbi, 0 * max_n_dots + ii] = X_starts[xi] gstructure[bbi, 1 * max_n_dots + ii] = A_starts[ai] gstructure[bbi, 2 * max_n_dots + ii] = A_stride0s[ai] gstructure[bbi, 3 * max_n_dots + ii] = A_shape1s[ai] # -- offset of output and input buffers gstructure[bbi, 4 * max_n_dots + 0] = Y_in_starts[bb] gstructure[bbi, 4 * max_n_dots + 1] = Y_starts[bb] # -- number of dots for bb gstructure[bbi, 4 * max_n_dots + 2] = len(A_js_i) # -- length of Y[bb] gstructure[bbi, 4 * max_n_dots + 3] = Y_shape0s[bb] gstructure[bbi, 4 * max_n_dots + 4] = bb cl_gstructure = to_device(p.queue, gstructure) textconf = { 'n_structure_vars': n_structure_vars, 'structure_vars_stride': structure_vars_stride, 'x_starts': 'lstructure[0 * %s + ii]' % max_n_dots, 'a_starts': 'lstructure[1 * %s + ii]' % max_n_dots, 'a_s0' : 'lstructure[2 * %s + ii]' % max_n_dots, 'N_i' : 'lstructure[3 * %s + ii]' % max_n_dots, 'y_in_starts': 'lstructure[4 * %s + 0]' % max_n_dots, 'y_offset': 'lstructure[4 * %s + 1]' % max_n_dots, 'n_dot_products': 'lstructure[4 * %s + 2]' % max_n_dots, 'y_len' : 'lstructure[4 * %s + 3]' % max_n_dots, 'bb' : 'lstructure[4 * %s + 4]' % max_n_dots, } return cl_gstructure, textconf
def cl_geometry_and_textconf(self, items, padding=4): p = self max_n_dots = max(len(p.geometry[ii]['dots']) for ii in items) n_structure_vars = 4 * max_n_dots + 5 structure_vars_stride = int( padding * math.ceil(float(n_structure_vars) / padding)) gstructure = np.zeros((len(items), structure_vars_stride), dtype='int32') A_starts = p.A.starts X_starts = p.X.starts Y_starts = p.Y.starts Y_in_starts = p.Y_in.starts A_stride0s = p.A.stride0s A_shape1s = p.A.shape1s Y_shape0s = p.Y.shape0s for bbi, bb in enumerate(items): x_js_i = p.X_js[bb] A_js_i = p.A_js[bb] assert len(x_js_i) == len(A_js_i) for ii, (xi, ai) in enumerate(zip(x_js_i, A_js_i)): gstructure[bbi, 0 * max_n_dots + ii] = X_starts[xi] gstructure[bbi, 1 * max_n_dots + ii] = A_starts[ai] gstructure[bbi, 2 * max_n_dots + ii] = A_stride0s[ai] gstructure[bbi, 3 * max_n_dots + ii] = A_shape1s[ai] # -- offset of output and input buffers gstructure[bbi, 4 * max_n_dots + 0] = Y_in_starts[bb] gstructure[bbi, 4 * max_n_dots + 1] = Y_starts[bb] # -- number of dots for bb gstructure[bbi, 4 * max_n_dots + 2] = len(A_js_i) # -- length of Y[bb] gstructure[bbi, 4 * max_n_dots + 3] = Y_shape0s[bb] gstructure[bbi, 4 * max_n_dots + 4] = bb cl_gstructure = to_device(p.queue, gstructure) textconf = { 'n_structure_vars': n_structure_vars, 'structure_vars_stride': structure_vars_stride, 'x_starts': 'lstructure[0 * %s + ii]' % max_n_dots, 'a_starts': 'lstructure[1 * %s + ii]' % max_n_dots, 'a_s0' : 'lstructure[2 * %s + ii]' % max_n_dots, 'N_i' : 'lstructure[3 * %s + ii]' % max_n_dots, 'y_in_starts': 'lstructure[4 * %s + 0]' % max_n_dots, 'y_offset': 'lstructure[4 * %s + 1]' % max_n_dots, 'n_dot_products': 'lstructure[4 * %s + 2]' % max_n_dots, 'y_len' : 'lstructure[4 * %s + 3]' % max_n_dots, 'bb' : 'lstructure[4 * %s + 4]' % max_n_dots, } return cl_gstructure, textconf
def float_cl_clra(queue, arg, cl_dtype, N): float_arg = None cl_arg = None clra_arg = None if isinstance(arg, CLRaggedArray): clra_arg = arg assert arg.dtype == cl_dtype elif isinstance(arg, float): float_arg = arg elif len(set(arg)) == 1: float_arg = arg[0] else: host_arg = np.asarray(arg, cl_dtype) assert host_arg.shape == (N,) cl_arg = to_device(queue, host_arg) return float_arg, cl_arg, clra_arg
def plan_probes(queue, periods, X, Y, tag=None): """ Parameters ---------- P : raggedarray of ints The period (in time-steps) of each probe """ assert len(X) == len(Y) assert len(X) == len(periods) N = len(X) cl_countdowns = to_device(queue, np.zeros(N, dtype="int32")) cl_bufpositions = to_device(queue, np.zeros(N, dtype="int32")) cl_periods = to_device(queue, np.asarray(periods, dtype="int32")) assert X.cl_buf.ocldtype == Y.cl_buf.ocldtype ### N.B. X[i].shape = (ndims[i], ) ### Y[i].shape = (buf_ndims[i], buf_len) for i in xrange(N): assert X.shape0s[i] == Y.shape1s[i] assert X.shape1s[i] == 1 assert X.stride0s[i] == 1 assert Y.stride1s[i] == 1 text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( __global int *countdowns, __global int *bufpositions, __global const int *periods, __global const int *Xstarts, __global const int *Xshape0s, __global const ${Xtype} *Xdata, __global const int *Ystarts, __global ${Ytype} *Ydata ) { const int n = get_global_id(1); const int countdown = countdowns[n]; if (countdown == 0) { const int n_dims = Xshape0s[n]; __global const ${Xtype} *x = Xdata + Xstarts[n]; const int bufpos = bufpositions[n]; __global ${Ytype} *y = Ydata + Ystarts[n] + bufpos * n_dims; for (int ii = get_global_id(0); ii < n_dims; ii += get_global_size(0)) { y[ii] = x[ii]; } // This should *not* cause deadlock because // all local threads guaranteed to be // in this branch together. barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) == 0) { countdowns[n] = periods[n] - 1; bufpositions[n] = bufpos + 1; } } else { barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) == 0) { countdowns[n] = countdown - 1; } } } """ textconf = dict(N=N, Xtype=X.cl_buf.ocldtype, Ytype=Y.cl_buf.ocldtype) text = Template(text, output_encoding="ascii").render(**textconf) full_args = (cl_countdowns, cl_bufpositions, cl_periods, X.cl_starts, X.cl_shape0s, X.cl_buf, Y.cl_starts, Y.cl_buf) _fn = cl.Program(queue.context, text).build().fn _fn.set_args(*[arr.data for arr in full_args]) max_len = min(queue.device.max_work_group_size, max(X.shape0s)) gsize = (max_len, N) lsize = (max_len, 1) rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_probes", tag=tag) rval.full_args = full_args # prevent garbage-collection rval.cl_bufpositions = cl_bufpositions rval.Y = Y return rval
def plan_probes(queue, periods, X, Y, tag=None): """ Parameters ---------- P : raggedarray of ints The period (in time-steps) of each probe """ assert len(X) == len(Y) assert len(X) == len(periods) N = len(X) cl_countdowns = to_device(queue, np.zeros(N, dtype='int32')) cl_bufpositions = to_device(queue, np.zeros(N, dtype='int32')) cl_periods = to_device(queue, np.asarray(periods, dtype='int32')) assert X.cl_buf.ocldtype == Y.cl_buf.ocldtype ### N.B. X[i].shape = (ndims[i], ) ### Y[i].shape = (buf_ndims[i], buf_len) for i in xrange(N): assert X.shape0s[i] == Y.shape1s[i] assert X.shape1s[i] == 1 assert X.stride0s[i] == 1 assert Y.stride1s[i] == 1 text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( __global int *countdowns, __global int *bufpositions, __global const int *periods, __global const int *Xstarts, __global const int *Xshape0s, __global const ${Xtype} *Xdata, __global const int *Ystarts, __global ${Ytype} *Ydata ) { const int n = get_global_id(1); const int countdown = countdowns[n]; if (countdown == 0) { const int n_dims = Xshape0s[n]; __global const ${Xtype} *x = Xdata + Xstarts[n]; const int bufpos = bufpositions[n]; __global ${Ytype} *y = Ydata + Ystarts[n] + bufpos * n_dims; for (int ii = get_global_id(0); ii < n_dims; ii += get_global_size(0)) { y[ii] = x[ii]; } // This should *not* cause deadlock because // all local threads guaranteed to be // in this branch together. barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) == 0) { countdowns[n] = periods[n] - 1; bufpositions[n] = bufpos + 1; } } else { barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) == 0) { countdowns[n] = countdown - 1; } } } """ textconf = dict(N=N, Xtype=X.cl_buf.ocldtype, Ytype=Y.cl_buf.ocldtype) text = Template(text, output_encoding='ascii').render(**textconf) full_args = ( cl_countdowns, cl_bufpositions, cl_periods, X.cl_starts, X.cl_shape0s, X.cl_buf, Y.cl_starts, Y.cl_buf, ) _fn = cl.Program(queue.context, text).build().fn _fn.set_args(*[arr.data for arr in full_args]) max_len = min(queue.device.max_work_group_size, max(X.shape0s)) gsize = ( max_len, N, ) lsize = (max_len, 1) rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_probes", tag=tag) rval.full_args = full_args # prevent garbage-collection rval.cl_bufpositions = cl_bufpositions rval.Y = Y return rval
def ref_impl(p, items): """ Return an OpenCL function to calculate elements `items` of gemv operation `p`. In this reference implementation, we create a work item per output number, or more specifically, a work grid of (max_y_len, len(items)). Each work item loops over the dot products and the elements within each dot product to compute the output value Y[global_id(1)][global_id(0)]. """ if p.clra_alpha is not None: raise NotImplementedError() if p.clra_gamma is not None: raise NotImplementedError() cl_items = to_device(p.queue, np.asarray(items, dtype='int32')) if 0: if len(items) < 10: print 'Falling back on reference implementation' p.print_geometry_summary(items, full=True) else: print 'Falling back on reference implementation' p.print_geometry_summary(items) assert all(s == 1 for s in p.A.stride1s) assert all(s == 1 for s in p.X.stride1s) assert all(s == 1 for s in p.Y.stride0s) assert all(s == 1 for s in p.Y.stride1s) assert all(s == 1 for s in p.Y_in.stride0s) assert all(s == 1 for s in p.Y_in.stride1s) text = """ __kernel void fn( __global int *items, % if cl_alpha is not None: __global ${cl_alpha.ocldtype} * alphas, % endif % if (A_js is not None): __global int *A_starts, __global int *A_shape1s, __global int *A_stride0s, __global ${A.cl_buf.ocldtype} *A_data, __global int *A_js_starts, __global int *A_js_shape0s, __global int *A_js_data, __global int *X_starts, __global int *X_stride0s, __global ${X.cl_buf.ocldtype} *X_data, __global int *X_js_starts, __global int *X_js_data, % endif % if cl_beta is not None: __global ${cl_beta.ocldtype} * betas, % endif % if clra_beta is not None: __global int *beta_starts, __global int *beta_data, % endif % if cl_gamma is not None: __global ${cl_gamma.ocldtype} * gammas, % endif __global int *Y_in_starts, __global ${Y_in.cl_buf.ocldtype} *Y_in_data, __global int *Y_starts, __global int *Y_shape0s, __global ${Y.cl_buf.ocldtype} *Y_data) { const int mm = get_global_id(0); const int bb = items[get_global_id(1)]; const int M = Y_shape0s[bb]; if (mm < M) { const int y_offset = Y_starts[bb]; const int y_in_offset = Y_in_starts[bb]; % if float_beta is not None: const ${Y.cl_buf.ocldtype} beta = ${float_beta}; % elif cl_beta is not None: const ${cl_beta.ocldtype} beta = betas[bb]; % elif clra_beta is not None: const int beta_offset = beta_starts[bb]; const ${clra_beta.cl_buf.ocldtype} beta = beta_data[beta_offset + mm]; % endif % if float_gamma is not None: const ${Y.cl_buf.ocldtype} gamma = ${float_gamma}; % elif cl_gamma is not None: const ${cl_gamma.ocldtype} gamma = gammas[bb]; % endif Y_data[y_offset + mm] = gamma + beta * Y_in_data[y_in_offset + mm]; % if (A_js is not None) : const int n_dot_products = A_js_shape0s[bb]; X_js_data += X_js_starts[bb]; A_js_data += A_js_starts[bb]; ${Y.cl_buf.ocldtype} y_sum = 0; for (int ii = 0; ii < n_dot_products; ++ii) { const int x_ji = X_js_data[ii]; const int a_ji = A_js_data[ii]; const int N_i = A_shape1s[a_ji]; const int x_offset = X_starts[x_ji]; const int a_offset = A_starts[a_ji]; const int AsM = A_stride0s[a_ji]; const int XsM = X_stride0s[x_ji]; for (int nn = 0; nn < N_i; ++nn) { y_sum += X_data[x_offset + nn * XsM] * A_data[a_offset + mm * AsM + nn]; } } % if float_alpha is not None: Y_data[y_offset + mm] += ${float_alpha} * y_sum; % elif cl_alpha is not None: Y_data[y_offset + mm] += alphas[bb] * y_sum; % endif % endif } } """ text = Template(text, output_encoding='ascii').render(**p.__dict__) #print text gsize = ( max(p.geometry[ii]['y_len'] for ii in items), len(items)) lsize = None fn = cl.Program(p.queue.context, text).build().fn full_args = [cl_items] if p.cl_alpha is not None: full_args += [p.cl_alpha] if p.A_js is not None: full_args += [ p.A.cl_starts, p.A.cl_shape1s, p.A.cl_stride0s, p.A.cl_buf, p.A_js.cl_starts, p.A_js.cl_shape0s, p.A_js.cl_buf, p.X.cl_starts, p.X.cl_stride0s, p.X.cl_buf, p.X_js.cl_starts, p.X_js.cl_buf, ] if p.cl_beta is not None: full_args += [p.cl_beta] elif p.clra_beta is not None: full_args += [p.clra_beta.cl_starts, p.clra_beta.cl_buf] if p.cl_gamma is not None: full_args += [p.cl_gamma] elif p.clra_gamma is not None: full_args += [p.clra_gamma.cl_starts, p.clra_gamma.cl_buf] full_args += [ p.Y_in.cl_starts, p.Y_in.cl_buf, p.Y.cl_starts, p.Y.cl_shape0s, p.Y.cl_buf] #print [str(arr.dtype)[0] for arr in full_args] fn.set_args(*[arr.data for arr in full_args]) rval = Plan(p.queue, fn, gsize, lsize, name="clra_gemv.ref_impl", tag=p.tag, bw_per_call=bw_from_geometry(p.geometry, items), flops_per_call=flops_from_geometry(p.geometry, items)) rval.full_args = full_args # prevent GC the args return rval