def plan_timeupdate(queue, step, time, dt): assert len(step) == len(time) == 1 assert step.ctype == time.ctype == 'float' assert step.shape0s[0] == step.shape1s[0] == 1 assert time.shape0s[0] == time.shape1s[0] == 1 text = """ ////////// MAIN FUNCTION ////////// __kernel void timeupdate( __global const int *step_starts, __global float *step_data, __global const int *time_starts, __global float *time_data ) { __global float *step = step_data + step_starts[0]; __global float *time = time_data + time_starts[0]; step[0] += 1; time[0] = ${dt} * step[0]; } """ text = as_ascii(Template(text, output_encoding='ascii').render(dt=dt)) full_args = (step.cl_starts, step.cl_buf, time.cl_starts, time.cl_buf) _fn = cl.Program(queue.context, text).build().timeupdate _fn.set_args(*[arr.data for arr in full_args]) gsize = (1,) lsize = None rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_timeupdate") rval.full_args = full_args # prevent garbage-collection return rval
def plan_bcm2_threshold_diagonal1(queue, delta, weights, max_weight, tag=None): N = len(delta) for arr in (delta,): # matrices assert (arr.stride1s == 1).all() text = """ __kernel void bcm2_threshold_diagonal1( __global const int *shape0s, __global const int *shape1s, __global const int *delta_stride0s, __global const int *delta_starts, __global ${type} *delta_data, __global const int *weights_stride0s, __global const int *weights_starts, __global const ${type} *weights_data, __global const ${type} *max_weights ) { const int ij = get_global_id(0); const int k = get_global_id(1); const int shape0 = shape0s[k]; const int shape1 = shape1s[k]; const int i = ij / shape1; const int j = ij % shape1; __global ${type} *delta = delta_data + delta_starts[k]; __global const ${type} *weights = weights_data + weights_starts[k]; const ${type} max_weight = max_weights[k]; if (i < shape0) { if (fabs(weights[i*weights_stride0s[k] + j] + delta[i*delta_stride0s[k] + j]) > max_weight) { delta[i*delta_stride0s[k] + j] = 0; } } } """ textconf = dict(type=delta.ctype) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) full_args = ( delta.cl_shape0s, delta.cl_shape1s, delta.cl_stride0s, delta.cl_starts, delta.cl_buf, weights.cl_stride0s, weights.cl_starts, weights.cl_buf, max_weight, ) _fn = cl.Program(queue.context, text).build().bcm2_threshold_diagonal1 _fn.set_args(*[arr.data for arr in full_args]) lsize = None gsize = (delta.sizes.max(), N) plan = Plan(queue, _fn, gsize, lsize=lsize, name="cl_bcm2_threshold_diagonal1", tag=tag) plan.full_args = full_args # prevent garbage-collection plan.flops_per_call = 4 * delta.sizes.sum() plan.bw_per_call = (delta.nbytes + weights.nbytes + max_weight.nbytes) return plan
def plan_reset(queue, Y, values, tag=None): N = len(Y) assert len(Y) == len(values) assert np.all(Y.stride0s == Y.shape1s) assert np.all(Y.stride1s == 1) assert Y.ctype == values.ctype text = """ ////////// MAIN FUNCTION ////////// __kernel void reset( __global const int *Yshape0s, __global const int *Yshape1s, __global const int *Ystarts, __global ${Ytype} *Ydata, __global const ${Ytype} *values ) { const int n = get_global_id(1); int i = get_global_id(0); const ${Ytype} value = values[n]; const int size = Yshape0s[n] * Yshape1s[n]; __global ${Ytype} *y = Ydata + Ystarts[n]; for (; i < size; i += get_global_size(0)) y[i] = value; } """ textconf = dict(Ytype=Y.ctype) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) full_args = ( Y.cl_shape0s, Y.cl_shape1s, Y.cl_starts, Y.cl_buf, values, ) _fn = cl.Program(queue.context, text).build().reset _fn.set_args(*[arr.data for arr in full_args]) max_group = queue.device.max_work_group_size sizes = Y.shape0s * Y.shape1s n = min(sizes.max(), max_group) gsize = (n, N) lsize = (n, 1) rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_reset", tag=tag) rval.full_args = full_args # prevent garbage-collection rval.bw_per_call = Y.nbytes + values.nbytes rval.description = ( "groups: %d; items: %d; items/group: %0.1f [%d, %d]" % (len(Y), Y.sizes.sum(), Y.sizes.mean(), Y.sizes.min(), Y.sizes.max())) return rval
def plan_softmax(queue, X, Y): from mako.template import Template from nengo_ocl.utils import as_ascii from nengo_ocl.plan import Plan m, n = X.shape assert n <= 32 assert Y.shape == X.shape assert X.elemstrides[1] == 1 assert Y.elemstrides[1] == 1 text = """ __kernel void fn( __global const ${Xtype} *X, __global ${Ytype} *Y ) { const int i = get_global_id(0); ${Xtype} ex[${n}]; __global const ${Xtype} *x = X + i*${Xstride0}; __global ${Ytype} *y = Y + i*${Ystride0}; ${Xtype} maxx = -INFINITY; for (int j = 0; j < ${n}; j++) if (x[j] > maxx) maxx = x[j]; ${Xtype} sumex = 0; for (int j = 0; j < ${n}; j++) { ex[j] = exp(x[j] - maxx); sumex += ex[j]; } for (int j = 0; j < ${n}; j++) y[j] = ex[j] / sumex; } """ textconf = dict(Xtype=X.ctype, Ytype=Y.ctype, m=m, n=n, Xstride0=X.elemstrides[0], Ystride0=Y.elemstrides[0]) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) fn = cl.Program(queue.context, text).build().fn fn.set_args(*[arr.data for arr in (X, Y)]) plan = Plan(queue, fn, gsize=(m, )) return plan
def block_impl(p, items): if p.clra_alpha is not None: raise NotImplementedError() if p.clra_gamma is not None: raise NotImplementedError() if p.clra_beta is not None: raise NotImplementedError() if p.cl_alpha is not None: raise NotImplementedError() if p.cl_beta is not None: raise NotImplementedError() if p.cl_gamma is not None: raise NotImplementedError() if not all(s == 1 for s in p.A.stride1s): raise NotImplementedError() if p.A_js is None: # -- easy probably, but not done raise NotImplementedError() # --- blocking # We want to group the dot products into blocks, so that each workgroup # is computing a (block_y, block_x) region of a dot product. To do this, # we create a temporary output buffer, compute each block to a separate # region of this buffer, then reduce across the buffer in a separate kernel # block_y = 8 block_y = 32 # block_x = 32 block_x = 128 shape0s = [] shape1s = [] Astride0s = [] Astride1s = [] Astarts = [] Xstride0s = [] Xstarts = [] Ybufstarts = [] Ybufstart = 0 Yshape0s_reduce = [] Yinstride0s_reduce = [] Yinstarts_reduce = [] Ystride0s_reduce = [] Ystarts_reduce = [] Ybufinds_reduce = [] bw_reduce = 0 for n in items: assert p.Y_in.shape0s[n] == p.Y.shape0s[n] shape0n = p.Y.shape0s[n] for i in range(0, shape0n, block_y): shape0i = min(shape0n - i, block_y) Ybufind_reduce = [] # loop over dot products outputting to same Y assert len(p.A_js[n]) == len(p.X_js[n]) for aj, xj in zip(p.A_js[n], p.X_js[n]): assert aj.size == 1 and xj.size == 1 aj, xj = aj[0], xj[0] # to ignore numpy DeprecationWarning assert p.A.shape0s[aj] == shape0n assert p.A.shape1s[aj] == p.X.shape0s[xj] assert p.X.shape1s[xj] == 1 shape1n = p.A.shape1s[aj] for j in range(0, shape1n, block_x): shape0s.append(shape0i) shape1s.append(min(shape1n - j, block_x)) Astride0s.append(p.A.stride0s[aj]) Astride1s.append(p.A.stride1s[aj]) Astarts.append(p.A.starts[aj] + i*p.A.stride0s[aj] + j*p.A.stride1s[aj]) Xstride0s.append(p.X.stride0s[xj]) Xstarts.append(p.X.starts[xj] + j*p.X.stride0s[xj]) Ybufstarts.append(Ybufstart) Ybufind_reduce.append(Ybufstart) # Ybufstart += shape0s[-1] Ybufstart += block_y # keep good offset # --- Y-blocking for reduce Yshape0s_reduce.append(shape0i) Yinstride0s_reduce.append(p.Y_in.stride0s[n]) Yinstarts_reduce.append(p.Y_in.starts[n] + i*p.Y_in.stride0s[n]) Ystride0s_reduce.append(p.Y.stride0s[n]) Ystarts_reduce.append(p.Y.starts[n] + i*p.Y.stride0s[n]) Ybufinds_reduce.append(Ybufind_reduce) bw_reduce += shape0i*(len(Ybufind_reduce) + 1) * p.Y.dtype.itemsize # --- create structure gstructure = np.column_stack([shape0s, shape1s, Astride0s, Astride1s, Astarts, Xstride0s, Xstarts, Ybufstarts]) cl_gstructure = to_device(p.queue, gstructure.astype(np.int32)) # --- create Y buffer clYbuf = to_device(p.queue, np.zeros(Ybufstart, dtype=p.Y.dtype)) lsize0 = 4 # lsize0 = 8 lsize0_log2 = int(np.log2(lsize0)) assert 2**lsize0_log2 == lsize0 lsize = (lsize0, block_y, 1) gsize = (lsize[0], lsize[1], gstructure.shape[0]) assert np.prod(lsize) >= block_x textconf = dict( A=p.A, X=p.X, Ybuf=clYbuf, n_structure_vars=gstructure.shape[1], shape0='lstructure[0]', shape1='lstructure[1]', Astride0='lstructure[2]', Astride1='lstructure[3]', Astart='lstructure[4]', Xstride0='lstructure[5]', Xstart='lstructure[6]', Ybufstart='lstructure[7]', block_y=block_y, block_x=block_x, lsize0=lsize0, lsize0_log2=lsize0_log2, float_alpha=p.float_alpha, ) full_args = ( cl_gstructure, p.A.cl_buf, p.X.cl_buf, clYbuf, ) text = """ __kernel void fn( __global const int *gstructure, __global const ${A.ctype} *Adata, __global const ${X.ctype} *Xdata, __global ${Ybuf.ctype} *Ybufdata ) { const int j = get_global_id(0); const int i = get_global_id(1); const int n = get_global_id(2); // load structure __local int lstructure[${n_structure_vars}]; const int local_idx = get_local_id(0) + get_local_id(1)*get_local_size(0); if (local_idx < ${n_structure_vars}) lstructure[local_idx] = gstructure[ n * ${n_structure_vars} + local_idx]; barrier(CLK_LOCAL_MEM_FENCE); __global const ${X.ctype} *x = Xdata + ${Xstart}; __global ${Ybuf.ctype} *ybuf = Ybufdata + ${Ybufstart}; // load x into local memory __local ${X.ctype} xlocal[${block_x}]; if (local_idx < ${shape1}) xlocal[local_idx] = x[local_idx*${Xstride0}]; barrier(CLK_LOCAL_MEM_FENCE); __local ${Ybuf.ctype} sums[${block_y}][${lsize0}]; sums[i][j] = 0; if (i < ${shape0}) { __global const ${A.ctype} *Ai = Adata + ${Astart} + i*${Astride0}; for(int jj = j; jj < ${shape1}; jj += get_global_size(0)) { sums[i][j] += Ai[jj*${Astride1}] * xlocal[jj]; } } barrier(CLK_LOCAL_MEM_FENCE); % for k in range(lsize0_log2 - 1, 0, -1): if (j < ${2**k}) sums[i][j] += sums[i][${2**k} + j]; barrier(CLK_LOCAL_MEM_FENCE); % endfor if (i < ${shape0} && j == 0) ybuf[i] = ${float_alpha} * (sums[i][0] + sums[i][1]); } """ text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) kernel = cl.Program(p.queue.context, text).build().fn kernel.set_args(*[arr.data for arr in full_args]) plan = Plan(p.queue, kernel, gsize, lsize, name='clra_gemv.block_impl', tag=p.tag, bw_per_call=bw_from_geometry(p.geometry, items), flops_per_call=flops_from_geometry(p.geometry, items), ) plan.full_args = full_args # prevent GC the args plan.description = p.geometry_summary(items) plan.Ybuf = clYbuf # --- Reduce kernel align = False Nreduce = len(Yshape0s_reduce) clYshape0s_reduce = to_device( p.queue, np.array(Yshape0s_reduce, dtype=np.int32)) clYinstride0s_reduce = to_device( p.queue, np.array(Yinstride0s_reduce, dtype=np.int32)) clYinstarts_reduce = to_device( p.queue, np.array(Yinstarts_reduce, dtype=np.int32)) clYstride0s_reduce = to_device( p.queue, np.array(Ystride0s_reduce, dtype=np.int32)) clYstarts_reduce = to_device( p.queue, np.array(Ystarts_reduce, dtype=np.int32)) clYbufinds_reduce = CLRaggedArray.from_arrays( p.queue, Ybufinds_reduce, dtype=np.int32, align=align) assert len(clYbufinds_reduce) == Nreduce assert (clYbufinds_reduce.shape1s == 1).all() textconf_reduce = dict( Ybuf=clYbuf, Yin=p.Y_in, Y=p.Y, float_beta=p.float_beta, float_gamma=p.float_gamma, ) full_args_reduce = ( clYshape0s_reduce, clYbufinds_reduce.cl_shape0s, clYbufinds_reduce.cl_starts, clYbufinds_reduce.cl_buf, clYbuf, clYinstride0s_reduce, clYinstarts_reduce, p.Y_in.cl_buf, clYstride0s_reduce, clYstarts_reduce, p.Y.cl_buf, ) lsize_reduce = None gsize_reduce = (block_y, Nreduce) text_reduce = """ __kernel void reduce( __global const int *shape0s, __global const int *Ishape0s, __global const int *Istarts, __global const int *Idata, __global ${Ybuf.ctype} *Ybufdata, __global const int *Yinstride0s, __global const int *Yinstarts, __global ${Yin.ctype} *Yindata, __global const int *Ystride0s, __global const int *Ystarts, __global ${Y.ctype} *Ydata ) { const int i = get_global_id(0); const int n = get_global_id(1); if (i >= shape0s[n]) return; const int Ishape0 = Ishape0s[n]; __global const int *Ybufstart = Idata + Istarts[n]; __global ${Yin.ctype} *yin = Yindata + Yinstarts[n]; __global ${Y.ctype} *y = Ydata + Ystarts[n]; ${Y.ctype} sum = ${float_beta} * yin[i*Yinstride0s[n]]; for (int j = 0; j < Ishape0; j++) { sum += Ybufdata[Ybufstart[j] + i]; } y[i*Ystride0s[n]] = sum + ${float_gamma}; } """ text_reduce = as_ascii(Template( text_reduce, output_encoding='ascii').render(**textconf_reduce)) kernel_reduce = cl.Program(p.queue.context, text_reduce).build().reduce kernel_reduce.set_args(*[arr.data for arr in full_args_reduce]) plan_reduce = Plan(p.queue, kernel_reduce, gsize_reduce, lsize_reduce, name='clra_gemv.block_impl_reduce', tag=p.tag) plan_reduce.full_args = full_args_reduce # prevent GC of the args plan_reduce.bw_per_call = bw_reduce # plan_reduce.description = p.geometry_summary(items) return [plan, plan_reduce]
def many_dots_impl(p, items): # target use case: # * several very shallow gemvs (short inner prods) into each target # * not all targets have the same size # p.print_geometry_summary(items, full=True) # This algorithm is blocked out so that a work-group [i, j] computes # some segment of an output vector: # e.g. Y[i][ 32 * j : 32 * (j + 1)] # # This is done for two reasons: # - to increase occupancy when there are not so many vectors Y # - to handle long vectors Y # p.print_geometry_summary(items) if p.clra_alpha is not None: raise NotImplementedError() if p.clra_gamma is not None: raise NotImplementedError() if p.clra_beta is not None: raise NotImplementedError() if p.cl_alpha is not None: raise NotImplementedError() if p.cl_gamma is not None: raise NotImplementedError() if not all(s == 1 for s in p.A.stride1s): raise NotImplementedError() assert p.float_alpha is not None assert p.float_gamma is not None if p.A_js is None: # -- easy probably, but not done raise NotImplementedError() A_js_shape0s = p.A_js.shape0s cl_gstructure, textconf = p.cl_geometry_and_textconf(items) # min_n_dots = min(A_js_shape0s) max_n_dots = max(A_js_shape0s) max_y_len = max(p.geometry[ii]['y_len'] for ii in items) MAX_SEGMENT_SIZE = 16 # tricky to tune? segment_size = min( max_y_len, MAX_SEGMENT_SIZE) dot_block_size = min( max(max_n_dots, 1), int(p.queue.device.max_work_group_size / segment_size), ) n_segments = int(np.ceil(float(max_y_len) / segment_size)) gsize = (n_segments * segment_size, dot_block_size, len(items)) lsize = (segment_size, dot_block_size, 1) textconf.update({ 'gsize': gsize, 'lsize': lsize, 'segment_size': segment_size, 'dot_block_size': dot_block_size, 'max_y_len': max_y_len, 'n_locals': segment_size * dot_block_size, # 'segment_idx': 'get_local_id(0)', # 'dot_block_idx': 'get_local_id(1)', 'segment_idx': 'segment_idx', 'dot_block_idx': 'dot_block_idx', }) if 0: for k, v in textconf.items(): print(k, v) textconf.update(p.__dict__) # print('float_gamma', textconf['float_gamma']) # print('cl_gamma', textconf['cl_gamma']) # print('clra_gamma', textconf['clra_gamma']) text = """ __kernel void gemv_many_dots( const __global int *gstructure, const __global ${A.cl_buf.ctype} *A_data, const __global ${X.cl_buf.ctype} *X_data, % if cl_beta is not None: const __global ${cl_beta.ctype} * betas, % endif const __global ${Y_in.cl_buf.ctype} *Y_in_data, __global ${Y.cl_buf.ctype} *Y_data) { __local int lstructure[${n_structure_vars}]; __local ${Y.cl_buf.ctype} y_sum_pre[${segment_size}]; __local ${Y.cl_buf.ctype} \ y_sum_post[${dot_block_size}][${segment_size}]; const int local_idx = get_local_id(0) \ + get_local_id(1) * get_local_size(0); int segment_idx = get_local_id(0); int dot_block_idx = get_local_id(1); for (int ii = local_idx; ii < ${n_structure_vars}; ii += ${n_locals}) { lstructure[ii] = gstructure[ get_global_id(2) * ${structure_vars_stride} + ii]; } barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) < ${y_len}) { if (dot_block_idx == 0) { % if float_beta is not None and float_beta != 0 : y_sum_pre[segment_idx] = ${float_beta} * Y_in_data[${y_in_starts} + get_global_id(0)]; % elif cl_beta is not None: y_sum_pre[segment_idx] = betas[${bb}] * Y_in_data[${y_in_starts} + get_global_id(0)]; % else : y_sum_pre[segment_idx] = 0; % endif % if float_gamma is not None: % if float_gamma != 0: y_sum_pre[segment_idx] += ${float_gamma}; % endif % endif } //printf("betaY + gamma=%f\\n", y_sum_pre[segment_idx]); // XXX Move X into shared memory first y_sum_post[dot_block_idx][segment_idx] = 0; for (int ii = dot_block_idx; ii < ${n_dot_products}; ii += ${dot_block_size}) { for (int nn = 0; nn < ${N_i}; nn += 1) { y_sum_post[dot_block_idx][segment_idx] += A_data[${a_starts} + get_global_id(0) * ${a_s0} + nn] * X_data[${x_starts} + nn]; } } } barrier(CLK_LOCAL_MEM_FENCE); //printf("AX=%f\\n", y_sum_post[dot_block_idx][segment_idx]); if ((get_global_id(0) < ${y_len}) && (dot_block_idx == 0)) { for (int ii = 1; ii < ${dot_block_size}; ++ii) { y_sum_post[0][segment_idx] += y_sum_post[ii][segment_idx]; } Y_data[${y_offset} + get_global_id(0)] = y_sum_pre[segment_idx] + ${float_alpha} * y_sum_post[0][segment_idx]; //printf("Yout=%f\\n", Y_data[${y_offset} + get_global_id(0)]); } } """ text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) fn = cl.Program(p.queue.context, text).build().gemv_many_dots full_args = [ cl_gstructure, p.A.cl_buf, p.X.cl_buf, ] if p.cl_beta is not None: full_args += [p.cl_beta] full_args += [ p.Y_in.cl_buf, p.Y.cl_buf, ] fn.set_args(*[arr.data for arr in full_args]) rval = Plan(p.queue, fn, gsize, lsize, name='clra_gemv.many_dots_impl', tag=p.tag, bw_per_call=bw_from_geometry(p.geometry, items), flops_per_call=flops_from_geometry(p.geometry, items), ) rval.full_args = full_args # prevent GC the args rval.description = p.geometry_summary(items) return rval
def plan_whitenoise(queue, Y, dist_enums, dist_params, scale, dt, ranluxcltab, tag=None): N = len(Y) assert len(Y) == len(dist_enums) == len(dist_params) == len(scale) assert dist_enums.ctype == 'int' assert scale.ctype == 'int' for i in range(N): for arr in [Y, dist_enums, dist_params, scale]: assert arr.stride1s[i] == 1 assert Y.shape1s[i] == 1 assert Y.stride0s[i] == 1 assert Y.stride1s[i] == 1 assert dist_enums.shape0s[i] == dist_enums.shape1s[i] == 1 assert dist_params.shape1s[i] == 1 assert scale.shape0s[i] == scale.shape1s[i] == 1 assert scale.stride0s[i] == scale.stride1s[i] == 1 text = """ ${dist_header} ////////// MAIN FUNCTION ////////// __kernel void whitenoise( __global const int *shape0s, __global const int *Ystarts, __global ${Ytype} *Ydata, __global const int *Estarts, __global const int *Edata, __global const int *Pstarts, __global const ${Ptype} *Pdata, __global const int *scalestarts, __global const int *scaledata, __global ranluxcl_state_t *ranluxcltab ) { const int i0 = get_global_id(0); const int k = get_global_id(1); const int m = shape0s[k]; if (i0 >= m) return; __global ${Ytype} *y = Ydata + Ystarts[k]; ranluxcl_state_t state; ranluxcl_download_seed(&state, ranluxcltab); const int scale = *(scaledata + scalestarts[k]); const int dist_enum = *(Edata + Estarts[k]); __global const float *dist_params = Pdata + Pstarts[k]; float4 samples; float sample; int samplei = 4; for (int i = i0; i < m; i += get_global_size(0)) { if (samplei >= 4) { samples = sample_dist(dist_enum, dist_params, &state); samplei = 0; } sample = getfloat4(samples, samplei); y[i] = (scale) ? ${sqrt_dt_inv} * sample : sample; samplei++; } ranluxcl_upload_seed(&state, ranluxcltab); } """ textconf = dict(Ytype=Y.ctype, Ptype=dist_params.ctype, sqrt_dt_inv=1. / np.sqrt(dt), dist_header=dist_header) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) full_args = ( Y.cl_shape0s, Y.cl_starts, Y.cl_buf, dist_enums.cl_starts, dist_enums.cl_buf, dist_params.cl_starts, dist_params.cl_buf, scale.cl_starts, scale.cl_buf, ranluxcltab, ) _fn = cl.Program(queue.context, text).build().whitenoise _fn.set_args(*[arr.data for arr in full_args]) max_len = min(queue.device.max_work_group_size, max(Y.shape0s)) gsize = (max_len, N) lsize = (max_len, 1) rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_whitenoise", tag=tag) rval.full_args = full_args # prevent garbage-collection return rval
def plan_bcm2(queue, pre, post, theta, delta, alpha, tag=None): #weights, max_weight, assert len(pre) == len(post) == len(theta) == len(delta) == alpha.size N = len(pre) for arr in (pre, post, theta): # vectors assert (arr.shape1s == 1).all() for arr in (delta,): # matrices assert (arr.stride1s == 1).all() assert (post.shape0s == delta.shape0s).all() assert (pre.shape0s == delta.shape1s).all() assert (post.shape0s == theta.shape0s).all() assert (pre.ctype == post.ctype == theta.ctype == delta.ctype == alpha.ctype) text = """ __kernel void bcm2( __global const int *shape0s, __global const int *shape1s, __global const int *pre_stride0s, __global const int *pre_starts, __global const ${type} *pre_data, __global const int *post_stride0s, __global const int *post_starts, __global const ${type} *post_data, __global const int *theta_stride0s, __global const int *theta_starts, __global const ${type} *theta_data, __global const int *delta_stride0s, __global const int *delta_starts, __global ${type} *delta_data, __global const ${type} *alphas //__global const int *weights_stride0s, //__global const int *weights_starts, //__global const ${type} *weights_data, //__global const ${type} *max_weights ) { const int ij = get_global_id(0); const int k = get_global_id(1); const int shape0 = shape0s[k]; const int shape1 = shape1s[k]; const int i = ij / shape1; const int j = ij % shape1; __global ${type} *delta = delta_data + delta_starts[k]; const ${type} pre = pre_data[pre_starts[k] + j*pre_stride0s[k]]; const ${type} post = post_data[post_starts[k] + i*post_stride0s[k]]; const ${type} theta = theta_data[ theta_starts[k] + i*theta_stride0s[k]]; const ${type} alpha = alphas[k]; //__global const ${type} *weights = weights_data + weights_starts[k]; //const ${type} max_weight = max_weights[k]; if (i < shape0) { delta[i*delta_stride0s[k] + j] = alpha * post * (post - theta) * pre; //if (i==j) { // delta[i*delta_stride0s[k] + j] = 0; //} else { // // delta[i*delta_stride0s[k] + j] = alpha * post * (post - theta) * pre; // // if (fabs(weights[i*weights_stride0s[k] + j] + delta[i*delta_stride0s[k] + j]) > max_weight) { // delta[i*delta_stride0s[k] + j] = 0; // } //} } } """ textconf = dict(type=pre.ctype) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) full_args = ( delta.cl_shape0s, delta.cl_shape1s, pre.cl_stride0s, pre.cl_starts, pre.cl_buf, post.cl_stride0s, post.cl_starts, post.cl_buf, theta.cl_stride0s, theta.cl_starts, theta.cl_buf, delta.cl_stride0s, delta.cl_starts, delta.cl_buf, alpha, ) #weights.cl_stride0s, weights.cl_starts, weights.cl_buf, #max_weight, _fn = cl.Program(queue.context, text).build().bcm2 _fn.set_args(*[arr.data for arr in full_args]) lsize = None gsize = (delta.sizes.max(), N) plan = Plan(queue, _fn, gsize, lsize=lsize, name="cl_bcm2", tag=tag) plan.full_args = full_args # prevent garbage-collection plan.flops_per_call = 4 * delta.sizes.sum() plan.bw_per_call = (pre.nbytes + post.nbytes + theta.nbytes + delta.nbytes + alpha.nbytes) # + weights.nbytes + max_weight.nbytes) return plan
def _plan_template( # noqa: C901 queue, name, core_text, declares="", tag=None, blockify=True, inputs=None, outputs=None, parameters=None, ): """Template for making a plan for vector nonlinearities. This template assumes that all inputs and outputs are vectors. Parameters ---------- blockify : bool If true, divide the inputs up into blocks with a maximum size. inputs: dictionary of CLRaggedArrays Inputs to the function. RaggedArrays must be a list of vectors. outputs: dictionary of CLRaggedArrays Outputs of the function. RaggedArrays must be a list of vectors. parameters: dictionary of CLRaggedArrays Parameters to the function. Each RaggedArray element must be a vector of the same length of the inputs, or a scalar (to be broadcasted). Providing a float instead of a RaggedArray makes that parameter constant. """ inputs = {} if inputs is None else inputs outputs = {} if outputs is None else outputs parameters = {} if parameters is None else parameters input0 = list(inputs.values())[0] # input to use as reference for lengths # split parameters into static and updated params static_params = {} # static params (hard-coded) params = {} # variable params (updated) for k, v in parameters.items(): if isinstance(v, CLRaggedArray): params[k] = v elif is_number(v): static_params[k] = ("float", float(v)) else: raise ValueError( "Parameter %r must be CLRaggedArray or float (got %s)" % (k, type(v))) avars = {} bw_per_call = 0 for vname, v in list(inputs.items()) + list(outputs.items()) + list( params.items()): assert vname not in avars, "Name clash" assert len(v) == len(input0) assert (v.shape0s == input0.shape0s).all() assert (v.stride0s == v.shape1s).all() # rows contiguous assert (v.stride1s == 1).all() # columns contiguous assert (v.shape1s == 1).all() # vectors only offset = "%(name)s_starts[gind1]" % {"name": vname} avars[vname] = (v.ctype, offset) bw_per_call += v.nbytes ivars = {k: avars[k] for k in inputs} ovars = {k: avars[k] for k in outputs} pvars = {k: avars[k] for k in params} fn_name = str(name) textconf = dict( fn_name=fn_name, declares=declares, core_text=core_text, ivars=ivars, ovars=ovars, pvars=pvars, static_params=static_params, ) text = """ ////////// MAIN FUNCTION ////////// __kernel void ${fn_name}( % for name, [type, offset] in ivars.items(): __global const int *${name}_starts, __global const ${type} *${name}_buf, % endfor % for name, [type, offset] in ovars.items(): __global const int *${name}_starts, __global ${type} *${name}_buf, % endfor % for name, [type, offset] in pvars.items(): __global const int *${name}_starts, __global const int *${name}_shape0s, __global const ${type} *${name}_buf, % endfor __global const int *sizes ) { const int gind0 = get_global_id(0); const int gind1 = get_global_id(1); if (gind1 >= ${N} || gind0 >= sizes[gind1]) return; % for name, [type, offset] in ivars.items(): ${type} ${name} = ${name}_buf[${offset} + gind0]; % endfor % for name, [type, offset] in ovars.items(): ${type} ${name}; % endfor % for name, [type, offset] in pvars.items(): const ${type} ${name} = ${name}_buf[${offset} + gind0]; % endfor % for name, [type, value] in static_params.items(): const ${type} ${name} = ${value}; % endfor ////////////////////////////////////////////////// //vvvvv USER DECLARATIONS BELOW vvvvv ${declares} //^^^^^ USER DECLARATIONS ABOVE ^^^^^ ////////////////////////////////////////////////// /////vvvvv USER COMPUTATIONS BELOW vvvvv ${core_text} /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^ % for name, [type, offset] in ovars.items(): ${name}_buf[${offset} + gind0] = ${name}; % endfor } """ if blockify: # blockify to help with heterogeneous sizes # find best block size block_sizes = [16, 32, 64, 128, 256, 512, 1024] N = np.inf for block_size_i in block_sizes: sizes_i, inds_i, _ = blockify_vector(block_size_i, input0) if len(sizes_i) < N: N = len(sizes_i) block_size = block_size_i sizes = sizes_i inds = inds_i clsizes = to_device(queue, sizes) get_starts = lambda ras: [ to_device(queue, starts) for starts in blockify_vectors(block_size, ras)[2] ] Istarts = get_starts(inputs.values()) Ostarts = get_starts(outputs.values()) Pstarts = get_starts(params.values()) Pshape0s = [to_device(queue, x.shape0s[inds]) for x in params.values()] lsize = None gsize = (block_size, len(sizes)) full_args = [] for vstarts, v in zip(Istarts, inputs.values()): full_args.extend([vstarts, v.cl_buf]) for vstarts, v in zip(Ostarts, outputs.values()): full_args.extend([vstarts, v.cl_buf]) for vstarts, vshape0s, v in zip(Pstarts, Pshape0s, params.values()): full_args.extend([vstarts, vshape0s, v.cl_buf]) full_args.append(clsizes) else: # Allocate more than enough kernels in a matrix lsize = None gsize = (input0.shape0s.max(), len(input0)) full_args = [] for v in inputs.values(): full_args.extend([v.cl_starts, v.cl_buf]) for v in outputs.values(): full_args.extend([v.cl_starts, v.cl_buf]) for vname, v in params.items(): full_args.extend([v.cl_starts, v.cl_shape0s, v.cl_buf]) full_args.append(input0.cl_shape0s) textconf["N"] = gsize[1] text = as_ascii(Template(text, output_encoding="ascii").render(**textconf)) fns = cl.Program(queue.context, text).build() _fn = getattr(fns, fn_name) _fn.set_args(*[arr.data for arr in full_args]) plan = Plan(queue, _fn, gsize, lsize=lsize, name=name, tag=tag) plan.full_args = tuple(full_args) # prevent garbage-collection plan.bw_per_call = bw_per_call plan.description = "groups: %d; items: %d; items/group: %0.1f [%d, %d]" % ( gsize[1], input0.sizes.sum(), input0.sizes.mean(), input0.sizes.min(), input0.sizes.max(), ) return plan
def many_dots_impl(p, items): # target use case: # * several very shallow gemvs (short inner prods) into each target # * not all targets have the same size # p.print_geometry_summary(items, full=True) # This algorithm is blocked out so that a work-group [i, j] computes # some segment of an output vector: # e.g. Y[i][ 32 * j : 32 * (j + 1)] # # This is done for two reasons: # - to increase occupancy when there are not so many vectors Y # - to handle long vectors Y # p.print_geometry_summary(items) if p.clra_alpha is not None: raise NotImplementedError() if p.clra_gamma is not None: raise NotImplementedError() if p.clra_beta is not None: raise NotImplementedError() if p.cl_alpha is not None: raise NotImplementedError() if p.cl_gamma is not None: raise NotImplementedError() if not all(s == 1 for s in p.A.stride1s): raise NotImplementedError() assert p.float_alpha is not None assert p.float_gamma is not None if p.A_js is None: # -- easy probably, but not done raise NotImplementedError() A_js_shape0s = p.A_js.shape0s cl_gstructure, textconf = p.cl_geometry_and_textconf(items) # min_n_dots = min(A_js_shape0s) max_n_dots = max(A_js_shape0s) max_y_len = max(p.geometry[ii]['y_len'] for ii in items) MAX_SEGMENT_SIZE = 16 # tricky to tune? segment_size = min(max_y_len, MAX_SEGMENT_SIZE) dot_block_size = min( max(max_n_dots, 1), int(p.queue.device.max_work_group_size / segment_size), ) n_segments = int(np.ceil(float(max_y_len) / segment_size)) gsize = (n_segments * segment_size, dot_block_size, len(items)) lsize = (segment_size, dot_block_size, 1) textconf.update({ 'gsize': gsize, 'lsize': lsize, 'segment_size': segment_size, 'dot_block_size': dot_block_size, 'max_y_len': max_y_len, 'n_locals': segment_size * dot_block_size, # 'segment_idx': 'get_local_id(0)', # 'dot_block_idx': 'get_local_id(1)', 'segment_idx': 'segment_idx', 'dot_block_idx': 'dot_block_idx', }) if 0: for k, v in textconf.items(): print(k, v) textconf.update(p.__dict__) # print('float_gamma', textconf['float_gamma']) # print('cl_gamma', textconf['cl_gamma']) # print('clra_gamma', textconf['clra_gamma']) text = """ __kernel void gemv_many_dots( const __global int *gstructure, const __global ${A.cl_buf.ctype} *A_data, const __global ${X.cl_buf.ctype} *X_data, % if cl_beta is not None: const __global ${cl_beta.ctype} * betas, % endif const __global ${Y_in.cl_buf.ctype} *Y_in_data, __global ${Y.cl_buf.ctype} *Y_data) { __local int lstructure[${n_structure_vars}]; __local ${Y.cl_buf.ctype} y_sum_pre[${segment_size}]; __local ${Y.cl_buf.ctype} \ y_sum_post[${dot_block_size}][${segment_size}]; const int local_idx = get_local_id(0) \ + get_local_id(1) * get_local_size(0); int segment_idx = get_local_id(0); int dot_block_idx = get_local_id(1); for (int ii = local_idx; ii < ${n_structure_vars}; ii += ${n_locals}) { lstructure[ii] = gstructure[ get_global_id(2) * ${structure_vars_stride} + ii]; } barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) < ${y_len}) { if (dot_block_idx == 0) { % if float_beta is not None and float_beta != 0 : y_sum_pre[segment_idx] = ${float_beta} * Y_in_data[${y_in_starts} + get_global_id(0)]; % elif cl_beta is not None: y_sum_pre[segment_idx] = betas[${bb}] * Y_in_data[${y_in_starts} + get_global_id(0)]; % else : y_sum_pre[segment_idx] = 0; % endif % if float_gamma is not None: % if float_gamma != 0: y_sum_pre[segment_idx] += ${float_gamma}; % endif % endif } //printf("betaY + gamma=%f\\n", y_sum_pre[segment_idx]); // XXX Move X into shared memory first y_sum_post[dot_block_idx][segment_idx] = 0; for (int ii = dot_block_idx; ii < ${n_dot_products}; ii += ${dot_block_size}) { for (int nn = 0; nn < ${N_i}; nn += 1) { y_sum_post[dot_block_idx][segment_idx] += A_data[${a_starts} + get_global_id(0) * ${a_s0} + nn] * X_data[${x_starts} + nn]; } } } barrier(CLK_LOCAL_MEM_FENCE); //printf("AX=%f\\n", y_sum_post[dot_block_idx][segment_idx]); if ((get_global_id(0) < ${y_len}) && (dot_block_idx == 0)) { for (int ii = 1; ii < ${dot_block_size}; ++ii) { y_sum_post[0][segment_idx] += y_sum_post[ii][segment_idx]; } Y_data[${y_offset} + get_global_id(0)] = y_sum_pre[segment_idx] + ${float_alpha} * y_sum_post[0][segment_idx]; //printf("Yout=%f\\n", Y_data[${y_offset} + get_global_id(0)]); } } """ text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) fn = cl.Program(p.queue.context, text).build().gemv_many_dots full_args = [ cl_gstructure, p.A.cl_buf, p.X.cl_buf, ] if p.cl_beta is not None: full_args += [p.cl_beta] full_args += [ p.Y_in.cl_buf, p.Y.cl_buf, ] fn.set_args(*[arr.data for arr in full_args]) rval = Plan( p.queue, fn, gsize, lsize, name='clra_gemv.many_dots_impl', tag=p.tag, bw_per_call=bw_from_geometry(p.geometry, items), flops_per_call=flops_from_geometry(p.geometry, items), ) rval.full_args = full_args # prevent GC the args rval.description = p.geometry_summary(items) return rval
def ref_impl(p, items): """ Return an OpenCL function to calculate elements `items` of gemv operation `p`. In this reference implementation, we create a work item per output number, or more specifically, a work grid of (max_y_len, len(items)). Each work item loops over the dot products and the elements within each dot product to compute the output value Y[global_id(1)][global_id(0)]. """ if p.clra_alpha is not None: raise NotImplementedError() if p.clra_gamma is not None: raise NotImplementedError() cl_items = to_device(p.queue, np.asarray(items, dtype='int32')) if 0: if len(items) < 10: print('Falling back on reference implementation') p.print_geometry_summary(items, full=True) else: print('Falling back on reference implementation') p.print_geometry_summary(items) assert all(s == 1 for s in p.A.stride1s) assert all(s == 1 for s in p.X.stride1s) assert all(s == 1 for s in p.Y.stride0s) assert all(s == 1 for s in p.Y.stride1s) assert all(s == 1 for s in p.Y_in.stride0s) assert all(s == 1 for s in p.Y_in.stride1s) text = """ __kernel void gemv_ref( __global int *items, % if cl_alpha is not None: __global ${cl_alpha.ctype} * alphas, % endif % if (A_js is not None): __global int *A_starts, __global int *A_shape1s, __global int *A_stride0s, __global ${A.cl_buf.ctype} *A_data, __global int *A_js_starts, __global int *A_js_shape0s, __global int *A_js_data, __global int *X_starts, __global int *X_stride0s, __global ${X.cl_buf.ctype} *X_data, __global int *X_js_starts, __global int *X_js_data, % endif % if cl_beta is not None: __global ${cl_beta.ctype} * betas, % endif % if clra_beta is not None: __global int *beta_starts, __global int *beta_data, % endif % if cl_gamma is not None: __global ${cl_gamma.ctype} * gammas, % endif __global int *Y_in_starts, __global ${Y_in.cl_buf.ctype} *Y_in_data, __global int *Y_starts, __global int *Y_shape0s, __global ${Y.cl_buf.ctype} *Y_data) { const int mm = get_global_id(0); const int bb = items[get_global_id(1)]; const int M = Y_shape0s[bb]; if (mm < M) { const int y_offset = Y_starts[bb]; const int y_in_offset = Y_in_starts[bb]; % if float_beta is not None: const ${Y.cl_buf.ctype} beta = ${float_beta}; % elif cl_beta is not None: const ${cl_beta.ctype} beta = betas[bb]; % elif clra_beta is not None: const int beta_offset = beta_starts[bb]; const ${clra_beta.cl_buf.ctype} beta = beta_data[beta_offset + mm]; % endif % if float_gamma is not None: const ${Y.cl_buf.ctype} gamma = ${float_gamma}; % elif cl_gamma is not None: const ${cl_gamma.ctype} gamma = gammas[bb]; % endif Y_data[y_offset + mm] = gamma + beta * Y_in_data[y_in_offset + mm]; % if A_js is not None: const int n_dot_products = A_js_shape0s[bb]; X_js_data += X_js_starts[bb]; A_js_data += A_js_starts[bb]; ${Y.cl_buf.ctype} y_sum = 0; for (int ii = 0; ii < n_dot_products; ++ii) { const int x_ji = X_js_data[ii]; const int a_ji = A_js_data[ii]; const int N_i = A_shape1s[a_ji]; const int x_offset = X_starts[x_ji]; const int a_offset = A_starts[a_ji]; const int AsM = A_stride0s[a_ji]; const int XsM = X_stride0s[x_ji]; for (int nn = 0; nn < N_i; ++nn) { y_sum += X_data[x_offset + nn * XsM] * A_data[a_offset + mm * AsM + nn]; } } % if float_alpha is not None: Y_data[y_offset + mm] += ${float_alpha} * y_sum; % elif cl_alpha is not None: Y_data[y_offset + mm] += alphas[bb] * y_sum; % endif % endif } } """ text = as_ascii( Template(text, output_encoding='ascii').render(**p.__dict__)) gsize = (max(p.geometry[ii]['y_len'] for ii in items), len(items)) lsize = None fn = cl.Program(p.queue.context, text).build().gemv_ref full_args = [cl_items] if p.cl_alpha is not None: full_args += [p.cl_alpha] if p.A_js is not None: full_args += [ p.A.cl_starts, p.A.cl_shape1s, p.A.cl_stride0s, p.A.cl_buf, p.A_js.cl_starts, p.A_js.cl_shape0s, p.A_js.cl_buf, p.X.cl_starts, p.X.cl_stride0s, p.X.cl_buf, p.X_js.cl_starts, p.X_js.cl_buf, ] if p.cl_beta is not None: full_args += [p.cl_beta] elif p.clra_beta is not None: full_args += [p.clra_beta.cl_starts, p.clra_beta.cl_buf] if p.cl_gamma is not None: full_args += [p.cl_gamma] elif p.clra_gamma is not None: full_args += [p.clra_gamma.cl_starts, p.clra_gamma.cl_buf] full_args += [ p.Y_in.cl_starts, p.Y_in.cl_buf, p.Y.cl_starts, p.Y.cl_shape0s, p.Y.cl_buf ] # print([str(arr.dtype)[0] for arr in full_args]) fn.set_args(*[arr.data for arr in full_args]) rval = Plan(p.queue, fn, gsize, lsize, name="clra_gemv.ref_impl", tag=p.tag, bw_per_call=bw_from_geometry(p.geometry, items), flops_per_call=flops_from_geometry(p.geometry, items)) rval.full_args = full_args # prevent GC the args return rval
def plan_probes(queue, periods, X, Y, tag=None): """ Parameters ---------- P : raggedarray of ints The period (in time-steps) of each probe """ assert len(X) == len(Y) assert len(X) == len(periods) assert X.ctype == Y.ctype N = len(X) # N.B. X[i].shape = (M, N) # Y[i].shape = (buf_len, M * N) for arr in [X, Y]: assert (arr.stride1s == 1).all() assert (X.shape0s * X.shape1s == Y.shape1s).all() assert (X.stride0s == X.shape1s).all() assert (X.stride1s == 1).all() assert (Y.stride0s == Y.shape1s).all() assert (Y.stride1s == 1).all() periods = np.asarray(periods, dtype='float32') cl_periods = to_device(queue, periods) cl_countdowns = to_device(queue, periods - 1) cl_bufpositions = to_device(queue, np.zeros(N, dtype='int32')) text = """ ////////// MAIN FUNCTION ////////// __kernel void probes( __global ${Ctype} *countdowns, __global int *bufpositions, __global const ${Ptype} *periods, __global const int *Xstarts, __global const int *Xshape0s, __global const int *Xshape1s, __global const ${Xtype} *Xdata, __global const int *Ystarts, __global ${Ytype} *Ydata ) { const int n = get_global_id(1); const ${Ctype} countdown = countdowns[n]; if (countdown <= 0) { const int n_dims = Xshape0s[n] * Xshape1s[n]; __global const ${Xtype} *x = Xdata + Xstarts[n]; const int bufpos = bufpositions[n]; __global ${Ytype} *y = Ydata + Ystarts[n] + bufpos * n_dims; for (int ii = get_global_id(0); ii < n_dims; ii += get_global_size(0)) { y[ii] = x[ii]; } // This should *not* cause deadlock because // all local threads guaranteed to be // in this branch together. barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) == 0) { countdowns[n] = countdown + periods[n] - 1; bufpositions[n] = bufpos + 1; } } else { barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) == 0) { countdowns[n] = countdown - 1; } } } """ textconf = dict(N=N, Xtype=X.ctype, Ytype=Y.ctype, Ctype=cl_countdowns.ctype, Ptype=cl_periods.ctype) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) full_args = ( cl_countdowns, cl_bufpositions, cl_periods, X.cl_starts, X.cl_shape0s, X.cl_shape1s, X.cl_buf, Y.cl_starts, Y.cl_buf, ) _fn = cl.Program(queue.context, text).build().probes _fn.set_args(*[arr.data for arr in full_args]) max_len = min(queue.device.max_work_group_size, max(X.shape0s)) gsize = (max_len, N,) lsize = (max_len, 1) rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_probes", tag=tag) rval.full_args = full_args # prevent garbage-collection rval.cl_bufpositions = cl_bufpositions rval.Y = Y rval.bw_per_call = (X.nbytes + Y.nbytes + cl_periods.nbytes + cl_countdowns.nbytes + cl_bufpositions.nbytes) rval.description = ( "groups: %d; items: %d; items/group: %0.1f [%d, %d]" % (len(Y), Y.sizes.sum(), Y.sizes.mean(), Y.sizes.min(), Y.sizes.max())) return rval
def plan_linear_synapse(queue, X, Y, A, B, Xbuf, Ybuf, tag=None): """ Implements a filter of the form y[n+1] + a[0] y[n] + ... + a[i] y[n-i] = b[0] x[n] + ... + b[j] x[n-j] """ N = len(X) assert len(Y) == N and len(A) == N and len(B) == N for arr in [X, Y, A, B, Xbuf, Ybuf]: assert (arr.shape1s == arr.stride0s).all() assert (arr.stride1s == 1).all() for arr in [X, Y, A, B]: # vectors assert (arr.shape1s == 1).all() assert (X.shape0s == Y.shape0s).all() assert (B.shape0s >= 1).all() assert ((B.shape0s == 1) | (Xbuf.shape0s == B.shape0s)).all() assert (Xbuf.shape1s == X.shape0s).all() assert ((A.shape0s == 1) | (Ybuf.shape0s == A.shape0s)).all() assert (Ybuf.shape1s == Y.shape0s).all() assert X.ctype == Xbuf.ctype assert Y.ctype == Ybuf.ctype Xbufpos = to_device(queue, np.zeros(N, dtype='int32')) Ybufpos = to_device(queue, np.zeros(N, dtype='int32')) text = """ ////////// MAIN FUNCTION ////////// __kernel void linear_synapse( __global const int *shape0s, __global const int *Xstarts, __global const ${Xtype} *Xdata, __global const int *Ystarts, __global ${Ytype} *Ydata, __global const int *Ashape0s, __global const int *Astarts, __global const ${Atype} *Adata, __global const int *Bshape0s, __global const int *Bstarts, __global const ${Btype} *Bdata, __global const int *Xbufstarts, __global ${Xtype} *Xbufdata, __global const int *Ybufstarts, __global ${Ytype} *Ybufdata, __global int *Xbufpos, __global int *Ybufpos ) { int i = get_global_id(0); const int k = get_global_id(1); __global const ${Xtype} *x = Xdata + Xstarts[k]; __global ${Ytype} *y = Ydata + Ystarts[k]; __global const ${Atype} *a = Adata + Astarts[k]; __global const ${Btype} *b = Bdata + Bstarts[k]; const int n = shape0s[k]; const int na = Ashape0s[k]; const int nb = Bshape0s[k]; if (na == 0 && nb == 1) { for (; i < n; i += get_global_size(0)) y[i] = b[0] * x[i]; } else if (na == 1 && nb == 1) { for (; i < n; i += get_global_size(0)) { y[i] *= -a[0]; y[i] += b[0] * x[i]; } } else { // general filtering __global ${Xtype} *xbuf = Xbufdata + Xbufstarts[k]; __global ${Ytype} *ybuf = Ybufdata + Ybufstarts[k]; const int ix = Xbufpos[k]; const int iy = Ybufpos[k]; const int ix1 = (ix > 0) ? ix - 1 : nb - 1; const int iy1 = (iy > 0) ? iy - 1 : na - 1; ${Ytype} yi; int j, jj; for (; i < n; i += get_global_size(0)) { yi = b[0] * x[i]; if (nb > 1) { xbuf[ix*n + i] = x[i]; // copy input to buffer for (j = 1; j < nb; j++) { jj = (ix + j) % nb; yi += b[j] * xbuf[jj*n + i]; } } if (na > 0) { yi -= a[0] * y[i]; if (na > 1) { for (j = 1; j < na; j++) { jj = (iy + j) % na; yi -= a[j] * ybuf[jj*n + i]; } ybuf[iy1*n + i] = yi; // copy output to buffer } } y[i] = yi; } Xbufpos[k] = ix1; Ybufpos[k] = iy1; } } """ textconf = dict( Xtype=X.ctype, Ytype=Y.ctype, Atype=A.ctype, Btype=B.ctype ) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) full_args = ( X.cl_shape0s, X.cl_starts, X.cl_buf, Y.cl_starts, Y.cl_buf, A.cl_shape0s, A.cl_starts, A.cl_buf, B.cl_shape0s, B.cl_starts, B.cl_buf, Xbuf.cl_starts, Xbuf.cl_buf, Ybuf.cl_starts, Ybuf.cl_buf, Xbufpos, Ybufpos, ) _fn = cl.Program(queue.context, text).build().linear_synapse _fn.set_args(*[arr.data for arr in full_args]) max_len = min(max(X.shape0s), queue.device.max_work_group_size) gsize = (max_len, N) lsize = (max_len, 1) rval = Plan( queue, _fn, gsize, lsize=lsize, name="cl_linear_synapse", tag=tag) rval.full_args = full_args # prevent garbage-collection rval.bw_per_call = ( X.nbytes + Y.nbytes + A.nbytes + B.nbytes + Xbuf.nbytes + Ybuf.nbytes) rval.description = ( "groups: %d; items: %d; items/group: %0.1f [%d, %d]" % (len(Y), Y.sizes.sum(), Y.sizes.mean(), Y.sizes.min(), Y.sizes.max())) return rval
def plan_elementwise_inc(queue, A, X, Y, tag=None): """Implements an element-wise increment Y += A * X""" N = len(X) assert len(Y) == N and len(A) == N for arr in [A, X, Y]: assert (arr.stride1s == 1).all() assert ((X.shape0s == 1) | (X.shape0s == Y.shape0s)).all() assert ((X.shape1s == 1) | (X.shape1s == Y.shape1s)).all() assert ((A.shape0s == 1) | (A.shape0s == Y.shape0s)).all() assert ((A.shape1s == 1) | (A.shape1s == Y.shape1s)).all() assert (X.stride1s == 1).all() assert (Y.stride1s == 1).all() assert (A.stride1s == 1).all() assert X.ctype == Y.ctype assert A.ctype == Y.ctype text = """ inline ${Ytype} get_element( __global const ${Ytype} *data, const int shape0, const int shape1, const int stride0, const int i, const int j ) { if (shape0 == 1 && shape1 == 1) return data[0]; else if (shape0 == 1) return data[j]; else if (shape1 == 1) return data[i * stride0]; else return data[i * stride0 + j]; } ////////// MAIN FUNCTION ////////// __kernel void elementwise_inc( __global const int *Ashape0s, __global const int *Ashape1s, __global const int *Astride0s, __global const int *Astarts, __global const ${Atype} *Adata, __global const int *Xshape0s, __global const int *Xshape1s, __global const int *Xstride0s, __global const int *Xstarts, __global const ${Xtype} *Xdata, __global const int *Yshape0s, __global const int *Yshape1s, __global const int *Ystride0s, __global const int *Ystarts, __global ${Ytype} *Ydata ) { const int n = get_global_id(1); __global const ${Atype} *a = Adata + Astarts[n]; __global const ${Xtype} *x = Xdata + Xstarts[n]; __global ${Ytype} *y = Ydata + Ystarts[n]; const int Ysize = Yshape0s[n] * Yshape1s[n]; for (int ij = get_global_id(0); ij < Ysize; ij += get_global_size(0)) { int i = ij / Yshape1s[n]; int j = ij - i * Yshape1s[n]; ${Atype} aa = get_element( a, Ashape0s[n], Ashape1s[n], Astride0s[n], i, j); ${Xtype} xx = get_element( x, Xshape0s[n], Xshape1s[n], Xstride0s[n], i, j); y[i * Ystride0s[n] + j] += aa * xx; } } """ textconf = dict(Atype=A.ctype, Xtype=X.ctype, Ytype=Y.ctype) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) full_args = ( A.cl_shape0s, A.cl_shape1s, A.cl_stride0s, A.cl_starts, A.cl_buf, X.cl_shape0s, X.cl_shape1s, X.cl_stride0s, X.cl_starts, X.cl_buf, Y.cl_shape0s, Y.cl_shape1s, Y.cl_stride0s, Y.cl_starts, Y.cl_buf, ) _fn = cl.Program(queue.context, text).build().elementwise_inc _fn.set_args(*[arr.data for arr in full_args]) max_group = queue.device.max_work_group_size mn = min(max(max(Y.shape0s), max(Y.shape1s)), max_group) gsize = (mn, N) lsize = (mn, 1) rval = Plan( queue, _fn, gsize, lsize=lsize, name="cl_elementwise_inc", tag=tag) rval.full_args = full_args # prevent garbage-collection rval.bw_per_call = A.nbytes + X.nbytes + Y.nbytes rval.description = ( "groups: %d; items: %d; items/group: %0.1f [%d, %d]" % (len(Y), Y.sizes.sum(), Y.sizes.mean(), Y.sizes.min(), Y.sizes.max())) return rval
def plan_whitesignal(queue, Y, t, signals, dt, tag=None): N = len(Y) assert len(Y) == len(t) == len(signals) for i in range(N): for arr in [Y, t, signals]: assert arr.stride1s[i] == 1 assert Y.shape1s[i] == 1 assert Y.stride0s[i] == Y.stride1s[i] == 1 assert t.shape0s[i] == t.shape1s[i] == 1 assert Y.shape0s[i] == signals.shape1s[i] assert signals.stride1s[i] == 1 text = """ ////////// MAIN FUNCTION ////////// __kernel void whitesignal( __global const int *Yshape0s, __global const int *Ystarts, __global ${Ytype} *Ydata, __global const int *Tstarts, __global ${Ttype} *Tdata, __global const int *Sshape0s, __global const int *Sstarts, __global ${Stype} *Sdata ) { int i = get_global_id(0); const int k = get_global_id(1); const int m = Yshape0s[k]; if (i >= m) return; __global ${Ytype} *y = Ydata + Ystarts[k]; __global ${Ytype} *s = Sdata + Sstarts[k]; const float t = *(Tdata + Tstarts[k]); const int nt = Sshape0s[k]; const int ti = (int)round(t / ${dt}) % nt; for (; i < m; i += get_global_size(0)) y[i] = s[m*ti + i]; } """ textconf = dict(Ytype=Y.ctype, Ttype=t.ctype, Stype=signals.ctype, dt=dt) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) full_args = ( Y.cl_shape0s, Y.cl_starts, Y.cl_buf, t.cl_starts, t.cl_buf, signals.cl_shape0s, signals.cl_starts, signals.cl_buf, ) _fn = cl.Program(queue.context, text).build().whitesignal _fn.set_args(*[arr.data for arr in full_args]) max_len = min(queue.device.max_work_group_size, max(Y.shape0s)) gsize = (max_len, N) lsize = (max_len, 1) rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_whitesignal", tag=tag) rval.full_args = full_args # prevent garbage-collection return rval
def plan_hingeloss(queue, yinds, Z, c, E): from mako.template import Template from nengo_ocl.utils import as_ascii from nengo_ocl.plan import Plan m, n = Z.shape assert n <= 32 assert Z.shape == E.shape assert Z.elemstrides[1] == 1 assert E.elemstrides[1] == 1 assert yinds.shape == (m, ) assert yinds.elemstrides[0] == 1 assert c.shape == (m, ) assert c.elemstrides[0] == 1 text = """ __kernel void fn( __global const ${yindstype} *yinds, __global const ${Ztype} *Z, __global ${ctype} *c, __global ${Etype} *E ) { const int i = get_global_id(0); const ${yindstype} yi = yinds[i]; __global const ${Ztype} *z = Z + i*${Zstride0}; __global ${Etype} *e = E + i*${Estride0}; ${yindstype} ti; ${Ztype} zj, zy, zt = -INFINITY; zt = -INFINITY; for (int j = 0; j < ${n}; j++) { e[j] = 0; zj = z[j]; if (j == yi) { zy = zj; } else if (zj > zt) { zt = zj; ti = j; } } ${Ztype} margin = zy - zt; if (margin < 1) { e[yi] = -1; e[ti] = 1; } c[i] = max(1 - margin, 0.0f); } """ textconf = dict(yindstype=yinds.ctype, Ztype=Z.ctype, ctype=c.ctype, Etype=E.ctype, m=m, n=n, Zstride0=Z.elemstrides[0], Estride0=E.elemstrides[0]) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) fn = cl.Program(queue.context, text).build().fn fn.set_args(*[arr.data for arr in (yinds, Z, c, E)]) plan = Plan(queue, fn, gsize=(m, )) return plan
def plan_direct(queue, code, init, input_names, inputs, output, tag=None): from . import ast_conversion assert len(input_names) == len(inputs) N = len(inputs[0]) for x in inputs: assert len(x) == len(output) for x in inputs + [output]: assert (x.shape1s == 1).all() and (x.stride1s == 1).all() assert (x.stride0s == 1).all() input_types = [x.ctype for x in inputs] output_type = output.ctype text = """ ////////// MAIN FUNCTION ////////// __kernel void direct( % for iname, itype in zip(input_names, input_types): __global const int *${iname}_starts__, __global const ${itype} *${iname}_data__, % endfor __global const int *${oname}_starts__, __global ${otype} *${oname}_data__ ) { const int n = get_global_id(0); if (n >= ${N}) return; % for iname, itype in zip(input_names, input_types): __global const ${itype} *${iname} = ${iname}_data__ + ${iname}_starts__[n]; % endfor __global ${otype} *${oname} = ${oname}_data__ + ${oname}_starts__[n]; /////vvvvv USER DECLARATIONS BELOW vvvvv ${init} /////vvvvv USER COMPUTATIONS BELOW vvvvv ${code} // END OF FUNC: put nothing after user code, since it can return } """ textconf = dict(init=indent(init, 12), code=indent(code, 12), N=N, input_names=input_names, input_types=input_types, oname=ast_conversion.OUTPUT_NAME, otype=output_type, ) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) full_args = [] for x in inputs: full_args.extend([x.cl_starts, x.cl_buf]) full_args.extend([output.cl_starts, output.cl_buf]) _fn = cl.Program(queue.context, text).build().direct _fn.set_args(*[arr.data for arr in full_args]) gsize = (N,) rval = Plan(queue, _fn, gsize, lsize=None, name="cl_direct", tag=tag) rval.full_args = full_args # prevent garbage-collection rval.description = ( "groups: %d; items: %d; items/group: %0.1f [%d, %d]" % (len(output), output.sizes.sum(), output.sizes.mean(), output.sizes.min(), output.sizes.max())) return rval
def reduce_impl( p, items, group_size=None, segment_size=None, ): # # Target use case: long inner products, small numbers of dots. # # Approach: each work-group computes a small number of gemv outputs # if p.clra_alpha is not None: raise NotImplementedError() if p.clra_gamma is not None: raise NotImplementedError() if p.clra_beta is not None: raise NotImplementedError() if p.cl_alpha is not None: raise NotImplementedError() if p.cl_gamma is not None: raise NotImplementedError() if not all(s == 1 for s in p.A.stride1s): raise NotImplementedError() assert p.float_alpha is not None assert p.float_gamma is not None cl_gstructure, textconf = p.cl_geometry_and_textconf(items) max_n_dots = max([len(p.geometry[ii]['dots']) for ii in items]) max_reduce_len = max( max([gg['a_shape1'] for gg in p.geometry[ii]['dots']]) for ii in items) max_y_len = max([p.geometry[ii]['y_len'] for ii in items]) # segment means the piece of Y written by a work-group # group_size is the number of values that we're reducing over if len(items) < 4: if group_size is None: group_size = 32 # XXX if segment_size is None: segment_size = min(max_y_len, 2) # XXX else: if group_size is None: group_size = 32 # XXX if segment_size is None: segment_size = min(max_y_len, 4) # XXX g_segments = int(np.ceil(float(max_y_len) / segment_size)) gsize = (group_size, g_segments * segment_size, len(items)) lsize = (group_size, segment_size, 1) max_reduce_iters = int(np.ceil(float(max_reduce_len) / group_size)) textconf.update({ 'n_items': len(items), 'gsize': gsize, 'segment_size': segment_size, 'max_y_len': max_y_len, 'group_size': group_size, 'local_count': group_size * segment_size, 'max_reduce_len': max_reduce_len, 'N_cutoff': max_reduce_iters * group_size, 'max_n_dots': max_n_dots, }) if 0: for k, v in textconf.items(): print(k, v) textconf.update(p.__dict__) text = """ __kernel void gemv_reduce( const __global int *gstructure, const __global ${A.cl_buf.ctype} *A_data, const __global ${X.cl_buf.ctype} *X_data, % if cl_beta is not None: const __global ${cl_beta.ctype} * betas, % endif const __global ${Y_in.cl_buf.ctype} *Y_in_data, __global ${Y.cl_buf.ctype} *Y_data) { __local int lstructure[${n_structure_vars}]; % if segment_size > 1: // we'll cache X in shared memory so we load it only once // for the whole segment __local ${X.cl_buf.ctype} lX[${group_size}]; % endif //Scratch space for the dot products __local ${Y.cl_buf.ctype} partialDotProduct[${segment_size}][${group_size}]; __local ${Y.cl_buf.ctype} y_sum_pre[${segment_size}]; const int local_idx = get_local_id(0) + get_local_id(1) * get_local_size(0); // load structure % if local_count < n_structure_vars: for (int ii = local_idx; ii < ${n_structure_vars}; ii += ${local_count}) { lstructure[ii] = gstructure[ get_global_id(2) * ${structure_vars_stride} + ii]; } % else : if (local_idx < ${n_structure_vars}) { lstructure[local_idx] = gstructure[ get_global_id(2) * ${structure_vars_stride} + local_idx]; } % endif barrier(CLK_LOCAL_MEM_FENCE); if ((get_local_id(0) == 0) && (get_global_id(1) < ${y_len})) { % if float_beta is not None and float_beta != 0 : y_sum_pre[get_local_id(1)] = ${float_beta} * Y_in_data[${y_in_starts} + get_global_id(1)]; % elif cl_beta is not None: y_sum_pre[get_local_id(1)] = betas[${bb}] * Y_in_data[${y_in_starts} + get_global_id(1)]; % else : y_sum_pre[get_local_id(1)] = 0; % endif % if float_gamma is not None and float_gamma != 0: y_sum_pre[get_local_id(1)] += ${float_gamma}; % endif // printf("betaY + gamma=%f\\n", y_sum_pre[get_local_id(1)]); } partialDotProduct[get_local_id(1)][get_local_id(0)] = 0; % if max_n_dots > 1: for (int ii = 0; ii < ${n_dot_products}; ii += 1) { % else: const int ii = 0; % endif for (int nn = get_local_id(0); nn < ${N_cutoff}; nn += get_local_size(0)) { // segment_size = ${segment_size} % if (segment_size == 1): if ((nn < ${N_i}) && (get_global_id(1) < ${y_len})) { partialDotProduct[get_local_id(1)][get_local_id(0)] += A_data[${a_starts} + get_global_id(1) * ${a_s0} + nn] * X_data[${x_starts} + nn]; } % else: barrier(CLK_LOCAL_MEM_FENCE); if ((get_local_id(1) == 0) && (nn < ${N_i})) { lX[get_local_id(0)] = X_data[${x_starts} + nn]; } barrier(CLK_LOCAL_MEM_FENCE); if ((nn < ${N_i}) && (get_global_id(1) < ${y_len})) { partialDotProduct[get_local_id(1)][get_local_id(0)] += A_data[${a_starts} + get_global_id(1) * ${a_s0} + nn] * lX[get_local_id(0)]; } % endif } % if (max_n_dots > 1): } % endif // -- Parallel reduction long work-group dimension 0 for (uint stride = 1; stride < get_local_size(0); stride *= 2) { barrier(CLK_LOCAL_MEM_FENCE); uint index = 2 * stride * get_local_id(0); if (index + stride < get_local_size(0)) { partialDotProduct[get_local_id(1)][index] += partialDotProduct[get_local_id(1)][index + stride]; } } // barrier(CLK_LOCAL_MEM_FENCE); if ((get_local_id(0) == 0) && (get_global_id(1) < ${y_len})) { Y_data[${y_offset} + get_global_id(1)] = y_sum_pre[get_local_id(1)] + ${float_alpha} * partialDotProduct[get_local_id(1)][0]; } } """ text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) fn = cl.Program(p.queue.context, text).build().gemv_reduce full_args = [ cl_gstructure, p.A.cl_buf, p.X.cl_buf, ] if p.cl_beta is not None: full_args += [p.cl_beta] full_args += [ p.Y_in.cl_buf, p.Y.cl_buf, ] fn.set_args(*[arr.data for arr in full_args]) rval = Plan( p.queue, fn, gsize, lsize, name='clra_gemv.reduce_impl', tag=p.tag, bw_per_call=bw_from_geometry(p.geometry, items), flops_per_call=flops_from_geometry(p.geometry, items), ) rval.full_args = full_args # prevent GC the args rval.description = p.geometry_summary(items) return rval
def _plan_template(queue, name, core_text, declares="", tag=None, n_elements=0, inputs={}, outputs={}, parameters={}): """Template for making a plan for vector nonlinearities. This template assumes that all inputs and outputs are vectors. Parameters ---------- n_elements: int If n_elements == 0, then the kernels are allocated as a block. This is simple, but can be slow for large computations where input vector sizes are not uniform (e.g. one large population and many small ones). If n_elements >= 1, then all the vectors in the RaggedArray are flattened so that the exact number of required kernels is allocated. Each kernel performs computations for `n_elements` elements. inputs: dictionary of CLRaggedArrays Inputs to the function. RaggedArrays must be a list of vectors. outputs: dictionary of CLRaggedArrays Outputs of the function. RaggedArrays must be a list of vectors. parameters: dictionary of CLRaggedArrays Parameters to the function. Each RaggedArray element must be a vector of the same length of the inputs, or a scalar (to be broadcasted). Providing a float instead of a RaggedArray makes that parameter constant. """ input0 = list(inputs.values())[0] # input to use as reference for lengths N = len(input0) # split parameters into static and updated params static_params = {} # static params (hard-coded) params = {} # variable params (updated) for k, v in parameters.items(): if isinstance(v, CLRaggedArray): params[k] = v else: try: static_params[k] = ('float', float(v)) except TypeError: raise avars = {} bw_per_call = 0 for vname, v in list(inputs.items()) + list(outputs.items()): assert vname not in avars, "Name clash" assert len(v) == N assert (v.shape0s == input0.shape0s).all() assert (v.stride0s == v.shape1s).all() # rows contiguous assert (v.stride1s == 1).all() # columns contiguous assert (v.shape1s == 1).all() # vectors only offset = '%(name)s_starts[gind1]' % {'name': vname} avars[vname] = (v.ctype, offset) bw_per_call += v.nbytes for vname, v in params.items(): assert vname not in avars, "Name clash" assert len(v) == N assert ((v.shape0s == input0.shape0s) | (v.shape0s == 1)).all() assert (v.stride0s == v.shape1s).all() # rows contiguous assert (v.stride1s == 1).all() # columns contiguous assert (v.shape1s == 1).all() # vectors only offset = '%(name)s_starts[gind1]' % {'name': vname} avars[vname] = (v.ctype, offset) bw_per_call += v.nbytes ivars = dict((k, avars[k]) for k in inputs.keys()) ovars = dict((k, avars[k]) for k in outputs.keys()) pvars = dict((k, avars[k]) for k in params.keys()) fn_name = "%s_%d" % (name, n_elements) textconf = dict(fn_name=fn_name, N=N, n_elements=n_elements, declares=declares, core_text=core_text, ivars=ivars, ovars=ovars, pvars=pvars, static_params=static_params) if n_elements > 0: # Allocate the exact number of required kernels in a vector gsize = (int(np.ceil(np.sum(input0.shape0s) / float(n_elements))),) text = """ ////////// MAIN FUNCTION ////////// __kernel void ${fn_name}( % for name, [type, offset] in ivars.items(): __global const int *${name}_starts, __global const ${type} *in_${name}, % endfor % for name, [type, offset] in ovars.items(): __global const int *${name}_starts, __global ${type} *in_${name}, % endfor % for name, [type, offset] in pvars.items(): __global const int *${name}_starts, __global const int *${name}_shape0s, __global const ${type} *in_${name}, % endfor __global const int *lengths ) { int gind0 = get_global_id(0) * ${n_elements}; int gind1 = 0; while (gind0 >= lengths[gind1]) { gind0 -= lengths[gind1]; gind1++; } if (gind1 >= ${N}) return; % for name, [type, offset] in ivars.items(): __global const ${type} *cur_${name} = in_${name} + ${offset} + gind0; % endfor % for name, [type, offset] in ovars.items(): __global ${type} *cur_${name} = in_${name} + ${offset} + gind0; % endfor % for name, [type, offset] in pvars.items(): __global const ${type} *cur_${name} = in_${name} + ${offset}; int ${name}_isvector = ${name}_shape0s[gind1] > 1; if (${name}_isvector) cur_${name} += gind0; % endfor % for name, [type, offset] in \ list(ivars.items()) + list(ovars.items()) + list(pvars.items()): ${type} ${name}; % endfor % for name, [type, value] in static_params.items(): const ${type} ${name} = ${value}; % endfor ////////////////////////////////////////////////// //vvvvv USER DECLARATIONS BELOW vvvvv ${declares} //^^^^^ USER DECLARATIONS ABOVE ^^^^^ ////////////////////////////////////////////////// % for ii in range(n_elements): ////////////////////////////////////////////////// ////////// LOOP ITERATION ${ii} % for name, [type, offset] in ivars.items(): ${name} = *cur_${name}; % endfor % for name, [type, offset] in pvars.items(): if ((${ii} == 0) || ${name}_isvector) ${name} = *cur_${name}; % endfor /////vvvvv USER COMPUTATIONS BELOW vvvvv ${core_text} /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^ % for name, [type, offset] in ovars.items(): *cur_${name} = ${name}; % endfor % if ii + 1 < n_elements: gind0++; if (gind0 >= lengths[gind1]) { gind1++; gind0 = 0; if (gind1 >= ${N}) return; % for name, [_, offset] in \ list(ivars.items()) + list(ovars.items()) + list(pvars.items()): cur_${name} = in_${name} + ${offset}; % endfor % for name, _ in pvars.items(): ${name}_isvector = ${name}_shape0s[gind1] > 1; if (!${name}_isvector) ${name} = *cur_${name}; % endfor } else { % for name, _ in list(ivars.items()) + list(ovars.items()): cur_${name}++; % endfor % for name, _ in pvars.items(): if (${name}_isvector) cur_${name}++; % endfor } % endif % endfor } """ else: # Allocate more than enough kernels in a matrix gsize = (int(np.max(input0.shape0s)), int(N)) text = """ ////////// MAIN FUNCTION ////////// __kernel void ${fn_name}( % for name, [type, offset] in ivars.items(): __global const int *${name}_starts, __global const ${type} *in_${name}, % endfor % for name, [type, offset] in ovars.items(): __global const int *${name}_starts, __global ${type} *in_${name}, % endfor % for name, [type, offset] in pvars.items(): __global const int *${name}_starts, __global const int *${name}_shape0s, __global const ${type} *in_${name}, % endfor __global const int *lengths ) { const int gind0 = get_global_id(0); const int gind1 = get_global_id(1); if (gind0 >= lengths[gind1]) return; % for name, [type, offset] in ivars.items(): ${type} ${name} = in_${name}[${offset} + gind0]; % endfor % for name, [type, offset] in ovars.items(): ${type} ${name}; % endfor % for name, [type, offset] in pvars.items(): const ${type} ${name} = (${name}_shape0s[gind1] > 1) ? in_${name}[${offset} + gind0] : in_${name}[${offset}]; % endfor % for name, [type, value] in static_params.items(): const ${type} ${name} = ${value}; % endfor ////////////////////////////////////////////////// //vvvvv USER DECLARATIONS BELOW vvvvv ${declares} //^^^^^ USER DECLARATIONS ABOVE ^^^^^ ////////////////////////////////////////////////// /////vvvvv USER COMPUTATIONS BELOW vvvvv ${core_text} /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^ % for name, [type, offset] in ovars.items(): in_${name}[${offset} + gind0] = ${name}; % endfor } """ text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) if 0: for i, line in enumerate(text.split('\n')): print("%3d %s" % (i + 1, line)) full_args = [] for vname, v in list(inputs.items()) + list(outputs.items()): full_args.extend([v.cl_starts, v.cl_buf]) for vname, v in params.items(): full_args.extend([v.cl_starts, v.cl_shape0s, v.cl_buf]) full_args.append(input0.cl_shape0s) full_args = tuple(full_args) fns = cl.Program(queue.context, text).build() _fn = getattr(fns, fn_name) _fn.set_args(*[arr.data for arr in full_args]) rval = Plan(queue, _fn, gsize, lsize=None, name=name, tag=tag) rval.full_args = full_args # prevent garbage-collection rval.bw_per_call = bw_per_call rval.description = ("groups: %d; items: %d; items/group: %0.1f [%d, %d]" % (N, input0.sizes.sum(), input0.sizes.mean(), input0.sizes.min(), input0.sizes.max())) return rval
def block_impl(p, items): assert p.float_alpha == 1.0 assert p.float_beta == 1.0 assert p.float_gamma == 0.0 if p.clra_alpha is not None: raise NotImplementedError() if p.clra_gamma is not None: raise NotImplementedError() if p.clra_beta is not None: raise NotImplementedError() if p.cl_alpha is not None: raise NotImplementedError() if p.cl_gamma is not None: raise NotImplementedError() if not all(s == 1 for s in p.A.stride1s): raise NotImplementedError() if p.A_js is None: # -- easy probably, but not done raise NotImplementedError() # --- blocking # We want to group the dot products into blocks, so that each workgroup # is computing a (block_y, block_x) region of a dot product. To do this, # we create a temporary output buffer, compute each block to a separate # region of this buffer, then reduce across the buffer in a separate kernel # block_y = 8 block_y = 32 # block_x = 32 block_x = 128 shape0s = [] shape1s = [] Astride0s = [] Astride1s = [] Astarts = [] Xstride0s = [] Xstarts = [] Ybufstarts = [] Ybufstart = 0 Yshape0s_reduce = [] Yinstride0s_reduce = [] Yinstarts_reduce = [] Ystride0s_reduce = [] Ystarts_reduce = [] Ybufinds_reduce = [] bw_reduce = 0 for n in items: assert p.Y_in.shape0s[n] == p.Y.shape0s[n] shape0n = p.Y.shape0s[n] for i in range(0, shape0n, block_y): shape0i = min(shape0n - i, block_y) Ybufind_reduce = [] # loop over dot products outputting to same Y n_dots = len(p.A_js[n]) assert len(p.A_js[n]) == len(p.X_js[n]) for aj, xj in zip(p.A_js[n], p.X_js[n]): assert aj.size == 1 and xj.size == 1 aj, xj = aj[0], xj[0] # to ignore numpy DeprecationWarning assert p.A.shape0s[aj] == shape0n assert p.A.shape1s[aj] == p.X.shape0s[xj] assert p.X.shape1s[xj] == 1 shape1n = p.A.shape1s[aj] for j in range(0, shape1n, block_x): shape0s.append(shape0i) shape1s.append(min(shape1n - j, block_x)) Astride0s.append(p.A.stride0s[aj]) Astride1s.append(p.A.stride1s[aj]) Astarts.append(p.A.starts[aj] + i * p.A.stride0s[aj] + j * p.A.stride1s[aj]) Xstride0s.append(p.X.stride0s[xj]) Xstarts.append(p.X.starts[xj] + j * p.X.stride0s[xj]) Ybufstarts.append(Ybufstart) Ybufind_reduce.append(Ybufstart) # Ybufstart += shape0s[-1] Ybufstart += block_y # keep good offset # --- Y-blocking for reduce Yshape0s_reduce.append(shape0i) Yinstride0s_reduce.append(p.Y_in.stride0s[n]) Yinstarts_reduce.append(p.Y_in.starts[n] + i * p.Y_in.stride0s[n]) Ystride0s_reduce.append(p.Y.stride0s[n]) Ystarts_reduce.append(p.Y.starts[n] + i * p.Y.stride0s[n]) Ybufinds_reduce.append(Ybufind_reduce) bw_reduce += shape0i * (len(Ybufind_reduce) + 1) * p.Y.dtype.itemsize # --- create structure gstructure = np.column_stack([ shape0s, shape1s, Astride0s, Astride1s, Astarts, Xstride0s, Xstarts, Ybufstarts ]) cl_gstructure = to_device(p.queue, gstructure.astype(np.int32)) # --- create Y buffer clYbuf = to_device(p.queue, np.zeros(Ybufstart, dtype=p.Y.dtype)) lsize0 = 4 # lsize0 = 8 lsize0_log2 = int(np.log2(lsize0)) assert 2**lsize0_log2 == lsize0 lsize = (lsize0, block_y, 1) gsize = (lsize[0], lsize[1], gstructure.shape[0]) assert np.prod(lsize) >= block_x textconf = dict( A=p.A, X=p.X, Ybuf=clYbuf, n_structure_vars=gstructure.shape[1], shape0='lstructure[0]', shape1='lstructure[1]', Astride0='lstructure[2]', Astride1='lstructure[3]', Astart='lstructure[4]', Xstride0='lstructure[5]', Xstart='lstructure[6]', Ybufstart='lstructure[7]', block_y=block_y, block_x=block_x, lsize0=lsize0, lsize0_log2=lsize0_log2, ) full_args = ( cl_gstructure, p.A.cl_buf, p.X.cl_buf, clYbuf, ) source = """ __kernel void fn( __global const int *gstructure, __global const ${A.ctype} *Adata, __global const ${X.ctype} *Xdata, __global ${Ybuf.ctype} *Ybufdata ) { const int j = get_global_id(0); const int i = get_global_id(1); const int n = get_global_id(2); // load structure __local int lstructure[${n_structure_vars}]; const int local_idx = get_local_id(0) + get_local_id(1)*get_local_size(0); if (local_idx < ${n_structure_vars}) lstructure[local_idx] = gstructure[ n * ${n_structure_vars} + local_idx]; barrier(CLK_LOCAL_MEM_FENCE); __global const ${X.ctype} *x = Xdata + ${Xstart}; __global ${Ybuf.ctype} *ybuf = Ybufdata + ${Ybufstart}; // load x into local memory __local ${X.ctype} xlocal[${block_x}]; if (local_idx < ${shape1}) xlocal[local_idx] = x[local_idx*${Xstride0}]; barrier(CLK_LOCAL_MEM_FENCE); __local ${Ybuf.ctype} sums[${block_y}][${lsize0}]; sums[i][j] = 0; if (i < ${shape0}) { __global const ${A.ctype} *Ai = Adata + ${Astart} + i*${Astride0}; for(int jj = j; jj < ${shape1}; jj += get_global_size(0)) { sums[i][j] += Ai[jj*${Astride1}] * xlocal[jj]; } } barrier(CLK_LOCAL_MEM_FENCE); % for k in range(lsize0_log2 - 1, 0, -1): if (j < ${2**k}) sums[i][j] += sums[i][${2**k} + j]; barrier(CLK_LOCAL_MEM_FENCE); % endfor if (i < ${shape0} && j == 0) ybuf[i] = sums[i][0] + sums[i][1]; } """ source = Template(source, output_encoding='ascii').render(**textconf) kernel = cl.Program(p.queue.context, source).build().fn kernel.set_args(*[arr.data for arr in full_args]) plan = Plan( p.queue, kernel, gsize, lsize, name='clra_gemv.block_impl', tag=p.tag, bw_per_call=bw_from_geometry(p.geometry, items), flops_per_call=flops_from_geometry(p.geometry, items), ) plan.full_args = full_args # prevent GC the args plan.description = p.geometry_summary(items) plan.Ybuf = clYbuf # --- Reduce kernel align = False Nreduce = len(Yshape0s_reduce) clYshape0s_reduce = to_device(p.queue, np.array(Yshape0s_reduce, dtype=np.int32)) clYinstride0s_reduce = to_device( p.queue, np.array(Yinstride0s_reduce, dtype=np.int32)) clYinstarts_reduce = to_device(p.queue, np.array(Yinstarts_reduce, dtype=np.int32)) clYstride0s_reduce = to_device(p.queue, np.array(Ystride0s_reduce, dtype=np.int32)) clYstarts_reduce = to_device(p.queue, np.array(Ystarts_reduce, dtype=np.int32)) clYbufinds_reduce = CLRaggedArray.from_arrays(p.queue, Ybufinds_reduce, dtype=np.int32, align=align) assert len(clYbufinds_reduce) == Nreduce assert (clYbufinds_reduce.shape1s == 1).all() textconf_reduce = dict( Ybuf=clYbuf, Yin=p.Y_in, Y=p.Y, ) full_args_reduce = ( clYshape0s_reduce, clYbufinds_reduce.cl_shape0s, clYbufinds_reduce.cl_starts, clYbufinds_reduce.cl_buf, clYbuf, clYinstride0s_reduce, clYinstarts_reduce, p.Y_in.cl_buf, clYstride0s_reduce, clYstarts_reduce, p.Y.cl_buf, ) lsize_reduce = None gsize_reduce = (block_y, Nreduce) source_reduce = """ __kernel void reduce( __global const int *shape0s, __global const int *Ishape0s, __global const int *Istarts, __global const int *Idata, __global ${Ybuf.ctype} *Ybufdata, __global const int *Yinstride0s, __global const int *Yinstarts, __global ${Yin.ctype} *Yindata, __global const int *Ystride0s, __global const int *Ystarts, __global ${Y.ctype} *Ydata ) { const int i = get_global_id(0); const int n = get_global_id(1); if (i >= shape0s[n]) return; const int Ishape0 = Ishape0s[n]; __global const int *Ybufstart = Idata + Istarts[n]; __global ${Yin.ctype} *yin = Yindata + Yinstarts[n]; __global ${Y.ctype} *y = Ydata + Ystarts[n]; ${Y.ctype} sum = yin[i*Yinstride0s[n]]; for (int j = 0; j < Ishape0; j++) { sum += Ybufdata[Ybufstart[j] + i]; } y[i*Ystride0s[n]] = sum; } """ source_reduce = Template(source_reduce, output_encoding='ascii').render(**textconf_reduce) kernel_reduce = cl.Program(p.queue.context, source_reduce).build().reduce kernel_reduce.set_args(*[arr.data for arr in full_args_reduce]) plan_reduce = Plan( p.queue, kernel_reduce, gsize_reduce, lsize_reduce, name='clra_gemv.block_impl_reduce', tag=p.tag, bw_per_call=bw_reduce, ) plan_reduce.full_args = full_args_reduce # prevent GC the args # plan_reduce.description = p.geometry_summary(items) return [plan, plan_reduce]
def plan_sparse_dot_inc(queue, A_indices, A_indptr, A_data, X, Y, inc=False, tag=None): """Implements a sparse matrix-vector multiply: Y += A * X or Y = A * X Parameters ---------- A_indices, A_indptr : PyOpenCL array Column sparse row index specifications A_data : PyOpenCL array Matrix values at those indices X, Y : CLRaggedArrays of length 1 Input/output data. inc : bool Whether to increment ``Y`` (True), or set it (False). Notes ----- This function crashes when there are >10M nonzero weights. A potential solution would be some way to tell each work item to do multiple rows. """ assert len(X) == len(Y) == 1 for arr in [X, Y]: assert (arr.stride1s == 1).all() if not ((arr.shape1s == 1).all() and (arr.stride0s == 1).all()): raise NotImplementedError( "OCL SparseDot only supports matrix-vector currently, not matrix-matrix" ) for arr in [A_indices, A_indptr, A_data]: assert len(arr.shape) == 1 assert arr.strides[0] == arr.dtype.itemsize # contiguous assert A_indices.size == A_data.size assert A_data.ctype == X.ctype == Y.ctype assert A_indices.ctype == A_indptr.ctype == "int" kern = """ __kernel void sparsedot_inc( __global const int *A_indices, __global const int *A_indptr, __global const ${dtype} *A_data, __global const int *Xstarts, __global const ${dtype} *Xdata, __global const int *Ystarts, __global ${dtype} *Ydata ) { // n can later be used to keep track of multiple arrays const int n = 0; const int irow = get_global_id(0); __global const ${dtype} *x = Xdata + Xstarts[n]; __global ${dtype} *y = Ydata + Ystarts[n]; %if not inc: y[irow] = 0; %endif const int end = A_indptr[irow + 1]; for (int k = A_indptr[irow]; k < end; k++) { y[irow] += A_data[k] * x[A_indices[k]]; } } """ textconf = dict(dtype=A_data.ctype, IndType=A_indices.ctype, inc=inc) text = as_ascii(Template(kern, output_encoding="ascii").render(**textconf)) full_args = ( A_indices.base_data, A_indptr.base_data, A_data.base_data, X.cl_starts.data, X.cl_buf.data, Y.cl_starts.data, Y.cl_buf.data, ) _fn = cl.Program(queue.context, text).build().sparsedot_inc _fn.set_args(*full_args) gsize = (Y.sizes[0], 1) # this only works for a single operation lsize = None plan = Plan(queue, _fn, gsize, lsize=lsize, name="cl_sparsedot", tag=tag) plan.full_args = full_args # prevent garbage-collection plan.flops_per_call = 2 * A_data.size plan.bw_per_call = A_data.nbytes * 3 + A_indices.nbytes + A_indptr.nbytes plan.description = "groups: %d; shape: (%d, %d); nonzeros: %d" % ( 1, Y.sizes[0], X.sizes[0], A_data.size, ) return plan
def plan_stp(queue, calcium, resources, weights, delta, alpha, init_weights, tag=None): assert (len(calcium) == len(resources) == len(weights) == len(delta) == alpha.size == len(init_weights)) N = len(calcium) for arr in (calcium, resources): # vectors assert (arr.shape1s == 1).all() for arr in (delta, weights, init_weights): # matrices assert (arr.stride1s == 1).all() #assert (resources.shape0s == weights.shape0s).all() #assert (calcium.shape0s == weights.shape1s).all() assert (weights.shape0s == delta.shape0s).all() assert (weights.shape1s == delta.shape1s).all() assert (weights.shape0s == init_weights.shape0s).all() assert (weights.shape1s == init_weights.shape1s).all() assert (calcium.ctype == resources.ctype == weights.ctype == delta.ctype == alpha.ctype == init_weights.ctype) text = """ __kernel void stp( __global const int *shape0s, __global const int *shape1s, __global const int *calcium_stride0s, __global const int *calcium_starts, __global const ${type} *calcium_data, __global const int *resources_stride0s, __global const int *resources_starts, __global const ${type} *resources_data, __global const int *weights_stride0s, __global const int *weights_starts, __global const ${type} *weights_data, __global const int *delta_stride0s, __global const int *delta_starts, __global ${type} *delta_data, __global const ${type} *alphas, __global const int *init_weights_stride0s, __global const int *init_weights_starts, __global const ${type} *init_weights_data ) { const int ij = get_global_id(0); const int k = get_global_id(1); const int shape0 = shape0s[k]; const int shape1 = shape1s[k]; const int i = ij / shape1; const int j = ij % shape1; __global ${type} *delta = delta_data + delta_starts[k]; const ${type} calcium = calcium_data[calcium_starts[k] + i*calcium_stride0s[k]]; const ${type} resources = resources_data[resources_starts[k] + i*resources_stride0s[k]]; const ${type} weight = weights_data[ weights_starts[k] + i*weights_stride0s[k]+j]; const ${type} alpha = alphas[k]; const ${type} init_weights = init_weights_data[init_weights_starts[k] + i*init_weights_stride0s[k]+j]; if (i < shape0) { delta[i*delta_stride0s[k] + j] = ((calcium*resources/0.2)*init_weights)-weight; } } """ textconf = dict(type=calcium.ctype) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) full_args = ( delta.cl_shape0s, delta.cl_shape1s, calcium.cl_stride0s, calcium.cl_starts, calcium.cl_buf, resources.cl_stride0s, resources.cl_starts, resources.cl_buf, weights.cl_stride0s, weights.cl_starts, weights.cl_buf, delta.cl_stride0s, delta.cl_starts, delta.cl_buf, alpha, init_weights.cl_stride0s, init_weights.cl_starts, init_weights.cl_buf, ) _fn = cl.Program(queue.context, text).build().stp _fn.set_args(*[arr.data for arr in full_args]) lsize = None gsize = (delta.sizes.max(), N) plan = Plan(queue, _fn, gsize, lsize=lsize, name="cl_stp", tag=tag) plan.full_args = full_args # prevent garbage-collection plan.flops_per_call = 6 * delta.sizes.sum() plan.bw_per_call = (calcium.nbytes + resources.nbytes + weights.nbytes + delta.nbytes + alpha.nbytes + init_weights.nbytes) return plan
def ref_impl(p, items): """ Return an OpenCL function to calculate elements `items` of gemv operation `p`. In this reference implementation, we create a work item per output number, or more specifically, a work grid of (max_y_len, len(items)). Each work item loops over the dot products and the elements within each dot product to compute the output value Y[global_id(1)][global_id(0)]. """ if p.clra_alpha is not None: raise NotImplementedError() if p.clra_gamma is not None: raise NotImplementedError() cl_items = to_device(p.queue, np.asarray(items, dtype='int32')) if 0: if len(items) < 10: print('Falling back on reference implementation') p.print_geometry_summary(items, full=True) else: print('Falling back on reference implementation') p.print_geometry_summary(items) assert all(s == 1 for s in p.A.stride1s) assert all(s == 1 for s in p.X.stride1s) assert all(s == 1 for s in p.Y.stride0s) assert all(s == 1 for s in p.Y.stride1s) assert all(s == 1 for s in p.Y_in.stride0s) assert all(s == 1 for s in p.Y_in.stride1s) text = """ __kernel void gemv_ref( __global int *items, % if cl_alpha is not None: __global ${cl_alpha.ctype} * alphas, % endif % if (A_js is not None): __global int *A_starts, __global int *A_shape1s, __global int *A_stride0s, __global ${A.cl_buf.ctype} *A_data, __global int *A_js_starts, __global int *A_js_shape0s, __global int *A_js_data, __global int *X_starts, __global int *X_stride0s, __global ${X.cl_buf.ctype} *X_data, __global int *X_js_starts, __global int *X_js_data, % endif % if cl_beta is not None: __global ${cl_beta.ctype} * betas, % endif % if clra_beta is not None: __global int *beta_starts, __global int *beta_data, % endif % if cl_gamma is not None: __global ${cl_gamma.ctype} * gammas, % endif __global int *Y_in_starts, __global ${Y_in.cl_buf.ctype} *Y_in_data, __global int *Y_starts, __global int *Y_shape0s, __global ${Y.cl_buf.ctype} *Y_data) { const int mm = get_global_id(0); const int bb = items[get_global_id(1)]; const int M = Y_shape0s[bb]; if (mm < M) { const int y_offset = Y_starts[bb]; const int y_in_offset = Y_in_starts[bb]; % if float_beta is not None: const ${Y.cl_buf.ctype} beta = ${float_beta}; % elif cl_beta is not None: const ${cl_beta.ctype} beta = betas[bb]; % elif clra_beta is not None: const int beta_offset = beta_starts[bb]; const ${clra_beta.cl_buf.ctype} beta = beta_data[beta_offset + mm]; % endif % if float_gamma is not None: const ${Y.cl_buf.ctype} gamma = ${float_gamma}; % elif cl_gamma is not None: const ${cl_gamma.ctype} gamma = gammas[bb]; % endif Y_data[y_offset + mm] = gamma + beta * Y_in_data[y_in_offset + mm]; % if A_js is not None: const int n_dot_products = A_js_shape0s[bb]; X_js_data += X_js_starts[bb]; A_js_data += A_js_starts[bb]; ${Y.cl_buf.ctype} y_sum = 0; for (int ii = 0; ii < n_dot_products; ++ii) { const int x_ji = X_js_data[ii]; const int a_ji = A_js_data[ii]; const int N_i = A_shape1s[a_ji]; const int x_offset = X_starts[x_ji]; const int a_offset = A_starts[a_ji]; const int AsM = A_stride0s[a_ji]; const int XsM = X_stride0s[x_ji]; for (int nn = 0; nn < N_i; ++nn) { y_sum += X_data[x_offset + nn * XsM] * A_data[a_offset + mm * AsM + nn]; } } % if float_alpha is not None: Y_data[y_offset + mm] += ${float_alpha} * y_sum; % elif cl_alpha is not None: Y_data[y_offset + mm] += alphas[bb] * y_sum; % endif % endif } } """ text = as_ascii( Template(text, output_encoding='ascii').render(**p.__dict__)) gsize = ( max(p.geometry[ii]['y_len'] for ii in items), len(items)) lsize = None fn = cl.Program(p.queue.context, text).build().gemv_ref full_args = [cl_items] if p.cl_alpha is not None: full_args += [p.cl_alpha] if p.A_js is not None: full_args += [ p.A.cl_starts, p.A.cl_shape1s, p.A.cl_stride0s, p.A.cl_buf, p.A_js.cl_starts, p.A_js.cl_shape0s, p.A_js.cl_buf, p.X.cl_starts, p.X.cl_stride0s, p.X.cl_buf, p.X_js.cl_starts, p.X_js.cl_buf, ] if p.cl_beta is not None: full_args += [p.cl_beta] elif p.clra_beta is not None: full_args += [p.clra_beta.cl_starts, p.clra_beta.cl_buf] if p.cl_gamma is not None: full_args += [p.cl_gamma] elif p.clra_gamma is not None: full_args += [p.clra_gamma.cl_starts, p.clra_gamma.cl_buf] full_args += [ p.Y_in.cl_starts, p.Y_in.cl_buf, p.Y.cl_starts, p.Y.cl_shape0s, p.Y.cl_buf] # print([str(arr.dtype)[0] for arr in full_args]) fn.set_args(*[arr.data for arr in full_args]) rval = Plan(p.queue, fn, gsize, lsize, name="clra_gemv.ref_impl", tag=p.tag, bw_per_call=bw_from_geometry(p.geometry, items), flops_per_call=flops_from_geometry(p.geometry, items)) rval.full_args = full_args # prevent GC the args return rval
def reduce_impl(p, items, group_size=None, segment_size=None, ): # # Target use case: long inner products, small numbers of dots. # # Approach: each work-group computes a small number of gemv outputs # if p.clra_alpha is not None: raise NotImplementedError() if p.clra_gamma is not None: raise NotImplementedError() if p.clra_beta is not None: raise NotImplementedError() if p.cl_alpha is not None: raise NotImplementedError() if p.cl_gamma is not None: raise NotImplementedError() if not all(s == 1 for s in p.A.stride1s): raise NotImplementedError() assert p.float_alpha is not None assert p.float_gamma is not None cl_gstructure, textconf = p.cl_geometry_and_textconf(items) max_n_dots = max([len(p.geometry[ii]['dots']) for ii in items]) max_reduce_len = max(max([gg['a_shape1'] for gg in p.geometry[ii]['dots']]) for ii in items) max_y_len = max([p.geometry[ii]['y_len'] for ii in items]) # segment means the piece of Y written by a work-group # group_size is the number of values that we're reducing over if len(items) < 4: if group_size is None: group_size = 32 # XXX if segment_size is None: segment_size = min(max_y_len, 2) # XXX else: if group_size is None: group_size = 32 # XXX if segment_size is None: segment_size = min(max_y_len, 4) # XXX g_segments = int(np.ceil(float(max_y_len) / segment_size)) gsize = (group_size, g_segments * segment_size, len(items)) lsize = (group_size, segment_size, 1) max_reduce_iters = int(np.ceil(float(max_reduce_len) / group_size)) textconf.update({ 'n_items': len(items), 'gsize': gsize, 'segment_size': segment_size, 'max_y_len': max_y_len, 'group_size': group_size, 'local_count': group_size * segment_size, 'max_reduce_len': max_reduce_len, 'N_cutoff': max_reduce_iters * group_size, 'max_n_dots': max_n_dots, }) if 0: for k, v in textconf.items(): print(k, v) textconf.update(p.__dict__) text = """ __kernel void gemv_reduce( const __global int *gstructure, const __global ${A.cl_buf.ctype} *A_data, const __global ${X.cl_buf.ctype} *X_data, % if cl_beta is not None: const __global ${cl_beta.ctype} * betas, % endif const __global ${Y_in.cl_buf.ctype} *Y_in_data, __global ${Y.cl_buf.ctype} *Y_data) { __local int lstructure[${n_structure_vars}]; % if segment_size > 1: // we'll cache X in shared memory so we load it only once // for the whole segment __local ${X.cl_buf.ctype} lX[${group_size}]; % endif //Scratch space for the dot products __local ${Y.cl_buf.ctype} partialDotProduct[${segment_size}][${group_size}]; __local ${Y.cl_buf.ctype} y_sum_pre[${segment_size}]; const int local_idx = get_local_id(0) + get_local_id(1) * get_local_size(0); // load structure % if local_count < n_structure_vars: for (int ii = local_idx; ii < ${n_structure_vars}; ii += ${local_count}) { lstructure[ii] = gstructure[ get_global_id(2) * ${structure_vars_stride} + ii]; } % else : if (local_idx < ${n_structure_vars}) { lstructure[local_idx] = gstructure[ get_global_id(2) * ${structure_vars_stride} + local_idx]; } % endif barrier(CLK_LOCAL_MEM_FENCE); if ((get_local_id(0) == 0) && (get_global_id(1) < ${y_len})) { % if float_beta is not None and float_beta != 0 : y_sum_pre[get_local_id(1)] = ${float_beta} * Y_in_data[${y_in_starts} + get_global_id(1)]; % elif cl_beta is not None: y_sum_pre[get_local_id(1)] = betas[${bb}] * Y_in_data[${y_in_starts} + get_global_id(1)]; % else : y_sum_pre[get_local_id(1)] = 0; % endif % if float_gamma is not None and float_gamma != 0: y_sum_pre[get_local_id(1)] += ${float_gamma}; % endif // printf("betaY + gamma=%f\\n", y_sum_pre[get_local_id(1)]); } partialDotProduct[get_local_id(1)][get_local_id(0)] = 0; % if max_n_dots > 1: for (int ii = 0; ii < ${n_dot_products}; ii += 1) { % else: const int ii = 0; % endif for (int nn = get_local_id(0); nn < ${N_cutoff}; nn += get_local_size(0)) { // segment_size = ${segment_size} % if (segment_size == 1): if ((nn < ${N_i}) && (get_global_id(1) < ${y_len})) { partialDotProduct[get_local_id(1)][get_local_id(0)] += A_data[${a_starts} + get_global_id(1) * ${a_s0} + nn] * X_data[${x_starts} + nn]; } % else: barrier(CLK_LOCAL_MEM_FENCE); if ((get_local_id(1) == 0) && (nn < ${N_i})) { lX[get_local_id(0)] = X_data[${x_starts} + nn]; } barrier(CLK_LOCAL_MEM_FENCE); if ((nn < ${N_i}) && (get_global_id(1) < ${y_len})) { partialDotProduct[get_local_id(1)][get_local_id(0)] += A_data[${a_starts} + get_global_id(1) * ${a_s0} + nn] * lX[get_local_id(0)]; } % endif } % if (max_n_dots > 1): } % endif // -- Parallel reduction long work-group dimension 0 for (uint stride = 1; stride < get_local_size(0); stride *= 2) { barrier(CLK_LOCAL_MEM_FENCE); uint index = 2 * stride * get_local_id(0); if (index + stride < get_local_size(0)) { partialDotProduct[get_local_id(1)][index] += partialDotProduct[get_local_id(1)][index + stride]; } } // barrier(CLK_LOCAL_MEM_FENCE); if ((get_local_id(0) == 0) && (get_global_id(1) < ${y_len})) { Y_data[${y_offset} + get_global_id(1)] = y_sum_pre[get_local_id(1)] + ${float_alpha} * partialDotProduct[get_local_id(1)][0]; } } """ text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) fn = cl.Program(p.queue.context, text).build().gemv_reduce full_args = [ cl_gstructure, p.A.cl_buf, p.X.cl_buf, ] if p.cl_beta is not None: full_args += [p.cl_beta] full_args += [ p.Y_in.cl_buf, p.Y.cl_buf, ] fn.set_args(*[arr.data for arr in full_args]) rval = Plan(p.queue, fn, gsize, lsize, name='clra_gemv.reduce_impl', tag=p.tag, bw_per_call=bw_from_geometry(p.geometry, items), flops_per_call=flops_from_geometry(p.geometry, items), ) rval.full_args = full_args # prevent GC the args rval.description = p.geometry_summary(items) return rval
def plan_slicedcopy(queue, A, B, Ainds, Binds, incs, tag=None): N = len(A) assert len(A) == len(B) == len(Ainds) == len(Binds) for arr in [A, B, Ainds, Binds]: assert (arr.shape1s == 1).all() assert (arr.stride0s == 1).all() assert (arr.stride1s == 1).all() assert (Ainds.shape0s == Binds.shape0s).all() assert A.ctype == B.ctype assert Ainds.ctype == Binds.ctype == 'int' assert incs.ctype == 'int' text = """ ////////// MAIN FUNCTION ////////// __kernel void slicedcopy( __global const int *Astarts, __global const ${Atype} *Adata, __global const int *Bstarts, __global ${Btype} *Bdata, __global const int *Ishape0s, __global const int *AIstarts, __global const int *AIdata, __global const int *BIstarts, __global const int *BIdata, __global const int *incdata ) { const int n = get_global_id(1); __global const ${Atype} *a = Adata + Astarts[n]; __global ${Btype} *b = Bdata + Bstarts[n]; __global const int *aind = AIdata + AIstarts[n]; __global const int *bind = BIdata + BIstarts[n]; const int inc = incdata[n]; int i = get_global_id(0); if (inc) for (; i < Ishape0s[n]; i += get_global_size(0)) b[bind[i]] += a[aind[i]]; else for (; i < Ishape0s[n]; i += get_global_size(0)) b[bind[i]] = a[aind[i]]; } """ textconf = dict(Atype=A.ctype, Btype=B.ctype) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) full_args = ( A.cl_starts, A.cl_buf, B.cl_starts, B.cl_buf, Ainds.cl_shape0s, Ainds.cl_starts, Ainds.cl_buf, Binds.cl_starts, Binds.cl_buf, incs.cl_buf, ) _fn = cl.Program(queue.context, text).build().slicedcopy _fn.set_args(*[arr.data for arr in full_args]) max_group = queue.device.max_work_group_size n = min(max(Ainds.shape0s), max_group) gsize = (n, N) lsize = (n, 1) rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_slicedcopy", tag=tag) rval.full_args = full_args # prevent garbage-collection rval.bw_per_call = 2 * Ainds.shape0s.sum() * A.dtype.itemsize rval.description = ( "groups: %d; items: %d; items/group: %0.1f [%d, %d]" % (len(Ainds), Ainds.sizes.sum(), Ainds.sizes.mean(), Ainds.sizes.min(), Ainds.sizes.max())) return rval