Exemplo n.º 1
0
def plan_timeupdate(queue, step, time, dt):
    assert len(step) == len(time) == 1
    assert step.ctype == time.ctype == 'float'
    assert step.shape0s[0] == step.shape1s[0] == 1
    assert time.shape0s[0] == time.shape1s[0] == 1

    text = """
        ////////// MAIN FUNCTION //////////
        __kernel void timeupdate(
            __global const int *step_starts,
            __global float *step_data,
            __global const int *time_starts,
            __global float *time_data
        )
        {
            __global float *step = step_data + step_starts[0];
            __global float *time = time_data + time_starts[0];
            step[0] += 1;
            time[0] = ${dt} * step[0];
        }
        """

    text = as_ascii(Template(text, output_encoding='ascii').render(dt=dt))
    full_args = (step.cl_starts, step.cl_buf, time.cl_starts, time.cl_buf)
    _fn = cl.Program(queue.context, text).build().timeupdate
    _fn.set_args(*[arr.data for arr in full_args])

    gsize = (1,)
    lsize = None
    rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_timeupdate")
    rval.full_args = full_args     # prevent garbage-collection
    return rval
Exemplo n.º 2
0
def plan_bcm2_threshold_diagonal1(queue, delta, weights, max_weight, tag=None):
    N = len(delta)

    for arr in (delta,):  # matrices
        assert (arr.stride1s == 1).all()

    text = """
    __kernel void bcm2_threshold_diagonal1(
        __global const int *shape0s,
        __global const int *shape1s,
        __global const int *delta_stride0s,
        __global const int *delta_starts,
        __global ${type} *delta_data,
        __global const int *weights_stride0s,
        __global const int *weights_starts,
        __global const ${type} *weights_data,
        __global const ${type} *max_weights
    )
    {
        const int ij = get_global_id(0);
        const int k = get_global_id(1);
        const int shape0 = shape0s[k];
        const int shape1 = shape1s[k];
        const int i = ij / shape1;
        const int j = ij % shape1;
        __global ${type} *delta = delta_data + delta_starts[k];
        __global const ${type} *weights = weights_data + weights_starts[k];
        const ${type} max_weight = max_weights[k];

        if (i < shape0) {            
           if (fabs(weights[i*weights_stride0s[k] + j] + delta[i*delta_stride0s[k] + j]) > max_weight) { 
                delta[i*delta_stride0s[k] + j] = 0;
            }
        }
    }
    """

    textconf = dict(type=delta.ctype)
    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    full_args = (
        delta.cl_shape0s, delta.cl_shape1s,
        delta.cl_stride0s, delta.cl_starts, delta.cl_buf,
        weights.cl_stride0s, weights.cl_starts, weights.cl_buf,
        max_weight,
    )
    _fn = cl.Program(queue.context, text).build().bcm2_threshold_diagonal1
    _fn.set_args(*[arr.data for arr in full_args])

    lsize = None
    gsize = (delta.sizes.max(), N)
    plan = Plan(queue, _fn, gsize, lsize=lsize, name="cl_bcm2_threshold_diagonal1", tag=tag)
    plan.full_args = full_args     # prevent garbage-collection
    plan.flops_per_call = 4 * delta.sizes.sum()
    plan.bw_per_call = (delta.nbytes + weights.nbytes + max_weight.nbytes)
    return plan 
Exemplo n.º 3
0
def plan_reset(queue, Y, values, tag=None):
    N = len(Y)
    assert len(Y) == len(values)

    assert np.all(Y.stride0s == Y.shape1s)
    assert np.all(Y.stride1s == 1)
    assert Y.ctype == values.ctype

    text = """
        ////////// MAIN FUNCTION //////////
        __kernel void reset(
            __global const int *Yshape0s,
            __global const int *Yshape1s,
            __global const int *Ystarts,
            __global ${Ytype} *Ydata,
            __global const ${Ytype} *values
        )
        {
            const int n = get_global_id(1);
            int i = get_global_id(0);

            const ${Ytype} value = values[n];
            const int size = Yshape0s[n] * Yshape1s[n];
            __global ${Ytype} *y = Ydata + Ystarts[n];

            for (; i < size; i += get_global_size(0))
                y[i] = value;
        }
        """

    textconf = dict(Ytype=Y.ctype)
    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    full_args = (
        Y.cl_shape0s,
        Y.cl_shape1s,
        Y.cl_starts,
        Y.cl_buf,
        values,
    )
    _fn = cl.Program(queue.context, text).build().reset
    _fn.set_args(*[arr.data for arr in full_args])

    max_group = queue.device.max_work_group_size
    sizes = Y.shape0s * Y.shape1s
    n = min(sizes.max(), max_group)
    gsize = (n, N)
    lsize = (n, 1)
    rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_reset", tag=tag)
    rval.full_args = full_args     # prevent garbage-collection
    rval.bw_per_call = Y.nbytes + values.nbytes
    rval.description = (
        "groups: %d; items: %d; items/group: %0.1f [%d, %d]" %
        (len(Y), Y.sizes.sum(), Y.sizes.mean(), Y.sizes.min(), Y.sizes.max()))
    return rval
Exemplo n.º 4
0
def plan_softmax(queue, X, Y):
    from mako.template import Template
    from nengo_ocl.utils import as_ascii
    from nengo_ocl.plan import Plan

    m, n = X.shape
    assert n <= 32
    assert Y.shape == X.shape
    assert X.elemstrides[1] == 1
    assert Y.elemstrides[1] == 1

    text = """
        __kernel void fn(
            __global const ${Xtype} *X,
            __global ${Ytype} *Y
        )
        {
            const int i = get_global_id(0);

            ${Xtype} ex[${n}];
            __global const ${Xtype} *x = X + i*${Xstride0};
            __global ${Ytype} *y = Y + i*${Ystride0};

            ${Xtype} maxx = -INFINITY;
            for (int j = 0; j < ${n}; j++)
                if (x[j] > maxx)
                    maxx = x[j];

            ${Xtype} sumex = 0;
            for (int j = 0; j < ${n}; j++) {
                ex[j] = exp(x[j] - maxx);
                sumex += ex[j];
            }

            for (int j = 0; j < ${n}; j++)
                y[j] = ex[j] / sumex;
        }
        """
    textconf = dict(Xtype=X.ctype,
                    Ytype=Y.ctype,
                    m=m,
                    n=n,
                    Xstride0=X.elemstrides[0],
                    Ystride0=Y.elemstrides[0])
    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    fn = cl.Program(queue.context, text).build().fn
    fn.set_args(*[arr.data for arr in (X, Y)])
    plan = Plan(queue, fn, gsize=(m, ))
    return plan
Exemplo n.º 5
0
def block_impl(p, items):

    if p.clra_alpha is not None:
        raise NotImplementedError()
    if p.clra_gamma is not None:
        raise NotImplementedError()
    if p.clra_beta is not None:
        raise NotImplementedError()
    if p.cl_alpha is not None:
        raise NotImplementedError()
    if p.cl_beta is not None:
        raise NotImplementedError()
    if p.cl_gamma is not None:
        raise NotImplementedError()
    if not all(s == 1 for s in p.A.stride1s):
        raise NotImplementedError()

    if p.A_js is None:
        # -- easy probably, but not done
        raise NotImplementedError()

    # --- blocking
    # We want to group the dot products into blocks, so that each workgroup
    # is computing a (block_y, block_x) region of a dot product. To do this,
    # we create a temporary output buffer, compute each block to a separate
    # region of this buffer, then reduce across the buffer in a separate kernel

    # block_y = 8
    block_y = 32
    # block_x = 32
    block_x = 128

    shape0s = []
    shape1s = []
    Astride0s = []
    Astride1s = []
    Astarts = []
    Xstride0s = []
    Xstarts = []
    Ybufstarts = []
    Ybufstart = 0

    Yshape0s_reduce = []
    Yinstride0s_reduce = []
    Yinstarts_reduce = []
    Ystride0s_reduce = []
    Ystarts_reduce = []
    Ybufinds_reduce = []
    bw_reduce = 0

    for n in items:
        assert p.Y_in.shape0s[n] == p.Y.shape0s[n]
        shape0n = p.Y.shape0s[n]

        for i in range(0, shape0n, block_y):
            shape0i = min(shape0n - i, block_y)

            Ybufind_reduce = []

            # loop over dot products outputting to same Y
            assert len(p.A_js[n]) == len(p.X_js[n])
            for aj, xj in zip(p.A_js[n], p.X_js[n]):
                assert aj.size == 1 and xj.size == 1
                aj, xj = aj[0], xj[0]  # to ignore numpy DeprecationWarning

                assert p.A.shape0s[aj] == shape0n
                assert p.A.shape1s[aj] == p.X.shape0s[xj]
                assert p.X.shape1s[xj] == 1
                shape1n = p.A.shape1s[aj]

                for j in range(0, shape1n, block_x):
                    shape0s.append(shape0i)
                    shape1s.append(min(shape1n - j, block_x))
                    Astride0s.append(p.A.stride0s[aj])
                    Astride1s.append(p.A.stride1s[aj])
                    Astarts.append(p.A.starts[aj] +
                                   i*p.A.stride0s[aj] + j*p.A.stride1s[aj])
                    Xstride0s.append(p.X.stride0s[xj])
                    Xstarts.append(p.X.starts[xj] + j*p.X.stride0s[xj])

                    Ybufstarts.append(Ybufstart)
                    Ybufind_reduce.append(Ybufstart)
                    # Ybufstart += shape0s[-1]
                    Ybufstart += block_y  # keep good offset

            # --- Y-blocking for reduce
            Yshape0s_reduce.append(shape0i)
            Yinstride0s_reduce.append(p.Y_in.stride0s[n])
            Yinstarts_reduce.append(p.Y_in.starts[n] + i*p.Y_in.stride0s[n])
            Ystride0s_reduce.append(p.Y.stride0s[n])
            Ystarts_reduce.append(p.Y.starts[n] + i*p.Y.stride0s[n])
            Ybufinds_reduce.append(Ybufind_reduce)
            bw_reduce += shape0i*(len(Ybufind_reduce) + 1) * p.Y.dtype.itemsize

    # --- create structure
    gstructure = np.column_stack([shape0s, shape1s, Astride0s, Astride1s,
                                  Astarts, Xstride0s, Xstarts, Ybufstarts])
    cl_gstructure = to_device(p.queue, gstructure.astype(np.int32))

    # --- create Y buffer
    clYbuf = to_device(p.queue, np.zeros(Ybufstart, dtype=p.Y.dtype))

    lsize0 = 4
    # lsize0 = 8
    lsize0_log2 = int(np.log2(lsize0))
    assert 2**lsize0_log2 == lsize0

    lsize = (lsize0, block_y, 1)
    gsize = (lsize[0], lsize[1], gstructure.shape[0])
    assert np.prod(lsize) >= block_x

    textconf = dict(
        A=p.A,
        X=p.X,
        Ybuf=clYbuf,
        n_structure_vars=gstructure.shape[1],
        shape0='lstructure[0]',
        shape1='lstructure[1]',
        Astride0='lstructure[2]',
        Astride1='lstructure[3]',
        Astart='lstructure[4]',
        Xstride0='lstructure[5]',
        Xstart='lstructure[6]',
        Ybufstart='lstructure[7]',
        block_y=block_y,
        block_x=block_x,
        lsize0=lsize0,
        lsize0_log2=lsize0_log2,
        float_alpha=p.float_alpha,
    )

    full_args = (
        cl_gstructure,
        p.A.cl_buf,
        p.X.cl_buf,
        clYbuf,
    )

    text = """
    __kernel void fn(
        __global const int *gstructure,
        __global const ${A.ctype} *Adata,
        __global const ${X.ctype} *Xdata,
        __global ${Ybuf.ctype} *Ybufdata
        )
    {
        const int j = get_global_id(0);
        const int i = get_global_id(1);
        const int n = get_global_id(2);

        // load structure
        __local int lstructure[${n_structure_vars}];
        const int local_idx =
            get_local_id(0) + get_local_id(1)*get_local_size(0);
        if (local_idx < ${n_structure_vars})
            lstructure[local_idx] = gstructure[
                n * ${n_structure_vars} + local_idx];
        barrier(CLK_LOCAL_MEM_FENCE);

        __global const ${X.ctype} *x = Xdata + ${Xstart};
        __global ${Ybuf.ctype} *ybuf = Ybufdata + ${Ybufstart};

        // load x into local memory
        __local ${X.ctype} xlocal[${block_x}];
        if (local_idx < ${shape1})
            xlocal[local_idx] = x[local_idx*${Xstride0}];
        barrier(CLK_LOCAL_MEM_FENCE);

        __local ${Ybuf.ctype} sums[${block_y}][${lsize0}];
        sums[i][j] = 0;

        if (i < ${shape0}) {
            __global const ${A.ctype} *Ai = Adata + ${Astart} + i*${Astride0};
            for(int jj = j; jj < ${shape1}; jj += get_global_size(0)) {
                sums[i][j] += Ai[jj*${Astride1}] * xlocal[jj];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);

    % for k in range(lsize0_log2 - 1, 0, -1):
        if (j < ${2**k})
            sums[i][j] += sums[i][${2**k} + j];
        barrier(CLK_LOCAL_MEM_FENCE);
    % endfor

        if (i < ${shape0} && j == 0)
            ybuf[i] = ${float_alpha} * (sums[i][0] + sums[i][1]);
    }
    """

    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))
    kernel = cl.Program(p.queue.context, text).build().fn
    kernel.set_args(*[arr.data for arr in full_args])

    plan = Plan(p.queue, kernel, gsize, lsize,
                name='clra_gemv.block_impl',
                tag=p.tag,
                bw_per_call=bw_from_geometry(p.geometry, items),
                flops_per_call=flops_from_geometry(p.geometry, items),
                )
    plan.full_args = full_args  # prevent GC the args
    plan.description = p.geometry_summary(items)
    plan.Ybuf = clYbuf

    # --- Reduce kernel
    align = False

    Nreduce = len(Yshape0s_reduce)
    clYshape0s_reduce = to_device(
        p.queue, np.array(Yshape0s_reduce, dtype=np.int32))
    clYinstride0s_reduce = to_device(
        p.queue, np.array(Yinstride0s_reduce, dtype=np.int32))
    clYinstarts_reduce = to_device(
        p.queue, np.array(Yinstarts_reduce, dtype=np.int32))
    clYstride0s_reduce = to_device(
        p.queue, np.array(Ystride0s_reduce, dtype=np.int32))
    clYstarts_reduce = to_device(
        p.queue, np.array(Ystarts_reduce, dtype=np.int32))
    clYbufinds_reduce = CLRaggedArray.from_arrays(
        p.queue, Ybufinds_reduce, dtype=np.int32, align=align)
    assert len(clYbufinds_reduce) == Nreduce
    assert (clYbufinds_reduce.shape1s == 1).all()

    textconf_reduce = dict(
        Ybuf=clYbuf,
        Yin=p.Y_in,
        Y=p.Y,
        float_beta=p.float_beta,
        float_gamma=p.float_gamma,
    )

    full_args_reduce = (
        clYshape0s_reduce,
        clYbufinds_reduce.cl_shape0s,
        clYbufinds_reduce.cl_starts,
        clYbufinds_reduce.cl_buf,
        clYbuf,
        clYinstride0s_reduce,
        clYinstarts_reduce,
        p.Y_in.cl_buf,
        clYstride0s_reduce,
        clYstarts_reduce,
        p.Y.cl_buf,
    )

    lsize_reduce = None
    gsize_reduce = (block_y, Nreduce)

    text_reduce = """
    __kernel void reduce(
        __global const int *shape0s,
        __global const int *Ishape0s,
        __global const int *Istarts,
        __global const int *Idata,
        __global ${Ybuf.ctype} *Ybufdata,
        __global const int *Yinstride0s,
        __global const int *Yinstarts,
        __global ${Yin.ctype} *Yindata,
        __global const int *Ystride0s,
        __global const int *Ystarts,
        __global ${Y.ctype} *Ydata
    )
    {
        const int i = get_global_id(0);
        const int n = get_global_id(1);
        if (i >= shape0s[n])
            return;

        const int Ishape0 = Ishape0s[n];

        __global const int *Ybufstart = Idata + Istarts[n];
        __global ${Yin.ctype} *yin = Yindata + Yinstarts[n];
        __global ${Y.ctype} *y = Ydata + Ystarts[n];

        ${Y.ctype} sum = ${float_beta} * yin[i*Yinstride0s[n]];
        for (int j = 0; j < Ishape0; j++) {
            sum += Ybufdata[Ybufstart[j] + i];
        }

        y[i*Ystride0s[n]] = sum + ${float_gamma};
    }
    """

    text_reduce = as_ascii(Template(
        text_reduce, output_encoding='ascii').render(**textconf_reduce))
    kernel_reduce = cl.Program(p.queue.context, text_reduce).build().reduce
    kernel_reduce.set_args(*[arr.data for arr in full_args_reduce])

    plan_reduce = Plan(p.queue, kernel_reduce, gsize_reduce, lsize_reduce,
                       name='clra_gemv.block_impl_reduce', tag=p.tag)
    plan_reduce.full_args = full_args_reduce  # prevent GC of the args
    plan_reduce.bw_per_call = bw_reduce
    # plan_reduce.description = p.geometry_summary(items)

    return [plan, plan_reduce]
Exemplo n.º 6
0
def many_dots_impl(p, items):
    # target use case:
    # * several very shallow gemvs (short inner prods) into each target
    # * not all targets have the same size

    # p.print_geometry_summary(items, full=True)

    # This algorithm is blocked out so that a work-group [i, j] computes
    # some segment of an output vector:
    # e.g. Y[i][ 32 * j : 32 * (j + 1)]
    #
    # This is done for two reasons:
    # - to increase occupancy when there are not so many vectors Y
    # - to handle long vectors Y

    # p.print_geometry_summary(items)

    if p.clra_alpha is not None:
        raise NotImplementedError()
    if p.clra_gamma is not None:
        raise NotImplementedError()
    if p.clra_beta is not None:
        raise NotImplementedError()
    if p.cl_alpha is not None:
        raise NotImplementedError()
    if p.cl_gamma is not None:
        raise NotImplementedError()
    if not all(s == 1 for s in p.A.stride1s):
        raise NotImplementedError()

    assert p.float_alpha is not None
    assert p.float_gamma is not None

    if p.A_js is None:
        # -- easy probably, but not done
        raise NotImplementedError()
    A_js_shape0s = p.A_js.shape0s
    cl_gstructure, textconf = p.cl_geometry_and_textconf(items)

    # min_n_dots = min(A_js_shape0s)
    max_n_dots = max(A_js_shape0s)

    max_y_len = max(p.geometry[ii]['y_len'] for ii in items)
    MAX_SEGMENT_SIZE = 16  # tricky to tune?

    segment_size = min(
        max_y_len,
        MAX_SEGMENT_SIZE)
    dot_block_size = min(
        max(max_n_dots, 1),
        int(p.queue.device.max_work_group_size / segment_size),
    )

    n_segments = int(np.ceil(float(max_y_len) / segment_size))
    gsize = (n_segments * segment_size, dot_block_size, len(items))
    lsize = (segment_size, dot_block_size, 1)

    textconf.update({
        'gsize': gsize,
        'lsize': lsize,
        'segment_size': segment_size,
        'dot_block_size': dot_block_size,
        'max_y_len': max_y_len,
        'n_locals': segment_size * dot_block_size,
        # 'segment_idx': 'get_local_id(0)',
        # 'dot_block_idx': 'get_local_id(1)',
        'segment_idx': 'segment_idx',
        'dot_block_idx': 'dot_block_idx',
    })
    if 0:
        for k, v in textconf.items():
            print(k, v)
    textconf.update(p.__dict__)
    #    print('float_gamma', textconf['float_gamma'])
    #    print('cl_gamma', textconf['cl_gamma'])
    #    print('clra_gamma', textconf['clra_gamma'])

    text = """
        __kernel void gemv_many_dots(
            const __global int *gstructure,
            const __global ${A.cl_buf.ctype} *A_data,
            const __global ${X.cl_buf.ctype} *X_data,
            % if cl_beta is not None:
            const __global ${cl_beta.ctype} * betas,
            % endif
            const __global ${Y_in.cl_buf.ctype} *Y_in_data,
            __global ${Y.cl_buf.ctype} *Y_data)
    {
        __local int lstructure[${n_structure_vars}];
        __local ${Y.cl_buf.ctype} y_sum_pre[${segment_size}];
        __local ${Y.cl_buf.ctype} \
            y_sum_post[${dot_block_size}][${segment_size}];
        const int local_idx = get_local_id(0) \
            + get_local_id(1) * get_local_size(0);

        int segment_idx = get_local_id(0);
        int dot_block_idx = get_local_id(1);

        for (int ii = local_idx; ii < ${n_structure_vars}; ii += ${n_locals})
        {
            lstructure[ii] = gstructure[
                get_global_id(2) * ${structure_vars_stride} + ii];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if (get_global_id(0) < ${y_len})
        {

            if (dot_block_idx == 0)
            {
    % if float_beta is not None and float_beta != 0 :
                y_sum_pre[segment_idx]
                = ${float_beta} * Y_in_data[${y_in_starts} + get_global_id(0)];
    % elif cl_beta is not None:
                y_sum_pre[segment_idx]
                = betas[${bb}] * Y_in_data[${y_in_starts} + get_global_id(0)];
    % else :
                y_sum_pre[segment_idx] = 0;
    % endif

    % if float_gamma is not None:
        % if float_gamma != 0:
                y_sum_pre[segment_idx] += ${float_gamma};
        % endif
    % endif
            }
        //printf("betaY + gamma=%f\\n", y_sum_pre[segment_idx]);

            // XXX Move X into shared memory first
            y_sum_post[dot_block_idx][segment_idx] = 0;
            for (int ii = dot_block_idx;
                     ii < ${n_dot_products};
                     ii += ${dot_block_size})
            {
                for (int nn = 0; nn < ${N_i}; nn += 1)
                {
                    y_sum_post[dot_block_idx][segment_idx]
                    += A_data[${a_starts} + get_global_id(0) * ${a_s0} + nn]
                       * X_data[${x_starts} + nn];
                }
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        //printf("AX=%f\\n", y_sum_post[dot_block_idx][segment_idx]);
        if ((get_global_id(0) < ${y_len}) && (dot_block_idx == 0))
        {
            for (int ii = 1; ii < ${dot_block_size}; ++ii)
            {
                y_sum_post[0][segment_idx] += y_sum_post[ii][segment_idx];
            }
            Y_data[${y_offset} + get_global_id(0)]
                = y_sum_pre[segment_idx]
                  + ${float_alpha} * y_sum_post[0][segment_idx];
        //printf("Yout=%f\\n", Y_data[${y_offset} + get_global_id(0)]);
        }
    }
        """

    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))
    fn = cl.Program(p.queue.context, text).build().gemv_many_dots

    full_args = [
        cl_gstructure,
        p.A.cl_buf,
        p.X.cl_buf,
    ]
    if p.cl_beta is not None:
        full_args += [p.cl_beta]
    full_args += [
        p.Y_in.cl_buf,
        p.Y.cl_buf,
    ]

    fn.set_args(*[arr.data for arr in full_args])
    rval = Plan(p.queue, fn, gsize, lsize,
                name='clra_gemv.many_dots_impl',
                tag=p.tag,
                bw_per_call=bw_from_geometry(p.geometry, items),
                flops_per_call=flops_from_geometry(p.geometry, items),
                )
    rval.full_args = full_args  # prevent GC the args
    rval.description = p.geometry_summary(items)
    return rval
Exemplo n.º 7
0
def plan_whitenoise(queue, Y, dist_enums, dist_params, scale, dt, ranluxcltab,
                    tag=None):
    N = len(Y)
    assert len(Y) == len(dist_enums) == len(dist_params) == len(scale)

    assert dist_enums.ctype == 'int'
    assert scale.ctype == 'int'

    for i in range(N):
        for arr in [Y, dist_enums, dist_params, scale]:
            assert arr.stride1s[i] == 1

        assert Y.shape1s[i] == 1
        assert Y.stride0s[i] == 1
        assert Y.stride1s[i] == 1

        assert dist_enums.shape0s[i] == dist_enums.shape1s[i] == 1
        assert dist_params.shape1s[i] == 1

        assert scale.shape0s[i] == scale.shape1s[i] == 1
        assert scale.stride0s[i] == scale.stride1s[i] == 1

    text = """
        ${dist_header}

        ////////// MAIN FUNCTION //////////
        __kernel void whitenoise(
            __global const int *shape0s,
            __global const int *Ystarts,
            __global ${Ytype} *Ydata,
            __global const int *Estarts,
            __global const int *Edata,
            __global const int *Pstarts,
            __global const ${Ptype} *Pdata,
            __global const int *scalestarts,
            __global const int *scaledata,
            __global ranluxcl_state_t *ranluxcltab
        )
        {
            const int i0 = get_global_id(0);
            const int k = get_global_id(1);
            const int m = shape0s[k];
            if (i0 >= m)
                return;

            __global ${Ytype} *y = Ydata + Ystarts[k];

            ranluxcl_state_t state;
            ranluxcl_download_seed(&state, ranluxcltab);

            const int scale = *(scaledata + scalestarts[k]);
            const int dist_enum = *(Edata + Estarts[k]);
            __global const float *dist_params = Pdata + Pstarts[k];

            float4 samples;
            float sample;
            int samplei = 4;
            for (int i = i0; i < m; i += get_global_size(0))
            {
                if (samplei >= 4) {
                    samples = sample_dist(dist_enum, dist_params, &state);
                    samplei = 0;
                }

                sample = getfloat4(samples, samplei);
                y[i] = (scale) ? ${sqrt_dt_inv} * sample : sample;
                samplei++;
            }

            ranluxcl_upload_seed(&state, ranluxcltab);
        }
        """

    textconf = dict(Ytype=Y.ctype, Ptype=dist_params.ctype,
                    sqrt_dt_inv=1. / np.sqrt(dt), dist_header=dist_header)
    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    full_args = (
        Y.cl_shape0s,
        Y.cl_starts,
        Y.cl_buf,
        dist_enums.cl_starts,
        dist_enums.cl_buf,
        dist_params.cl_starts,
        dist_params.cl_buf,
        scale.cl_starts,
        scale.cl_buf,
        ranluxcltab,
    )
    _fn = cl.Program(queue.context, text).build().whitenoise
    _fn.set_args(*[arr.data for arr in full_args])

    max_len = min(queue.device.max_work_group_size, max(Y.shape0s))
    gsize = (max_len, N)
    lsize = (max_len, 1)
    rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_whitenoise", tag=tag)
    rval.full_args = full_args     # prevent garbage-collection
    return rval
Exemplo n.º 8
0
def plan_bcm2(queue, pre, post, theta, delta, alpha, tag=None): #weights, max_weight,
    assert len(pre) == len(post) == len(theta) == len(delta) == alpha.size
    N = len(pre)

    for arr in (pre, post, theta):  # vectors
        assert (arr.shape1s == 1).all()
    for arr in (delta,):  # matrices
        assert (arr.stride1s == 1).all()

    assert (post.shape0s == delta.shape0s).all()
    assert (pre.shape0s == delta.shape1s).all()
    assert (post.shape0s == theta.shape0s).all()

    assert (pre.ctype == post.ctype == theta.ctype == delta.ctype ==
            alpha.ctype)

    text = """
    __kernel void bcm2(
        __global const int *shape0s,
        __global const int *shape1s,
        __global const int *pre_stride0s,
        __global const int *pre_starts,
        __global const ${type} *pre_data,
        __global const int *post_stride0s,
        __global const int *post_starts,
        __global const ${type} *post_data,
        __global const int *theta_stride0s,
        __global const int *theta_starts,
        __global const ${type} *theta_data,
        __global const int *delta_stride0s,
        __global const int *delta_starts,
        __global ${type} *delta_data,
        __global const ${type} *alphas
        //__global const int *weights_stride0s,
        //__global const int *weights_starts,
        //__global const ${type} *weights_data,
        //__global const ${type} *max_weights
    )
    {
        const int ij = get_global_id(0);
        const int k = get_global_id(1);
        const int shape0 = shape0s[k];
        const int shape1 = shape1s[k];
        const int i = ij / shape1;
        const int j = ij % shape1;
        __global ${type} *delta = delta_data + delta_starts[k];
        const ${type} pre = pre_data[pre_starts[k] + j*pre_stride0s[k]];
        const ${type} post = post_data[post_starts[k] + i*post_stride0s[k]];
        const ${type} theta = theta_data[
            theta_starts[k] + i*theta_stride0s[k]];
        const ${type} alpha = alphas[k];
        
        //__global const ${type} *weights = weights_data + weights_starts[k];

        //const ${type} max_weight = max_weights[k];

        if (i < shape0) {
            delta[i*delta_stride0s[k] + j]  =
                alpha * post * (post - theta) * pre;

  
            //if (i==j) {
            //    delta[i*delta_stride0s[k] + j]  = 0;
            //} else {
            //    
            //    delta[i*delta_stride0s[k] + j]  = alpha * post * (post - theta) * pre;
            //
            //   if (fabs(weights[i*weights_stride0s[k] + j] + delta[i*delta_stride0s[k] + j]) > max_weight) { 
            //        delta[i*delta_stride0s[k] + j] = 0;
            //    }
            //}
        }
    }
    """

    textconf = dict(type=pre.ctype)
    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    full_args = (
        delta.cl_shape0s, delta.cl_shape1s,
        pre.cl_stride0s, pre.cl_starts, pre.cl_buf,
        post.cl_stride0s, post.cl_starts, post.cl_buf,
        theta.cl_stride0s, theta.cl_starts, theta.cl_buf,
        delta.cl_stride0s, delta.cl_starts, delta.cl_buf,
        alpha,
    )
        #weights.cl_stride0s, weights.cl_starts, weights.cl_buf, #max_weight,
    
    _fn = cl.Program(queue.context, text).build().bcm2
    _fn.set_args(*[arr.data for arr in full_args])

    lsize = None
    gsize = (delta.sizes.max(), N)
    plan = Plan(queue, _fn, gsize, lsize=lsize, name="cl_bcm2", tag=tag)
    plan.full_args = full_args     # prevent garbage-collection
    plan.flops_per_call = 4 * delta.sizes.sum()
    plan.bw_per_call = (pre.nbytes + post.nbytes + theta.nbytes +
                        delta.nbytes + alpha.nbytes) # + weights.nbytes + max_weight.nbytes)
    return plan
Exemplo n.º 9
0
def _plan_template(  # noqa: C901
    queue,
    name,
    core_text,
    declares="",
    tag=None,
    blockify=True,
    inputs=None,
    outputs=None,
    parameters=None,
):
    """Template for making a plan for vector nonlinearities.
    This template assumes that all inputs and outputs are vectors.
    Parameters
    ----------
    blockify : bool
        If true, divide the inputs up into blocks with a maximum size.
    inputs: dictionary of CLRaggedArrays
        Inputs to the function. RaggedArrays must be a list of vectors.
    outputs: dictionary of CLRaggedArrays
        Outputs of the function. RaggedArrays must be a list of vectors.
    parameters: dictionary of CLRaggedArrays
        Parameters to the function. Each RaggedArray element must be a vector
        of the same length of the inputs, or a scalar (to be broadcasted).
        Providing a float instead of a RaggedArray makes that parameter
        constant.
    """
    inputs = {} if inputs is None else inputs
    outputs = {} if outputs is None else outputs
    parameters = {} if parameters is None else parameters

    input0 = list(inputs.values())[0]  # input to use as reference for lengths

    # split parameters into static and updated params
    static_params = {}  # static params (hard-coded)
    params = {}  # variable params (updated)
    for k, v in parameters.items():
        if isinstance(v, CLRaggedArray):
            params[k] = v
        elif is_number(v):
            static_params[k] = ("float", float(v))
        else:
            raise ValueError(
                "Parameter %r must be CLRaggedArray or float (got %s)" %
                (k, type(v)))

    avars = {}
    bw_per_call = 0
    for vname, v in list(inputs.items()) + list(outputs.items()) + list(
            params.items()):
        assert vname not in avars, "Name clash"
        assert len(v) == len(input0)
        assert (v.shape0s == input0.shape0s).all()
        assert (v.stride0s == v.shape1s).all()  # rows contiguous
        assert (v.stride1s == 1).all()  # columns contiguous
        assert (v.shape1s == 1).all()  # vectors only

        offset = "%(name)s_starts[gind1]" % {"name": vname}
        avars[vname] = (v.ctype, offset)
        bw_per_call += v.nbytes

    ivars = {k: avars[k] for k in inputs}
    ovars = {k: avars[k] for k in outputs}
    pvars = {k: avars[k] for k in params}

    fn_name = str(name)
    textconf = dict(
        fn_name=fn_name,
        declares=declares,
        core_text=core_text,
        ivars=ivars,
        ovars=ovars,
        pvars=pvars,
        static_params=static_params,
    )

    text = """
    ////////// MAIN FUNCTION //////////
    __kernel void ${fn_name}(
% for name, [type, offset] in ivars.items():
        __global const int *${name}_starts,
        __global const ${type} *${name}_buf,
% endfor
% for name, [type, offset] in ovars.items():
        __global const int *${name}_starts,
        __global ${type} *${name}_buf,
% endfor
% for name, [type, offset] in pvars.items():
        __global const int *${name}_starts,
        __global const int *${name}_shape0s,
        __global const ${type} *${name}_buf,
% endfor
        __global const int *sizes
    )
    {
        const int gind0 = get_global_id(0);
        const int gind1 = get_global_id(1);
        if (gind1 >= ${N} || gind0 >= sizes[gind1])
            return;
% for name, [type, offset] in ivars.items():
        ${type} ${name} = ${name}_buf[${offset} + gind0];
% endfor
% for name, [type, offset] in ovars.items():
        ${type} ${name};
% endfor
% for name, [type, offset] in pvars.items():
        const ${type} ${name} = ${name}_buf[${offset} + gind0];
% endfor
% for name, [type, value] in static_params.items():
        const ${type} ${name} = ${value};
% endfor
        //////////////////////////////////////////////////
        //vvvvv USER DECLARATIONS BELOW vvvvv
        ${declares}
        //^^^^^ USER DECLARATIONS ABOVE ^^^^^
        //////////////////////////////////////////////////
        /////vvvvv USER COMPUTATIONS BELOW vvvvv
        ${core_text}
        /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^
% for name, [type, offset] in ovars.items():
        ${name}_buf[${offset} + gind0] = ${name};
% endfor
    }
    """

    if blockify:
        # blockify to help with heterogeneous sizes

        # find best block size
        block_sizes = [16, 32, 64, 128, 256, 512, 1024]
        N = np.inf
        for block_size_i in block_sizes:
            sizes_i, inds_i, _ = blockify_vector(block_size_i, input0)
            if len(sizes_i) < N:
                N = len(sizes_i)
                block_size = block_size_i
                sizes = sizes_i
                inds = inds_i

        clsizes = to_device(queue, sizes)
        get_starts = lambda ras: [
            to_device(queue, starts)
            for starts in blockify_vectors(block_size, ras)[2]
        ]
        Istarts = get_starts(inputs.values())
        Ostarts = get_starts(outputs.values())
        Pstarts = get_starts(params.values())
        Pshape0s = [to_device(queue, x.shape0s[inds]) for x in params.values()]

        lsize = None
        gsize = (block_size, len(sizes))

        full_args = []
        for vstarts, v in zip(Istarts, inputs.values()):
            full_args.extend([vstarts, v.cl_buf])
        for vstarts, v in zip(Ostarts, outputs.values()):
            full_args.extend([vstarts, v.cl_buf])
        for vstarts, vshape0s, v in zip(Pstarts, Pshape0s, params.values()):
            full_args.extend([vstarts, vshape0s, v.cl_buf])
        full_args.append(clsizes)
    else:
        # Allocate more than enough kernels in a matrix
        lsize = None
        gsize = (input0.shape0s.max(), len(input0))

        full_args = []
        for v in inputs.values():
            full_args.extend([v.cl_starts, v.cl_buf])
        for v in outputs.values():
            full_args.extend([v.cl_starts, v.cl_buf])
        for vname, v in params.items():
            full_args.extend([v.cl_starts, v.cl_shape0s, v.cl_buf])
        full_args.append(input0.cl_shape0s)

    textconf["N"] = gsize[1]
    text = as_ascii(Template(text, output_encoding="ascii").render(**textconf))
    fns = cl.Program(queue.context, text).build()
    _fn = getattr(fns, fn_name)
    _fn.set_args(*[arr.data for arr in full_args])

    plan = Plan(queue, _fn, gsize, lsize=lsize, name=name, tag=tag)
    plan.full_args = tuple(full_args)  # prevent garbage-collection
    plan.bw_per_call = bw_per_call
    plan.description = "groups: %d; items: %d; items/group: %0.1f [%d, %d]" % (
        gsize[1],
        input0.sizes.sum(),
        input0.sizes.mean(),
        input0.sizes.min(),
        input0.sizes.max(),
    )
    return plan
Exemplo n.º 10
0
def many_dots_impl(p, items):
    # target use case:
    # * several very shallow gemvs (short inner prods) into each target
    # * not all targets have the same size

    # p.print_geometry_summary(items, full=True)

    # This algorithm is blocked out so that a work-group [i, j] computes
    # some segment of an output vector:
    # e.g. Y[i][ 32 * j : 32 * (j + 1)]
    #
    # This is done for two reasons:
    # - to increase occupancy when there are not so many vectors Y
    # - to handle long vectors Y

    # p.print_geometry_summary(items)

    if p.clra_alpha is not None:
        raise NotImplementedError()
    if p.clra_gamma is not None:
        raise NotImplementedError()
    if p.clra_beta is not None:
        raise NotImplementedError()
    if p.cl_alpha is not None:
        raise NotImplementedError()
    if p.cl_gamma is not None:
        raise NotImplementedError()
    if not all(s == 1 for s in p.A.stride1s):
        raise NotImplementedError()

    assert p.float_alpha is not None
    assert p.float_gamma is not None

    if p.A_js is None:
        # -- easy probably, but not done
        raise NotImplementedError()
    A_js_shape0s = p.A_js.shape0s
    cl_gstructure, textconf = p.cl_geometry_and_textconf(items)

    # min_n_dots = min(A_js_shape0s)
    max_n_dots = max(A_js_shape0s)

    max_y_len = max(p.geometry[ii]['y_len'] for ii in items)
    MAX_SEGMENT_SIZE = 16  # tricky to tune?

    segment_size = min(max_y_len, MAX_SEGMENT_SIZE)
    dot_block_size = min(
        max(max_n_dots, 1),
        int(p.queue.device.max_work_group_size / segment_size),
    )

    n_segments = int(np.ceil(float(max_y_len) / segment_size))
    gsize = (n_segments * segment_size, dot_block_size, len(items))
    lsize = (segment_size, dot_block_size, 1)

    textconf.update({
        'gsize': gsize,
        'lsize': lsize,
        'segment_size': segment_size,
        'dot_block_size': dot_block_size,
        'max_y_len': max_y_len,
        'n_locals': segment_size * dot_block_size,
        # 'segment_idx': 'get_local_id(0)',
        # 'dot_block_idx': 'get_local_id(1)',
        'segment_idx': 'segment_idx',
        'dot_block_idx': 'dot_block_idx',
    })
    if 0:
        for k, v in textconf.items():
            print(k, v)
    textconf.update(p.__dict__)
    #    print('float_gamma', textconf['float_gamma'])
    #    print('cl_gamma', textconf['cl_gamma'])
    #    print('clra_gamma', textconf['clra_gamma'])

    text = """
        __kernel void gemv_many_dots(
            const __global int *gstructure,
            const __global ${A.cl_buf.ctype} *A_data,
            const __global ${X.cl_buf.ctype} *X_data,
            % if cl_beta is not None:
            const __global ${cl_beta.ctype} * betas,
            % endif
            const __global ${Y_in.cl_buf.ctype} *Y_in_data,
            __global ${Y.cl_buf.ctype} *Y_data)
    {
        __local int lstructure[${n_structure_vars}];
        __local ${Y.cl_buf.ctype} y_sum_pre[${segment_size}];
        __local ${Y.cl_buf.ctype} \
            y_sum_post[${dot_block_size}][${segment_size}];
        const int local_idx = get_local_id(0) \
            + get_local_id(1) * get_local_size(0);

        int segment_idx = get_local_id(0);
        int dot_block_idx = get_local_id(1);

        for (int ii = local_idx; ii < ${n_structure_vars}; ii += ${n_locals})
        {
            lstructure[ii] = gstructure[
                get_global_id(2) * ${structure_vars_stride} + ii];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if (get_global_id(0) < ${y_len})
        {

            if (dot_block_idx == 0)
            {
    % if float_beta is not None and float_beta != 0 :
                y_sum_pre[segment_idx]
                = ${float_beta} * Y_in_data[${y_in_starts} + get_global_id(0)];
    % elif cl_beta is not None:
                y_sum_pre[segment_idx]
                = betas[${bb}] * Y_in_data[${y_in_starts} + get_global_id(0)];
    % else :
                y_sum_pre[segment_idx] = 0;
    % endif

    % if float_gamma is not None:
        % if float_gamma != 0:
                y_sum_pre[segment_idx] += ${float_gamma};
        % endif
    % endif
            }
        //printf("betaY + gamma=%f\\n", y_sum_pre[segment_idx]);

            // XXX Move X into shared memory first
            y_sum_post[dot_block_idx][segment_idx] = 0;
            for (int ii = dot_block_idx;
                     ii < ${n_dot_products};
                     ii += ${dot_block_size})
            {
                for (int nn = 0; nn < ${N_i}; nn += 1)
                {
                    y_sum_post[dot_block_idx][segment_idx]
                    += A_data[${a_starts} + get_global_id(0) * ${a_s0} + nn]
                       * X_data[${x_starts} + nn];
                }
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        //printf("AX=%f\\n", y_sum_post[dot_block_idx][segment_idx]);
        if ((get_global_id(0) < ${y_len}) && (dot_block_idx == 0))
        {
            for (int ii = 1; ii < ${dot_block_size}; ++ii)
            {
                y_sum_post[0][segment_idx] += y_sum_post[ii][segment_idx];
            }
            Y_data[${y_offset} + get_global_id(0)]
                = y_sum_pre[segment_idx]
                  + ${float_alpha} * y_sum_post[0][segment_idx];
        //printf("Yout=%f\\n", Y_data[${y_offset} + get_global_id(0)]);
        }
    }
        """

    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))
    fn = cl.Program(p.queue.context, text).build().gemv_many_dots

    full_args = [
        cl_gstructure,
        p.A.cl_buf,
        p.X.cl_buf,
    ]
    if p.cl_beta is not None:
        full_args += [p.cl_beta]
    full_args += [
        p.Y_in.cl_buf,
        p.Y.cl_buf,
    ]

    fn.set_args(*[arr.data for arr in full_args])
    rval = Plan(
        p.queue,
        fn,
        gsize,
        lsize,
        name='clra_gemv.many_dots_impl',
        tag=p.tag,
        bw_per_call=bw_from_geometry(p.geometry, items),
        flops_per_call=flops_from_geometry(p.geometry, items),
    )
    rval.full_args = full_args  # prevent GC the args
    rval.description = p.geometry_summary(items)
    return rval
Exemplo n.º 11
0
def ref_impl(p, items):
    """
    Return an OpenCL function to calculate elements `items` of
    gemv operation `p`.

    In this reference implementation, we create a work item
    per output number, or more specifically, a work grid
    of (max_y_len, len(items)).  Each work item loops over the
    dot products and the elements within each dot product to
    compute the output value Y[global_id(1)][global_id(0)].

    """

    if p.clra_alpha is not None:
        raise NotImplementedError()
    if p.clra_gamma is not None:
        raise NotImplementedError()
    cl_items = to_device(p.queue, np.asarray(items, dtype='int32'))
    if 0:
        if len(items) < 10:
            print('Falling back on reference implementation')
            p.print_geometry_summary(items, full=True)
        else:
            print('Falling back on reference implementation')
            p.print_geometry_summary(items)

    assert all(s == 1 for s in p.A.stride1s)
    assert all(s == 1 for s in p.X.stride1s)
    assert all(s == 1 for s in p.Y.stride0s)
    assert all(s == 1 for s in p.Y.stride1s)
    assert all(s == 1 for s in p.Y_in.stride0s)
    assert all(s == 1 for s in p.Y_in.stride1s)

    text = """
        __kernel void gemv_ref(
            __global int *items,
    % if cl_alpha is not None:
            __global ${cl_alpha.ctype} * alphas,
    % endif
    % if (A_js is not None):
            __global int *A_starts,
            __global int *A_shape1s,
            __global int *A_stride0s,
            __global ${A.cl_buf.ctype} *A_data,
            __global int *A_js_starts,
            __global int *A_js_shape0s,
            __global int *A_js_data,
            __global int *X_starts,
            __global int *X_stride0s,
            __global ${X.cl_buf.ctype} *X_data,
            __global int *X_js_starts,
            __global int *X_js_data,
    % endif
    % if cl_beta is not None:
            __global ${cl_beta.ctype} * betas,
    % endif
    % if clra_beta is not None:
            __global int *beta_starts,
            __global int *beta_data,
    % endif
    % if cl_gamma is not None:
            __global ${cl_gamma.ctype} * gammas,
    % endif
            __global int *Y_in_starts,
            __global ${Y_in.cl_buf.ctype} *Y_in_data,
            __global int *Y_starts,
            __global int *Y_shape0s,
            __global ${Y.cl_buf.ctype} *Y_data)
        {
            const int mm = get_global_id(0);
            const int bb = items[get_global_id(1)];
            const int M = Y_shape0s[bb];
            if (mm < M)
            {
                const int y_offset = Y_starts[bb];
                const int y_in_offset = Y_in_starts[bb];

    % if float_beta is not None:
                const ${Y.cl_buf.ctype} beta = ${float_beta};
    % elif cl_beta is not None:
                const ${cl_beta.ctype} beta = betas[bb];
    % elif clra_beta is not None:
                const int beta_offset = beta_starts[bb];
                const ${clra_beta.cl_buf.ctype} beta
                    = beta_data[beta_offset + mm];
    % endif

    % if float_gamma is not None:
                const ${Y.cl_buf.ctype} gamma = ${float_gamma};
    % elif cl_gamma is not None:
                const ${cl_gamma.ctype} gamma = gammas[bb];
    % endif

                Y_data[y_offset + mm] =
                    gamma + beta * Y_in_data[y_in_offset + mm];

    % if A_js is not None:
                const int n_dot_products = A_js_shape0s[bb];
                X_js_data += X_js_starts[bb];
                A_js_data += A_js_starts[bb];

                ${Y.cl_buf.ctype} y_sum = 0;
                for (int ii = 0; ii < n_dot_products; ++ii)
                {
                    const int x_ji = X_js_data[ii];
                    const int a_ji = A_js_data[ii];
                    const int N_i = A_shape1s[a_ji];
                    const int x_offset = X_starts[x_ji];
                    const int a_offset = A_starts[a_ji];
                    const int AsM = A_stride0s[a_ji];
                    const int XsM = X_stride0s[x_ji];

                    for (int nn = 0; nn < N_i; ++nn)
                    {
                        y_sum += X_data[x_offset + nn * XsM]
                                 * A_data[a_offset + mm * AsM + nn];
                    }
                }
        % if float_alpha is not None:
                Y_data[y_offset + mm] += ${float_alpha} * y_sum;
        % elif cl_alpha is not None:
                Y_data[y_offset + mm] += alphas[bb] * y_sum;
        % endif
    % endif
            }

        }
    """

    text = as_ascii(
        Template(text, output_encoding='ascii').render(**p.__dict__))

    gsize = (max(p.geometry[ii]['y_len'] for ii in items), len(items))
    lsize = None
    fn = cl.Program(p.queue.context, text).build().gemv_ref
    full_args = [cl_items]
    if p.cl_alpha is not None:
        full_args += [p.cl_alpha]
    if p.A_js is not None:
        full_args += [
            p.A.cl_starts,
            p.A.cl_shape1s,
            p.A.cl_stride0s,
            p.A.cl_buf,
            p.A_js.cl_starts,
            p.A_js.cl_shape0s,
            p.A_js.cl_buf,
            p.X.cl_starts,
            p.X.cl_stride0s,
            p.X.cl_buf,
            p.X_js.cl_starts,
            p.X_js.cl_buf,
        ]
    if p.cl_beta is not None:
        full_args += [p.cl_beta]
    elif p.clra_beta is not None:
        full_args += [p.clra_beta.cl_starts, p.clra_beta.cl_buf]

    if p.cl_gamma is not None:
        full_args += [p.cl_gamma]
    elif p.clra_gamma is not None:
        full_args += [p.clra_gamma.cl_starts, p.clra_gamma.cl_buf]

    full_args += [
        p.Y_in.cl_starts, p.Y_in.cl_buf, p.Y.cl_starts, p.Y.cl_shape0s,
        p.Y.cl_buf
    ]

    # print([str(arr.dtype)[0] for arr in full_args])
    fn.set_args(*[arr.data for arr in full_args])
    rval = Plan(p.queue,
                fn,
                gsize,
                lsize,
                name="clra_gemv.ref_impl",
                tag=p.tag,
                bw_per_call=bw_from_geometry(p.geometry, items),
                flops_per_call=flops_from_geometry(p.geometry, items))
    rval.full_args = full_args  # prevent GC the args
    return rval
Exemplo n.º 12
0
def plan_probes(queue, periods, X, Y, tag=None):
    """
    Parameters
    ----------
    P : raggedarray of ints
        The period (in time-steps) of each probe
    """
    assert len(X) == len(Y)
    assert len(X) == len(periods)
    assert X.ctype == Y.ctype
    N = len(X)

    # N.B.  X[i].shape = (M, N)
    #       Y[i].shape = (buf_len, M * N)
    for arr in [X, Y]:
        assert (arr.stride1s == 1).all()
    assert (X.shape0s * X.shape1s == Y.shape1s).all()
    assert (X.stride0s == X.shape1s).all()
    assert (X.stride1s == 1).all()
    assert (Y.stride0s == Y.shape1s).all()
    assert (Y.stride1s == 1).all()

    periods = np.asarray(periods, dtype='float32')
    cl_periods = to_device(queue, periods)
    cl_countdowns = to_device(queue, periods - 1)
    cl_bufpositions = to_device(queue, np.zeros(N, dtype='int32'))

    text = """
        ////////// MAIN FUNCTION //////////
        __kernel void probes(
            __global ${Ctype} *countdowns,
            __global int *bufpositions,
            __global const ${Ptype} *periods,
            __global const int *Xstarts,
            __global const int *Xshape0s,
            __global const int *Xshape1s,
            __global const ${Xtype} *Xdata,
            __global const int *Ystarts,
            __global ${Ytype} *Ydata
        )
        {
            const int n = get_global_id(1);
            const ${Ctype} countdown = countdowns[n];

            if (countdown <= 0) {
                const int n_dims = Xshape0s[n] * Xshape1s[n];
                __global const ${Xtype} *x = Xdata + Xstarts[n];
                const int bufpos = bufpositions[n];

                __global ${Ytype} *y = Ydata + Ystarts[n] + bufpos * n_dims;

                for (int ii = get_global_id(0);
                         ii < n_dims;
                         ii += get_global_size(0))
                {
                    y[ii] = x[ii];
                }
                // This should *not* cause deadlock because
                // all local threads guaranteed to be
                // in this branch together.
                barrier(CLK_LOCAL_MEM_FENCE);
                if (get_global_id(0) == 0)
                {
                    countdowns[n] = countdown + periods[n] - 1;
                    bufpositions[n] = bufpos + 1;
                }
            }
            else
            {
                barrier(CLK_LOCAL_MEM_FENCE);
                if (get_global_id(0) == 0)
                {
                    countdowns[n] = countdown - 1;
                }
            }
        }
        """

    textconf = dict(N=N,
                    Xtype=X.ctype,
                    Ytype=Y.ctype,
                    Ctype=cl_countdowns.ctype,
                    Ptype=cl_periods.ctype)
    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    full_args = (
        cl_countdowns,
        cl_bufpositions,
        cl_periods,
        X.cl_starts,
        X.cl_shape0s,
        X.cl_shape1s,
        X.cl_buf,
        Y.cl_starts,
        Y.cl_buf,
    )
    _fn = cl.Program(queue.context, text).build().probes
    _fn.set_args(*[arr.data for arr in full_args])

    max_len = min(queue.device.max_work_group_size, max(X.shape0s))
    gsize = (max_len, N,)
    lsize = (max_len, 1)
    rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_probes", tag=tag)
    rval.full_args = full_args     # prevent garbage-collection
    rval.cl_bufpositions = cl_bufpositions
    rval.Y = Y
    rval.bw_per_call = (X.nbytes + Y.nbytes + cl_periods.nbytes +
                        cl_countdowns.nbytes + cl_bufpositions.nbytes)
    rval.description = (
        "groups: %d; items: %d; items/group: %0.1f [%d, %d]" %
        (len(Y), Y.sizes.sum(), Y.sizes.mean(), Y.sizes.min(), Y.sizes.max()))
    return rval
Exemplo n.º 13
0
def plan_linear_synapse(queue, X, Y, A, B, Xbuf, Ybuf, tag=None):
    """
    Implements a filter of the form

        y[n+1] + a[0] y[n] + ... + a[i] y[n-i] = b[0] x[n] + ... + b[j] x[n-j]
    """
    N = len(X)
    assert len(Y) == N and len(A) == N and len(B) == N

    for arr in [X, Y, A, B, Xbuf, Ybuf]:
        assert (arr.shape1s == arr.stride0s).all()
        assert (arr.stride1s == 1).all()
    for arr in [X, Y, A, B]:  # vectors
        assert (arr.shape1s == 1).all()
    assert (X.shape0s == Y.shape0s).all()

    assert (B.shape0s >= 1).all()
    assert ((B.shape0s == 1) | (Xbuf.shape0s == B.shape0s)).all()
    assert (Xbuf.shape1s == X.shape0s).all()
    assert ((A.shape0s == 1) | (Ybuf.shape0s == A.shape0s)).all()
    assert (Ybuf.shape1s == Y.shape0s).all()

    assert X.ctype == Xbuf.ctype
    assert Y.ctype == Ybuf.ctype

    Xbufpos = to_device(queue, np.zeros(N, dtype='int32'))
    Ybufpos = to_device(queue, np.zeros(N, dtype='int32'))

    text = """
        ////////// MAIN FUNCTION //////////
        __kernel void linear_synapse(
            __global const int *shape0s,
            __global const int *Xstarts,
            __global const ${Xtype} *Xdata,
            __global const int *Ystarts,
            __global ${Ytype} *Ydata,
            __global const int *Ashape0s,
            __global const int *Astarts,
            __global const ${Atype} *Adata,
            __global const int *Bshape0s,
            __global const int *Bstarts,
            __global const ${Btype} *Bdata,
            __global const int *Xbufstarts,
            __global ${Xtype} *Xbufdata,
            __global const int *Ybufstarts,
            __global ${Ytype} *Ybufdata,
            __global int *Xbufpos,
            __global int *Ybufpos
        )
        {
            int i = get_global_id(0);
            const int k = get_global_id(1);
            __global const ${Xtype} *x = Xdata + Xstarts[k];
            __global ${Ytype} *y = Ydata + Ystarts[k];
            __global const ${Atype} *a = Adata + Astarts[k];
            __global const ${Btype} *b = Bdata + Bstarts[k];

            const int n = shape0s[k];
            const int na = Ashape0s[k];
            const int nb = Bshape0s[k];
            if (na == 0 && nb == 1) {
                for (; i < n; i += get_global_size(0))
                    y[i] = b[0] * x[i];
            } else if (na == 1 && nb == 1) {
                for (; i < n; i += get_global_size(0)) {
                    y[i] *= -a[0];
                    y[i] += b[0] * x[i];
                }
            } else {  // general filtering
                __global ${Xtype} *xbuf = Xbufdata + Xbufstarts[k];
                __global ${Ytype} *ybuf = Ybufdata + Ybufstarts[k];
                const int ix = Xbufpos[k];
                const int iy = Ybufpos[k];
                const int ix1 = (ix > 0) ? ix - 1 : nb - 1;
                const int iy1 = (iy > 0) ? iy - 1 : na - 1;

                ${Ytype} yi;
                int j, jj;
                for (; i < n; i += get_global_size(0)) {
                    yi = b[0] * x[i];
                    if (nb > 1) {
                        xbuf[ix*n + i] = x[i];  // copy input to buffer
                        for (j = 1; j < nb; j++) {
                            jj = (ix + j) % nb;
                            yi += b[j] * xbuf[jj*n + i];
                        }
                    }

                    if (na > 0) {
                        yi -= a[0] * y[i];
                        if (na > 1) {
                            for (j = 1; j < na; j++) {
                                jj = (iy + j) % na;
                                yi -= a[j] * ybuf[jj*n + i];
                            }
                            ybuf[iy1*n + i] = yi;  // copy output to buffer
                        }
                    }

                    y[i] = yi;
                }

                Xbufpos[k] = ix1;
                Ybufpos[k] = iy1;
            }
        }
        """

    textconf = dict(
        Xtype=X.ctype, Ytype=Y.ctype,
        Atype=A.ctype, Btype=B.ctype
    )
    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    full_args = (
        X.cl_shape0s,
        X.cl_starts,
        X.cl_buf,
        Y.cl_starts,
        Y.cl_buf,
        A.cl_shape0s,
        A.cl_starts,
        A.cl_buf,
        B.cl_shape0s,
        B.cl_starts,
        B.cl_buf,
        Xbuf.cl_starts,
        Xbuf.cl_buf,
        Ybuf.cl_starts,
        Ybuf.cl_buf,
        Xbufpos,
        Ybufpos,
    )
    _fn = cl.Program(queue.context, text).build().linear_synapse
    _fn.set_args(*[arr.data for arr in full_args])

    max_len = min(max(X.shape0s), queue.device.max_work_group_size)
    gsize = (max_len, N)
    lsize = (max_len, 1)
    rval = Plan(
        queue, _fn, gsize, lsize=lsize, name="cl_linear_synapse", tag=tag)
    rval.full_args = full_args     # prevent garbage-collection
    rval.bw_per_call = (
        X.nbytes + Y.nbytes + A.nbytes + B.nbytes + Xbuf.nbytes + Ybuf.nbytes)
    rval.description = (
        "groups: %d; items: %d; items/group: %0.1f [%d, %d]" %
        (len(Y), Y.sizes.sum(), Y.sizes.mean(), Y.sizes.min(), Y.sizes.max()))
    return rval
Exemplo n.º 14
0
def plan_elementwise_inc(queue, A, X, Y, tag=None):
    """Implements an element-wise increment Y += A * X"""
    N = len(X)
    assert len(Y) == N and len(A) == N

    for arr in [A, X, Y]:
        assert (arr.stride1s == 1).all()
    assert ((X.shape0s == 1) | (X.shape0s == Y.shape0s)).all()
    assert ((X.shape1s == 1) | (X.shape1s == Y.shape1s)).all()
    assert ((A.shape0s == 1) | (A.shape0s == Y.shape0s)).all()
    assert ((A.shape1s == 1) | (A.shape1s == Y.shape1s)).all()
    assert (X.stride1s == 1).all()
    assert (Y.stride1s == 1).all()
    assert (A.stride1s == 1).all()

    assert X.ctype == Y.ctype
    assert A.ctype == Y.ctype

    text = """
        inline ${Ytype} get_element(
            __global const ${Ytype} *data,
            const int shape0, const int shape1, const int stride0,
            const int i, const int j
        )
        {
            if (shape0 == 1 && shape1 == 1)
                return data[0];
            else if (shape0 == 1)
                return data[j];
            else if (shape1 == 1)
                return data[i * stride0];
            else
                return data[i * stride0 + j];
        }

        ////////// MAIN FUNCTION //////////
        __kernel void elementwise_inc(
            __global const int *Ashape0s,
            __global const int *Ashape1s,
            __global const int *Astride0s,
            __global const int *Astarts,
            __global const ${Atype} *Adata,
            __global const int *Xshape0s,
            __global const int *Xshape1s,
            __global const int *Xstride0s,
            __global const int *Xstarts,
            __global const ${Xtype} *Xdata,
            __global const int *Yshape0s,
            __global const int *Yshape1s,
            __global const int *Ystride0s,
            __global const int *Ystarts,
            __global ${Ytype} *Ydata
        )
        {
            const int n = get_global_id(1);
            __global const ${Atype} *a = Adata + Astarts[n];
            __global const ${Xtype} *x = Xdata + Xstarts[n];
            __global ${Ytype} *y = Ydata + Ystarts[n];

            const int Ysize = Yshape0s[n] * Yshape1s[n];
            for (int ij = get_global_id(0);
                 ij < Ysize;
                 ij += get_global_size(0))
            {
                int i = ij / Yshape1s[n];
                int j = ij - i * Yshape1s[n];

                ${Atype} aa = get_element(
                    a, Ashape0s[n], Ashape1s[n], Astride0s[n], i, j);
                ${Xtype} xx = get_element(
                    x, Xshape0s[n], Xshape1s[n], Xstride0s[n], i, j);

                y[i * Ystride0s[n] + j] += aa * xx;
            }
        }
        """

    textconf = dict(Atype=A.ctype, Xtype=X.ctype, Ytype=Y.ctype)
    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    full_args = (
        A.cl_shape0s,
        A.cl_shape1s,
        A.cl_stride0s,
        A.cl_starts,
        A.cl_buf,
        X.cl_shape0s,
        X.cl_shape1s,
        X.cl_stride0s,
        X.cl_starts,
        X.cl_buf,
        Y.cl_shape0s,
        Y.cl_shape1s,
        Y.cl_stride0s,
        Y.cl_starts,
        Y.cl_buf,
    )
    _fn = cl.Program(queue.context, text).build().elementwise_inc
    _fn.set_args(*[arr.data for arr in full_args])

    max_group = queue.device.max_work_group_size
    mn = min(max(max(Y.shape0s), max(Y.shape1s)), max_group)
    gsize = (mn, N)
    lsize = (mn, 1)
    rval = Plan(
        queue, _fn, gsize, lsize=lsize, name="cl_elementwise_inc", tag=tag)
    rval.full_args = full_args     # prevent garbage-collection
    rval.bw_per_call = A.nbytes + X.nbytes + Y.nbytes
    rval.description = (
        "groups: %d; items: %d; items/group: %0.1f [%d, %d]" %
        (len(Y), Y.sizes.sum(), Y.sizes.mean(), Y.sizes.min(), Y.sizes.max()))
    return rval
Exemplo n.º 15
0
def plan_whitesignal(queue, Y, t, signals, dt, tag=None):
    N = len(Y)
    assert len(Y) == len(t) == len(signals)

    for i in range(N):
        for arr in [Y, t, signals]:
            assert arr.stride1s[i] == 1

        assert Y.shape1s[i] == 1
        assert Y.stride0s[i] == Y.stride1s[i] == 1

        assert t.shape0s[i] == t.shape1s[i] == 1

        assert Y.shape0s[i] == signals.shape1s[i]
        assert signals.stride1s[i] == 1

    text = """
        ////////// MAIN FUNCTION //////////
        __kernel void whitesignal(
            __global const int *Yshape0s,
            __global const int *Ystarts,
            __global ${Ytype} *Ydata,
            __global const int *Tstarts,
            __global ${Ttype} *Tdata,
            __global const int *Sshape0s,
            __global const int *Sstarts,
            __global ${Stype} *Sdata
        )
        {
            int i = get_global_id(0);
            const int k = get_global_id(1);
            const int m = Yshape0s[k];
            if (i >= m)
                return;

            __global ${Ytype} *y = Ydata + Ystarts[k];
            __global ${Ytype} *s = Sdata + Sstarts[k];
            const float t = *(Tdata + Tstarts[k]);
            const int nt = Sshape0s[k];
            const int ti = (int)round(t / ${dt}) % nt;

            for (; i < m; i += get_global_size(0))
                y[i] = s[m*ti + i];
        }
        """

    textconf = dict(Ytype=Y.ctype, Ttype=t.ctype,
                    Stype=signals.ctype, dt=dt)
    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    full_args = (
        Y.cl_shape0s,
        Y.cl_starts,
        Y.cl_buf,
        t.cl_starts,
        t.cl_buf,
        signals.cl_shape0s,
        signals.cl_starts,
        signals.cl_buf,
    )
    _fn = cl.Program(queue.context, text).build().whitesignal
    _fn.set_args(*[arr.data for arr in full_args])

    max_len = min(queue.device.max_work_group_size, max(Y.shape0s))
    gsize = (max_len, N)
    lsize = (max_len, 1)
    rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_whitesignal", tag=tag)
    rval.full_args = full_args     # prevent garbage-collection
    return rval
Exemplo n.º 16
0
def plan_hingeloss(queue, yinds, Z, c, E):
    from mako.template import Template
    from nengo_ocl.utils import as_ascii
    from nengo_ocl.plan import Plan

    m, n = Z.shape
    assert n <= 32
    assert Z.shape == E.shape
    assert Z.elemstrides[1] == 1
    assert E.elemstrides[1] == 1
    assert yinds.shape == (m, )
    assert yinds.elemstrides[0] == 1
    assert c.shape == (m, )
    assert c.elemstrides[0] == 1

    text = """
        __kernel void fn(
            __global const ${yindstype} *yinds,
            __global const ${Ztype} *Z,
            __global ${ctype} *c,
            __global ${Etype} *E
        )
        {
            const int i = get_global_id(0);

            const ${yindstype} yi = yinds[i];
            __global const ${Ztype} *z = Z + i*${Zstride0};
            __global ${Etype} *e = E + i*${Estride0};

            ${yindstype} ti;
            ${Ztype} zj, zy, zt = -INFINITY;
            zt = -INFINITY;
            for (int j = 0; j < ${n}; j++) {
                e[j] = 0;
                zj = z[j];
                if (j == yi) {
                    zy = zj;
                } else if (zj > zt) {
                    zt = zj;
                    ti = j;
                }
            }

            ${Ztype} margin = zy - zt;
            if (margin < 1) {
                e[yi] = -1;
                e[ti] = 1;
            }
            c[i] = max(1 - margin, 0.0f);
        }
        """
    textconf = dict(yindstype=yinds.ctype,
                    Ztype=Z.ctype,
                    ctype=c.ctype,
                    Etype=E.ctype,
                    m=m,
                    n=n,
                    Zstride0=Z.elemstrides[0],
                    Estride0=E.elemstrides[0])
    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    fn = cl.Program(queue.context, text).build().fn
    fn.set_args(*[arr.data for arr in (yinds, Z, c, E)])
    plan = Plan(queue, fn, gsize=(m, ))
    return plan
Exemplo n.º 17
0
def plan_direct(queue, code, init, input_names, inputs, output, tag=None):
    from . import ast_conversion

    assert len(input_names) == len(inputs)

    N = len(inputs[0])
    for x in inputs:
        assert len(x) == len(output)
    for x in inputs + [output]:
        assert (x.shape1s == 1).all() and (x.stride1s == 1).all()
        assert (x.stride0s == 1).all()

    input_types = [x.ctype for x in inputs]
    output_type = output.ctype

    text = """
        ////////// MAIN FUNCTION //////////
        __kernel void direct(
% for iname, itype in zip(input_names, input_types):
            __global const int *${iname}_starts__,
            __global const ${itype} *${iname}_data__,
% endfor
            __global const int *${oname}_starts__,
            __global ${otype} *${oname}_data__
        )
        {
            const int n = get_global_id(0);
            if (n >= ${N}) return;

% for iname, itype in zip(input_names, input_types):
            __global const ${itype} *${iname} =
                ${iname}_data__ + ${iname}_starts__[n];
% endfor
            __global ${otype} *${oname} =
                ${oname}_data__ + ${oname}_starts__[n];

            /////vvvvv USER DECLARATIONS BELOW vvvvv
${init}

            /////vvvvv USER COMPUTATIONS BELOW vvvvv
${code}
            // END OF FUNC: put nothing after user code, since it can return
        }
        """

    textconf = dict(init=indent(init, 12),
                    code=indent(code, 12),
                    N=N, input_names=input_names, input_types=input_types,
                    oname=ast_conversion.OUTPUT_NAME, otype=output_type,
                    )
    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    full_args = []
    for x in inputs:
        full_args.extend([x.cl_starts, x.cl_buf])
    full_args.extend([output.cl_starts, output.cl_buf])
    _fn = cl.Program(queue.context, text).build().direct
    _fn.set_args(*[arr.data for arr in full_args])

    gsize = (N,)
    rval = Plan(queue, _fn, gsize, lsize=None, name="cl_direct", tag=tag)
    rval.full_args = full_args     # prevent garbage-collection
    rval.description = (
        "groups: %d; items: %d; items/group: %0.1f [%d, %d]" %
        (len(output), output.sizes.sum(),
         output.sizes.mean(), output.sizes.min(), output.sizes.max()))
    return rval
Exemplo n.º 18
0
def reduce_impl(
    p,
    items,
    group_size=None,
    segment_size=None,
):

    #
    # Target use case: long inner products, small numbers of dots.
    #
    # Approach: each work-group computes a small number of gemv outputs
    #

    if p.clra_alpha is not None:
        raise NotImplementedError()
    if p.clra_gamma is not None:
        raise NotImplementedError()
    if p.clra_beta is not None:
        raise NotImplementedError()
    if p.cl_alpha is not None:
        raise NotImplementedError()
    if p.cl_gamma is not None:
        raise NotImplementedError()
    if not all(s == 1 for s in p.A.stride1s):
        raise NotImplementedError()

    assert p.float_alpha is not None
    assert p.float_gamma is not None

    cl_gstructure, textconf = p.cl_geometry_and_textconf(items)
    max_n_dots = max([len(p.geometry[ii]['dots']) for ii in items])
    max_reduce_len = max(
        max([gg['a_shape1'] for gg in p.geometry[ii]['dots']]) for ii in items)
    max_y_len = max([p.geometry[ii]['y_len'] for ii in items])

    # segment means the piece of Y written by a work-group
    # group_size is the number of values that we're reducing over

    if len(items) < 4:
        if group_size is None:
            group_size = 32  # XXX
        if segment_size is None:
            segment_size = min(max_y_len, 2)  # XXX
    else:
        if group_size is None:
            group_size = 32  # XXX
        if segment_size is None:
            segment_size = min(max_y_len, 4)  # XXX
    g_segments = int(np.ceil(float(max_y_len) / segment_size))
    gsize = (group_size, g_segments * segment_size, len(items))
    lsize = (group_size, segment_size, 1)

    max_reduce_iters = int(np.ceil(float(max_reduce_len) / group_size))
    textconf.update({
        'n_items': len(items),
        'gsize': gsize,
        'segment_size': segment_size,
        'max_y_len': max_y_len,
        'group_size': group_size,
        'local_count': group_size * segment_size,
        'max_reduce_len': max_reduce_len,
        'N_cutoff': max_reduce_iters * group_size,
        'max_n_dots': max_n_dots,
    })
    if 0:
        for k, v in textconf.items():
            print(k, v)

    textconf.update(p.__dict__)

    text = """
        __kernel void gemv_reduce(
            const __global int *gstructure,
            const __global ${A.cl_buf.ctype} *A_data,
            const __global ${X.cl_buf.ctype} *X_data,
            % if cl_beta is not None:
            const __global ${cl_beta.ctype} * betas,
            % endif
            const __global ${Y_in.cl_buf.ctype} *Y_in_data,
            __global ${Y.cl_buf.ctype} *Y_data)
    {
        __local int lstructure[${n_structure_vars}];
    % if segment_size > 1:
        // we'll cache X in shared memory so we load it only once
        // for the whole segment
        __local ${X.cl_buf.ctype} lX[${group_size}];
    % endif
        //Scratch space for the dot products
        __local ${Y.cl_buf.ctype}
            partialDotProduct[${segment_size}][${group_size}];
        __local ${Y.cl_buf.ctype}
            y_sum_pre[${segment_size}];
        const int local_idx = get_local_id(0)
            + get_local_id(1) * get_local_size(0);

        // load structure
    % if local_count < n_structure_vars:
        for (int ii = local_idx;
                 ii < ${n_structure_vars};
                 ii += ${local_count})
        {
            lstructure[ii] = gstructure[
                get_global_id(2) * ${structure_vars_stride} + ii];
        }
    % else :
        if (local_idx < ${n_structure_vars})
        {
            lstructure[local_idx] = gstructure[
                get_global_id(2) * ${structure_vars_stride} + local_idx];
        }
    % endif
        barrier(CLK_LOCAL_MEM_FENCE);

        if ((get_local_id(0) == 0) && (get_global_id(1) < ${y_len}))
        {
    % if float_beta is not None and float_beta != 0 :
            y_sum_pre[get_local_id(1)] = ${float_beta}
                * Y_in_data[${y_in_starts} + get_global_id(1)];
    % elif cl_beta is not None:
            y_sum_pre[get_local_id(1)] = betas[${bb}]
                * Y_in_data[${y_in_starts} + get_global_id(1)];
    % else :
            y_sum_pre[get_local_id(1)] = 0;
    % endif

    % if float_gamma is not None and float_gamma != 0:
            y_sum_pre[get_local_id(1)] += ${float_gamma};
    % endif
    // printf("betaY + gamma=%f\\n", y_sum_pre[get_local_id(1)]);
        }

        partialDotProduct[get_local_id(1)][get_local_id(0)] = 0;
    % if max_n_dots > 1:
        for (int ii = 0;
                 ii < ${n_dot_products};
                 ii += 1)
        {
    % else:
        const int ii = 0;
    % endif


        for (int nn = get_local_id(0);
                 nn < ${N_cutoff};
                 nn += get_local_size(0))
        {
    // segment_size = ${segment_size}
    % if (segment_size == 1):
            if ((nn < ${N_i}) && (get_global_id(1) < ${y_len}))
            {
            partialDotProduct[get_local_id(1)][get_local_id(0)] +=
                A_data[${a_starts} + get_global_id(1) * ${a_s0} + nn]
                * X_data[${x_starts} + nn];
            }
    % else:
            barrier(CLK_LOCAL_MEM_FENCE);
            if ((get_local_id(1) == 0) && (nn < ${N_i}))
            {
                lX[get_local_id(0)] = X_data[${x_starts} + nn];
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            if ((nn < ${N_i}) && (get_global_id(1) < ${y_len}))
            {
            partialDotProduct[get_local_id(1)][get_local_id(0)] +=
                A_data[${a_starts} + get_global_id(1) * ${a_s0} + nn]
                * lX[get_local_id(0)];
            }
    % endif
        }

    % if (max_n_dots > 1):
        }
    % endif

        // -- Parallel reduction long work-group dimension 0
        for (uint stride = 1;
                  stride < get_local_size(0);
                  stride *= 2)
        {
            barrier(CLK_LOCAL_MEM_FENCE);

            uint index = 2 * stride * get_local_id(0);
            if (index + stride < get_local_size(0))
            {
                partialDotProduct[get_local_id(1)][index] +=
                    partialDotProduct[get_local_id(1)][index + stride];
            }
        }
        // barrier(CLK_LOCAL_MEM_FENCE);
        if ((get_local_id(0) == 0) && (get_global_id(1) < ${y_len})) {
            Y_data[${y_offset} + get_global_id(1)] = y_sum_pre[get_local_id(1)]
                + ${float_alpha} * partialDotProduct[get_local_id(1)][0];
        }
    }
        """

    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    fn = cl.Program(p.queue.context, text).build().gemv_reduce

    full_args = [
        cl_gstructure,
        p.A.cl_buf,
        p.X.cl_buf,
    ]
    if p.cl_beta is not None:
        full_args += [p.cl_beta]
    full_args += [
        p.Y_in.cl_buf,
        p.Y.cl_buf,
    ]

    fn.set_args(*[arr.data for arr in full_args])
    rval = Plan(
        p.queue,
        fn,
        gsize,
        lsize,
        name='clra_gemv.reduce_impl',
        tag=p.tag,
        bw_per_call=bw_from_geometry(p.geometry, items),
        flops_per_call=flops_from_geometry(p.geometry, items),
    )
    rval.full_args = full_args  # prevent GC the args
    rval.description = p.geometry_summary(items)
    return rval
Exemplo n.º 19
0
def _plan_template(queue, name, core_text, declares="", tag=None, n_elements=0,
                   inputs={}, outputs={}, parameters={}):
    """Template for making a plan for vector nonlinearities.

    This template assumes that all inputs and outputs are vectors.

    Parameters
    ----------
    n_elements: int
        If n_elements == 0, then the kernels are allocated as a block. This is
        simple, but can be slow for large computations where input vector sizes
        are not uniform (e.g. one large population and many small ones).
        If n_elements >= 1, then all the vectors in the RaggedArray are
        flattened so that the exact number of required kernels is allocated.
        Each kernel performs computations for `n_elements` elements.

    inputs: dictionary of CLRaggedArrays
        Inputs to the function. RaggedArrays must be a list of vectors.

    outputs: dictionary of CLRaggedArrays
        Outputs of the function. RaggedArrays must be a list of vectors.

    parameters: dictionary of CLRaggedArrays
        Parameters to the function. Each RaggedArray element must be a vector
        of the same length of the inputs, or a scalar (to be broadcasted).
        Providing a float instead of a RaggedArray makes that parameter
        constant.

    """
    input0 = list(inputs.values())[0]   # input to use as reference for lengths
    N = len(input0)

    # split parameters into static and updated params
    static_params = {}  # static params (hard-coded)
    params = {}  # variable params (updated)
    for k, v in parameters.items():
        if isinstance(v, CLRaggedArray):
            params[k] = v
        else:
            try:
                static_params[k] = ('float', float(v))
            except TypeError:
                raise

    avars = {}
    bw_per_call = 0
    for vname, v in list(inputs.items()) + list(outputs.items()):
        assert vname not in avars, "Name clash"
        assert len(v) == N
        assert (v.shape0s == input0.shape0s).all()
        assert (v.stride0s == v.shape1s).all()  # rows contiguous
        assert (v.stride1s == 1).all()  # columns contiguous
        assert (v.shape1s == 1).all()  # vectors only

        offset = '%(name)s_starts[gind1]' % {'name': vname}
        avars[vname] = (v.ctype, offset)
        bw_per_call += v.nbytes

    for vname, v in params.items():
        assert vname not in avars, "Name clash"
        assert len(v) == N
        assert ((v.shape0s == input0.shape0s) | (v.shape0s == 1)).all()
        assert (v.stride0s == v.shape1s).all()  # rows contiguous
        assert (v.stride1s == 1).all()  # columns contiguous
        assert (v.shape1s == 1).all()  # vectors only

        offset = '%(name)s_starts[gind1]' % {'name': vname}
        avars[vname] = (v.ctype, offset)
        bw_per_call += v.nbytes

    ivars = dict((k, avars[k]) for k in inputs.keys())
    ovars = dict((k, avars[k]) for k in outputs.keys())
    pvars = dict((k, avars[k]) for k in params.keys())

    fn_name = "%s_%d" % (name, n_elements)
    textconf = dict(fn_name=fn_name, N=N, n_elements=n_elements,
                    declares=declares, core_text=core_text,
                    ivars=ivars, ovars=ovars, pvars=pvars,
                    static_params=static_params)

    if n_elements > 0:
        # Allocate the exact number of required kernels in a vector
        gsize = (int(np.ceil(np.sum(input0.shape0s) / float(n_elements))),)
        text = """
        ////////// MAIN FUNCTION //////////
        __kernel void ${fn_name}(
% for name, [type, offset] in ivars.items():
            __global const int *${name}_starts,
            __global const ${type} *in_${name},
% endfor
% for name, [type, offset] in ovars.items():
            __global const int *${name}_starts,
            __global ${type} *in_${name},
% endfor
% for name, [type, offset] in pvars.items():
            __global const int *${name}_starts,
            __global const int *${name}_shape0s,
            __global const ${type} *in_${name},
% endfor
            __global const int *lengths
        )
        {
            int gind0 = get_global_id(0) * ${n_elements};
            int gind1 = 0;
            while (gind0 >= lengths[gind1]) {
                gind0 -= lengths[gind1];
                gind1++;
            }
            if (gind1 >= ${N}) return;

% for name, [type, offset] in ivars.items():
            __global const ${type} *cur_${name} =
                in_${name} + ${offset} + gind0;
% endfor
% for name, [type, offset] in ovars.items():
            __global ${type} *cur_${name} = in_${name} + ${offset} + gind0;
% endfor
% for name, [type, offset] in pvars.items():
            __global const ${type} *cur_${name} = in_${name} + ${offset};
            int ${name}_isvector = ${name}_shape0s[gind1] > 1;
            if (${name}_isvector) cur_${name} += gind0;
% endfor
% for name, [type, offset] in \
        list(ivars.items()) + list(ovars.items()) + list(pvars.items()):
            ${type} ${name};
% endfor
% for name, [type, value] in static_params.items():
            const ${type} ${name} = ${value};
% endfor
            //////////////////////////////////////////////////
            //vvvvv USER DECLARATIONS BELOW vvvvv
            ${declares}
            //^^^^^ USER DECLARATIONS ABOVE ^^^^^
            //////////////////////////////////////////////////

% for ii in range(n_elements):
            //////////////////////////////////////////////////
            ////////// LOOP ITERATION ${ii}
  % for name, [type, offset] in ivars.items():
            ${name} = *cur_${name};
  % endfor
  % for name, [type, offset] in pvars.items():
            if ((${ii} == 0) || ${name}_isvector) ${name} = *cur_${name};
  % endfor

            /////vvvvv USER COMPUTATIONS BELOW vvvvv
            ${core_text}
            /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^

  % for name, [type, offset] in ovars.items():
            *cur_${name} = ${name};
  % endfor

  % if ii + 1 < n_elements:
            gind0++;
            if (gind0 >= lengths[gind1]) {
                gind1++;
                gind0 = 0;
                if (gind1 >= ${N}) return;

    % for name, [_, offset] in \
        list(ivars.items()) + list(ovars.items()) + list(pvars.items()):
                cur_${name} = in_${name} + ${offset};
    % endfor
    % for name, _ in pvars.items():
                ${name}_isvector = ${name}_shape0s[gind1] > 1;
                if (!${name}_isvector) ${name} = *cur_${name};
    % endfor
            } else {
    % for name, _ in list(ivars.items()) + list(ovars.items()):
                cur_${name}++;
    % endfor
    % for name, _ in pvars.items():
                if (${name}_isvector) cur_${name}++;
    % endfor
            }
  % endif
% endfor
        }
        """
    else:
        # Allocate more than enough kernels in a matrix
        gsize = (int(np.max(input0.shape0s)), int(N))
        text = """
        ////////// MAIN FUNCTION //////////
        __kernel void ${fn_name}(
% for name, [type, offset] in ivars.items():
            __global const int *${name}_starts,
            __global const ${type} *in_${name},
% endfor
% for name, [type, offset] in ovars.items():
            __global const int *${name}_starts,
            __global ${type} *in_${name},
% endfor
% for name, [type, offset] in pvars.items():
            __global const int *${name}_starts,
            __global const int *${name}_shape0s,
            __global const ${type} *in_${name},
% endfor
            __global const int *lengths
        )
        {
            const int gind0 = get_global_id(0);
            const int gind1 = get_global_id(1);
            if (gind0 >= lengths[gind1]) return;

% for name, [type, offset] in ivars.items():
            ${type} ${name} = in_${name}[${offset} + gind0];
% endfor
% for name, [type, offset] in ovars.items():
            ${type} ${name};
% endfor
% for name, [type, offset] in pvars.items():
            const ${type} ${name} = (${name}_shape0s[gind1] > 1) ?
                in_${name}[${offset} + gind0] : in_${name}[${offset}];
% endfor
% for name, [type, value] in static_params.items():
            const ${type} ${name} = ${value};
% endfor
            //////////////////////////////////////////////////
            //vvvvv USER DECLARATIONS BELOW vvvvv
            ${declares}
            //^^^^^ USER DECLARATIONS ABOVE ^^^^^
            //////////////////////////////////////////////////

            /////vvvvv USER COMPUTATIONS BELOW vvvvv
            ${core_text}
            /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^

% for name, [type, offset] in ovars.items():
            in_${name}[${offset} + gind0] = ${name};
% endfor
        }
        """

    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))
    if 0:
        for i, line in enumerate(text.split('\n')):
            print("%3d %s" % (i + 1, line))

    full_args = []
    for vname, v in list(inputs.items()) + list(outputs.items()):
        full_args.extend([v.cl_starts, v.cl_buf])
    for vname, v in params.items():
        full_args.extend([v.cl_starts, v.cl_shape0s, v.cl_buf])
    full_args.append(input0.cl_shape0s)
    full_args = tuple(full_args)

    fns = cl.Program(queue.context, text).build()
    _fn = getattr(fns, fn_name)
    _fn.set_args(*[arr.data for arr in full_args])

    rval = Plan(queue, _fn, gsize, lsize=None, name=name, tag=tag)
    rval.full_args = full_args     # prevent garbage-collection
    rval.bw_per_call = bw_per_call
    rval.description = ("groups: %d; items: %d; items/group: %0.1f [%d, %d]" %
                        (N, input0.sizes.sum(), input0.sizes.mean(),
                         input0.sizes.min(), input0.sizes.max()))
    return rval
Exemplo n.º 20
0
def block_impl(p, items):

    assert p.float_alpha == 1.0
    assert p.float_beta == 1.0
    assert p.float_gamma == 0.0

    if p.clra_alpha is not None:
        raise NotImplementedError()
    if p.clra_gamma is not None:
        raise NotImplementedError()
    if p.clra_beta is not None:
        raise NotImplementedError()
    if p.cl_alpha is not None:
        raise NotImplementedError()
    if p.cl_gamma is not None:
        raise NotImplementedError()
    if not all(s == 1 for s in p.A.stride1s):
        raise NotImplementedError()

    if p.A_js is None:
        # -- easy probably, but not done
        raise NotImplementedError()

    # --- blocking
    # We want to group the dot products into blocks, so that each workgroup
    # is computing a (block_y, block_x) region of a dot product. To do this,
    # we create a temporary output buffer, compute each block to a separate
    # region of this buffer, then reduce across the buffer in a separate kernel

    # block_y = 8
    block_y = 32
    # block_x = 32
    block_x = 128

    shape0s = []
    shape1s = []
    Astride0s = []
    Astride1s = []
    Astarts = []
    Xstride0s = []
    Xstarts = []
    Ybufstarts = []
    Ybufstart = 0

    Yshape0s_reduce = []
    Yinstride0s_reduce = []
    Yinstarts_reduce = []
    Ystride0s_reduce = []
    Ystarts_reduce = []
    Ybufinds_reduce = []
    bw_reduce = 0

    for n in items:
        assert p.Y_in.shape0s[n] == p.Y.shape0s[n]
        shape0n = p.Y.shape0s[n]

        for i in range(0, shape0n, block_y):
            shape0i = min(shape0n - i, block_y)

            Ybufind_reduce = []

            # loop over dot products outputting to same Y
            n_dots = len(p.A_js[n])
            assert len(p.A_js[n]) == len(p.X_js[n])
            for aj, xj in zip(p.A_js[n], p.X_js[n]):
                assert aj.size == 1 and xj.size == 1
                aj, xj = aj[0], xj[0]  # to ignore numpy DeprecationWarning

                assert p.A.shape0s[aj] == shape0n
                assert p.A.shape1s[aj] == p.X.shape0s[xj]
                assert p.X.shape1s[xj] == 1
                shape1n = p.A.shape1s[aj]

                for j in range(0, shape1n, block_x):
                    shape0s.append(shape0i)
                    shape1s.append(min(shape1n - j, block_x))
                    Astride0s.append(p.A.stride0s[aj])
                    Astride1s.append(p.A.stride1s[aj])
                    Astarts.append(p.A.starts[aj] + i * p.A.stride0s[aj] +
                                   j * p.A.stride1s[aj])
                    Xstride0s.append(p.X.stride0s[xj])
                    Xstarts.append(p.X.starts[xj] + j * p.X.stride0s[xj])

                    Ybufstarts.append(Ybufstart)
                    Ybufind_reduce.append(Ybufstart)
                    # Ybufstart += shape0s[-1]
                    Ybufstart += block_y  # keep good offset

            # --- Y-blocking for reduce
            Yshape0s_reduce.append(shape0i)
            Yinstride0s_reduce.append(p.Y_in.stride0s[n])
            Yinstarts_reduce.append(p.Y_in.starts[n] + i * p.Y_in.stride0s[n])
            Ystride0s_reduce.append(p.Y.stride0s[n])
            Ystarts_reduce.append(p.Y.starts[n] + i * p.Y.stride0s[n])
            Ybufinds_reduce.append(Ybufind_reduce)
            bw_reduce += shape0i * (len(Ybufind_reduce) +
                                    1) * p.Y.dtype.itemsize

    # --- create structure
    gstructure = np.column_stack([
        shape0s, shape1s, Astride0s, Astride1s, Astarts, Xstride0s, Xstarts,
        Ybufstarts
    ])
    cl_gstructure = to_device(p.queue, gstructure.astype(np.int32))

    # --- create Y buffer
    clYbuf = to_device(p.queue, np.zeros(Ybufstart, dtype=p.Y.dtype))

    lsize0 = 4
    # lsize0 = 8
    lsize0_log2 = int(np.log2(lsize0))
    assert 2**lsize0_log2 == lsize0

    lsize = (lsize0, block_y, 1)
    gsize = (lsize[0], lsize[1], gstructure.shape[0])
    assert np.prod(lsize) >= block_x

    textconf = dict(
        A=p.A,
        X=p.X,
        Ybuf=clYbuf,
        n_structure_vars=gstructure.shape[1],
        shape0='lstructure[0]',
        shape1='lstructure[1]',
        Astride0='lstructure[2]',
        Astride1='lstructure[3]',
        Astart='lstructure[4]',
        Xstride0='lstructure[5]',
        Xstart='lstructure[6]',
        Ybufstart='lstructure[7]',
        block_y=block_y,
        block_x=block_x,
        lsize0=lsize0,
        lsize0_log2=lsize0_log2,
    )

    full_args = (
        cl_gstructure,
        p.A.cl_buf,
        p.X.cl_buf,
        clYbuf,
    )

    source = """
    __kernel void fn(
        __global const int *gstructure,
        __global const ${A.ctype} *Adata,
        __global const ${X.ctype} *Xdata,
        __global ${Ybuf.ctype} *Ybufdata
        )
    {
        const int j = get_global_id(0);
        const int i = get_global_id(1);
        const int n = get_global_id(2);

        // load structure
        __local int lstructure[${n_structure_vars}];
        const int local_idx = get_local_id(0) + get_local_id(1)*get_local_size(0);
        if (local_idx < ${n_structure_vars})
            lstructure[local_idx] = gstructure[
                n * ${n_structure_vars} + local_idx];
        barrier(CLK_LOCAL_MEM_FENCE);

        __global const ${X.ctype} *x = Xdata + ${Xstart};
        __global ${Ybuf.ctype} *ybuf = Ybufdata + ${Ybufstart};

        // load x into local memory
        __local ${X.ctype} xlocal[${block_x}];
        if (local_idx < ${shape1})
            xlocal[local_idx] = x[local_idx*${Xstride0}];
        barrier(CLK_LOCAL_MEM_FENCE);

        __local ${Ybuf.ctype} sums[${block_y}][${lsize0}];
        sums[i][j] = 0;

        if (i < ${shape0}) {
            __global const ${A.ctype} *Ai = Adata + ${Astart} + i*${Astride0};
            for(int jj = j; jj < ${shape1}; jj += get_global_size(0)) {
                sums[i][j] += Ai[jj*${Astride1}] * xlocal[jj];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);

    % for k in range(lsize0_log2 - 1, 0, -1):
        if (j < ${2**k})
            sums[i][j] += sums[i][${2**k} + j];
        barrier(CLK_LOCAL_MEM_FENCE);
    % endfor
        if (i < ${shape0} && j == 0)
            ybuf[i] = sums[i][0] + sums[i][1];
    }
    """

    source = Template(source, output_encoding='ascii').render(**textconf)
    kernel = cl.Program(p.queue.context, source).build().fn
    kernel.set_args(*[arr.data for arr in full_args])

    plan = Plan(
        p.queue,
        kernel,
        gsize,
        lsize,
        name='clra_gemv.block_impl',
        tag=p.tag,
        bw_per_call=bw_from_geometry(p.geometry, items),
        flops_per_call=flops_from_geometry(p.geometry, items),
    )
    plan.full_args = full_args  # prevent GC the args
    plan.description = p.geometry_summary(items)
    plan.Ybuf = clYbuf

    # --- Reduce kernel
    align = False

    Nreduce = len(Yshape0s_reduce)
    clYshape0s_reduce = to_device(p.queue,
                                  np.array(Yshape0s_reduce, dtype=np.int32))
    clYinstride0s_reduce = to_device(
        p.queue, np.array(Yinstride0s_reduce, dtype=np.int32))
    clYinstarts_reduce = to_device(p.queue,
                                   np.array(Yinstarts_reduce, dtype=np.int32))
    clYstride0s_reduce = to_device(p.queue,
                                   np.array(Ystride0s_reduce, dtype=np.int32))
    clYstarts_reduce = to_device(p.queue,
                                 np.array(Ystarts_reduce, dtype=np.int32))
    clYbufinds_reduce = CLRaggedArray.from_arrays(p.queue,
                                                  Ybufinds_reduce,
                                                  dtype=np.int32,
                                                  align=align)
    assert len(clYbufinds_reduce) == Nreduce
    assert (clYbufinds_reduce.shape1s == 1).all()

    textconf_reduce = dict(
        Ybuf=clYbuf,
        Yin=p.Y_in,
        Y=p.Y,
    )

    full_args_reduce = (
        clYshape0s_reduce,
        clYbufinds_reduce.cl_shape0s,
        clYbufinds_reduce.cl_starts,
        clYbufinds_reduce.cl_buf,
        clYbuf,
        clYinstride0s_reduce,
        clYinstarts_reduce,
        p.Y_in.cl_buf,
        clYstride0s_reduce,
        clYstarts_reduce,
        p.Y.cl_buf,
    )

    lsize_reduce = None
    gsize_reduce = (block_y, Nreduce)

    source_reduce = """
    __kernel void reduce(
        __global const int *shape0s,
        __global const int *Ishape0s,
        __global const int *Istarts,
        __global const int *Idata,
        __global ${Ybuf.ctype} *Ybufdata,
        __global const int *Yinstride0s,
        __global const int *Yinstarts,
        __global ${Yin.ctype} *Yindata,
        __global const int *Ystride0s,
        __global const int *Ystarts,
        __global ${Y.ctype} *Ydata
    )
    {
        const int i = get_global_id(0);
        const int n = get_global_id(1);
        if (i >= shape0s[n])
            return;

        const int Ishape0 = Ishape0s[n];

        __global const int *Ybufstart = Idata + Istarts[n];
        __global ${Yin.ctype} *yin = Yindata + Yinstarts[n];
        __global ${Y.ctype} *y = Ydata + Ystarts[n];

        ${Y.ctype} sum = yin[i*Yinstride0s[n]];
        for (int j = 0; j < Ishape0; j++) {
            sum += Ybufdata[Ybufstart[j] + i];
        }

        y[i*Ystride0s[n]] = sum;
    }
    """

    source_reduce = Template(source_reduce,
                             output_encoding='ascii').render(**textconf_reduce)
    kernel_reduce = cl.Program(p.queue.context, source_reduce).build().reduce
    kernel_reduce.set_args(*[arr.data for arr in full_args_reduce])

    plan_reduce = Plan(
        p.queue,
        kernel_reduce,
        gsize_reduce,
        lsize_reduce,
        name='clra_gemv.block_impl_reduce',
        tag=p.tag,
        bw_per_call=bw_reduce,
    )
    plan_reduce.full_args = full_args_reduce  # prevent GC the args
    # plan_reduce.description = p.geometry_summary(items)

    return [plan, plan_reduce]
Exemplo n.º 21
0
def plan_sparse_dot_inc(queue,
                        A_indices,
                        A_indptr,
                        A_data,
                        X,
                        Y,
                        inc=False,
                        tag=None):
    """Implements a sparse matrix-vector multiply: Y += A * X or Y = A * X

    Parameters
    ----------
    A_indices, A_indptr : PyOpenCL array
        Column sparse row index specifications
    A_data : PyOpenCL array
        Matrix values at those indices
    X, Y : CLRaggedArrays of length 1
        Input/output data.
    inc : bool
        Whether to increment ``Y`` (True), or set it (False).

    Notes
    -----
    This function crashes when there are >10M nonzero weights. A potential solution
    would be some way to tell each work item to do multiple rows.
    """
    assert len(X) == len(Y) == 1

    for arr in [X, Y]:
        assert (arr.stride1s == 1).all()
        if not ((arr.shape1s == 1).all() and (arr.stride0s == 1).all()):
            raise NotImplementedError(
                "OCL SparseDot only supports matrix-vector currently, not matrix-matrix"
            )

    for arr in [A_indices, A_indptr, A_data]:
        assert len(arr.shape) == 1
        assert arr.strides[0] == arr.dtype.itemsize  # contiguous

    assert A_indices.size == A_data.size

    assert A_data.ctype == X.ctype == Y.ctype
    assert A_indices.ctype == A_indptr.ctype == "int"

    kern = """
    __kernel void sparsedot_inc(
        __global const int *A_indices,
        __global const int *A_indptr,
        __global const ${dtype} *A_data,
        __global const int *Xstarts,
        __global const ${dtype} *Xdata,
        __global const int *Ystarts,
        __global ${dtype} *Ydata
    )
    {
        // n can later be used to keep track of multiple arrays
        const int n = 0;
        const int irow = get_global_id(0);

        __global const ${dtype} *x = Xdata + Xstarts[n];
        __global ${dtype} *y = Ydata + Ystarts[n];

    %if not inc:
        y[irow] = 0;
    %endif
        const int end = A_indptr[irow + 1];
        for (int k = A_indptr[irow]; k < end; k++) {
            y[irow] += A_data[k] * x[A_indices[k]];
        }
    }
    """
    textconf = dict(dtype=A_data.ctype, IndType=A_indices.ctype, inc=inc)
    text = as_ascii(Template(kern, output_encoding="ascii").render(**textconf))
    full_args = (
        A_indices.base_data,
        A_indptr.base_data,
        A_data.base_data,
        X.cl_starts.data,
        X.cl_buf.data,
        Y.cl_starts.data,
        Y.cl_buf.data,
    )
    _fn = cl.Program(queue.context, text).build().sparsedot_inc
    _fn.set_args(*full_args)

    gsize = (Y.sizes[0], 1)  # this only works for a single operation
    lsize = None
    plan = Plan(queue, _fn, gsize, lsize=lsize, name="cl_sparsedot", tag=tag)
    plan.full_args = full_args  # prevent garbage-collection
    plan.flops_per_call = 2 * A_data.size
    plan.bw_per_call = A_data.nbytes * 3 + A_indices.nbytes + A_indptr.nbytes
    plan.description = "groups: %d; shape: (%d, %d); nonzeros: %d" % (
        1,
        Y.sizes[0],
        X.sizes[0],
        A_data.size,
    )
    return plan
Exemplo n.º 22
0
def plan_stp(queue,
             calcium,
             resources,
             weights,
             delta,
             alpha,
             init_weights,
             tag=None):
    assert (len(calcium) == len(resources) == len(weights) == len(delta) ==
            alpha.size == len(init_weights))
    N = len(calcium)

    for arr in (calcium, resources):  # vectors
        assert (arr.shape1s == 1).all()
    for arr in (delta, weights, init_weights):  # matrices
        assert (arr.stride1s == 1).all()

    #assert (resources.shape0s == weights.shape0s).all()
    #assert (calcium.shape0s == weights.shape1s).all()
    assert (weights.shape0s == delta.shape0s).all()
    assert (weights.shape1s == delta.shape1s).all()
    assert (weights.shape0s == init_weights.shape0s).all()
    assert (weights.shape1s == init_weights.shape1s).all()

    assert (calcium.ctype == resources.ctype == weights.ctype == delta.ctype ==
            alpha.ctype == init_weights.ctype)

    text = """
    __kernel void stp(
        __global const int *shape0s,
        __global const int *shape1s,
        __global const int *calcium_stride0s,
        __global const int *calcium_starts,
        __global const ${type} *calcium_data,
        __global const int *resources_stride0s,
        __global const int *resources_starts,
        __global const ${type} *resources_data,
        __global const int *weights_stride0s,
        __global const int *weights_starts,
        __global const ${type} *weights_data,
        __global const int *delta_stride0s,
        __global const int *delta_starts,
        __global ${type} *delta_data,
        __global const ${type} *alphas,
        __global const int *init_weights_stride0s,
        __global const int *init_weights_starts,
        __global const ${type} *init_weights_data
        
    )
    {
        const int ij = get_global_id(0);
        const int k = get_global_id(1);
        const int shape0 = shape0s[k];
        const int shape1 = shape1s[k];
        const int i = ij / shape1;
        const int j = ij % shape1;
        __global ${type} *delta = delta_data + delta_starts[k];
        const ${type} calcium = calcium_data[calcium_starts[k] + i*calcium_stride0s[k]];
        const ${type} resources = resources_data[resources_starts[k] + i*resources_stride0s[k]];
        const ${type} weight = weights_data[
            weights_starts[k] + i*weights_stride0s[k]+j];
        const ${type} alpha = alphas[k];
        const ${type} init_weights = init_weights_data[init_weights_starts[k] + i*init_weights_stride0s[k]+j];
        if (i < shape0) {
            delta[i*delta_stride0s[k] + j] =
               ((calcium*resources/0.2)*init_weights)-weight;
        }
    }
    """

    textconf = dict(type=calcium.ctype)
    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    full_args = (
        delta.cl_shape0s,
        delta.cl_shape1s,
        calcium.cl_stride0s,
        calcium.cl_starts,
        calcium.cl_buf,
        resources.cl_stride0s,
        resources.cl_starts,
        resources.cl_buf,
        weights.cl_stride0s,
        weights.cl_starts,
        weights.cl_buf,
        delta.cl_stride0s,
        delta.cl_starts,
        delta.cl_buf,
        alpha,
        init_weights.cl_stride0s,
        init_weights.cl_starts,
        init_weights.cl_buf,
    )
    _fn = cl.Program(queue.context, text).build().stp
    _fn.set_args(*[arr.data for arr in full_args])

    lsize = None
    gsize = (delta.sizes.max(), N)
    plan = Plan(queue, _fn, gsize, lsize=lsize, name="cl_stp", tag=tag)
    plan.full_args = full_args  # prevent garbage-collection
    plan.flops_per_call = 6 * delta.sizes.sum()
    plan.bw_per_call = (calcium.nbytes + resources.nbytes + weights.nbytes +
                        delta.nbytes + alpha.nbytes + init_weights.nbytes)
    return plan
Exemplo n.º 23
0
def ref_impl(p, items):
    """
    Return an OpenCL function to calculate elements `items` of
    gemv operation `p`.

    In this reference implementation, we create a work item
    per output number, or more specifically, a work grid
    of (max_y_len, len(items)).  Each work item loops over the
    dot products and the elements within each dot product to
    compute the output value Y[global_id(1)][global_id(0)].

    """

    if p.clra_alpha is not None:
        raise NotImplementedError()
    if p.clra_gamma is not None:
        raise NotImplementedError()
    cl_items = to_device(p.queue,
                         np.asarray(items, dtype='int32'))
    if 0:
        if len(items) < 10:
            print('Falling back on reference implementation')
            p.print_geometry_summary(items, full=True)
        else:
            print('Falling back on reference implementation')
            p.print_geometry_summary(items)

    assert all(s == 1 for s in p.A.stride1s)
    assert all(s == 1 for s in p.X.stride1s)
    assert all(s == 1 for s in p.Y.stride0s)
    assert all(s == 1 for s in p.Y.stride1s)
    assert all(s == 1 for s in p.Y_in.stride0s)
    assert all(s == 1 for s in p.Y_in.stride1s)

    text = """
        __kernel void gemv_ref(
            __global int *items,
    % if cl_alpha is not None:
            __global ${cl_alpha.ctype} * alphas,
    % endif
    % if (A_js is not None):
            __global int *A_starts,
            __global int *A_shape1s,
            __global int *A_stride0s,
            __global ${A.cl_buf.ctype} *A_data,
            __global int *A_js_starts,
            __global int *A_js_shape0s,
            __global int *A_js_data,
            __global int *X_starts,
            __global int *X_stride0s,
            __global ${X.cl_buf.ctype} *X_data,
            __global int *X_js_starts,
            __global int *X_js_data,
    % endif
    % if cl_beta is not None:
            __global ${cl_beta.ctype} * betas,
    % endif
    % if clra_beta is not None:
            __global int *beta_starts,
            __global int *beta_data,
    % endif
    % if cl_gamma is not None:
            __global ${cl_gamma.ctype} * gammas,
    % endif
            __global int *Y_in_starts,
            __global ${Y_in.cl_buf.ctype} *Y_in_data,
            __global int *Y_starts,
            __global int *Y_shape0s,
            __global ${Y.cl_buf.ctype} *Y_data)
        {
            const int mm = get_global_id(0);
            const int bb = items[get_global_id(1)];
            const int M = Y_shape0s[bb];
            if (mm < M)
            {
                const int y_offset = Y_starts[bb];
                const int y_in_offset = Y_in_starts[bb];

    % if float_beta is not None:
                const ${Y.cl_buf.ctype} beta = ${float_beta};
    % elif cl_beta is not None:
                const ${cl_beta.ctype} beta = betas[bb];
    % elif clra_beta is not None:
                const int beta_offset = beta_starts[bb];
                const ${clra_beta.cl_buf.ctype} beta
                    = beta_data[beta_offset + mm];
    % endif

    % if float_gamma is not None:
                const ${Y.cl_buf.ctype} gamma = ${float_gamma};
    % elif cl_gamma is not None:
                const ${cl_gamma.ctype} gamma = gammas[bb];
    % endif

                Y_data[y_offset + mm] =
                    gamma + beta * Y_in_data[y_in_offset + mm];

    % if A_js is not None:
                const int n_dot_products = A_js_shape0s[bb];
                X_js_data += X_js_starts[bb];
                A_js_data += A_js_starts[bb];

                ${Y.cl_buf.ctype} y_sum = 0;
                for (int ii = 0; ii < n_dot_products; ++ii)
                {
                    const int x_ji = X_js_data[ii];
                    const int a_ji = A_js_data[ii];
                    const int N_i = A_shape1s[a_ji];
                    const int x_offset = X_starts[x_ji];
                    const int a_offset = A_starts[a_ji];
                    const int AsM = A_stride0s[a_ji];
                    const int XsM = X_stride0s[x_ji];

                    for (int nn = 0; nn < N_i; ++nn)
                    {
                        y_sum += X_data[x_offset + nn * XsM]
                                 * A_data[a_offset + mm * AsM + nn];
                    }
                }
        % if float_alpha is not None:
                Y_data[y_offset + mm] += ${float_alpha} * y_sum;
        % elif cl_alpha is not None:
                Y_data[y_offset + mm] += alphas[bb] * y_sum;
        % endif
    % endif
            }

        }
    """

    text = as_ascii(
        Template(text, output_encoding='ascii').render(**p.__dict__))

    gsize = (
        max(p.geometry[ii]['y_len'] for ii in items),
        len(items))
    lsize = None
    fn = cl.Program(p.queue.context, text).build().gemv_ref
    full_args = [cl_items]
    if p.cl_alpha is not None:
        full_args += [p.cl_alpha]
    if p.A_js is not None:
        full_args += [
            p.A.cl_starts,
            p.A.cl_shape1s,
            p.A.cl_stride0s,
            p.A.cl_buf,
            p.A_js.cl_starts,
            p.A_js.cl_shape0s,
            p.A_js.cl_buf,
            p.X.cl_starts,
            p.X.cl_stride0s,
            p.X.cl_buf,
            p.X_js.cl_starts,
            p.X_js.cl_buf,
        ]
    if p.cl_beta is not None:
        full_args += [p.cl_beta]
    elif p.clra_beta is not None:
        full_args += [p.clra_beta.cl_starts, p.clra_beta.cl_buf]

    if p.cl_gamma is not None:
        full_args += [p.cl_gamma]
    elif p.clra_gamma is not None:
        full_args += [p.clra_gamma.cl_starts, p.clra_gamma.cl_buf]

    full_args += [
        p.Y_in.cl_starts,
        p.Y_in.cl_buf,
        p.Y.cl_starts,
        p.Y.cl_shape0s,
        p.Y.cl_buf]

    # print([str(arr.dtype)[0] for arr in full_args])
    fn.set_args(*[arr.data for arr in full_args])
    rval = Plan(p.queue, fn, gsize, lsize, name="clra_gemv.ref_impl",
                tag=p.tag,
                bw_per_call=bw_from_geometry(p.geometry, items),
                flops_per_call=flops_from_geometry(p.geometry, items))
    rval.full_args = full_args  # prevent GC the args
    return rval
Exemplo n.º 24
0
def reduce_impl(p, items,
                group_size=None,
                segment_size=None,
                ):

    #
    # Target use case: long inner products, small numbers of dots.
    #
    # Approach: each work-group computes a small number of gemv outputs
    #

    if p.clra_alpha is not None:
        raise NotImplementedError()
    if p.clra_gamma is not None:
        raise NotImplementedError()
    if p.clra_beta is not None:
        raise NotImplementedError()
    if p.cl_alpha is not None:
        raise NotImplementedError()
    if p.cl_gamma is not None:
        raise NotImplementedError()
    if not all(s == 1 for s in p.A.stride1s):
        raise NotImplementedError()

    assert p.float_alpha is not None
    assert p.float_gamma is not None

    cl_gstructure, textconf = p.cl_geometry_and_textconf(items)
    max_n_dots = max([len(p.geometry[ii]['dots']) for ii in items])
    max_reduce_len = max(max([gg['a_shape1']
                              for gg in p.geometry[ii]['dots']])
                         for ii in items)
    max_y_len = max([p.geometry[ii]['y_len'] for ii in items])

    # segment means the piece of Y written by a work-group
    # group_size is the number of values that we're reducing over

    if len(items) < 4:
        if group_size is None:
            group_size = 32  # XXX
        if segment_size is None:
            segment_size = min(max_y_len, 2)  # XXX
    else:
        if group_size is None:
            group_size = 32  # XXX
        if segment_size is None:
            segment_size = min(max_y_len, 4)  # XXX
    g_segments = int(np.ceil(float(max_y_len) / segment_size))
    gsize = (group_size, g_segments * segment_size, len(items))
    lsize = (group_size, segment_size, 1)

    max_reduce_iters = int(np.ceil(float(max_reduce_len) / group_size))
    textconf.update({
        'n_items': len(items),
        'gsize': gsize,
        'segment_size': segment_size,
        'max_y_len': max_y_len,
        'group_size': group_size,
        'local_count': group_size * segment_size,
        'max_reduce_len': max_reduce_len,
        'N_cutoff': max_reduce_iters * group_size,
        'max_n_dots': max_n_dots,
    })
    if 0:
        for k, v in textconf.items():
            print(k, v)

    textconf.update(p.__dict__)

    text = """
        __kernel void gemv_reduce(
            const __global int *gstructure,
            const __global ${A.cl_buf.ctype} *A_data,
            const __global ${X.cl_buf.ctype} *X_data,
            % if cl_beta is not None:
            const __global ${cl_beta.ctype} * betas,
            % endif
            const __global ${Y_in.cl_buf.ctype} *Y_in_data,
            __global ${Y.cl_buf.ctype} *Y_data)
    {
        __local int lstructure[${n_structure_vars}];
    % if segment_size > 1:
        // we'll cache X in shared memory so we load it only once
        // for the whole segment
        __local ${X.cl_buf.ctype} lX[${group_size}];
    % endif
        //Scratch space for the dot products
        __local ${Y.cl_buf.ctype}
            partialDotProduct[${segment_size}][${group_size}];
        __local ${Y.cl_buf.ctype}
            y_sum_pre[${segment_size}];
        const int local_idx = get_local_id(0)
            + get_local_id(1) * get_local_size(0);

        // load structure
    % if local_count < n_structure_vars:
        for (int ii = local_idx;
                 ii < ${n_structure_vars};
                 ii += ${local_count})
        {
            lstructure[ii] = gstructure[
                get_global_id(2) * ${structure_vars_stride} + ii];
        }
    % else :
        if (local_idx < ${n_structure_vars})
        {
            lstructure[local_idx] = gstructure[
                get_global_id(2) * ${structure_vars_stride} + local_idx];
        }
    % endif
        barrier(CLK_LOCAL_MEM_FENCE);

        if ((get_local_id(0) == 0) && (get_global_id(1) < ${y_len}))
        {
    % if float_beta is not None and float_beta != 0 :
            y_sum_pre[get_local_id(1)] = ${float_beta}
                * Y_in_data[${y_in_starts} + get_global_id(1)];
    % elif cl_beta is not None:
            y_sum_pre[get_local_id(1)] = betas[${bb}]
                * Y_in_data[${y_in_starts} + get_global_id(1)];
    % else :
            y_sum_pre[get_local_id(1)] = 0;
    % endif

    % if float_gamma is not None and float_gamma != 0:
            y_sum_pre[get_local_id(1)] += ${float_gamma};
    % endif
    // printf("betaY + gamma=%f\\n", y_sum_pre[get_local_id(1)]);
        }

        partialDotProduct[get_local_id(1)][get_local_id(0)] = 0;
    % if max_n_dots > 1:
        for (int ii = 0;
                 ii < ${n_dot_products};
                 ii += 1)
        {
    % else:
        const int ii = 0;
    % endif


        for (int nn = get_local_id(0);
                 nn < ${N_cutoff};
                 nn += get_local_size(0))
        {
    // segment_size = ${segment_size}
    % if (segment_size == 1):
            if ((nn < ${N_i}) && (get_global_id(1) < ${y_len}))
            {
            partialDotProduct[get_local_id(1)][get_local_id(0)] +=
                A_data[${a_starts} + get_global_id(1) * ${a_s0} + nn]
                * X_data[${x_starts} + nn];
            }
    % else:
            barrier(CLK_LOCAL_MEM_FENCE);
            if ((get_local_id(1) == 0) && (nn < ${N_i}))
            {
                lX[get_local_id(0)] = X_data[${x_starts} + nn];
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            if ((nn < ${N_i}) && (get_global_id(1) < ${y_len}))
            {
            partialDotProduct[get_local_id(1)][get_local_id(0)] +=
                A_data[${a_starts} + get_global_id(1) * ${a_s0} + nn]
                * lX[get_local_id(0)];
            }
    % endif
        }

    % if (max_n_dots > 1):
        }
    % endif

        // -- Parallel reduction long work-group dimension 0
        for (uint stride = 1;
                  stride < get_local_size(0);
                  stride *= 2)
        {
            barrier(CLK_LOCAL_MEM_FENCE);

            uint index = 2 * stride * get_local_id(0);
            if (index + stride < get_local_size(0))
            {
                partialDotProduct[get_local_id(1)][index] +=
                    partialDotProduct[get_local_id(1)][index + stride];
            }
        }
        // barrier(CLK_LOCAL_MEM_FENCE);
        if ((get_local_id(0) == 0) && (get_global_id(1) < ${y_len})) {
            Y_data[${y_offset} + get_global_id(1)] = y_sum_pre[get_local_id(1)]
                + ${float_alpha} * partialDotProduct[get_local_id(1)][0];
        }
    }
        """

    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    fn = cl.Program(p.queue.context, text).build().gemv_reduce

    full_args = [
        cl_gstructure,
        p.A.cl_buf,
        p.X.cl_buf,
    ]
    if p.cl_beta is not None:
        full_args += [p.cl_beta]
    full_args += [
        p.Y_in.cl_buf,
        p.Y.cl_buf,
    ]

    fn.set_args(*[arr.data for arr in full_args])
    rval = Plan(p.queue, fn, gsize, lsize,
                name='clra_gemv.reduce_impl',
                tag=p.tag,
                bw_per_call=bw_from_geometry(p.geometry, items),
                flops_per_call=flops_from_geometry(p.geometry, items),
                )
    rval.full_args = full_args  # prevent GC the args
    rval.description = p.geometry_summary(items)
    return rval
Exemplo n.º 25
0
def plan_slicedcopy(queue, A, B, Ainds, Binds, incs, tag=None):
    N = len(A)
    assert len(A) == len(B) == len(Ainds) == len(Binds)

    for arr in [A, B, Ainds, Binds]:
        assert (arr.shape1s == 1).all()
        assert (arr.stride0s == 1).all()
        assert (arr.stride1s == 1).all()
    assert (Ainds.shape0s == Binds.shape0s).all()

    assert A.ctype == B.ctype
    assert Ainds.ctype == Binds.ctype == 'int'
    assert incs.ctype == 'int'

    text = """
        ////////// MAIN FUNCTION //////////
        __kernel void slicedcopy(
            __global const int *Astarts,
            __global const ${Atype} *Adata,
            __global const int *Bstarts,
            __global ${Btype} *Bdata,
            __global const int *Ishape0s,
            __global const int *AIstarts,
            __global const int *AIdata,
            __global const int *BIstarts,
            __global const int *BIdata,
            __global const int *incdata
        )
        {
            const int n = get_global_id(1);
            __global const ${Atype} *a = Adata + Astarts[n];
            __global ${Btype} *b = Bdata + Bstarts[n];
            __global const int *aind = AIdata + AIstarts[n];
            __global const int *bind = BIdata + BIstarts[n];
            const int inc = incdata[n];

            int i = get_global_id(0);
            if (inc)
                for (; i < Ishape0s[n]; i += get_global_size(0))
                    b[bind[i]] += a[aind[i]];
            else
                for (; i < Ishape0s[n]; i += get_global_size(0))
                    b[bind[i]] = a[aind[i]];
        }
        """

    textconf = dict(Atype=A.ctype, Btype=B.ctype)
    text = as_ascii(Template(text, output_encoding='ascii').render(**textconf))

    full_args = (
        A.cl_starts,
        A.cl_buf,
        B.cl_starts,
        B.cl_buf,
        Ainds.cl_shape0s,
        Ainds.cl_starts,
        Ainds.cl_buf,
        Binds.cl_starts,
        Binds.cl_buf,
        incs.cl_buf,
    )
    _fn = cl.Program(queue.context, text).build().slicedcopy
    _fn.set_args(*[arr.data for arr in full_args])

    max_group = queue.device.max_work_group_size
    n = min(max(Ainds.shape0s), max_group)
    gsize = (n, N)
    lsize = (n, 1)
    rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_slicedcopy", tag=tag)
    rval.full_args = full_args     # prevent garbage-collection
    rval.bw_per_call = 2 * Ainds.shape0s.sum() * A.dtype.itemsize
    rval.description = (
        "groups: %d; items: %d; items/group: %0.1f [%d, %d]" %
        (len(Ainds), Ainds.sizes.sum(),
         Ainds.sizes.mean(), Ainds.sizes.min(), Ainds.sizes.max()))
    return rval