Exemplo n.º 1
0
def plan_probes(queue, periods, X, Y, tag=None):
    """
    Parameters
    ----------
    P : raggedarray of ints
        The period (in time-steps) of each probe
    """

    assert len(X) == len(Y)
    assert len(X) == len(periods)
    N = len(X)

    cl_countdowns = to_device(queue, np.zeros(N, dtype="int32"))
    cl_bufpositions = to_device(queue, np.zeros(N, dtype="int32"))
    cl_periods = to_device(queue, np.asarray(periods, dtype="int32"))

    assert X.cl_buf.ocldtype == Y.cl_buf.ocldtype

    ### N.B.  X[i].shape = (ndims[i], )
    ###       Y[i].shape = (buf_ndims[i], buf_len)

    for i in xrange(N):
        assert X.shape0s[i] == Y.shape1s[i]
        assert X.shape1s[i] == 1
        assert X.stride0s[i] == 1
        assert Y.stride1s[i] == 1

    text = """
        ////////// MAIN FUNCTION //////////
        __kernel void fn(
            __global int *countdowns,
            __global int *bufpositions,
            __global const int *periods,
            __global const int *Xstarts,
            __global const int *Xshape0s,
            __global const ${Xtype} *Xdata,
            __global const int *Ystarts,
            __global ${Ytype} *Ydata
        )
        {
            const int n = get_global_id(1);
            const int countdown = countdowns[n];

            if (countdown == 0) {
                const int n_dims = Xshape0s[n];
                __global const ${Xtype} *x = Xdata + Xstarts[n];
                const int bufpos = bufpositions[n];

                __global ${Ytype} *y = Ydata + Ystarts[n] + bufpos * n_dims;

                for (int ii = get_global_id(0);
                         ii < n_dims;
                         ii += get_global_size(0))
                {
                    y[ii] = x[ii];
                }
                // This should *not* cause deadlock because
                // all local threads guaranteed to be
                // in this branch together.
                barrier(CLK_LOCAL_MEM_FENCE);
                if (get_global_id(0) == 0)
                {
                    countdowns[n] = periods[n] - 1;
                    bufpositions[n] = bufpos + 1;
                }
            }
            else
            {
                barrier(CLK_LOCAL_MEM_FENCE);
                if (get_global_id(0) == 0)
                {
                    countdowns[n] = countdown - 1;
                }
            }
        }
        """

    textconf = dict(N=N, Xtype=X.cl_buf.ocldtype, Ytype=Y.cl_buf.ocldtype)
    text = Template(text, output_encoding="ascii").render(**textconf)

    full_args = (cl_countdowns, cl_bufpositions, cl_periods, X.cl_starts, X.cl_shape0s, X.cl_buf, Y.cl_starts, Y.cl_buf)
    _fn = cl.Program(queue.context, text).build().fn
    _fn.set_args(*[arr.data for arr in full_args])

    max_len = min(queue.device.max_work_group_size, max(X.shape0s))
    gsize = (max_len, N)
    lsize = (max_len, 1)
    rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_probes", tag=tag)
    rval.full_args = full_args  # prevent garbage-collection
    rval.cl_bufpositions = cl_bufpositions
    rval.Y = Y
    return rval
Exemplo n.º 2
0
def plan_probes(queue, periods, X, Y, tag=None):
    """
    Parameters
    ----------
    P : raggedarray of ints
        The period (in time-steps) of each probe
    """

    assert len(X) == len(Y)
    assert len(X) == len(periods)
    N = len(X)

    cl_countdowns = to_device(queue, np.zeros(N, dtype='int32'))
    cl_bufpositions = to_device(queue, np.zeros(N, dtype='int32'))
    cl_periods = to_device(queue, np.asarray(periods, dtype='int32'))

    assert X.cl_buf.ocldtype == Y.cl_buf.ocldtype

    ### N.B.  X[i].shape = (ndims[i], )
    ###       Y[i].shape = (buf_ndims[i], buf_len)

    for i in xrange(N):
        assert X.shape0s[i] == Y.shape1s[i]
        assert X.shape1s[i] == 1
        assert X.stride0s[i] == 1
        assert Y.stride1s[i] == 1

    text = """
        ////////// MAIN FUNCTION //////////
        __kernel void fn(
            __global int *countdowns,
            __global int *bufpositions,
            __global const int *periods,
            __global const int *Xstarts,
            __global const int *Xshape0s,
            __global const ${Xtype} *Xdata,
            __global const int *Ystarts,
            __global ${Ytype} *Ydata
        )
        {
            const int n = get_global_id(1);
            const int countdown = countdowns[n];

            if (countdown == 0) {
                const int n_dims = Xshape0s[n];
                __global const ${Xtype} *x = Xdata + Xstarts[n];
                const int bufpos = bufpositions[n];

                __global ${Ytype} *y = Ydata + Ystarts[n] + bufpos * n_dims;

                for (int ii = get_global_id(0);
                         ii < n_dims;
                         ii += get_global_size(0))
                {
                    y[ii] = x[ii];
                }
                // This should *not* cause deadlock because
                // all local threads guaranteed to be
                // in this branch together.
                barrier(CLK_LOCAL_MEM_FENCE);
                if (get_global_id(0) == 0)
                {
                    countdowns[n] = periods[n] - 1;
                    bufpositions[n] = bufpos + 1;
                }
            }
            else
            {
                barrier(CLK_LOCAL_MEM_FENCE);
                if (get_global_id(0) == 0)
                {
                    countdowns[n] = countdown - 1;
                }
            }
        }
        """

    textconf = dict(N=N, Xtype=X.cl_buf.ocldtype, Ytype=Y.cl_buf.ocldtype)
    text = Template(text, output_encoding='ascii').render(**textconf)

    full_args = (
        cl_countdowns,
        cl_bufpositions,
        cl_periods,
        X.cl_starts,
        X.cl_shape0s,
        X.cl_buf,
        Y.cl_starts,
        Y.cl_buf,
    )
    _fn = cl.Program(queue.context, text).build().fn
    _fn.set_args(*[arr.data for arr in full_args])

    max_len = min(queue.device.max_work_group_size, max(X.shape0s))
    gsize = (
        max_len,
        N,
    )
    lsize = (max_len, 1)
    rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_probes", tag=tag)
    rval.full_args = full_args  # prevent garbage-collection
    rval.cl_bufpositions = cl_bufpositions
    rval.Y = Y
    return rval