Exemplo n.º 1
0
def mkiterlib(gnm):
    packer = interp.GenomePacker('iter_params', 'params',
                                 cuburn.genome.specs.anim)
    cp = packer.view(gnm)

    iterbody = iter_body(cp)
    bodies = [iter_xf_body(cp, i, x) for i, x in sorted(cp.xforms.items())]
    if 'final_xform' in cp:
        bodies.append(iter_xf_body(cp, 'final', cp.final_xform))
    bodies.append(iterbody)
    packer_lib = packer.finalize()

    lib = devlib(deps=[packer_lib, mwclib, ringbuflib],
                 # We grab the surf decl from palintlib as well
                 decls=iter_decls + interp.palintlib.decls,
                 defs='\n'.join(bodies))
    return packer, lib
Exemplo n.º 2
0
    def finalize(self):
        """
        Create the code to render this genome.
        """
        # At the risk of packing a few things more than once, we don't
        # uniquify the overall precalc order, sparing us the need to implement
        # recursive code generation
        direct = list(self.packed_direct) + list(self.packed_direct_mag)
        self.packed = direct + list(self.packed_precalc)
        self.genome = direct + list(self.genome_precalc)

        self._len = len(self.packed)

        decls = self._decls.substitute(**self.__dict__)
        defs = self._defs.substitute(**self.__dict__)

        return devlib(deps=[catmullromlib], decls=decls, defs=defs)
Exemplo n.º 3
0
    def finalize(self):
        """
        Create the code to render this genome.
        """
        # At the risk of packing a few things more than once, we don't
        # uniquify the overall precalc order, sparing us the need to implement
        # recursive code generation
        direct = list(self.packed_direct) + list(self.packed_direct_mag)
        self.packed = direct + list(self.packed_precalc)
        self.genome = direct + list(self.genome_precalc)

        self._len = len(self.packed)

        decls = self._decls.substitute(**self.__dict__)
        defs = self._defs.substitute(**self.__dict__)

        return devlib(deps=[catmullromlib], decls=decls, defs=defs)
Exemplo n.º 4
0
def mkiterlib(gnm):
    packer = interp.GenomePacker('iter_params', 'params',
                                 cuburn.genome.specs.anim)
    cp = packer.view(gnm)

    iterbody = iter_body(cp)
    bodies = [iter_xf_body(cp, i, x) for i, x in sorted(cp.xforms.items())]
    if 'final_xform' in cp:
        bodies.append(iter_xf_body(cp, 'final', cp.final_xform))
    bodies.append(iterbody)
    packer_lib = packer.finalize()

    lib = devlib(
        deps=[packer_lib, mwclib, ringbuflib],
        # We grab the surf decl from palintlib as well
        decls=iter_decls + interp.palintlib.decls,
        defs='\n'.join(bodies))
    return packer, lib
Exemplo n.º 5
0
rgba8lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
// Perform a conversion from float32 values to uint8 ones, applying
// pixel- and channel-independent dithering to reduce suprathreshold banding
// artifacts. Clamps values larger than 1.0f.
// TODO: move to a separate module?
// TODO: less ineffecient mwc_st handling?
__global__ void f32_to_rgba_u8(
    uchar4 *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height,
    ringbuf *rb, mwc_st *rctxs)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int isrc = sstride * (y + gutter) + x + gutter;

    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];

    float4 in = src[isrc];
    uchar4 out = make_uchar4(
        fminf(1.0f, in.x) * 255.0f + 0.49f * mwc_next_11(rctx),
        fminf(1.0f, in.y) * 255.0f + 0.49f * mwc_next_11(rctx),
        fminf(1.0f, in.z) * 255.0f + 0.49f * mwc_next_11(rctx),
        fminf(1.0f, in.w) * 255.0f + 0.49f * mwc_next_11(rctx)
    );

    int idst = dstride * y + x;
    dst[idst] = out;
    rctxs[rb_incr(rb->tail, tid)] = rctx;
}
''')
Exemplo n.º 6
0
catmullromlib = devlib(deps=[binsearchlib], decls=r'''
__device__ __noinline__
float catmull_rom(const float *times, const float *knots, float t);

__device__ __noinline__
float catmull_rom_mag(const float *times, const float *knots, float t);
''', defs=r'''

// ELBOW is the linearization threhsold; above this magnitude, a value scales
// logarithmically, and below it, linearly. ELOG1 is a constant used to make
// this happen. See helpers/spline_mag_domain_interp.wxm for nice graphs.
#define ELBOW 0.0625f   // 2^(-4)
#define ELOG1 5.0f      // 1 - log2(elbow)

// Transform from linear to magnitude domain
__device__ float linlog(float x) {
    if (x > ELBOW)  return   log2f(x)  + ELOG1;
    if (x < -ELBOW) return -(log2f(-x) + ELOG1);
    return x / ELBOW;
}

// Reverse of above
__device__ float linexp(float v) {
    if (v >= 1.0)   return  exp2f( v - ELOG1);
    if (v <= -1.0)  return -exp2f(-v - ELOG1);
    return v * ELBOW;
}

__device__ float linslope(float x, float m) {
    if (x >=  ELBOW) return m /  x;
    if (x <= -ELBOW) return m / -x;
    return m / ELBOW;
}

__device__ float
catmull_rom_base(const float *times, const float *knots, float t, bool mag) {
    int idx = bitwise_binsearch(times, t);

    // The left bias of the search means that we never have to worry about
    // overshooting unless the genome is corrupted
    idx = max(idx, 1);

    float t1 = times[idx], t2 = times[idx+1] - t1;
    float rt2 = 1.0f / t2;
    float t0 = (times[idx-1] - t1) * rt2, t3 = (times[idx+2] - t1) * rt2;
    t = (t - t1) * rt2;

    // Now t1 is effectively 0 and t2 is 1

    float k0 = knots[idx-1], k1 = knots[idx],
          k2 = knots[idx+1], k3 = knots[idx+2];

    float m1 = (k2 - k0) / (1.0f - t0),
          m2 = (k3 - k1) / (t3);

    if (mag) {
        m1 = linslope(k1, m1);
        m2 = linslope(k2, m2);
        k1 = linlog(k1);
        k2 = linlog(k2);
    }

    float tt = t * t, ttt = tt * t;

    float r = m1 * (      ttt - 2.0f*tt + t)
            + k1 * ( 2.0f*ttt - 3.0f*tt + 1)
            + m2 * (      ttt -      tt)
            + k2 * (-2.0f*ttt + 3.0f*tt);

    if (mag) r = linexp(r);
    return r;
}

// Variants with scaling domain logic inlined
__device__ __noinline__
float catmull_rom(const float *times, const float *knots, float t) {
    return catmull_rom_base(times, knots, t, false);
}

__device__ __noinline__
float catmull_rom_mag(const float *times, const float *knots, float t) {
    return catmull_rom_base(times, knots, t, true);
}
''')
Exemplo n.º 7
0
texshearlib = devlib(defs=r'''
// Filter directions specified in degrees, using image/texture addressing
// [(0,0) is upper left corner, 90 degrees is down].

__constant__ float2 addressing_patterns[16] = {
    { 1.0f,  0.0f},        { 0.0f,       1.0f}, //  0,  1:   0,    90
    { 1.0f,  1.0f},        {-1.0f,       1.0f}, //  2,  3:  45,   135
    { 1.0f,  0.5f},        {-0.5f,       1.0f}, //  4,  5:  22.5, 112.5
    { 1.0f, -0.5f},        { 0.5f,       1.0f}, //  6,  7: -22.5,  67.5
    { 1.0f,  0.666667f},   {-0.666667f,  1.0f}, //  8,  9:  30,   120
    { 1.0f, -0.666667f},   { 0.666667f,  1.0f}, // 10, 11: -30,    60
    { 1.0f,  0.333333f},   {-0.333333f,  1.0f}, // 12, 13:  15,   105
    { 1.0f, -0.333333f},   { 0.333333f,  1.0f}, // 14, 15: -15,    75
};

// Mon dieu! A C++ feature? Gotta close the "extern C" added by the compiler.
}

template <typename T> __device__ T
tex_shear(texture<T, cudaTextureType2D> ref, int pattern,
          float x, float y, float radius) {
    float2 scale = addressing_patterns[pattern];
    float i = scale.x * radius, j = scale.y * radius;
    // Round i and j to the nearest integer, choosing the nearest even when
    // equidistant. It's critical that this be done before adding 'x' and 'y',
    // so that addressing patterns remain consistent across the grid.
    asm("{\n\t"
        "cvt.rni.ftz.f32.f32    %0, %0;\n\t"
        "cvt.rni.ftz.f32.f32    %1, %1;\n\t"
        "}\n" : "+f"(i), "+f"(j));
    return tex2D(ref, x + i, y + j);
}

extern "C" {
''')
Exemplo n.º 8
0
pixfmtlib = devlib(deps=[ringbuflib, mwclib], defs=r'''
// Clamp an input between 0 and a given peak (inclusive), dithering its output,
// with full clamping for pixels that are true-black for compressibility.
__device__ float dclampf(mwc_st &rctx, float peak, float in) {
  float ret = 0.0f;
  if (in > 0.0f) {
    ret = fminf(peak, in * peak + 0.99f * mwc_next_01(rctx));
  }
  return ret;
}

// Perform a conversion from float32 values to uint8 ones, applying
// pixel- and channel-independent dithering to reduce suprathreshold banding
// artifacts. Clamps values larger than 1.0f.
// TODO: move to a separate module?
// TODO: less ineffecient mwc_st handling?
__global__ void f32_to_rgba_u8(
    uchar4 *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height,
    ringbuf *rb, mwc_st *rctxs)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int isrc = sstride * (y + gutter) + x + gutter;

    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];

    float4 in = src[isrc];
    uchar4 out = make_uchar4(
        dclampf(rctx, 255.0f, in.x),
        dclampf(rctx, 255.0f, in.y),
        dclampf(rctx, 255.0f, in.z),
        dclampf(rctx, 255.0f, in.w)
    );

    int idst = dstride * y + x;
    dst[idst] = out;
    rctxs[rb_incr(rb->tail, tid)] = rctx;
}

// Perform a conversion from float32 values to uint16 ones, as above.
__global__ void f32_to_rgba_u16(
    ushort4 *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height,
    ringbuf *rb, mwc_st *rctxs)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int isrc = sstride * (y + gutter) + x + gutter;

    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];

    float4 in = src[isrc];
    ushort4 out = make_ushort4(
        dclampf(rctx, 65535.0f, in.x),
        dclampf(rctx, 65535.0f, in.y),
        dclampf(rctx, 65535.0f, in.z),
        dclampf(rctx, 65535.0f, in.w)
    );

    int idst = dstride * y + x;
    dst[idst] = out;
    rctxs[rb_incr(rb->tail, tid)] = rctx;
}

// Convert from rgb444 to planar YUV with no chroma subsampling.
// Uses JPEG full-range color primaries.
__global__ void f32_to_yuv444p(
    char *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height,
    ringbuf *rb, mwc_st *rctxs)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int isrc = sstride * (y + gutter) + x + gutter;

    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];

    float4 in = src[isrc];
    uchar3 out = make_uchar3(
        dclampf(rctx, 255.0f, 0.299f      * in.x + 0.587f     * in.y + 0.114f     * in.z),
        dclampf(rctx, 255.0f, -0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z + 0.5f),
        dclampf(rctx, 255.0f, 0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z + 0.5f)
    );

    int idst = dstride * y + x;
    dst[idst] = out.x;
    idst += dstride * height;
    dst[idst] = out.y;
    idst += dstride * height;
    dst[idst] = out.z;
    rctxs[rb_incr(rb->tail, tid)] = rctx;
}

// Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries.
// TODO(strobe): Decide how YouTube will handle Rec. 2020, and then do that here.
__global__ void f32_to_yuv444p10(
    uint16_t *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height,
    ringbuf *rb, mwc_st *rctxs)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int isrc = sstride * (y + gutter) + x + gutter;

    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];

    float4 in = src[isrc];
    ushort3 out = make_ushort3(
        dclampf(rctx, 1023.0f, 0.299f      * in.x + 0.587f     * in.y + 0.114f     * in.z),
        dclampf(rctx, 1023.0f, -0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z + 0.5f),
        dclampf(rctx, 1023.0f, 0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z + 0.5f)
    );

    int idst = dstride * y + x;
    dst[idst] = out.x;
    idst += dstride * height;
    dst[idst] = 1023.0f * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z + 0.5f);
    idst += dstride * height;
    dst[idst] = out.z;

    rctxs[rb_incr(rb->tail, tid)] = rctx;
}

// Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries.
// Perform subsampling of chroma using weighted averages.
__global__ void f32_to_yuv420p10(
    uint16_t *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height,
    ringbuf *rb, mwc_st *rctxs)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];

    // Perform luma using real addressing
    int isrc = sstride * (y + gutter) + x + gutter;
    int idst = dstride * y + x;
    float4 in = src[isrc];
    dst[idst] = dclampf(rctx, 1023.0f, 0.299f      * in.x + 0.587f     * in.y + 0.114f     * in.z);

    // Drop into subsampling mode for chroma components
    if (x * 2 > dstride || y * 2 > height) return;

    // Recompute addressing and collect weighted averages
    // TODO(strobe): characterize overflow here
    isrc = sstride * (y * 2 + gutter) + x * 2 + gutter;
    in = src[isrc];
    float sum = in.w + 1e-12;
    float cb = in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
    float cr = in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);

    in = src[isrc + 1];
    sum += in.w;
    cb += in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
    cr += in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);

    isrc += sstride;
    in = src[isrc];
    sum += in.w;
    cb += in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
    cr += in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);

    in = src[isrc + 1];
    sum += in.w;
    cb += in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
    cr += in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);

    // For this to work, dstride must equal the output frame width
    // and be a multiple of four.
    idst = dstride * height + dstride / 2 * y + x;
    dst[idst] = dclampf(rctx, 1023.0f, cb / sum + 0.5f);
    idst += dstride * height / 4;
    dst[idst] = dclampf(rctx, 1023.0f, cr / sum + 0.5f);

    rctxs[rb_incr(rb->tail, tid)] = rctx;
}

// Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries.
// TODO(strobe): Share more code.
__global__ void f32_to_yuv444p12(
    uint16_t *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height,
    ringbuf *rb, mwc_st *rctxs)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int isrc = sstride * (y + gutter) + x + gutter;

    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];

    float4 in = src[isrc];
    in.x = fminf(1.0f, fmaxf(0.0f, in.x));
    in.y = fminf(1.0f, fmaxf(0.0f, in.y));
    in.z = fminf(1.0f, fmaxf(0.0f, in.z));
    ushort3 out = make_ushort3(
        dclampf(rctx, 4095.0f, 0.299f      * in.x + 0.587f     * in.y + 0.114f     * in.z),
        dclampf(rctx, 4095.0f, -0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z + 0.5f),
        dclampf(rctx, 4095.0f, 0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z + 0.5f)
    );

    int idst = dstride * y + x;
    dst[idst] = out.x;
    idst += dstride * height;
    dst[idst] = out.y;
    idst += dstride * height;
    dst[idst] = out.z;

    rctxs[rb_incr(rb->tail, tid)] = rctx;
}
''')
Exemplo n.º 9
0
mwclib = devlib(decls=r'''
typedef struct {
    uint32_t    mul;
    uint32_t    state;
    uint32_t    carry;
} mwc_st;
''',
                defs=r'''
__device__ uint32_t mwc_next(mwc_st &st) {
    asm("{\n\t"
        ".reg .u32 tmp;\n\t"
        "mad.lo.cc.u32   tmp,    %2,     %1,     %0;\n\t"
        "madc.hi.u32     %0,     %2,     %1,     0;\n\t"
        "mov.u32         %1,     tmp;\n\t"
    "}" : "+r"(st.carry), "+r"(st.state) : "r"(st.mul));
    return st.state;
}

__device__ float mwc_next_01(mwc_st &st) {
    return mwc_next(st) * (1.0f / 4294967296.0f);
}

__device__ float mwc_next_11(mwc_st &st) {
    uint32_t val = mwc_next(st);
    float ret;
    asm("cvt.rn.f32.s32 %0,     %1;\n\t"
        "mul.f32        %0,     %0,     (1.0 / 2147483648.0);"
        : "=f"(ret) : "r"(val));
    return ret;
}
''')
Exemplo n.º 10
0
pixfmtlib = devlib(deps=[ringbuflib, mwclib], defs=r'''
// Clamp an input between 0 and a given peak (inclusive), dithering its output,
// with full clamping for pixels that are true-black for compressibility.
__device__ float dclampf(mwc_st &rctx, float peak, float in) {
  float ret = 0.0f;
  if (in > 0.0f) {
    ret = fminf(peak, in * peak + 0.99f * mwc_next_01(rctx));
  }
  return ret;
}

// Perform a conversion from float32 values to uint8 ones, applying
// pixel- and channel-independent dithering to reduce suprathreshold banding
// artifacts. Clamps values larger than 1.0f.
// TODO: move to a separate module?
// TODO: less ineffecient mwc_st handling?
__global__ void f32_to_rgba_u8(
    uchar4 *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height,
    ringbuf *rb, mwc_st *rctxs)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int isrc = sstride * (y + gutter) + x + gutter;

    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];

    float4 in = src[isrc];
    uchar4 out = make_uchar4(
        dclampf(rctx, 255.0f, in.x),
        dclampf(rctx, 255.0f, in.y),
        dclampf(rctx, 255.0f, in.z),
        dclampf(rctx, 255.0f, in.w)
    );

    int idst = dstride * y + x;
    dst[idst] = out;
    rctxs[rb_incr(rb->tail, tid)] = rctx;
}

// Perform a conversion from float32 values to uint16 ones, as above.
__global__ void f32_to_rgba_u16(
    ushort4 *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height,
    ringbuf *rb, mwc_st *rctxs)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int isrc = sstride * (y + gutter) + x + gutter;

    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];

    float4 in = src[isrc];
    ushort4 out = make_ushort4(
        dclampf(rctx, 65535.0f, in.x),
        dclampf(rctx, 65535.0f, in.y),
        dclampf(rctx, 65535.0f, in.z),
        dclampf(rctx, 65535.0f, in.w)
    );

    int idst = dstride * y + x;
    dst[idst] = out;
    rctxs[rb_incr(rb->tail, tid)] = rctx;
}

// Convert from rgb444 to planar YUV with no chroma subsampling.
// Uses JPEG full-range color primaries.
__global__ void f32_to_yuv444p(
    char *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height,
    ringbuf *rb, mwc_st *rctxs)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int isrc = sstride * (y + gutter) + x + gutter;

    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];

    float4 in = src[isrc];
    uchar3 out = make_uchar3(
        dclampf(rctx, 255.0f, 0.299f      * in.x + 0.587f     * in.y + 0.114f     * in.z),
        dclampf(rctx, 255.0f, -0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z + 0.5f),
        dclampf(rctx, 255.0f, 0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z + 0.5f)
    );

    int idst = dstride * y + x;
    dst[idst] = out.x;
    idst += dstride * height;
    dst[idst] = out.y;
    idst += dstride * height;
    dst[idst] = out.z;
    rctxs[rb_incr(rb->tail, tid)] = rctx;
}

// Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries.
// TODO(strobe): Decide how YouTube will handle Rec. 2020, and then do that here.
__global__ void f32_to_yuv444p10(
    uint16_t *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height,
    ringbuf *rb, mwc_st *rctxs)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int isrc = sstride * (y + gutter) + x + gutter;

    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];

    float4 in = src[isrc];
    ushort3 out = make_ushort3(
        dclampf(rctx, 1023.0f, 0.299f      * in.x + 0.587f     * in.y + 0.114f     * in.z),
        dclampf(rctx, 1023.0f, -0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z + 0.5f),
        dclampf(rctx, 1023.0f, 0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z + 0.5f)
    );

    int idst = dstride * y + x;
    dst[idst] = out.x;
    idst += dstride * height;
    dst[idst] = 1023.0f * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z + 0.5f);
    idst += dstride * height;
    dst[idst] = out.z;

    rctxs[rb_incr(rb->tail, tid)] = rctx;
}

// Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries.
// Perform subsampling of chroma using weighted averages.
__global__ void f32_to_yuv420p10(
    uint16_t *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height,
    ringbuf *rb, mwc_st *rctxs)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];

    // Perform luma using real addressing
    int isrc = sstride * (y + gutter) + x + gutter;
    int idst = dstride * y + x;
    float4 in = src[isrc];
    dst[idst] = dclampf(rctx, 1023.0f, 0.299f      * in.x + 0.587f     * in.y + 0.114f     * in.z);

    // Drop into subsampling mode for chroma components
    if (x * 2 > dstride || y * 2 > height) return;

    // Recompute addressing and collect weighted averages
    // TODO(strobe): characterize overflow here
    isrc = sstride * (y * 2 + gutter) + x * 2 + gutter;
    in = src[isrc];
    float sum = in.w + 1e-12;
    float cb = in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
    float cr = in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);

    in = src[isrc + 1];
    sum += in.w;
    cb += in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
    cr += in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);

    isrc += sstride;
    in = src[isrc];
    sum += in.w;
    cb += in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
    cr += in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);

    in = src[isrc + 1];
    sum += in.w;
    cb += in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
    cr += in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);

    // For this to work, dstride must equal the output frame width
    // and be a multiple of four.
    idst = dstride * height + dstride / 2 * y + x;
    dst[idst] = dclampf(rctx, 1023.0f, cb / sum + 0.5f);
    idst += dstride * height / 4;
    dst[idst] = dclampf(rctx, 1023.0f, cr / sum + 0.5f);

    rctxs[rb_incr(rb->tail, tid)] = rctx;
}

// Convert from rgb444 to planar YUV 12-bit studio swing,
// using the Rec. 709 matrix.
__global__ void f32_to_yuv444p12(
    uint16_t *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height,
    ringbuf *rb, mwc_st *rctxs)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int isrc = sstride * (y + gutter) + x + gutter;

    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];

    float4 in = src[isrc];
    in.x = fminf(1.0f, fmaxf(0.0f, in.x));
    in.y = fminf(1.0f, fmaxf(0.0f, in.y));
    in.z = fminf(1.0f, fmaxf(0.0f, in.z));
    ushort3 out = make_ushort3(
        dclampf(rctx, 3504.0f, 0.2126f   * in.x + 0.7152f  * in.y + 0.0722f   * in.z) + 256.0f,
        dclampf(rctx, 3584.0f, -0.11457f * in.x - 0.38543f * in.y + 0.5f      * in.z + 0.5f) + 256.0f,
        dclampf(rctx, 3584.0f, 0.5f      * in.x - 0.45416f * in.y - 0.04585f  * in.z + 0.5f) + 256.0f
    );

    int idst = dstride * y + x;
    dst[idst] = out.x;
    idst += dstride * height;
    dst[idst] = out.y;
    idst += dstride * height;
    dst[idst] = out.z;

    rctxs[rb_incr(rb->tail, tid)] = rctx;
}
''')
Exemplo n.º 11
0
texshearlib = devlib(defs=r'''
// Filter directions specified in degrees, using image/texture addressing
// [(0,0) is upper left corner, 90 degrees is down].

__constant__ float2 addressing_patterns[16] = {
    { 1.0f,  0.0f},        { 0.0f,       1.0f}, //  0,  1:   0,    90
    { 1.0f,  1.0f},        {-1.0f,       1.0f}, //  2,  3:  45,   135
    { 1.0f,  0.5f},        {-0.5f,       1.0f}, //  4,  5:  22.5, 112.5
    { 1.0f, -0.5f},        { 0.5f,       1.0f}, //  6,  7: -22.5,  67.5
    { 1.0f,  0.666667f},   {-0.666667f,  1.0f}, //  8,  9:  30,   120
    { 1.0f, -0.666667f},   { 0.666667f,  1.0f}, // 10, 11: -30,    60
    { 1.0f,  0.333333f},   {-0.333333f,  1.0f}, // 12, 13:  15,   105
    { 1.0f, -0.333333f},   { 0.333333f,  1.0f}, // 14, 15: -15,    75
};

// Mon dieu! A C++ feature? Gotta close the "extern C" added by the compiler.
}

template <typename T> __device__ T
tex_shear(texture<T, cudaTextureType2D> ref, int pattern,
          float x, float y, float radius) {
    float2 scale = addressing_patterns[pattern];
    float i = scale.x * radius, j = scale.y * radius;
    // Round i and j to the nearest integer, choosing the nearest even when
    // equidistant. It's critical that this be done before adding 'x' and 'y',
    // so that addressing patterns remain consistent across the grid.
    asm("{\n\t"
        "cvt.rni.ftz.f32.f32    %0, %0;\n\t"
        "cvt.rni.ftz.f32.f32    %1, %1;\n\t"
        "}\n" : "+f"(i), "+f"(j));
    return tex2D(ref, x + i, y + j);
}

extern "C" {
''')
Exemplo n.º 12
0
yuvlib = devlib(defs='''
__device__ float3 rgb2yuv(float3 rgb);
__device__ float3 yuv2rgb(float3 yuv);
''',
                decls=r'''
/* This conversion uses the JPEG full-range standard. Note that UV have range
 * [-0.5, 0.5], so consider biasing the results. */
__device__ float3 rgb2yuv(float3 rgb) {
    return make_float3(
        0.299f      * rgb.x + 0.587f    * rgb.y + 0.114f    * rgb.z,
        -0.168736f  * rgb.x - 0.331264f * rgb.y + 0.5f      * rgb.z,
        0.5f        * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z);
}

__device__ float3 yuv2rgb(float3 yuv) {
    return make_float3(
        yuv.x                    + 1.402f   * yuv.z,
        yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z,
        yuv.x + 1.772f   * yuv.y);
}

// As used in the various cliplibs.
__device__ void yuvo2rgb(float4& pix) {
    pix.y -= 0.5f * pix.w;
    pix.z -= 0.5f * pix.w;
    float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z));
    pix.x = fmaxf(0.0f, tmp.x);
    pix.y = fmaxf(0.0f, tmp.y);
    pix.z = fmaxf(0.0f, tmp.z);
}

''')
Exemplo n.º 13
0
flushatomlib = devlib(defs=Template(r'''
__global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
    int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
    if (i >= nbins) return;
    asm volatile ({{crep("""
{
    .reg .u32   off, hi, lo, d, y, u, v;
    .reg .u64   val, ptr;
    .reg .f32   yf, uf, vf, df, yg, ug, vg, dg;

    // TODO: use explicit movs to handle this
    shl.b32             off,    %0,     3;
    cvt.u64.u32         ptr,    off;
    add.u64             ptr,    ptr,    %1;
    ld.global.v2.u32    {lo, hi},   [ptr];
    shl.b32             off,    %0,     4;
    cvt.u64.u32         ptr,    off;
    add.u64             ptr,    ptr,    %2;
    ld.global.v4.f32    {yg,ug,vg,dg},  [ptr];
    shr.u32             d,      hi,     22;
    bfe.u32             y,      hi,     4,      18;
    bfe.u32             u,      lo,     18,     14;
    bfi.b32             u,      hi,     u,      14,     4;
    and.b32             v,      lo,     ((1<<18)-1);
    cvt.rn.f32.u32      yf,     y;
    cvt.rn.f32.u32      uf,     u;
    cvt.rn.f32.u32      vf,     v;
    cvt.rn.f32.u32      df,     d;
    fma.rn.ftz.f32      yg,     yf,     (1.0/255.0),    yg;
    fma.rn.ftz.f32      ug,     uf,     (1.0/255.0),    ug;
    fma.rn.ftz.f32      vg,     vf,     (1.0/255.0),    vg;

    add.rn.ftz.f32      dg,     df,     dg;
    st.global.v4.f32    [ptr],  {yg,ug,vg,dg};
}
    """)}}  ::  "r"(i), "l"(atom_ptr), "l"(out_ptr));
}
''', 'flush_atom').substitute())
Exemplo n.º 14
0
mwclib = devlib(decls=r'''
typedef struct {
    uint32_t    mul;
    uint32_t    state;
    uint32_t    carry;
} mwc_st;
''', defs=r'''
__device__ uint32_t mwc_next(mwc_st &st) {
    asm("{\n\t"
        ".reg .u32 tmp;\n\t"
        "mad.lo.cc.u32   tmp,    %2,     %1,     %0;\n\t"
        "madc.hi.u32     %0,     %2,     %1,     0;\n\t"
        "mov.u32         %1,     tmp;\n\t"
    "}" : "+r"(st.carry), "+r"(st.state) : "r"(st.mul));
    return st.state;
}

__device__ float mwc_next_01(mwc_st &st) {
    return mwc_next(st) * (1.0f / 4294967296.0f);
}

__device__ float mwc_next_11(mwc_st &st) {
    uint32_t val = mwc_next(st);
    float ret;
    asm("cvt.rn.f32.s32 %0,     %1;\n\t"
        "mul.f32        %0,     %0,     (1.0 / 2147483648.0);"
        : "=f"(ret) : "r"(val));
    return ret;
}
''')
Exemplo n.º 15
0
flushatomlib = devlib(defs=Template(
    r'''
__global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
    int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
    if (i >= nbins) return;
    asm volatile ({{crep("""
{
    .reg .u32   off, hi, lo, d, y, u, v;
    .reg .u64   val, ptr;
    .reg .f32   yf, uf, vf, df, yg, ug, vg, dg;

    // TODO: use explicit movs to handle this
    shl.b32             off,    %0,     3;
    cvt.u64.u32         ptr,    off;
    add.u64             ptr,    ptr,    %1;
    ld.global.v2.u32    {lo, hi},   [ptr];
    shl.b32             off,    %0,     4;
    cvt.u64.u32         ptr,    off;
    add.u64             ptr,    ptr,    %2;
    ld.global.v4.f32    {yg,ug,vg,dg},  [ptr];
    shr.u32             d,      hi,     22;
    bfe.u32             y,      hi,     4,      18;
    bfe.u32             u,      lo,     18,     14;
    bfi.b32             u,      hi,     u,      14,     4;
    and.b32             v,      lo,     ((1<<18)-1);
    cvt.rn.f32.u32      yf,     y;
    cvt.rn.f32.u32      uf,     u;
    cvt.rn.f32.u32      vf,     v;
    cvt.rn.f32.u32      df,     d;
    fma.rn.ftz.f32      yg,     yf,     (1.0/255.0),    yg;
    fma.rn.ftz.f32      ug,     uf,     (1.0/255.0),    ug;
    fma.rn.ftz.f32      vg,     vf,     (1.0/255.0),    vg;

    add.rn.ftz.f32      dg,     df,     dg;
    st.global.v4.f32    [ptr],  {yg,ug,vg,dg};
}
    """)}}  ::  "r"(i), "l"(atom_ptr), "l"(out_ptr));
}
''', 'flush_atom').substitute())
Exemplo n.º 16
0
yuvlib = devlib(defs='''
__device__ float3 rgb2yuv(float3 rgb);
__device__ float3 yuv2rgb(float3 yuv);
''', decls=r'''
/* This conversion uses the JPEG full-range standard. Note that UV have range
 * [-0.5, 0.5], so consider biasing the results. */
__device__ float3 rgb2yuv(float3 rgb) {
    return make_float3(
        0.299f      * rgb.x + 0.587f    * rgb.y + 0.114f    * rgb.z,
        -0.168736f  * rgb.x - 0.331264f * rgb.y + 0.5f      * rgb.z,
        0.5f        * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z);
}

__device__ float3 yuv2rgb(float3 yuv) {
    return make_float3(
        yuv.x                    + 1.402f   * yuv.z,
        yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z,
        yuv.x + 1.772f   * yuv.y);
}

// As used in the various cliplibs.
__device__ void yuvo2rgb(float4& pix) {
    pix.y -= 0.5f * pix.w;
    pix.z -= 0.5f * pix.w;
    float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z));
    pix.x = fmaxf(0.0f, tmp.x);
    pix.y = fmaxf(0.0f, tmp.y);
    pix.z = fmaxf(0.0f, tmp.z);
}

''')
Exemplo n.º 17
0
catmullromlib = devlib(
    deps=[binsearchlib],
    decls=r"""
__device__ __noinline__
float catmull_rom(const float *times, const float *knots, float t);

__device__ __noinline__
float catmull_rom_mag(const float *times, const float *knots, float t);
""",
    defs=r"""

// ELBOW is the linearization threhsold; above this magnitude, a value scales
// logarithmically, and below it, linearly. ELOG1 is a constant used to make
// this happen. See helpers/spline_mag_domain_interp.wxm for nice graphs.
#define ELBOW 0.0625f   // 2^(-4)
#define ELOG1 5.0f      // 1 - log2(elbow)

// Transform from linear to magnitude domain
__device__ float linlog(float x) {
    if (x > ELBOW)  return   log2f(x)  + ELOG1;
    if (x < -ELBOW) return -(log2f(-x) + ELOG1);
    return x / ELBOW;
}

// Reverse of above
__device__ float linexp(float v) {
    if (v >= 1.0)   return  exp2f( v - ELOG1);
    if (v <= -1.0)  return -exp2f(-v - ELOG1);
    return v * ELBOW;
}

__device__ float linslope(float x, float m) {
    if (x >=  ELBOW) return m /  x;
    if (x <= -ELBOW) return m / -x;
    return m / ELBOW;
}

__device__ float
catmull_rom_base(const float *times, const float *knots, float t, bool mag) {
    int idx = bitwise_binsearch(times, t);

    // The left bias of the search means that we never have to worry about
    // overshooting unless the genome is corrupted
    idx = max(idx, 1);

    float t1 = times[idx], t2 = times[idx+1] - t1;
    float rt2 = 1.0f / t2;
    float t0 = (times[idx-1] - t1) * rt2, t3 = (times[idx+2] - t1) * rt2;
    t = (t - t1) * rt2;

    // Now t1 is effectively 0 and t2 is 1

    float k0 = knots[idx-1], k1 = knots[idx],
          k2 = knots[idx+1], k3 = knots[idx+2];

    float m1 = (k2 - k0) / (1.0f - t0),
          m2 = (k3 - k1) / (t3);

    if (mag) {
        m1 = linslope(k1, m1);
        m2 = linslope(k2, m2);
        k1 = linlog(k1);
        k2 = linlog(k2);
    }

    float tt = t * t, ttt = tt * t;

    float r = m1 * (      ttt - 2.0f*tt + t)
            + k1 * ( 2.0f*ttt - 3.0f*tt + 1)
            + m2 * (      ttt -      tt)
            + k2 * (-2.0f*ttt + 3.0f*tt);

    if (mag) r = linexp(r);
    return r;
}

// Variants with scaling domain logic inlined
__device__ __noinline__
float catmull_rom(const float *times, const float *knots, float t) {
    return catmull_rom_base(times, knots, t, false);
}

__device__ __noinline__
float catmull_rom_mag(const float *times, const float *knots, float t) {
    return catmull_rom_base(times, knots, t, true);
}
""",
)