Пример #1
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        gam = f32(1 / gprof.filters.colorclip.gamma(tc) - 1)

        dsc = mkdsc(dim, 1)
        tref = mktref(self.mod, 'chan1_src')

        set_blur_width(self.mod, fb.pool, stream=stream)
        launch2('apply_gamma', self.mod, stream, dim, fb.d_left, fb.d_front,
                f32(0.1))
        tref.set_address_2d(fb.d_left, dsc, 4 * dim.astride)
        launch2('den_blur_1c',
                self.mod,
                stream,
                dim,
                fb.d_back,
                i32(2),
                i32(0),
                texrefs=[tref])
        tref.set_address_2d(fb.d_back, dsc, 4 * dim.astride)
        launch2('den_blur_1c',
                self.mod,
                stream,
                dim,
                fb.d_left,
                i32(3),
                i32(0),
                texrefs=[tref])

        launch2('haloclip', self.mod, stream, dim, fb.d_front, fb.d_left, gam)
Пример #2
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        # Helper variables and functions to keep it clean
        sb = 16 * dim.astride
        bs = sb * dim.ah

        dsc = mkdsc(dim, 4)
        tref = mktref(self.mod, 'chan4_src')
        grad_dsc = mkdsc(dim, 1)
        grad_tref = mktref(self.mod, 'chan1_src')
        set_blur_width(self.mod, fb.pool, stream=stream)

        for pattern in range(self.directions):
            # Scale spatial parameter so that a "pixel" is equivalent to an
            # actual pixel at 1080p
            sstd = params.spatial_std(tc) * dim.w / 1920.

            tref.set_address_2d(fb.d_front, dsc, sb)

            # Blur density two octaves along sampling vector, ultimately
            # storing in the side buffer
            launch2('den_blur', self.mod, stream, dim,
                    fb.d_back, i32(pattern), i32(0), texrefs=[tref])
            grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4)
            launch2('den_blur_1c', self.mod, stream, dim,
                    fb.d_left, i32(pattern), i32(1), texrefs=[grad_tref])
            grad_tref.set_address_2d(fb.d_left, grad_dsc, sb / 4)

            launch2('bilateral', self.mod, stream, dim,
                    fb.d_back, i32(pattern), i32(self.radius),
                    f32(sstd), f32(params.color_std(tc)),
                    f32(params.density_std(tc)), f32(params.density_pow(tc)),
                    f32(params.gradient(tc)),
                    texrefs=[tref, grad_tref])
            fb.flip()
Пример #3
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        vib = f32(params.vibrance(tc))
        hipow = f32(params.highlight_power(tc))
        gam, lin, lingam = calc_lingam(params, tc)

        launch2('colorclip', self.mod, stream, dim, fb.d_front, vib, hipow,
                gam, lin, lingam)
Пример #4
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        vib = f32(params.vibrance(tc))
        hipow = f32(params.highlight_power(tc))
        gam, lin, lingam = calc_lingam(params, tc)

        launch2('colorclip', self.mod, stream, dim,
                fb.d_front, vib, hipow, gam, lin, lingam)
Пример #5
0
 def apply(self, fb, gprof, params, dim, tc, stream=None):
     """Log-scale in place."""
     k1 = f32(params.brightness(tc) * 268 / 256)
     # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
     # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
     area = dim.h / (params.scale(tc) ** 2 * dim.w)
     k2 = f32(1.0 / (area * gprof.spp(tc)))
     launch2("logscale", self.mod, stream, dim, fb.d_front, fb.d_front, k1, k2)
Пример #6
0
 def apply(self, fb, gprof, params, dim, tc, stream=None):
     """Log-scale in place."""
     k1 = f32(params.brightness(tc) * 268 / 256)
     # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
     # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
     area = dim.h / (params.scale(tc)**2 * dim.w)
     k2 = f32(1.0 / (area * gprof.spp(tc)))
     launch2('logscale', self.mod, stream, dim, fb.d_front, fb.d_front, k1,
             k2)
Пример #7
0
    def z_circular(self, t, t0, p, a, i, update_t=True):
        if True: #update_t:
            t = t.astype(f32)
            self.t_buf = cl.Buffer(self.ctx, READ_ONLY | COPY_HOST_PTR, hostbuf=t)
            self.z = np.zeros_like(t)
            self.z_buf = cl.Buffer(self.ctx, WRITE_ONLY, t.nbytes)

        self.k_z_circular(self.queue, t.shape, None, self.t_buf, f32(t0), f32(p), f32(a), f32(i), self.z_buf)
        cl.enqueue_copy(self.queue, self.z, self.z_buf)

        return self.z
Пример #8
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        # Helper variables and functions to keep it clean
        sb = 16 * dim.astride
        bs = sb * dim.ah

        dsc = mkdsc(dim, 4)
        tref = mktref(self.mod, 'chan4_src')
        grad_dsc = mkdsc(dim, 1)
        grad_tref = mktref(self.mod, 'chan1_src')
        set_blur_width(self.mod, fb.pool, stream=stream)

        for pattern in range(self.directions):
            # Scale spatial parameter so that a "pixel" is equivalent to an
            # actual pixel at 1080p
            sstd = params.spatial_std(tc) * dim.w / 1920.

            tref.set_address_2d(fb.d_front, dsc, sb)

            # Blur density two octaves along sampling vector, ultimately
            # storing in the side buffer
            launch2('den_blur',
                    self.mod,
                    stream,
                    dim,
                    fb.d_back,
                    i32(pattern),
                    i32(0),
                    texrefs=[tref])
            grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4)
            launch2('den_blur_1c',
                    self.mod,
                    stream,
                    dim,
                    fb.d_left,
                    i32(pattern),
                    i32(1),
                    texrefs=[grad_tref])
            grad_tref.set_address_2d(fb.d_left, grad_dsc, sb / 4)

            launch2('bilateral',
                    self.mod,
                    stream,
                    dim,
                    fb.d_back,
                    i32(pattern),
                    i32(self.radius),
                    f32(sstd),
                    f32(params.color_std(tc)),
                    f32(params.density_std(tc)),
                    f32(params.density_pow(tc)),
                    f32(params.gradient(tc)),
                    texrefs=[tref, grad_tref])
            fb.flip()
Пример #9
0
    def z_circular(self, t, t0, p, a, i, update_t=True):
        if True:  #update_t:
            t = t.astype(f32)
            self.t_buf = cl.Buffer(self.ctx,
                                   READ_ONLY | COPY_HOST_PTR,
                                   hostbuf=t)
            self.z = np.zeros_like(t)
            self.z_buf = cl.Buffer(self.ctx, WRITE_ONLY, t.nbytes)

        self.k_z_circular(self.queue, t.shape, None, self.t_buf, f32(t0),
                          f32(p), f32(a), f32(i), self.z_buf)
        cl.enqueue_copy(self.queue, self.z, self.z_buf)

        return self.z
Пример #10
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        gam = f32(1 / gprof.filters.colorclip.gamma(tc) - 1)

        dsc = mkdsc(dim, 1)
        tref = mktref(self.mod, "chan1_src")

        set_blur_width(self.mod, fb.pool, stream=stream)
        launch2("apply_gamma", self.mod, stream, dim, fb.d_side, fb.d_front, f32(0.1))
        tref.set_address_2d(fb.d_side, dsc, 4 * dim.astride)
        launch2("den_blur_1c", self.mod, stream, dim, fb.d_back, i32(2), i32(0), texrefs=[tref])
        tref.set_address_2d(fb.d_back, dsc, 4 * dim.astride)
        launch2("den_blur_1c", self.mod, stream, dim, fb.d_side, i32(3), i32(0), texrefs=[tref])

        launch2("haloclip", self.mod, stream, dim, fb.d_front, fb.d_side, gam)
Пример #11
0
    def _iter(self, rdr, gnm, gprof, dim, tc):
        tref = rdr.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)

        nbins = dim.ah * dim.astride
        fill = lambda b, s, v=i32(0): util.fill_dptr(
                self.mod, b, s, stream=self.stream_a, value=v)
        fill(self.fb.d_front,  4 * nbins)
        fill(self.fb.d_side,   2 * nbins)
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))

        nts = self.info_a.ntemporal_samples
        nsamps = (gprof.spp(tc) * dim.w * dim.h)
        nrounds = int(nsamps / (nts * 256. * 256)) + 1

        def launch_iter(n):
            if n == 0: return
            launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n),
                    self.fb.d_front, self.fb.d_side,
                    self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
                    self.info_a.d_params)
        # Split the launch into multiple rounds, possibly (slightly) reducing
        # work overlap but avoiding stalls when working on a device with an
        # active X session. TODO: characterize performance impact, autodetect
        BLOCK_SIZE = 16
        for i in range(BLOCK_SIZE-1, nrounds, BLOCK_SIZE):
            launch_iter(BLOCK_SIZE)
        launch_iter(nrounds%BLOCK_SIZE)

        nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.)))
        launch('flush_atom', self.mod, self.stream_a,
                256, (nblocks, nblocks),
                u64(self.fb.d_front), u64(self.fb.d_side), i32(nbins))
Пример #12
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        gam, lin, lingam = calc_lingam(gprof.filters.colorclip, tc)
        dsc = mkdsc(dim, 4)
        tref = mktref(self.mod, "chan4_src")

        set_blur_width(self.mod, fb.pool, params.width(tc), stream)
        launch2("apply_gamma_full_hi", self.mod, stream, dim, fb.d_side, fb.d_front, f32(gam - 1))
        tref.set_address_2d(fb.d_side, dsc, 16 * dim.astride)
        launch2("full_blur", self.mod, stream, dim, fb.d_back, i32(2), i32(0), texrefs=[tref])
        tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride)
        launch2("full_blur", self.mod, stream, dim, fb.d_side, i32(3), i32(0), texrefs=[tref])
        tref.set_address_2d(fb.d_side, dsc, 16 * dim.astride)
        launch2("full_blur", self.mod, stream, dim, fb.d_back, i32(0), i32(0), texrefs=[tref])
        tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride)
        launch2("full_blur", self.mod, stream, dim, fb.d_side, i32(1), i32(0), texrefs=[tref])
        launch2("smearclip", self.mod, stream, dim, fb.d_front, fb.d_side, f32(gam - 1), lin, lingam)
Пример #13
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        gam, lin, lingam = calc_lingam(gprof.filters.colorclip, tc)
        dsc = mkdsc(dim, 4)
        tref = mktref(self.mod, 'chan4_src')

        set_blur_width(self.mod, fb.pool, params.width(tc), stream)
        launch2('apply_gamma_full_hi', self.mod, stream, dim, fb.d_left,
                fb.d_front, f32(gam - 1))
        tref.set_address_2d(fb.d_left, dsc, 16 * dim.astride)
        launch2('full_blur',
                self.mod,
                stream,
                dim,
                fb.d_back,
                i32(2),
                i32(0),
                texrefs=[tref])
        tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride)
        launch2('full_blur',
                self.mod,
                stream,
                dim,
                fb.d_left,
                i32(3),
                i32(0),
                texrefs=[tref])
        tref.set_address_2d(fb.d_left, dsc, 16 * dim.astride)
        launch2('full_blur',
                self.mod,
                stream,
                dim,
                fb.d_back,
                i32(0),
                i32(0),
                texrefs=[tref])
        tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride)
        launch2('full_blur',
                self.mod,
                stream,
                dim,
                fb.d_left,
                i32(1),
                i32(0),
                texrefs=[tref])
        launch2('smearclip', self.mod, stream, dim, fb.d_front, fb.d_left,
                f32(gam - 1), lin, lingam)
Пример #14
0
    def _interp(self, rdr, gnm, dim, ts, td):
        d_acc_size = rdr.mod.get_global('acc_size')[0]
        p_dim = self.fb.pool.allocate((len(dim), ), u32)
        p_dim[:] = dim
        cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)

        tref = self.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)
        launch('interp_palette_flat', self.mod, self.stream_a, 256,
               self.info_a.palette_height, self.fb.d_rb, self.fb.d_seeds,
               self.src_a.d_ptimes, self.src_a.d_pals, f32(ts),
               f32(td / self.info_a.palette_height))

        nts = self.info_a.ntemporal_samples
        launch('interp_iter_params', rdr.mod, self.stream_a, 256,
               np.ceil(nts / 256.), self.info_a.d_params, self.src_a.d_times,
               self.src_a.d_knots, f32(ts), f32(td / nts), i32(nts))
Пример #15
0
    def _iter(self, rdr, gnm, gprof, dim, tc):
        tref = rdr.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)

        nbins = dim.ah * dim.astride
        fill = lambda b, s, v=i32(0): util.fill_dptr(
                self.mod, b, s, stream=self.stream_a, value=v)
        fill(self.fb.d_front,  4 * nbins)
        fill(self.fb.d_left,   4 * nbins)
        fill(self.fb.d_right,  4 * nbins)
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
        fill(self.fb.d_uleft,  nbins / 2)
        fill(self.fb.d_uright, nbins / 2)

        nts = self.info_a.ntemporal_samples
        nsamps = (gprof.spp(tc) * dim.w * dim.h)
        nrounds = int(nsamps / (nts * 256. * 256)) + 1

        # Split the launch into multiple rounds, to prevent a system on older
        # GPUs from locking up and to give us a chance to flush some stuff.
        hidden_stream = cuda.Stream()
        iter_stream_left, iter_stream_right = self.stream_a, hidden_stream
        block_size = 4

        while nrounds:
          n = min(nrounds, block_size)
          now = time.time()
          launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n),
                 self.fb.d_front, self.fb.d_left,
                 self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
                 self.fb.d_uleft, self.info_a.d_params)
          delta = time.time() - now
          if delta > 0.1:
            # More than 100ms passed attempting to launch. The GPU is likely
            # out of queued execution resources on a long render, and scheduling
            # additional work will just keep spinning the CPU at 100%.
            # Do a blocking sync to free up resources. This may slightly reduce
            # parallelism but makes it a whole heck of a lot easier to keep
            # using the computer while things render.
            print >> sys.stderr, 'Launches became blocking, synchronizing'
            iter_stream_right.synchronize()

          # Make sure the other stream is done flushing before we start
          iter_stream_left.wait_for_event(cuda.Event().record(iter_stream_right))

          launch('flush_atom', rdr.mod, iter_stream_left,
                  (16, 16, 1), (dim.astride / 16, dim.ah / 16),
                  u64(self.fb.d_front), u64(self.fb.d_left),
                  u64(self.fb.d_uleft), i32(nbins))

          self.fb.flip_side()
          iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left
          nrounds -= n
          block_size += block_size / 2

        # Always wait on all events in the hidden stream before continuing on A
        self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))
Пример #16
0
    def _iter(self, rdr, gnm, gprof, dim, tc):
        tref = rdr.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)

        nbins = dim.ah * dim.astride
        fill = lambda b, s, v=i32(0): util.fill_dptr(
            self.mod, b, s, stream=self.stream_a, value=v)
        fill(self.fb.d_front, 4 * nbins)
        fill(self.fb.d_left, 4 * nbins)
        fill(self.fb.d_right, 4 * nbins)
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
        fill(self.fb.d_uleft, nbins / 2)
        fill(self.fb.d_uright, nbins / 2)

        nts = self.info_a.ntemporal_samples
        nsamps = (gprof.spp(tc) * dim.w * dim.h)
        nrounds = int(nsamps / (nts * 256. * 256)) + 1

        # Split the launch into multiple rounds, to prevent a system on older
        # GPUs from locking up and to give us a chance to flush some stuff.
        hidden_stream = cuda.Stream()
        iter_stream_left, iter_stream_right = self.stream_a, hidden_stream
        block_size = 4

        while nrounds:
            n = min(nrounds, block_size)
            now = time.time()
            launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n),
                   self.fb.d_front, self.fb.d_left, self.fb.d_rb,
                   self.fb.d_seeds, self.fb.d_points, self.fb.d_uleft,
                   self.info_a.d_params)
            delta = time.time() - now
            if delta > 0.1:
                # More than 100ms passed attempting to launch. The GPU is likely
                # out of queued execution resources on a long render, and scheduling
                # additional work will just keep spinning the CPU at 100%.
                # Do a blocking sync to free up resources. This may slightly reduce
                # parallelism but makes it a whole heck of a lot easier to keep
                # using the computer while things render.
                print >> sys.stderr, 'Launches became blocking, synchronizing'
                iter_stream_right.synchronize()

            # Make sure the other stream is done flushing before we start
            iter_stream_left.wait_for_event(
                cuda.Event().record(iter_stream_right))

            launch('flush_atom', rdr.mod, iter_stream_left, (16, 16, 1),
                   (dim.astride / 16, dim.ah / 16), u64(self.fb.d_front),
                   u64(self.fb.d_left), u64(self.fb.d_uleft), i32(nbins))

            self.fb.flip_side()
            iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left
            nrounds -= n
            block_size += block_size / 2

        # Always wait on all events in the hidden stream before continuing on A
        self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))
Пример #17
0
    def _interp(self, rdr, gnm, dim, ts, td):
        d_acc_size = rdr.mod.get_global('acc_size')[0]
        p_dim = self.fb.pool.allocate((len(dim),), u32)
        p_dim[:] = dim
        cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)

        tref = self.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)
        launch('interp_palette_flat', self.mod, self.stream_a,
                256, self.info_a.palette_height,
                self.fb.d_rb, self.fb.d_seeds,
                self.src_a.d_ptimes, self.src_a.d_pals,
                f32(ts), f32(td / self.info_a.palette_height))

        nts = self.info_a.ntemporal_samples
        launch('interp_iter_params', rdr.mod, self.stream_a,
                256, np.ceil(nts / 256.),
                self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
                f32(ts), f32(td / nts), i32(nts))
Пример #18
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        degamma = f32(params.degamma(tc))

        launch2('logencode', self.mod, stream, dim, fb.d_back, fb.d_front,
                degamma)
        fb.flip()
Пример #19
0
 def apply(self, fb, gprof, params, dim, tc, stream=None):
     gam, lin, lingam = calc_lingam(gprof.filters.colorclip, tc)
     launch2('plainclip', self.mod, stream, dim, fb.d_front, f32(gam - 1),
             lin, lingam, f32(gprof.filters.plainclip.brightness(tc)))
Пример #20
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        degamma = f32(params.degamma(tc))

        launch2('logencode', self.mod, stream, dim,
                fb.d_back, fb.d_front, degamma)
        fb.flip()
Пример #21
0
def calc_lingam(params, tc):
    gam = f32(1 / params.gamma(tc))
    lin = f32(params.gamma_threshold(tc))
    lingam = f32(lin**(gam - 1.0) if lin > 0 else 0)
    return gam, lin, lingam
Пример #22
0
def calc_lingam(params, tc):
    gam = f32(1 / params.gamma(tc))
    lin = f32(params.gamma_threshold(tc))
    lingam = f32(lin ** (gam - 1.0) if lin > 0 else 0)
    return gam, lin, lingam
Пример #23
0
 def apply(self, fb, gprof, params, dim, tc, stream=None):
     gam, lin, lingam = calc_lingam(gprof.filters.colorclip, tc)
     launch2('plainclip', self.mod, stream, dim,
             fb.d_front, f32(gam-1), lin, lingam,
             f32(gprof.filters.plainclip.brightness(tc)))