def apply(self, fb, gprof, params, dim, tc, stream=None): gam = f32(1 / gprof.filters.colorclip.gamma(tc) - 1) dsc = mkdsc(dim, 1) tref = mktref(self.mod, 'chan1_src') set_blur_width(self.mod, fb.pool, stream=stream) launch2('apply_gamma', self.mod, stream, dim, fb.d_left, fb.d_front, f32(0.1)) tref.set_address_2d(fb.d_left, dsc, 4 * dim.astride) launch2('den_blur_1c', self.mod, stream, dim, fb.d_back, i32(2), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_back, dsc, 4 * dim.astride) launch2('den_blur_1c', self.mod, stream, dim, fb.d_left, i32(3), i32(0), texrefs=[tref]) launch2('haloclip', self.mod, stream, dim, fb.d_front, fb.d_left, gam)
def apply(self, fb, gprof, params, dim, tc, stream=None): # Helper variables and functions to keep it clean sb = 16 * dim.astride bs = sb * dim.ah dsc = mkdsc(dim, 4) tref = mktref(self.mod, 'chan4_src') grad_dsc = mkdsc(dim, 1) grad_tref = mktref(self.mod, 'chan1_src') set_blur_width(self.mod, fb.pool, stream=stream) for pattern in range(self.directions): # Scale spatial parameter so that a "pixel" is equivalent to an # actual pixel at 1080p sstd = params.spatial_std(tc) * dim.w / 1920. tref.set_address_2d(fb.d_front, dsc, sb) # Blur density two octaves along sampling vector, ultimately # storing in the side buffer launch2('den_blur', self.mod, stream, dim, fb.d_back, i32(pattern), i32(0), texrefs=[tref]) grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4) launch2('den_blur_1c', self.mod, stream, dim, fb.d_left, i32(pattern), i32(1), texrefs=[grad_tref]) grad_tref.set_address_2d(fb.d_left, grad_dsc, sb / 4) launch2('bilateral', self.mod, stream, dim, fb.d_back, i32(pattern), i32(self.radius), f32(sstd), f32(params.color_std(tc)), f32(params.density_std(tc)), f32(params.density_pow(tc)), f32(params.gradient(tc)), texrefs=[tref, grad_tref]) fb.flip()
def apply(self, fb, gprof, params, dim, tc, stream=None): vib = f32(params.vibrance(tc)) hipow = f32(params.highlight_power(tc)) gam, lin, lingam = calc_lingam(params, tc) launch2('colorclip', self.mod, stream, dim, fb.d_front, vib, hipow, gam, lin, lingam)
def apply(self, fb, gprof, params, dim, tc, stream=None): """Log-scale in place.""" k1 = f32(params.brightness(tc) * 268 / 256) # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w)) area = dim.h / (params.scale(tc) ** 2 * dim.w) k2 = f32(1.0 / (area * gprof.spp(tc))) launch2("logscale", self.mod, stream, dim, fb.d_front, fb.d_front, k1, k2)
def apply(self, fb, gprof, params, dim, tc, stream=None): """Log-scale in place.""" k1 = f32(params.brightness(tc) * 268 / 256) # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w)) area = dim.h / (params.scale(tc)**2 * dim.w) k2 = f32(1.0 / (area * gprof.spp(tc))) launch2('logscale', self.mod, stream, dim, fb.d_front, fb.d_front, k1, k2)
def z_circular(self, t, t0, p, a, i, update_t=True): if True: #update_t: t = t.astype(f32) self.t_buf = cl.Buffer(self.ctx, READ_ONLY | COPY_HOST_PTR, hostbuf=t) self.z = np.zeros_like(t) self.z_buf = cl.Buffer(self.ctx, WRITE_ONLY, t.nbytes) self.k_z_circular(self.queue, t.shape, None, self.t_buf, f32(t0), f32(p), f32(a), f32(i), self.z_buf) cl.enqueue_copy(self.queue, self.z, self.z_buf) return self.z
def apply(self, fb, gprof, params, dim, tc, stream=None): gam = f32(1 / gprof.filters.colorclip.gamma(tc) - 1) dsc = mkdsc(dim, 1) tref = mktref(self.mod, "chan1_src") set_blur_width(self.mod, fb.pool, stream=stream) launch2("apply_gamma", self.mod, stream, dim, fb.d_side, fb.d_front, f32(0.1)) tref.set_address_2d(fb.d_side, dsc, 4 * dim.astride) launch2("den_blur_1c", self.mod, stream, dim, fb.d_back, i32(2), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_back, dsc, 4 * dim.astride) launch2("den_blur_1c", self.mod, stream, dim, fb.d_side, i32(3), i32(0), texrefs=[tref]) launch2("haloclip", self.mod, stream, dim, fb.d_front, fb.d_side, gam)
def _iter(self, rdr, gnm, gprof, dim, tc): tref = rdr.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) nbins = dim.ah * dim.astride fill = lambda b, s, v=i32(0): util.fill_dptr( self.mod, b, s, stream=self.stream_a, value=v) fill(self.fb.d_front, 4 * nbins) fill(self.fb.d_side, 2 * nbins) fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan)) nts = self.info_a.ntemporal_samples nsamps = (gprof.spp(tc) * dim.w * dim.h) nrounds = int(nsamps / (nts * 256. * 256)) + 1 def launch_iter(n): if n == 0: return launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n), self.fb.d_front, self.fb.d_side, self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, self.info_a.d_params) # Split the launch into multiple rounds, possibly (slightly) reducing # work overlap but avoiding stalls when working on a device with an # active X session. TODO: characterize performance impact, autodetect BLOCK_SIZE = 16 for i in range(BLOCK_SIZE-1, nrounds, BLOCK_SIZE): launch_iter(BLOCK_SIZE) launch_iter(nrounds%BLOCK_SIZE) nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.))) launch('flush_atom', self.mod, self.stream_a, 256, (nblocks, nblocks), u64(self.fb.d_front), u64(self.fb.d_side), i32(nbins))
def apply(self, fb, gprof, params, dim, tc, stream=None): gam, lin, lingam = calc_lingam(gprof.filters.colorclip, tc) dsc = mkdsc(dim, 4) tref = mktref(self.mod, "chan4_src") set_blur_width(self.mod, fb.pool, params.width(tc), stream) launch2("apply_gamma_full_hi", self.mod, stream, dim, fb.d_side, fb.d_front, f32(gam - 1)) tref.set_address_2d(fb.d_side, dsc, 16 * dim.astride) launch2("full_blur", self.mod, stream, dim, fb.d_back, i32(2), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride) launch2("full_blur", self.mod, stream, dim, fb.d_side, i32(3), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_side, dsc, 16 * dim.astride) launch2("full_blur", self.mod, stream, dim, fb.d_back, i32(0), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride) launch2("full_blur", self.mod, stream, dim, fb.d_side, i32(1), i32(0), texrefs=[tref]) launch2("smearclip", self.mod, stream, dim, fb.d_front, fb.d_side, f32(gam - 1), lin, lingam)
def apply(self, fb, gprof, params, dim, tc, stream=None): gam, lin, lingam = calc_lingam(gprof.filters.colorclip, tc) dsc = mkdsc(dim, 4) tref = mktref(self.mod, 'chan4_src') set_blur_width(self.mod, fb.pool, params.width(tc), stream) launch2('apply_gamma_full_hi', self.mod, stream, dim, fb.d_left, fb.d_front, f32(gam - 1)) tref.set_address_2d(fb.d_left, dsc, 16 * dim.astride) launch2('full_blur', self.mod, stream, dim, fb.d_back, i32(2), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride) launch2('full_blur', self.mod, stream, dim, fb.d_left, i32(3), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_left, dsc, 16 * dim.astride) launch2('full_blur', self.mod, stream, dim, fb.d_back, i32(0), i32(0), texrefs=[tref]) tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride) launch2('full_blur', self.mod, stream, dim, fb.d_left, i32(1), i32(0), texrefs=[tref]) launch2('smearclip', self.mod, stream, dim, fb.d_front, fb.d_left, f32(gam - 1), lin, lingam)
def _interp(self, rdr, gnm, dim, ts, td): d_acc_size = rdr.mod.get_global('acc_size')[0] p_dim = self.fb.pool.allocate((len(dim), ), u32) p_dim[:] = dim cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a) tref = self.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) launch('interp_palette_flat', self.mod, self.stream_a, 256, self.info_a.palette_height, self.fb.d_rb, self.fb.d_seeds, self.src_a.d_ptimes, self.src_a.d_pals, f32(ts), f32(td / self.info_a.palette_height)) nts = self.info_a.ntemporal_samples launch('interp_iter_params', rdr.mod, self.stream_a, 256, np.ceil(nts / 256.), self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots, f32(ts), f32(td / nts), i32(nts))
def _iter(self, rdr, gnm, gprof, dim, tc): tref = rdr.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) nbins = dim.ah * dim.astride fill = lambda b, s, v=i32(0): util.fill_dptr( self.mod, b, s, stream=self.stream_a, value=v) fill(self.fb.d_front, 4 * nbins) fill(self.fb.d_left, 4 * nbins) fill(self.fb.d_right, 4 * nbins) fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan)) fill(self.fb.d_uleft, nbins / 2) fill(self.fb.d_uright, nbins / 2) nts = self.info_a.ntemporal_samples nsamps = (gprof.spp(tc) * dim.w * dim.h) nrounds = int(nsamps / (nts * 256. * 256)) + 1 # Split the launch into multiple rounds, to prevent a system on older # GPUs from locking up and to give us a chance to flush some stuff. hidden_stream = cuda.Stream() iter_stream_left, iter_stream_right = self.stream_a, hidden_stream block_size = 4 while nrounds: n = min(nrounds, block_size) now = time.time() launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n), self.fb.d_front, self.fb.d_left, self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, self.fb.d_uleft, self.info_a.d_params) delta = time.time() - now if delta > 0.1: # More than 100ms passed attempting to launch. The GPU is likely # out of queued execution resources on a long render, and scheduling # additional work will just keep spinning the CPU at 100%. # Do a blocking sync to free up resources. This may slightly reduce # parallelism but makes it a whole heck of a lot easier to keep # using the computer while things render. print >> sys.stderr, 'Launches became blocking, synchronizing' iter_stream_right.synchronize() # Make sure the other stream is done flushing before we start iter_stream_left.wait_for_event(cuda.Event().record(iter_stream_right)) launch('flush_atom', rdr.mod, iter_stream_left, (16, 16, 1), (dim.astride / 16, dim.ah / 16), u64(self.fb.d_front), u64(self.fb.d_left), u64(self.fb.d_uleft), i32(nbins)) self.fb.flip_side() iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left nrounds -= n block_size += block_size / 2 # Always wait on all events in the hidden stream before continuing on A self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))
def _iter(self, rdr, gnm, gprof, dim, tc): tref = rdr.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) nbins = dim.ah * dim.astride fill = lambda b, s, v=i32(0): util.fill_dptr( self.mod, b, s, stream=self.stream_a, value=v) fill(self.fb.d_front, 4 * nbins) fill(self.fb.d_left, 4 * nbins) fill(self.fb.d_right, 4 * nbins) fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan)) fill(self.fb.d_uleft, nbins / 2) fill(self.fb.d_uright, nbins / 2) nts = self.info_a.ntemporal_samples nsamps = (gprof.spp(tc) * dim.w * dim.h) nrounds = int(nsamps / (nts * 256. * 256)) + 1 # Split the launch into multiple rounds, to prevent a system on older # GPUs from locking up and to give us a chance to flush some stuff. hidden_stream = cuda.Stream() iter_stream_left, iter_stream_right = self.stream_a, hidden_stream block_size = 4 while nrounds: n = min(nrounds, block_size) now = time.time() launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n), self.fb.d_front, self.fb.d_left, self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, self.fb.d_uleft, self.info_a.d_params) delta = time.time() - now if delta > 0.1: # More than 100ms passed attempting to launch. The GPU is likely # out of queued execution resources on a long render, and scheduling # additional work will just keep spinning the CPU at 100%. # Do a blocking sync to free up resources. This may slightly reduce # parallelism but makes it a whole heck of a lot easier to keep # using the computer while things render. print >> sys.stderr, 'Launches became blocking, synchronizing' iter_stream_right.synchronize() # Make sure the other stream is done flushing before we start iter_stream_left.wait_for_event( cuda.Event().record(iter_stream_right)) launch('flush_atom', rdr.mod, iter_stream_left, (16, 16, 1), (dim.astride / 16, dim.ah / 16), u64(self.fb.d_front), u64(self.fb.d_left), u64(self.fb.d_uleft), i32(nbins)) self.fb.flip_side() iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left nrounds -= n block_size += block_size / 2 # Always wait on all events in the hidden stream before continuing on A self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))
def _interp(self, rdr, gnm, dim, ts, td): d_acc_size = rdr.mod.get_global('acc_size')[0] p_dim = self.fb.pool.allocate((len(dim),), u32) p_dim[:] = dim cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a) tref = self.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) launch('interp_palette_flat', self.mod, self.stream_a, 256, self.info_a.palette_height, self.fb.d_rb, self.fb.d_seeds, self.src_a.d_ptimes, self.src_a.d_pals, f32(ts), f32(td / self.info_a.palette_height)) nts = self.info_a.ntemporal_samples launch('interp_iter_params', rdr.mod, self.stream_a, 256, np.ceil(nts / 256.), self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots, f32(ts), f32(td / nts), i32(nts))
def apply(self, fb, gprof, params, dim, tc, stream=None): degamma = f32(params.degamma(tc)) launch2('logencode', self.mod, stream, dim, fb.d_back, fb.d_front, degamma) fb.flip()
def apply(self, fb, gprof, params, dim, tc, stream=None): gam, lin, lingam = calc_lingam(gprof.filters.colorclip, tc) launch2('plainclip', self.mod, stream, dim, fb.d_front, f32(gam - 1), lin, lingam, f32(gprof.filters.plainclip.brightness(tc)))
def calc_lingam(params, tc): gam = f32(1 / params.gamma(tc)) lin = f32(params.gamma_threshold(tc)) lingam = f32(lin**(gam - 1.0) if lin > 0 else 0) return gam, lin, lingam
def calc_lingam(params, tc): gam = f32(1 / params.gamma(tc)) lin = f32(params.gamma_threshold(tc)) lingam = f32(lin ** (gam - 1.0) if lin > 0 else 0) return gam, lin, lingam
def apply(self, fb, gprof, params, dim, tc, stream=None): gam, lin, lingam = calc_lingam(gprof.filters.colorclip, tc) launch2('plainclip', self.mod, stream, dim, fb.d_front, f32(gam-1), lin, lingam, f32(gprof.filters.plainclip.brightness(tc)))