Пример #1
0
    def _iter(self, rdr, gnm, gprof, dim, tc):
        tref = rdr.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)

        nbins = dim.ah * dim.astride
        fill = lambda b, s, v=i32(0): util.fill_dptr(
                self.mod, b, s, stream=self.stream_a, value=v)
        fill(self.fb.d_front,  4 * nbins)
        fill(self.fb.d_side,   2 * nbins)
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))

        nts = self.info_a.ntemporal_samples
        nsamps = (gprof.spp(tc) * dim.w * dim.h)
        nrounds = int(nsamps / (nts * 256. * 256)) + 1

        def launch_iter(n):
            if n == 0: return
            launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n),
                    self.fb.d_front, self.fb.d_side,
                    self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
                    self.info_a.d_params)
        # Split the launch into multiple rounds, possibly (slightly) reducing
        # work overlap but avoiding stalls when working on a device with an
        # active X session. TODO: characterize performance impact, autodetect
        BLOCK_SIZE = 16
        for i in range(BLOCK_SIZE-1, nrounds, BLOCK_SIZE):
            launch_iter(BLOCK_SIZE)
        launch_iter(nrounds%BLOCK_SIZE)

        nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.)))
        launch('flush_atom', self.mod, self.stream_a,
                256, (nblocks, nblocks),
                u64(self.fb.d_front), u64(self.fb.d_side), i32(nbins))
Пример #2
0
    def _iter(self, rdr, gnm, gprof, dim, tc):
        tref = rdr.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)

        nbins = dim.ah * dim.astride
        fill = lambda b, s, v=i32(0): util.fill_dptr(
                self.mod, b, s, stream=self.stream_a, value=v)
        fill(self.fb.d_front,  4 * nbins)
        fill(self.fb.d_left,   4 * nbins)
        fill(self.fb.d_right,  4 * nbins)
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
        fill(self.fb.d_uleft,  nbins / 2)
        fill(self.fb.d_uright, nbins / 2)

        nts = self.info_a.ntemporal_samples
        nsamps = (gprof.spp(tc) * dim.w * dim.h)
        nrounds = int(nsamps / (nts * 256. * 256)) + 1

        # Split the launch into multiple rounds, to prevent a system on older
        # GPUs from locking up and to give us a chance to flush some stuff.
        hidden_stream = cuda.Stream()
        iter_stream_left, iter_stream_right = self.stream_a, hidden_stream
        block_size = 4

        while nrounds:
          n = min(nrounds, block_size)
          now = time.time()
          launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n),
                 self.fb.d_front, self.fb.d_left,
                 self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
                 self.fb.d_uleft, self.info_a.d_params)
          delta = time.time() - now
          if delta > 0.1:
            # More than 100ms passed attempting to launch. The GPU is likely
            # out of queued execution resources on a long render, and scheduling
            # additional work will just keep spinning the CPU at 100%.
            # Do a blocking sync to free up resources. This may slightly reduce
            # parallelism but makes it a whole heck of a lot easier to keep
            # using the computer while things render.
            print >> sys.stderr, 'Launches became blocking, synchronizing'
            iter_stream_right.synchronize()

          # Make sure the other stream is done flushing before we start
          iter_stream_left.wait_for_event(cuda.Event().record(iter_stream_right))

          launch('flush_atom', rdr.mod, iter_stream_left,
                  (16, 16, 1), (dim.astride / 16, dim.ah / 16),
                  u64(self.fb.d_front), u64(self.fb.d_left),
                  u64(self.fb.d_uleft), i32(nbins))

          self.fb.flip_side()
          iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left
          nrounds -= n
          block_size += block_size / 2

        # Always wait on all events in the hidden stream before continuing on A
        self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))
Пример #3
0
    def _iter(self, rdr, gnm, gprof, dim, tc):
        tref = rdr.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)

        nbins = dim.ah * dim.astride
        fill = lambda b, s, v=i32(0): util.fill_dptr(
            self.mod, b, s, stream=self.stream_a, value=v)
        fill(self.fb.d_front, 4 * nbins)
        fill(self.fb.d_left, 4 * nbins)
        fill(self.fb.d_right, 4 * nbins)
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
        fill(self.fb.d_uleft, nbins / 2)
        fill(self.fb.d_uright, nbins / 2)

        nts = self.info_a.ntemporal_samples
        nsamps = (gprof.spp(tc) * dim.w * dim.h)
        nrounds = int(nsamps / (nts * 256. * 256)) + 1

        # Split the launch into multiple rounds, to prevent a system on older
        # GPUs from locking up and to give us a chance to flush some stuff.
        hidden_stream = cuda.Stream()
        iter_stream_left, iter_stream_right = self.stream_a, hidden_stream
        block_size = 4

        while nrounds:
            n = min(nrounds, block_size)
            now = time.time()
            launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n),
                   self.fb.d_front, self.fb.d_left, self.fb.d_rb,
                   self.fb.d_seeds, self.fb.d_points, self.fb.d_uleft,
                   self.info_a.d_params)
            delta = time.time() - now
            if delta > 0.1:
                # More than 100ms passed attempting to launch. The GPU is likely
                # out of queued execution resources on a long render, and scheduling
                # additional work will just keep spinning the CPU at 100%.
                # Do a blocking sync to free up resources. This may slightly reduce
                # parallelism but makes it a whole heck of a lot easier to keep
                # using the computer while things render.
                print >> sys.stderr, 'Launches became blocking, synchronizing'
                iter_stream_right.synchronize()

            # Make sure the other stream is done flushing before we start
            iter_stream_left.wait_for_event(
                cuda.Event().record(iter_stream_right))

            launch('flush_atom', rdr.mod, iter_stream_left, (16, 16, 1),
                   (dim.astride / 16, dim.ah / 16), u64(self.fb.d_front),
                   u64(self.fb.d_left), u64(self.fb.d_uleft), i32(nbins))

            self.fb.flip_side()
            iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left
            nrounds -= n
            block_size += block_size / 2

        # Always wait on all events in the hidden stream before continuing on A
        self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))
Пример #4
0
def launchC(name, mod, stream, dim, fb, *args):
    launch(
        name,
        mod,
        stream,
        (32, 8, 1),
        (int(np.ceil(dim.w / 32.0)), int(np.ceil(dim.h / 8.0))),
        fb.d_back,
        fb.d_front,
        i32(fb.gutter),
        i32(dim.w),
        i32(dim.astride),
        i32(dim.h),
        *args
    )
Пример #5
0
 def launch_iter(n):
     if n == 0:
         return
     launch(
         "iter",
         rdr.mod,
         self.stream_a,
         (32, 8, 1),
         (nts, n),
         self.fb.d_front,
         self.fb.d_side,
         self.fb.d_rb,
         self.fb.d_seeds,
         self.fb.d_points,
         self.info_a.d_params,
     )
Пример #6
0
    def _interp(self, rdr, gnm, dim, ts, td):
        d_acc_size = rdr.mod.get_global('acc_size')[0]
        p_dim = self.fb.pool.allocate((len(dim), ), u32)
        p_dim[:] = dim
        cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)

        tref = self.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)
        launch('interp_palette_flat', self.mod, self.stream_a, 256,
               self.info_a.palette_height, self.fb.d_rb, self.fb.d_seeds,
               self.src_a.d_ptimes, self.src_a.d_pals, f32(ts),
               f32(td / self.info_a.palette_height))

        nts = self.info_a.ntemporal_samples
        launch('interp_iter_params', rdr.mod, self.stream_a, 256,
               np.ceil(nts / 256.), self.info_a.d_params, self.src_a.d_times,
               self.src_a.d_knots, f32(ts), f32(td / nts), i32(nts))
Пример #7
0
    def _interp(self, rdr, gnm, dim, ts, td):
        d_acc_size = rdr.mod.get_global('acc_size')[0]
        p_dim = self.fb.pool.allocate((len(dim),), u32)
        p_dim[:] = dim
        cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)

        tref = self.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)
        launch('interp_palette_flat', self.mod, self.stream_a,
                256, self.info_a.palette_height,
                self.fb.d_rb, self.fb.d_seeds,
                self.src_a.d_ptimes, self.src_a.d_pals,
                f32(ts), f32(td / self.info_a.palette_height))

        nts = self.info_a.ntemporal_samples
        launch('interp_iter_params', rdr.mod, self.stream_a,
                256, np.ceil(nts / 256.),
                self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
                f32(ts), f32(td / nts), i32(nts))
Пример #8
0
def launchC(name, mod, stream, dim, fb, *args):
    launch(name, mod, stream, (32, 8, 1),
           (int(np.ceil(dim.w / 32.)), int(np.ceil(dim.h / 8.))),
           fb.d_back, fb.d_front, i32(fb.gutter), i32(dim.w), i32(dim.astride),
           i32(dim.h), *args)
Пример #9
0
 def launch_iter(n):
     if n == 0: return
     launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n),
             self.fb.d_front, self.fb.d_side,
             self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
             self.info_a.d_params)