Пример #1
0
    def set(self, ary, device=None):
        """
        copy host array to device.
        Arguments:
            ary: host array, needs to be contiguous
            device: device id, if not the one attached to current context
        Returns:
            self
        """
        stream = self.backend.stream
        assert ary.size == self.size
        assert self.is_contiguous, "Array in set() must be contiguous"
        if ary.dtype is not self.dtype:
            ary = ary.astype(self.dtype)
        assert ary.strides == self.strides

        if device is None:
            drv.memcpy_htod_async(self.gpudata, ary, stream)
        else:
            # with multithreaded datasets, make a context before copying
            # and destroy it again once done.
            lctx = drv.Device(device).make_context()
            drv.memcpy_htod_async(self.gpudata, ary, stream)
            lctx.pop()
            del lctx

        return self
Пример #2
0
    def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{
        """Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*,
        and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude
        vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.

        Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have
        no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles.
        """

        # Set up lingo and count matrices on device #{{{
        if self.usePycudaArray:
            # Set up using PyCUDA CUDAArray support
            self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C')
            self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C')
            self.gpu.tex2lr.set_array(self.gpu.rsmiles)
            self.gpu.tex2cr.set_array(self.gpu.rcounts)
        else:
            # Manually handle setup
            temprlmat = self._padded_array(refsmilesmat)
            if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768:
                raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape)
            self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream)

            temprcmat = self._padded_array(refcountsmat)
            self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream)

            descriptor = cuda.ArrayDescriptor()
            descriptor.width  = temprcmat.shape[1]
            descriptor.height = temprcmat.shape[0]
            descriptor.format = cuda.array_format.UNSIGNED_INT32
            descriptor.num_channels = 1
            self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0])
            self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0])
            self.gpu.stream.synchronize()
            del temprlmat
            del temprcmat
        #}}}

        self.rlengths = reflengths
        self.rshape = refsmilesmat.shape
        self.nref = refsmilesmat.shape[0]

        # Copy reference lengths to GPU
        self.gpu.rl_gpu = cuda.to_device(reflengths)

        # Allocate buffers for query set magnitudes
        self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes)
        if refmags is not None:
            cuda.memcpy_htod(self.gpu.rmag_gpu,refmags)
        else:
            # Calculate query set magnitudes on GPU
            magthreads = 256
            self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr])
        return
Пример #3
0
    def set_async(self, ary, stream=None):
        assert ary.size == self.size
        assert ary.dtype == self.dtype
        assert self.flags.forc

        if not ary.flags.forc:
            raise RuntimeError("cannot asynchronously set from " "non-contiguous array")

        if self.size:
            drv.memcpy_htod_async(self.gpudata, ary, stream)
Пример #4
0
 def set(self, tensor, data):
     assert isinstance(tensor, MGPUTensor)
     if tensor.ptype == 'replica':
         for dest, strm, ctx in zip(tensor.tlist, self.strms, self.ctxs):
             ctx.push()
             drv.memcpy_htod_async(dest.ptr, data, strm)
             ctx.pop()
         # tensor.copy_from(data)
     else:
         self.scatter(data, tensor)
Пример #5
0
    def pre_execution(self, solver, stream=None):
        super(RimeEBeam, self).pre_execution(solver,stream)

        if stream is not None:
            cuda.memcpy_htod_async(
                self.rime_const_data[0],
                solver.const_data().ndary(),
                stream=stream)
        else:
            cuda.memcpy_htod(
                self.rime_const_data[0],
                solver.const_data().ndary())
Пример #6
0
def exchange(nx, ny, a_gpu, b_gpu, dev1, dev2):
	ctx1 = cuda.Device(dev1).make_context()
	a = cuda.from_device(int(a_gpu)+(nx-2)*ny*nof, (ny,), np.float32)
	ctx1.pop()

	ctx2 = cuda.Device(dev2).make_context()
	cuda.memcpy_htod(int(b_gpu), a)
	b = cuda.from_device(int(b_gpu)+ny*nof, (ny,), np.float32)
	ctx2.pop()

	ctx1 = cuda.Device(dev1).make_context()
	cuda.memcpy_htod_async(int(a_gpu)+(nx-1)*ny*nof, b)
	ctx1.pop()
Пример #7
0
def kernel_write(function_name, dest_devptr, dest_info, source_devptr, source_info, work_range, stream=None):
	global KD

	# initialize variables
	global tb_cnt
	tb_cnt = 0

	# dest
	cuda_args = [dest_devptr]
	cuda_args += [dest_info]

	# source
	cuda_args += [source_devptr]
	cuda_args += [source_info]

	# work_range
	cuda_args += make_cuda_list(work_range)

	# initialize model view
	eye = numpy.eye(4,dtype=numpy.float32)
	cuda.memcpy_htod_async(mmtx, eye, stream=stream)
	cuda.memcpy_htod_async(inv_mmtx, eye, stream=stream)
	
	try:
		if Debug:
			print "Function name: ", function_name
		func = mod.get_function(function_name) #cutting function
	except:
		print "Function not found ERROR"
		print "Function name: ", function_name
		assert(False)
		
	# set work range
	block, grid = range_to_block_grid(work_range)

	if log_type in ['time', 'all']:
		st = time.time()

	func(*cuda_args, block=block, grid=grid, stream=stream)

	#ctx.synchronize()
	
	KD.append((dest_info, source_info))
	

	if log_type in ['time', 'all']:
		bytes = make_bytes(work_range,3)
		t = MPI.Wtime()-st
		ms = 1000*t
		bw = bytes/GIGA/t
		log("rank%d, GPU%d, , kernel write time, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec "%(rank, device_number, bytes/MEGA, ms, bw),'time', log_type)
Пример #8
0
 def set_async(self, ary, stream=None):
     assert ary.ndim <= 3
     assert ary.dtype == ary.dtype
     
     assert ary.size == self.size
     
     if ary.base.__class__ != cuda.HostAllocation:
             raise TypeError("asynchronous memory trasfer requires pagelocked numpy array")
             
     if self.size:
         if self.M == 1:
             cuda.memcpy_htod_async(self.gpudata, ary, stream)
         else:
             PitchTrans(self.shape, self.gpudata, self.ld, ary, _pd(self.shape), self.dtype, async = True, stream = stream)
Пример #9
0
    def synchronize_isdone(self):
        """ Complete synchronization process. """

        # Use shorter, easier names for class variables.
        bufs = self._sync_buffers
        ptrs = self._sync_ptrs
        streams = self._sync_streams
        adj = self._sync_adj
        part2_start = self._sync_part2_start 
        is_done = [False, False, False, False]

        # Forward send.
        if streams[0].is_done(): # Device-to-host copy completed.
            if not part2_start[0]: # Initialize MPI send.
                comm.Isend(bufs[0], dest=adj['forw'], tag=self._sync_tags[0])
                part2_start[0] = True
                is_done[0] = True
            else: # No more work to do.
                is_done[0] = True

        # Backward send.
        if streams[1].is_done(): # Device-to-host copy completed.
            if not part2_start[1]: # Initialize MPI send.
                comm.Isend(bufs[1], dest=adj['back'], tag=self._sync_tags[1])
                part2_start[1] = True
                is_done[1] = True
            else: # No more work to do.
                is_done[1] = True

        # Forward receive.
        if self._sync_req_forw.Test(): # MPI receive completed.
            if not part2_start[2]: # Initialize host-to-device copy.
                drv.memcpy_htod_async(ptrs['back_dest'], bufs[2], \
                                        stream=streams[2]) # Host-to-device.
                part2_start[2] = True
            elif streams[2].is_done(): # Host-to-device copy completed.
                is_done[2] = True

        # Backward receive.
        if self._sync_req_back.Test(): # MPI receive completed.
            if not part2_start[3]: # Initialize host-to-device copy.
                drv.memcpy_htod_async(ptrs['forw_dest'], bufs[3], \
                                        stream=streams[3]) # Host-to-device.
                part2_start[3] = True
            elif streams[3].is_done(): # Host-to-device copy completed.
                is_done[3] = True
        # print '~', is_done[0:4],
        # Return true only when all four transfers are complete.
        return all(is_done) 
Пример #10
0
    def load_data_on_gpu(tl_args, module):
        d_V = module.get_global('d_V')[0]
        cuda.memcpy_htod_async(d_V, tl_args.V)

        d_c = module.get_global('d_c')[0]
        cuda.memcpy_htod_async(d_c, tl_args.c)

        d_I = module.get_global('d_I')[0]
        cuda.memcpy_htod_async(d_I, tl_args.I)

        d_E = module.get_global('d_E')[0]
        cuda.memcpy_htod_async(d_E, tl_args.E)

        d_x_0 = module.get_global('d_x_0')[0]
        cuda.memcpy_htod_async(d_x_0, tl_args.x_0)
Пример #11
0
    def execute(self, solver, stream=None):
        slvr = solver

        if stream is not None:
            cuda.memcpy_htod_async(
                self.rime_const_data[0],
                slvr.const_data().ndary(),
                stream=stream)
        else:
            cuda.memcpy_htod(
                self.rime_const_data[0],
                slvr.const_data().ndary())

        self.kernel(slvr.uvw, slvr.lm, slvr.frequency,
            slvr.B_sqrt, slvr.jones,
            stream=stream, **self.launch_params)
Пример #12
0
    def test_register_host_memory(self):
        if drv.get_version() < (4,):
            from py.test import skip
            skip("register_host_memory only exists on CUDA 4.0 and later")

        import sys
        if sys.platform == "darwin":
            from py.test import skip
            skip("register_host_memory is not supported on OS X")

        a = drv.aligned_empty((2**20,), np.float64)
        a_pin = drv.register_host_memory(a)

        gpu_ary = drv.mem_alloc_like(a)
        stream = drv.Stream()
        drv.memcpy_htod_async(gpu_ary, a_pin, stream)
        drv.Context.synchronize()
Пример #13
0
    def set_async(self, ary, stream=None):
        assert ary.size == self.size
        assert ary.dtype == self.dtype
        if ary.strides != self.strides:
            from warnings import warn
            warn("Setting array from one with different strides/storage order. "
                    "This will cease to work in 2013.x.",
                    stacklevel=2)

        assert self.flags.forc

        if not ary.flags.forc:
            raise RuntimeError("cannot asynchronously set from "
                    "non-contiguous array")

        if self.size:
            drv.memcpy_htod_async(self.gpudata, ary, stream)
Пример #14
0
            def run(self, scomp, scopy):
                # If we are unpacking then copy the host buffer to the GPU
                if op == 'unpack':
                    cuda.memcpy_htod_async(m.data, m.hdata, scopy)
                    event.record(scopy)
                    scomp.wait_for_event(event)

                # Call the CUDA kernel (pack or unpack)
                fn.prepared_async_call(grid, block, scomp, v.nrow, v.ncol,
                                       v.mapping, v.strides, m,
                                       v.mapping.leaddim, v.strides.leaddim,
                                       m.leaddim)

                # If we have been packing then copy the GPU buffer to the host
                if op == 'pack':
                    event.record(scomp)
                    scopy.wait_for_event(event)
                    cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
Пример #15
0
    def set(self, ary):
        """
        copy host array to device.
        Arguments:
            ary: host array, needs to be contiguous
        Returns:
            self
        """
        stream = self.backend.stream
        assert ary.size == self.size
        assert self.is_contiguous, "Array in set() must be contiguous"
        if ary.dtype is not self.dtype:
            ary = ary.astype(self.dtype)
        assert ary.strides == tuple(self.dtype.itemsize*s for s in self.strides)

        drv.memcpy_htod_async(self.gpudata, ary, stream)

        return self
Пример #16
0
    def _interp(self, rdr, gnm, dim, ts, td):
        d_acc_size = rdr.mod.get_global('acc_size')[0]
        p_dim = self.fb.pool.allocate((len(dim),), u32)
        p_dim[:] = dim
        cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)

        tref = self.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)
        launch('interp_palette_flat', self.mod, self.stream_a,
                256, self.info_a.palette_height,
                self.fb.d_rb, self.fb.d_seeds,
                self.src_a.d_ptimes, self.src_a.d_pals,
                f32(ts), f32(td / self.info_a.palette_height))

        nts = self.info_a.ntemporal_samples
        launch('interp_iter_params', rdr.mod, self.stream_a,
                256, np.ceil(nts / 256.),
                self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
                f32(ts), f32(td / nts), i32(nts))
Пример #17
0
    def execute(self, solver, stream=None):
        slvr = solver

        if stream is not None:
            cuda.memcpy_htod_async(
                self.rime_const_data[0],
                slvr.const_data().ndary(),
                stream=stream)
        else:
            cuda.memcpy_htod(
                self.rime_const_data[0],
                slvr.const_data().ndary())

        self.kernel(slvr.lm, slvr.parallactic_angles,
            slvr.point_errors, slvr.antenna_scaling, slvr.frequency,
            slvr.E_beam, slvr.jones,
            slvr.beam_ll, slvr.beam_lm, slvr.beam_lfreq,
            slvr.beam_ul, slvr.beam_um, slvr.beam_ufreq,
            stream=stream, **self.launch_params)
Пример #18
0
 def inference(self, img):
     # copy img to input memory
     # self.inputs[0]['host'] = np.ascontiguousarray(img)
     self.inputs[0]['host'] = np.ravel(img)
     # transfer data to the gpu
     for inp in self.inputs:
         cuda.memcpy_htod_async(inp['device'], inp['host'], self.stream)
     # run inference
     start = time.time()
     self.context.execute_async_v2(
             bindings=self.bindings,
             stream_handle=self.stream.handle)
     end = time.time()
     #print('execution time:', end-start)
     # fetch outputs from gpu
     for out in self.outputs:
         cuda.memcpy_dtoh_async(out['host'], out['device'], self.stream)
     # synchronize stream
     self.stream.synchronize()
     return [out['host'] for out in self.outputs]
Пример #19
0
def do_inference(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]
Пример #20
0
def main():

    # initialize TensorRT engine and parse ONNX model
    engine, context = build_engine(ONNX_FILE_PATH)

    # get sizes of input and output and allocate memory required for input data and for output data
    for binding in engine:
        if engine.binding_is_input(binding):  # we expect only one input
            input_shape = engine.get_binding_shape(binding)
            input_size = trt.volume(
                input_shape) * engine.max_batch_size * np.dtype(
                    np.float32).itemsize  # in bytes
            device_input = cuda.mem_alloc(input_size)
        else:  # and one output
            output_shape = engine.get_binding_shape(binding)
            # create page-locked memory buffers (i.e. won't be swapped to disk)
            host_output = cuda.pagelocked_empty(trt.volume(output_shape) *
                                                engine.max_batch_size,
                                                dtype=np.float32)
            device_output = cuda.mem_alloc(host_output.nbytes)

    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()

    # preprocess input data
    host_input = np.array(preprocess_image("input.jpeg").numpy(),
                          dtype=np.float32,
                          order='C')
    cuda.memcpy_htod_async(device_input, host_input, stream)

    # run inference
    context.execute_async(bindings=[int(device_input),
                                    int(device_output)],
                          stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(host_output, device_output, stream)
    stream.synchronize()

    # postprocess results
    output_data = torch.Tensor(host_output).reshape(engine.max_batch_size,
                                                    output_shape[1])
    postprocess(output_data)
Пример #21
0
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width):
   """
   This is the function to run the inference
   Args:
      engine : Path to the TensorRT engine 
      pics_1 : Input images to the model.  
      h_input_1: Input in the host         
      d_input_1: Input in the device 
      h_output_1: Output in the host 
      d_output_1: Output in the device 
      stream: CUDA stream
      batch_size : Batch size for execution time
      height: Height of the output image
      width: Width of the output image
   
   Output:
      The list of output images

   """
   print('load images to buffer')
   load_images_to_buffer(pics_1, h_input_1)

   with engine.create_execution_context() as context:
       context.debug_sync = False
       # Transfer input data to the GPU.
       cuda.memcpy_htod_async(d_input_1, h_input_1, stream)

       # Run inference.
       print('load profiler')
       context.profiler = trt.Profiler()
       print('execute')
       context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)])
       print('Transfer predictions back from the GPU.')
       # Transfer predictions back from the GPU.
       cuda.memcpy_dtoh_async(h_output, d_output, stream)
       # Synchronize the stream
       stream.synchronize()
       # Return the host output.
       print(h_output.shape)
       out = h_output.reshape((1,-1))
       return out 
    def loadONNX2TensorRT(self, filepath):
        '''
        通过onnx文件,构建TensorRT运行引擎
        :param filepath: onnx文件路径
        '''

        engine = self.ONNX_build_engine(filepath)

        # 读取测试集
        datas = DataLoaders()
        test_loader = datas.testDataLoader()
        img, target = next(iter(test_loader))
        img = img.numpy()
        target = target.numpy()

        img = img.ravel()

        context = engine.create_execution_context()
        output = np.empty((100, 10), dtype=np.float32)

        # 分配内存
        d_input = cuda.mem_alloc(1 * img.size * img.dtype.itemsize)
        d_output = cuda.mem_alloc(1 * output.size * output.dtype.itemsize)
        bindings = [int(d_input), int(d_output)]

        # pycuda操作缓冲区
        stream = cuda.Stream()
        # 将输入数据放入device
        cuda.memcpy_htod_async(d_input, img, stream)
        # 执行模型
        context.execute_async(100, bindings, stream.handle, None)
        # 将预测结果从从缓冲区取出
        cuda.memcpy_dtoh_async(output, d_output, stream)
        # 线程同步
        stream.synchronize()

        print("Test Case: " + str(target))
        print("Prediction 100: " + str(np.argmax(output, axis=1)))

        del context
        del engine
Пример #23
0
    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        import numpy
        shape = (32,8)
        a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        a[:] = numpy.random.randn(*shape)
        b[:] = numpy.random.randn(*shape)

        a_gpu = drv.mem_alloc(a.nbytes)
        b_gpu = drv.mem_alloc(b.nbytes)

        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)
        drv.memcpy_htod_async(b_gpu, b, strm)
        strm.synchronize()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(
                drv.Out(dest), a_gpu, b_gpu,
                block=shape+(1,), stream=strm)
        strm.synchronize()

        drv.memcpy_dtoh_async(a, a_gpu, strm)
        drv.memcpy_dtoh_async(b, b_gpu, strm)
        strm.synchronize()

        assert la.norm(dest-a*b) == 0
Пример #24
0
def trt_inference(stream, trt_ctx, d_input, d_output, input_signal,
                  input_signal_length):
    print("infer with shape: {}".format(input_signal.shape))

    trt_ctx.set_binding_shape(0, input_signal.shape)
    assert trt_ctx.all_binding_shapes_specified

    h_output = cuda.pagelocked_empty(tuple(trt_ctx.get_binding_shape(1)),
                                     dtype=np.float32)

    h_input_signal = cuda.register_host_memory(
        np.ascontiguousarray(input_signal.cpu().numpy().ravel()))
    cuda.memcpy_htod_async(d_input, h_input_signal, stream)
    trt_ctx.execute_async_v2(bindings=[int(d_input),
                                       int(d_output)],
                             stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    stream.synchronize()

    greedy_predictions = torch.tensor(h_output).argmax(dim=-1, keepdim=False)
    return greedy_predictions
Пример #25
0
 def run_inference(self):
     """
     This function is generalized for multiple inputs/outputs.
     inputs and outputs are expected to be lists of HostDeviceMem objects.
     """
     # Transfer input data to the GPU.
     for inp in self.inputs:
         cuda.memcpy_htod_async(inp.device, inp.host, self.stream)
     # Run inference.
     self.context.execute_async(
         batch_size=self.batch_size,
         bindings=self.bindings,
         stream_handle=self.stream.handle,
     )
     # Transfer predictions back from the GPU.
     for out in self.outputs:
         cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
     # Synchronize the stream
     self.stream.synchronize()
     # Return only the host outputs.
     return [out.host for out in self.outputs]
Пример #26
0
def asy_cpy(a, a_gpu, auto_init_context=True):
    """Data transfer from host to device.
    
    Asynchronous will be enabled when auto_init_context is True, otherwise
    use normal transfer.
    """

    import pycuda.driver as drv

    if auto_init_context:
        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)

        # Test correctness
        # ctx.synchronize()
        # b= numpy.zeros_like(a, a.dtype)
        # drv.memcpy_dtoh(b, a_gpu)
        # print numpy.allclose(a, b)
        return strm
    else:
        drv.memcpy_htod(a_gpu, a)
Пример #27
0
    def scatter(self, hbuf, dbuf):
        '''
        scatters the array data in hbuf to the mgpu tensor
        assumes that dbuf is a M x N and hbuf is M x (Nxk) where k is the
        number of replicas
        also assumes that dtype of hbuf and dbuf are the same
        '''
        assert hbuf.size == dbuf.size * dbuf.num_dev
        assert isinstance(dbuf, MGPUTensor)
        assert hbuf.dtype == dbuf.dtype
        ndata = dbuf.size
        starts = [i * ndata for i in range(self.num_dev)]

        for dest, strm, ctx, doff in zip(dbuf.tlist, self.strms, self.ctxs,
                                         starts):
            src = hbuf.reshape((hbuf.size))[doff:(doff + ndata)]
            ctx.push()
            drv.memcpy_htod_async(dest.ptr, src, strm)
            ctx.pop()

        self.synchronize()
Пример #28
0
    def scatter(self, hbuf, dbuf):
        '''
        scatters the array data in hbuf to the mgpu tensor
        assumes that dbuf is a M x N and hbuf is M x (Nxk) where k is the
        number of replicas
        also assumes that dtype of hbuf and dbuf are the same
        '''
        assert hbuf.size == dbuf.size * dbuf.num_dev
        assert isinstance(dbuf, MGPUTensor)
        assert hbuf.dtype == dbuf.dtype
        ndata = dbuf.size
        starts = [i * ndata for i in range(self.num_dev)]

        for dest, strm, ctx, doff in zip(dbuf.tlist, self.strms, self.ctxs,
                                         starts):
            src = hbuf.reshape((hbuf.size))[doff:(doff + ndata)]
            ctx.push()
            drv.memcpy_htod_async(dest.ptr, src, strm)
            ctx.pop()

        self.synchronize()
Пример #29
0
    def test_register_host_memory(self):
        if drv.get_version() < (4, ):
            from py.test import skip
            skip("register_host_memory only exists on CUDA 4.0 and later")

        import sys
        if sys.platform == "darwin":
            from py.test import skip
            skip("register_host_memory is not supported on OS X")

        import resource

        a = drv.aligned_empty((2**20, ),
                              np.float64,
                              alignment=resource.getpagesize())
        a_pin = drv.register_host_memory(a)

        gpu_ary = drv.mem_alloc_like(a)
        stream = drv.Stream()
        drv.memcpy_htod_async(gpu_ary, a_pin, stream)
        drv.Context.synchronize()
Пример #30
0
    def execute(self, *inputs):
        image = inputs[0]
        np.copyto(self.host_inputs[0], image.ravel())

        if self.cuda_ctx:
            self.cuda_ctx.push()
        cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0],
                               self.stream)
        self.context.execute_async(batch_size=1,
                                   bindings=self.bindings,
                                   stream_handle=self.stream.handle)
        cuda.memcpy_dtoh_async(self.host_outputs[1], self.cuda_outputs[1],
                               self.stream)
        cuda.memcpy_dtoh_async(self.host_outputs[0], self.cuda_outputs[0],
                               self.stream)
        self.stream.synchronize()
        if self.cuda_ctx:
            self.cuda_ctx.pop()

        output = self.host_outputs[0]
        return output
Пример #31
0
def asy_cpy(a, a_gpu, auto_init_context=True):
    """Data transfer from host to device.
    
    Asynchronous will be enabled when auto_init_context is True, otherwise
    use normal transfer.
    """

    import pycuda.driver as drv

    if auto_init_context:
        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)

        # Test correctness
        #ctx.synchronize()
        #b= numpy.zeros_like(a, a.dtype)
        #drv.memcpy_dtoh(b, a_gpu)
        #print numpy.allclose(a, b)
        return strm
    else:
        drv.memcpy_htod(a_gpu, a)
Пример #32
0
    def _enqueue_const_data_htod(self, subslvr, device_ptr):
        """
        Enqueue an async copy of the constant data array
        from the sub-solver into the constant memory buffer
        referenced by the device pointer.
        """
        # Get sub solver constant data array
        host_ary = subslvr.const_data().ndary()

        # Allocate pinned memory with same size
        pinned_ary = subslvr.pinned_mem_pool.allocate(
            shape=host_ary.shape, dtype=host_ary.dtype)

        # Copy into pinned memory
        pinned_ary[:] = host_ary

        # Enqueue the asynchronous transfer
        cuda.memcpy_htod_async(device_ptr, pinned_ary,
            stream=subslvr.stream)

        return pinned_ary
Пример #33
0
    def inference(self, input_image):
        # threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.cfx.push()

        # Do image preprocess
        # batch_image_raw = []
        # batch_origin_h = []
        # batch_origin_w = []
        # batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        # for i, image_raw in enumerate(raw_image_generator):
        #     input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
        #     batch_image_raw.append(image_raw)
        #     batch_origin_h.append(origin_h)
        #     batch_origin_w.append(origin_w)
        #     np.copyto(batch_input_image[i], input_image)
        # input_image = np.ascontiguousarray(input_image)
        # Copy input image to host buffer
        self.inputs[0]['host'] = np.ravel(input_image)
        # Transfer input data  to the GPU.
        for inp in self.inputs:
            cuda.memcpy_htod_async(inp['device'], inp['host'], self.stream)
        # Run inference.
        self.context.execute_async_v2(bindings=self.bindings,
                                      stream_handle=self.stream.handle)
        # Transfer predictions back from the GPU.
        # fetch outputs from gpu
        for out in self.outputs:
            cuda.memcpy_dtoh_async(out['host'], out['device'], self.stream)
        # Synchronize the stream
        self.stream.synchronize()
        # Remove any context from the top of the context stack, deactivating it.
        self.cfx.pop()
        # Here we use the first row of output in that batch_size = 1
        outputs = [out['host'] for out in self.outputs]
        reshaped = []
        for output, shape in zip(outputs, self.output_shapes):
            reshaped.append(output.reshape(shape))
        # Do postprocess
        return reshaped
Пример #34
0
        def inference(features):
            global h_output
            print("\nRunning Inference...")
            eval_start_time = time.time()

            # Copy inputs
            cuda.memcpy_htod_async(d_inputs[0], features["input_ids"], stream)
            cuda.memcpy_htod_async(d_inputs[1], features["segment_ids"], stream)
            cuda.memcpy_htod_async(d_inputs[2], features["input_mask"], stream)
            # Run inference
            context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
            # Transfer predictions back from GPU
            cuda.memcpy_dtoh_async(h_output, d_output, stream)
            # Synchronize the stream
            stream.synchronize()

            h_output = h_output.transpose((1,0,2,3,4))
            eval_time_elapsed = time.time() - eval_start_time

            print("------------------------")
            print("Running inference in {:.3f} Sentences/Sec".format(1.0/eval_time_elapsed))
            print("------------------------")

            for index, batch in enumerate(h_output):
                # Data Post-processing
                start_logits = batch[:, 0]
                end_logits = batch[:, 1]

                prediction, nbest_json, scores_diff_json = dp.get_predictions(doc_tokens, features,
                        start_logits, end_logits, args.n_best_size, args.max_answer_length)

                print("Processing output {:} in batch".format(index))
                print("Answer: '{}'".format(prediction))
                print("With probability: {:.3f}%".format(nbest_json[0]['probability'] * 100.0))
Пример #35
0
    def infer(self, raw_image):
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        ori_shape = raw_image.shape
        batch_input_image = np.empty(
            shape=[self.batch_size, 3, self.input_h, self.input_w])
        input_image = self.preprocess_one(raw_image)
        np.copyto(batch_input_image, input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size,
                              bindings=bindings,
                              stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream)
        cuda.memcpy_dtoh_async(host_outputs[2], cuda_outputs[2], stream)
        # Synchronize the stream
        stream.synchronize()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs
        # Do postprocess
        bboxes = self.post_process(output, ori_shape)
        return bboxes
Пример #36
0
    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = SourceModule(
            """
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """
        )

        multiply_them = mod.get_function("multiply_them")

        shape = (32, 8)
        a = drv.pagelocked_zeros(shape, dtype=np.float32)
        b = drv.pagelocked_zeros(shape, dtype=np.float32)
        a[:] = np.random.randn(*shape)
        b[:] = np.random.randn(*shape)

        a_gpu = drv.mem_alloc(a.nbytes)
        b_gpu = drv.mem_alloc(b.nbytes)

        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)
        drv.memcpy_htod_async(b_gpu, b, strm)
        strm.synchronize()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(drv.Out(dest), a_gpu, b_gpu, block=shape + (1,), stream=strm)
        strm.synchronize()

        drv.memcpy_dtoh_async(a, a_gpu, strm)
        drv.memcpy_dtoh_async(b, b_gpu, strm)
        strm.synchronize()

        assert la.norm(dest - a * b) == 0
Пример #37
0
def predict_labels_for_batch(batch_data):
    global d_inputs, h_d_outputs, h_output, model_bindings, cuda_stream
    global num_labels, model_classes
    global trt_context
    global max_batch_size

    batch_size = len(batch_data)
    flat_float_batch = np.ravel(batch_data)

    begin_time = time.time()

    cuda.memcpy_htod_async(
        d_inputs[0], flat_float_batch,
        cuda_stream)  # assuming one input layer for image classification
    trt_context.execute_async(bindings=model_bindings,
                              batch_size=batch_size,
                              stream_handle=cuda_stream.handle)
    for output in h_d_outputs:
        cuda.memcpy_dtoh_async(output['host_mem'], output['dev_mem'],
                               cuda_stream)
    cuda_stream.synchronize()

    classification_time = time.time() - begin_time

    print("[batch of {}] inference={:.2f} ms".format(
        batch_size, classification_time * 1000))

    batch_results = np.split(
        h_output,
        max_batch_size)  # where each row is a softmax_vector for one sample

    if model_classes == 1:
        batch_predicted_labels = batch_results[:batch_size]
    else:
        batch_predicted_labels = [
            np.argmax(batch_results[k][-num_labels:])
            for k in range(batch_size)
        ]

    return batch_predicted_labels
Пример #38
0
def run(nRow, nCol):
    print("test: nRow=%d,nCol=%d" % (nRow, nCol))
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFilePath)
    engine = buildEngine(logger, nRow, nCol)
    if engine == None:
        print("Failed building engine!")
        return None
    print("Succeeded building engine!")

    context = engine.create_execution_context()
    stream = cuda.Stream()

    data = np.full((nRow, nCol), 1, dtype=np.float32)  # uniform distribution
    #data = np.tile(np.arange(0,nCol,1,dtype=np.float32),[nRow,1])                                   # non-uniform distribution
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    inputD0 = cuda.mem_alloc(inputH0.nbytes)
    outputH0 = np.empty(context.get_binding_shape(1),
                        dtype=trt.nptype(engine.get_binding_dtype(1)))
    outputH1 = np.empty(context.get_binding_shape(2),
                        dtype=trt.nptype(engine.get_binding_dtype(2)))
    outputD0 = cuda.mem_alloc(outputH0.nbytes)
    outputD1 = cuda.mem_alloc(outputH1.nbytes)

    cuda.memcpy_htod_async(inputD0, inputH0, stream)
    context.execute_async(
        1, [int(inputD0), int(outputD0),
            int(outputD1)], stream.handle)
    cuda.memcpy_dtoh_async(outputH0, outputD0, stream)
    cuda.memcpy_dtoh_async(outputH1, outputD1, stream)
    stream.synchronize()

    print("outputH0")
    print(
        np.shape(outputH0), "mean=%.2f,var=%.2f,max=%d,min=%d" %
        (np.mean(outputH0), np.var(outputH0), np.max(outputH0),
         np.min(outputH0)))
    print("outputH1")
    print(np.shape(outputH1), "mean=%.2f" % (np.mean(outputH1)))
def model_infer(inputs, context, d_inputs, h_output0, h_output1, d_output0,
                d_output1, stream):
    input_ids = np.asarray(inputs["input_ids"], dtype=np.int32)
    attention_mask = np.asarray(inputs["attention_mask"], dtype=np.int32)
    token_type_ids = np.asarray(inputs["token_type_ids"], dtype=np.int32)

    # Copy inputs
    cuda.memcpy_htod_async(d_inputs[0], input_ids.ravel(), stream)
    cuda.memcpy_htod_async(d_inputs[1], attention_mask.ravel(), stream)
    cuda.memcpy_htod_async(d_inputs[2], token_type_ids.ravel(), stream)
    # start time
    start_time = time.time()
    # Run inference
    context.execute_async(bindings=[int(d_inp) for d_inp in d_inputs] +
                          [int(d_output0), int(d_output1)],
                          stream_handle=stream.handle)
    # Transfer predictions back from GPU
    cuda.memcpy_dtoh_async(h_output0, d_output0, stream)
    cuda.memcpy_dtoh_async(h_output1, d_output1, stream)
    # Synchronize the stream and take time
    stream.synchronize()
    # end time
    end_time = time.time()
    infer_time = end_time - start_time
    outputs = (h_output0, h_output1)
    # print(outputs)
    return outputs, infer_time
Пример #40
0
 def predict(self, input_data):
     """
     predict with async api
     data -> cpu -> GPU -> cpu
     :param input_data:
     :param kwargs:
     :return:
     """
     if self.pre_processing_fn is not None:
         input_data = self.pre_processing_fn(input_data)
     if str(input_data.dtype) != self.input_dtype.__name__:
         logging.warning(
             'dtype of input data:{} is not compilable with engine input:{}, enforcing dtype convertion'
             .format(str(input_data.dtype), self.input_dtype.__name__))
         input_data = self.input_dtype(input_data)
     # input data -> cpu
     np.copyto(self.host_input, input_data.ravel())
     # cpu -> gpu
     cuda.memcpy_htod_async(self.cuda_input, self.host_input, self.stream)
     # Run inference. difference execution api by the way the engine built(implicit/explicit batch size)
     if self.trt_engine.has_implicit_batch_dimension:
         self.context.execute_async(
             bindings=[int(self.cuda_input),
                       int(self.cuda_output)],
             stream_handle=self.stream.handle)
     else:
         self.context.execute_async_v2(
             bindings=[int(self.cuda_input),
                       int(self.cuda_output)],
             stream_handle=self.stream.handle)
     # gpu -> cpu.
     cuda.memcpy_dtoh_async(self.host_output, self.cuda_output, self.stream)
     # Synchronize the stream
     self.stream.synchronize()
     output = self.host_output
     if self.post_processing_fn is not None:
         output = self.post_processing_fn(output)
     # Return the host output.
     return output
Пример #41
0
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):

    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]

    context.execute_async(batch_size=batch_size,
                          bindings=bindings,
                          stream_handle=stream.handle)

    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]

    stream.synchronize()

    return [out.host for out in outputs]
Пример #42
0
    def detect(self, img, conf_th=0.3):
        """Detect objects in the input image."""
        img_resized = _preprocess_trt(img, self.input_shape)
        np.copyto(self.host_inputs[0], img_resized.ravel())

        if self.cuda_ctx:
            self.cuda_ctx.push()
        cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0],
                               self.stream)
        self.context.execute_async(batch_size=1,
                                   bindings=self.bindings,
                                   stream_handle=self.stream.handle)
        cuda.memcpy_dtoh_async(self.host_outputs[1], self.cuda_outputs[1],
                               self.stream)
        cuda.memcpy_dtoh_async(self.host_outputs[0], self.cuda_outputs[0],
                               self.stream)
        self.stream.synchronize()
        if self.cuda_ctx:
            self.cuda_ctx.pop()

        output = self.host_outputs[0]
        return _postprocess_trt(img, output, conf_th)
Пример #43
0
def do_inference(context, h_input, d_input, h_output, d_output, stream):
    # Transfer input data to the GPU.
    cuda.memcpy_htod_async(d_input, h_input, stream)
    # Run inference.
    context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    # Synchronize the stream
    stream.synchronize()

    # Additional test(chenrong06)
    n = 10
    d_in = []
    for i in range(n):
        p = cuda.mem_alloc(h_input.nbytes)
        cuda.memcpy_htod_async(p, h_input, stream)

        start = time.clock()
        context.execute_async(bindings=[int(p), int(d_output)], stream_handle=stream.handle)
        stream.synchronize()
        end = time.clock()
        print("Time used:", end - start)
def run():
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFilePath)

    engine = buildEngine(logger)
    if engine == None:
        print("Failed building engine!")
        return None
    print("Succeeded building engine!")

    context = engine.create_execution_context()
    stream = cuda.Stream()
    inputH0 = np.ascontiguousarray(np.random.rand(height, width).reshape(-1))
    inputH1 = np.ascontiguousarray(
        np.random.rand(8, height, width).reshape(-1))
    inputD0 = cuda.mem_alloc(inputH0.nbytes)
    inputD1 = cuda.mem_alloc(inputH1.nbytes)
    outputH0 = np.empty(context.get_binding_shape(2),
                        dtype=trt.nptype(engine.get_binding_dtype(2)))
    outputH1 = np.empty(context.get_binding_shape(3),
                        dtype=trt.nptype(engine.get_binding_dtype(3)))
    outputD0 = cuda.mem_alloc(outputH0.nbytes)
    outputD1 = cuda.mem_alloc(outputH1.nbytes)

    cuda.memcpy_htod_async(inputD0, inputH0, stream)
    cuda.memcpy_htod_async(inputD1, inputH1, stream)
    stream.synchronize()
    context.execute_async(
        1, [int(inputD0),
            int(inputD1),
            int(outputD0),
            int(outputD1)], stream.handle)
    stream.synchronize()
    cuda.memcpy_dtoh_async(outputH0, outputD0, stream)
    cuda.memcpy_dtoh_async(outputH1, outputD1, stream)
    stream.synchronize()

    print(np.shape(outputH0), np.shape(outputH1))
Пример #45
0
 def infer(self, image_raw):
     threading.Thread.__init__(self)
     # Make self the active context, pushing it on top of the context stack.
     self.cfx.push()
     # Restore
     stream = self.stream
     context = self.context
     engine = self.engine
     host_inputs = self.host_inputs
     cuda_inputs = self.cuda_inputs
     host_outputs = self.host_outputs
     cuda_outputs = self.cuda_outputs
     bindings = self.bindings
     print('ori_shape: ', image_raw.shape)
     # if image_raw is constant, image_raw.shape[1] != self.input_w
     w_ori, h_ori = image_raw.shape[1], image_raw.shape[0]
     # Do image preprocess
     input_image = self.preprocess_image(image_raw)
     # Copy input image to host buffer
     np.copyto(host_inputs[0], input_image.ravel())
     start = time.time()
     # Transfer input data to the GPU.
     cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
     # Run inference.
     context.execute_async(bindings=bindings, stream_handle=stream.handle)
     # Transfer predictions back from the GPU.
     cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
     # Synchronize the stream
     stream.synchronize()
     end = time.time()
     # Remove any context from the top of the context stack, deactivating it.
     self.cfx.pop()
     # Here we use the first row of output in that batch_size = 1
     output = host_outputs[0]
     # Do postprocess
     output = output.reshape(self.input_h, self.input_w).astype('uint8')
     print('output_shape: ', output.shape)
     output = cv2.resize(output, (w_ori, h_ori))
     return output, end - start
def run(inDim, outDatatype):
    print("test", inDim, outDatatype)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFilePath)
    engine = buildEngine(logger, outDatatype)
    if engine == None:
        print("Failed building engine!")
        return None
    print("Succeeded building engine!")

    context = engine.create_execution_context()
    context.set_binding_shape(0, inDim)
    context.set_binding_shape(1, inDim[:1])
    context.set_binding_shape(2, inDim[:1])
    #print("Bind0->",engine.get_binding_shape(0),context.get_binding_shape(0));
    #print("Bind1->",engine.get_binding_shape(1),context.get_binding_shape(1));
    #print("Bind2->",engine.get_binding_shape(2),context.get_binding_shape(2));
    print("All bind:", context.all_binding_shapes_specified)
    stream = cuda.Stream()

    data0 = np.full(inDim, 1, dtype=np.float32)
    data1 = np.random.randint(1, inDim[2], inDim[:1], dtype=np.int32)
    data2 = np.random.randint(1, inDim[3], inDim[:1], dtype=np.int32)
    inputH0 = np.ascontiguousarray(data0)
    inputD0 = cuda.mem_alloc(inputH0.nbytes)
    inputH1 = np.ascontiguousarray(data1)
    inputD1 = cuda.mem_alloc(inputH1.nbytes)
    inputH2 = np.ascontiguousarray(data2)
    inputD2 = cuda.mem_alloc(inputH2.nbytes)
    outputH0 = np.empty(context.get_binding_shape(3),
                        dtype=trt.nptype(engine.get_binding_dtype(3)))
    outputD0 = cuda.mem_alloc(outputH0.nbytes)

    cuda.memcpy_htod_async(inputD0, inputH0, stream)
    cuda.memcpy_htod_async(inputD1, inputH1, stream)
    cuda.memcpy_htod_async(inputD2, inputH2, stream)
    context.execute_async_v2(
        [int(inputD0), int(inputD1),
         int(inputD2), int(outputD0)], stream.handle)
    cuda.memcpy_dtoh_async(outputH0, outputD0, stream)

    stream.synchronize()

    outputH0CPU = mask2DCPU(inputH0, inputH1, inputH2, globalMask2DTrueValue,
                            globalMask2DFalseValue)

    #print("InputH0->",inputH0.shape, engine.get_binding_dtype(0))
    #print(inputH0)
    #print("InputH1->",inputH1.shape, engine.get_binding_dtype(1))
    #print(inputH1)
    #print("InputH2->",inputH2.shape, engine.get_binding_dtype(2))
    #print(inputH2)
    #print("OutputH0->",outputH0.shape, engine.get_binding_dtype(3))
    #print(outputH0)
    #print("OutputH0CPU->",outputH0CPU.shape)
    #print(outputH0CPU)
    print("Check result:",
          ["True" if np.all(outputH0 == outputH0CPU) else "False"][0])
    def inference(self, img):
        """
        Detect objects in the input image.

        Args:
            img: uint8 numpy array with shape (img_height, img_width, channels)

        Returns:
            result: a dictionary contains of [{"id": 0, "bbox": [x1, y1, x2, y2], "score": s% }, {...}, {...}, ...]
        """
        img_resized = self._preprocess_trt(img)
        # transfer the data to the GPU, run inference and the copy the results back
        np.copyto(self.host_inputs[0], img_resized.ravel())

        # Start inference time
        t_begin = time.perf_counter()
        cuda.memcpy_htod_async(
            self.cuda_inputs[0], self.host_inputs[0], self.stream)
        self.engine_context.execute_async(
            batch_size=1,
            bindings=self.bindings,
            stream_handle=self.stream.handle)
        cuda.memcpy_dtoh_async(
            self.host_outputs[1], self.cuda_outputs[1], self.stream)
        cuda.memcpy_dtoh_async(
            self.host_outputs[0], self.cuda_outputs[0], self.stream)
        self.stream.synchronize()
        inference_time = time.perf_counter() - t_begin  # Seconds

        # Calculate Frames rate (fps)
        self.fps = convert_infr_time_to_fps(inference_time)
        output = self.host_outputs[0]
        boxes, scores, classes = self._postprocess_trt(img, output)
        result = []
        for i in range(len(boxes)):  # number of boxes
            if classes[i] == self.class_id + 1:
                result.append({"id": str(classes[i] - 1) + '-' + str(i), "bbox": boxes[i], "score": scores[i]})

        return result
Пример #48
0
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # 1. Transfer input data to the GPU if need.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # 2. Run inference.
    context.execute_async(batch_size=batch_size,
                          bindings=bindings,
                          stream_handle=stream.handle)
    # 3. Transfer predictions back from the GPU if need.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # 4. Synchronize the stream
    stream.synchronize()
    # 5. Return only the host outputs or only the device outputs
    return [out.host for out in outputs]
Пример #49
0
def do_inference(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    #cuda.memcpy_htod_async(d_input, h_input, stream)
    print("inputs", inputs[0])
    print("device", inputs[0].device)
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
    #cuda.memcpy_dtoh_async(h_output, d_output, stream)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    start = time.time()
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    print("engine inference time: %.4f" %(time.time()-start))
    return [out.host for out in outputs]
Пример #51
0
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output,
                 stream, batch_size, height, width):
    """
   This is the function to run the inference
   Args:
      engine : Path to the TensorRT engine. 
      pics_1 : Input images to the model.  
      h_input_1: Input in the host. 
      d_input_1: Input in the device. 
      h_output_1: Output in the host. 
      d_output_1: Output in the device. 
      stream: CUDA stream.
      batch_size : Batch size for execution time.
      height: Height of the output image.
      width: Width of the output image.
   
   Output:
      The list of output images.

   """

    load_images_to_buffer(pics_1, h_input_1)

    with engine.create_execution_context() as context:
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input_1, h_input_1, stream)

        # Run inference.

        context.profiler = trt.Profiler()
        context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)])

        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream.
        stream.synchronize()
        # Return the host output.
        out = h_output.reshape((batch_size, 68, 64, 64))
        return out
Пример #52
0
    def execute(self, solver, stream=None):
        slvr = solver

        if stream is not None:
            cuda.memcpy_htod_async(
                self.rime_const_data[0],
                slvr.const_data().ndary(),
                stream=stream)
        else:
            cuda.memcpy_htod(
                self.rime_const_data[0],
                slvr.const_data().ndary())

        # The gaussian shape array can be empty if
        # no gaussian sources were specified.
        gauss = np.intp(0) if np.product(slvr.gauss_shape.shape) == 0 \
            else slvr.gauss_shape

        sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \
            else slvr.sersic_shape

        self.kernel(slvr.uvw, gauss, sersic,
            slvr.frequency, slvr.antenna1, slvr.antenna2,
            slvr.jones, slvr.flag, slvr.weight_vector,
            slvr.observed_vis, slvr.G_term,
            slvr.model_vis, slvr.chi_sqrd_result,
            stream=stream, **self.launch_params)

        # Call the pycuda reduction kernel.
        # Divide by the single sigma squared value if a weight vector
        # is not required. Otherwise the kernel will incorporate the
        # individual sigma squared values into the sum
        gpu_sum = gpuarray.sum(slvr.chi_sqrd_result).get()

        if not slvr.use_weight_vector():
            slvr.set_X2(gpu_sum/slvr.sigma_sqrd)
        else:
            slvr.set_X2(gpu_sum)
Пример #53
0
    def _copy(self, rdr, gnm):
        """
        Queue a copy of a host genome into a set of device interpolation source
        buffers.

        Note that for now, this is broken! It ignores ``gnm``, and only packs
        the genome that was used when creating the renderer.
        """
        times, knots = rdr.packer.pack(gnm, self.fb.pool)
        cuda.memcpy_htod_async(self.src_a.d_times, times, self.stream_a)
        cuda.memcpy_htod_async(self.src_a.d_knots, knots, self.stream_a)

        palsrc = dict([(v[0], palette_decode(v[1:])) for v in gnm["palette"]])
        ptimes, pvals = zip(*sorted(palsrc.items()))
        palettes = self.fb.pool.allocate((len(palsrc), 256, 4), f32)
        palettes[:] = pvals
        palette_times = self.fb.pool.allocate((self.src_a.max_knots,), f32)
        palette_times.fill(1e9)
        palette_times[: len(ptimes)] = ptimes
        cuda.memcpy_htod_async(self.src_a.d_pals, palettes, self.stream_a)
        cuda.memcpy_htod_async(self.src_a.d_ptimes, palette_times, self.stream_a)
Пример #54
0
 def async_copy(self, dest, src, stream=None):
     drv.memcpy_htod_async(dest.gpudata, src, stream)
Пример #55
0
		if shape is not None:
			res = res.reshape(shape)
		return res

	def toDevice(self, buf, shape=None, async=False, dest=None):
		if shape is not None:
			buf = buf.reshape(shape)

		if dest is None:
			if async:
			# FIXME: there must be a warning in docs that buf has to be pagelocked
				return gpuarray.to_gpu_async(buf, stream=self.stream)
			else:
				return gpuarray.to_gpu(buf)
		else:
			cuda.memcpy_htod_async(dest.gpudata,
				buf, stream=None)

	def copyBuffer(self, buf, dest=None, src_offset=0, dest_offset=0, length=None):

		elem_size = buf.dtype.itemsize
		size = buf.nbytes if length is None else elem_size * length
		src_offset *= elem_size
		dest_offset *= elem_size

		if dest is None:
			ddest = self.allocate(buf.shape, buf.dtype)
		else:
			ddest = dest

		cuda.memcpy_dtod_async(int(ddest.gpudata) + dest_offset,
			int(buf.gpudata) + src_offset,
Пример #56
0
                else:
                    drv.memcpy_dtod(dst.gpudata, src.gpudata, src.nbytes)
            else:
                # The arrays might be contiguous in the sense of
                # having no gaps, but the axes could be transposed
                # so that the order is neither Fortran or C.
                # So, we attempt to get a contiguous view of dst.
                dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,))
                if async:
                    drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream)
                else:
                    drv.memcpy_dtoh(dst, src.gpudata)
        else:
            src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,))
            if async:
                drv.memcpy_htod_async(dst.gpudata, src, stream=stream)
            else:
                drv.memcpy_htod(dst.gpudata, src)
        return

    if len(shape) == 2:
        copy = drv.Memcpy2D()
    elif len(shape) == 3:
        copy = drv.Memcpy3D()
    else:
        raise ValueError("more than 2 discontiguous axes not supported %s" % (tuple(sorted(axes)),))

    if isinstance(src, GPUArray):
        copy.set_src_device(src.gpudata)
    else:
        copy.set_src_host(src)
Пример #57
0
    def set_qsmiles(self,qsmilesmat,qcountsmat,querylengths,querymags=None): #{{{
        """Sets the reference SMILES set to use Lingo matrix *qsmilesmat*, count matrix *qcountsmat*,
        and length vector *querylengths*. If *querymags* is provided, it will be used as the magnitude
        vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.

        Because of hardware limitations, the query matrices (*qsmilesmat* and *qcountsmat*) must have
        no more than 65,536 rows (molecules) and 32,768 columns (Lingos). Larger computations must be performed in tiles.
        """
        # Set up lingo and count matrices on device #{{{

        if self.usePycudaArray:
            # Create CUDAarrays for lingo and count matrices
            print "Strides qsmilesmat:",numpy.ascontiguousarray(qsmilesmat.T).strides
            self.gpu.qsmiles = cuda.matrix_to_array(numpy.ascontiguousarray(qsmilesmat.T),order='C')
            self.gpu.qcounts= cuda.matrix_to_array(numpy.ascontiguousarray(qcountsmat.T),order='C')
            print "qsmiles descriptor",dtos(self.gpu.qsmiles.get_descriptor())
            print "qcounts descriptor",dtos(self.gpu.qcounts.get_descriptor())
            self.gpu.tex2lq.set_array(self.gpu.qsmiles)
            self.gpu.tex2cq.set_array(self.gpu.qcounts)
        else:
            # Manually handle texture setup
            # padded_array will handle making matrix contiguous
            tempqlmat = self._padded_array(qsmilesmat.T)
            if tempqlmat.shape[1] > 65536 or tempqlmat.shape[0] > 32768:
                raise ValueError("Error: query matrix is not allowed to have more than 65536 rows (molecules) or 32768 columns (LINGOs) (both padded to multiple of 16). Dimensions = (%d,%d)"%tempqlmat.shape)
            if self.gpu.qsmiles is None or self.gpu.qsmiles.nbytes < tempqlmat.nbytes:
                self.gpu.qsmiles = cuda.mem_alloc(tempqlmat.nbytes)
                self.gpu.qsmiles.nbytes = tempqlmat.nbytes
            cuda.memcpy_htod_async(self.gpu.qsmiles,tempqlmat,stream=self.gpu.stream)

            tempqcmat = self._padded_array(qcountsmat.T)
            if self.gpu.qcounts is None or self.gpu.qcounts.nbytes < tempqcmat.nbytes:
                self.gpu.qcounts = cuda.mem_alloc(tempqcmat.nbytes)
                self.gpu.qcounts.nbytes = tempqcmat.nbytes
            cuda.memcpy_htod_async(self.gpu.qcounts,tempqcmat,stream=self.gpu.stream)
            
            descriptor = cuda.ArrayDescriptor()
            descriptor.width  = tempqcmat.shape[1]
            descriptor.height = tempqcmat.shape[0]
            descriptor.format = cuda.array_format.UNSIGNED_INT32
            descriptor.num_channels = 1
            self.gpu.tex2lq.set_address_2d(self.gpu.qsmiles,descriptor,tempqlmat.strides[0])
            self.gpu.tex2cq.set_address_2d(self.gpu.qcounts,descriptor,tempqcmat.strides[0])
            #print "Set up query textures with stride=",tempqmat.strides[0]
            self.gpu.stream.synchronize()
            del tempqlmat
            del tempqcmat
        #}}}

        self.qshape = qsmilesmat.shape
        self.nquery = qsmilesmat.shape[0]
        #print "Query shape=",self.qshape,", nquery=",self.nquery

        # Transfer query lengths array to GPU
        self.gpu.ql_gpu = cuda.to_device(querylengths)

        # Allocate buffers for query set magnitudes
        self.gpu.qmag_gpu = cuda.mem_alloc(querylengths.nbytes)
        if querymags is not None:
            cuda.memcpy_htod(self.gpu.qmag_gpu,querymags)
        else:
            # Calculate query set magnitudes on GPU
            magthreads = 256
            self.gpu.qMagKernel(self.gpu.qmag_gpu,self.gpu.ql_gpu,numpy.int32(self.nquery),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cq])
            #self.qmag_gpu = cuda.to_device(qcountsmat.sum(1).astype(numpy.int32))


        return
Пример #58
0
 def run(self, queue):
     cuda.memcpy_htod_async(mv.data, mv.hdata,
                            queue.cuda_stream_comp)
Пример #59
0
    def step(self, t_end=0.0, apply_stochastic_term=True, write_now=True):
        """
        Function which steps n timesteps.
        apply_stochastic_term: Boolean value for whether the stochastic
            perturbation (if any) should be applied.
        """
        n = int(t_end / self.dt + 1)

        if self.t == 0:
            self.bc_kernel.boundaryCondition(self.gpu_stream, \
                                             self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0)
        
        for i in range(0, n):
            # Get new random wind direction (emulationg large-scale model error)
            if(self.max_wind_direction_perturbation > 0.0 and self.wind_stress.type() == 1):
                # max perturbation +/- max_wind_direction_perturbation deg within original wind direction (at t=0)
                perturbation = 2.0*(np.random.rand()-0.5) * self.max_wind_direction_perturbation;
                new_wind_stress = WindStress.GenericUniformWindStress( \
                    rho_air=self.wind_stress.rho_air, \
                    wind_speed=self.wind_stress.wind_speed, \
                    wind_direction=self.wind_stress.wind_direction + perturbation)
                # Upload new wind stress params to device
                cuda.memcpy_htod_async(int(self.wind_stress_dev), new_wind_stress.tostruct(), stream=self.gpu_stream)
                
            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
            
            if (local_dt <= 0.0):
                break
            
            wind_stress_t = np.float32(self.update_wind_stress(self.kernel, self.cdklm_swe_2D))

            #self.bc_kernel.boundaryCondition(self.cl_queue, \
            #            self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1)
            
            # 2nd order Runge Kutta
            if (self.rk_order == 2):

                self.callKernel(self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0, \
                                self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1, \
                                local_dt, wind_stress_t, 0)

                self.bc_kernel.boundaryCondition(self.gpu_stream, \
                        self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1)

                self.callKernel(self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1, \
                                self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0, \
                                local_dt, wind_stress_t, 1)

                self.bc_kernel.boundaryCondition(self.gpu_stream, \
                        self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0)
                
            elif (self.rk_order == 1):
                self.callKernel(self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0, \
                                self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1, \
                                local_dt, wind_stress_t, 0)
                                
                self.gpu_data.swap()

                self.bc_kernel.boundaryCondition(self.gpu_stream, \
                        self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0)

            # 3rd order RK method:
            elif (self.rk_order == 3):

                self.callKernel(self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0, \
                                self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1, \
                                local_dt, wind_stress_t, 0)
                
                self.bc_kernel.boundaryCondition(self.gpu_stream, \
                        self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1)

                self.callKernel(self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1, \
                                self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0, \
                                local_dt, wind_stress_t, 1)

                self.bc_kernel.boundaryCondition(self.gpu_stream, \
                        self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1)

                self.callKernel(self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1, \
                                self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0, \
                                local_dt, wind_stress_t, 2)
                
                self.bc_kernel.boundaryCondition(self.gpu_stream, \
                        self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0)
            
            # Perturb ocean state with model error
            if self.small_scale_perturbation and apply_stochastic_term:
                self.small_scale_model_error.perturbSim(self)
            
            # Evolve drifters
            if self.hasDrifters:
                self.drifters.drift(self.gpu_data.h0, self.gpu_data.hu0, \
                                    self.gpu_data.hv0, \
                                    np.float32(self.constant_equilibrium_depth), \
                                    self.nx, self.ny, self.dx, self.dy, \
                                    local_dt, \
                                    np.int32(2), np.int32(2))
            self.t += np.float64(local_dt)
            self.num_iterations += 1
            
        if self.write_netcdf and write_now:
            self.sim_writer.writeTimestep(self)
            
        return self.t
Пример #60
0
 def to_buf_async(self, cl_buf, stream=None):
     cuda.memcpy_htod_async(cl_buf, self.buffers[cl_buf], stream)