예제 #1
0
파일: GPU_unit.py 프로젝트: Anukura/Vivaldi
def data_finder(u, ss, sp, gpu_direct=True):
	data_package = data_list[u][ss][sp]
	dp = data_package.copy()

	memory_type = dp.memory_type
	if memory_type == 'devptr':
		if gpu_direct:
			devptr = data_list[u][ss][sp].devptr
			return devptr, dp
		else:
			devptr = data_list[u][ss][sp].devptr
			shape = dp.data_memory_shape
			bcmd = dp.data_contents_memory_dtype
			if log_type in ['time','all']: st = time.time()

			buf = numpy.empty((shape), dtype=bcmd)
			cuda.memcpy_dtoh_async(buf, devptr, stream=stream[1])
#			buf = cuda.from_device(devptr, shape, bcmd)
			if log_type in ['time','all']:
				u = dp.unique_id
				bytes = dp.data_bytes
				t = MPI.Wtime()-st
				ms = 1000*t
				bw = bytes/GIGA/t
				log("rank%d, \"%s\", u=%d, GPU%d data transfer from GPU memory to CPU memory, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, device_number, bytes/MEGA, ms, bw),'time', log_type)
			
			dp.memory_type = 'memory'
			dp.data_dtype = type(buf)
			return buf, dp
	else:
		data = data_list[u][ss][sp].data
		return data, dp
	return None, None
예제 #2
0
    def synchronize_start(self):
        """ Start the synchronization process. """

        # Use shorter, easier names for class variables.
        bufs = self._sync_buffers
        ptrs = self._sync_ptrs
        streams = self._sync_streams
        adj = self._sync_adj

        # Start the transfer operations needed.
        self._sync_tags = [mpi_tag() for k in range(2)] # Mpi message tags.

        # Forward send.
        drv.memcpy_dtoh_async(bufs[0], ptrs['forw_src'], stream=streams[0]) 

        # Backward send.
        drv.memcpy_dtoh_async(bufs[1], ptrs['back_src'], stream=streams[1]) 

        # Forward receive.
        self._sync_req_forw = comm.Irecv(bufs[2], source=adj['back'], \
                                            tag=self._sync_tags[0])

        # Backward receive.
        self._sync_req_back = comm.Irecv(bufs[3], source=adj['forw'], \
                                            tag=self._sync_tags[1])

        # Signalling variables needed to complete transfers.
        self._sync_part2_start = [False, False, False, False]
예제 #3
0
파일: GPU_unit.py 프로젝트: Anukura/Vivaldi
def send(data, data_package, dest=None, gpu_direct=True):
	global s_requests
	tag = 52
	dp = data_package
	# send data_package
	send_data_package(dp, dest=dest, tag=tag)

	bytes = dp.data_bytes
	memory_type = dp.memory_type
	
	if log_type in ['time','all']: st = time.time()

	flag = False
	request = None
	if memory_type == 'devptr': # data in the GPU
		if gpu_direct: # want to use GPU direct
			devptr = data
			buf = MPI.make_buffer(devptr.__int__(), bytes)
			ctx.synchronize()
			request = comm.Isend([buf, MPI.BYTE], dest=dest, tag=57)
			if VIVALDI_BLOCKING: MPI.Request.Wait(request)
			s_requests.append((request, buf, devptr))
			flag = True
		else:# not want to use GPU direct
		
			# copy to CPU
			shape = dp.data_memory_shape
			dtype = dp.data_contents_memory_dtype
			buf = numpy.empty(shape, dtype=dtype)
			cuda.memcpy_dtoh_async(buf, data, stream=stream_list[1])

			request = comm.Isend(buf, dest=dest, tag=57)
			if VIVALDI_BLOCKING: MPI.Request.Wait(request)
			s_requests.append((request, buf, None))
			
	else: # data in the CPU
		# want to use GPU direct, not exist case
		# not want to use GPU direct
		if dp.data_dtype == numpy.ndarray: 
			request = comm.Isend(data, dest=dest, tag=57)
			if VIVALDI_BLOCKING: MPI.Request.Wait(request)
			s_requests.append((request, data, None))
			
	if log_type in ['time','all']:
		u = dp.unique_id
		bytes = dp.data_bytes
		t = MPI.Wtime()-st
		ms = 1000*t
		bw = bytes/GIGA/t
	
		if flag:
			log("rank%d, \"%s\", u=%d, from rank%d to rank%d GPU direct send, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, rank, dest, bytes/MEGA, ms, bw),'time', log_type)
		else:
			log("rank%d, \"%s\", u=%d, from rank%d to rank%d MPI data transfer, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, rank, dest, bytes/MEGA, ms, bw),'time', log_type)
	
	return request
예제 #4
0
 def copy(self, fb, dim, pool, stream=None):
     fmt = 'u1'
     if self.pix_fmt in ('yuv444p10', 'yuv420p10', 'yuv444p12'):
         fmt = 'u2'
     dims =  (3, dim.h, dim.w)
     if self.pix_fmt == 'yuv420p10':
         dims = (dim.h * dim.w * 6 / 4,)
     h_out = pool.allocate(dims, fmt)
     cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
     return h_out
예제 #5
0
파일: packing.py 프로젝트: Aerojspark/PyFR
            def run(self, scomp, scopy):
                # Pack
                kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow,
                                         v.nvcol, v.basedata, v.mapping,
                                         v.cstrides or 0, v.rstrides or 0, m)

                # Copy the packed buffer to the host
                event.record(scomp)
                scopy.wait_for_event(event)
                cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
예제 #6
0
 def get(self, stream=None):
     """
     copy device array to host.
     Returns:
         the host numpy array
     """
     assert self.is_contiguous, "Array in get() must be contiguous"
     ary = np.empty(self.shape, self.dtype)
     drv.memcpy_dtoh_async(ary, self.gpudata, stream)
     return ary
예제 #7
0
파일: gpuarray.py 프로젝트: minrk/PyCUDA
    def get_async(self, stream=None, ary=None):
        if ary is None:
            ary = drv.pagelocked_empty(self.shape, self.dtype)
        else:
            assert ary.size == self.size
            assert ary.dtype == self.dtype

        if self.size:
            drv.memcpy_dtoh_async(ary, self.gpudata, stream)
        return ary
예제 #8
0
def cpy_back(a, a_gpu, auto_init_context=True):
    """Data transfer from device to host.
    
    Asynchronous will be enabled when auto_init_context is True, otherwise
    use normal transfer.
    """

    import pycuda.driver as drv

    if auto_init_context:
        strm = drv.Stream()
        drv.memcpy_dtoh_async(a, a_gpu, strm)
        return strm
    else:
        drv.memcpy_dtoh(a, a_gpu)
예제 #9
0
    def get_async(self, stream=None, ary=None):
        if ary is None:
            ary = drv.pagelocked_empty(self.shape, self.dtype)

            ary = _as_strided(ary, strides=self.strides)
        else:
            assert ary.size == self.size
            assert ary.dtype == self.dtype
            assert ary.flags.forc

        assert self.flags.forc, "Array in get() must be contiguous"

        if self.size:
            drv.memcpy_dtoh_async(ary, self.gpudata, stream)
        return ary
예제 #10
0
 def get_host_result(self):
     if not self.gpu_finished:
         if self.gpu_finished_evt.query():
             self.gpu_finished = True
             self.copy_stream = get_stream()
             self.host_dest = self.pagelocked_allocator(
                     self.gpu_result.shape, self.gpu_result.dtype,
                     self.copy_stream)
             drv.memcpy_dtoh_async(self.host_dest,
                     self.gpu_result.gpudata,
                     self.copy_stream)
             self.copy_finished_evt = drv.Event()
             self.copy_finished_evt.record()
     else:
         if self.copy_finished_evt.query():
             STREAM_POOL.append(self.copy_stream)
             return self.host_dest
예제 #11
0
파일: parray.py 프로젝트: bionet/vtem
 def get_async(self, stream = None, ary = None):
     if ary is None:
         ary = cuda.pagelocked_empty(self.shape, self.dtype)
         
     else:
         assert ary.size == self.size
         assert ary.dtype == ary.dtype
         if ary.base.__class__ != cuda.HostAllocation:
             raise TypeError("asynchronous memory trasfer requires pagelocked numpy array")
     
     if self.size:
         if self.M == 1:
             cuda.memcpy_dtoh_async(ary, self.gpudata, stream)
         else:
             PitchTrans(self.shape, ary, _pd(self.shape), self.gpudata, self.ld, self.dtype, async = True, stream = stream)
             
     return ary
예제 #12
0
파일: packing.py 프로젝트: bartwozniak/PyFR
            def run(self, scomp, scopy):
                # If we are unpacking then copy the host buffer to the GPU
                if op == 'unpack':
                    cuda.memcpy_htod_async(m.data, m.hdata, scopy)
                    event.record(scopy)
                    scomp.wait_for_event(event)

                # Call the CUDA kernel (pack or unpack)
                fn.prepared_async_call(grid, block, scomp, v.nrow, v.ncol,
                                       v.mapping, v.strides, m,
                                       v.mapping.leaddim, v.strides.leaddim,
                                       m.leaddim)

                # If we have been packing then copy the GPU buffer to the host
                if op == 'pack':
                    event.record(scomp)
                    scopy.wait_for_event(event)
                    cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
예제 #13
0
 def execute():
     # Allocate device memory for inputs and outputs.
     d_input = cuda.mem_alloc(h_input.nbytes)
     d_output = cuda.mem_alloc(h_output.nbytes)
     # Create a stream in which to copy inputs/outputs and run inference.
     stream = cuda.Stream()
     with engine.create_execution_context() as context:
         # Transfer input data to the GPU.
         cuda.memcpy_htod_async(d_input, h_input, stream)
         # Run inference.
         context.execute_async(bindings=[int(d_input),
                                         int(d_output)],
                               stream_handle=stream.handle)
         # Transfer predictions back from the GPU.
         cuda.memcpy_dtoh_async(h_output, d_output, stream)
         # Synchronize the stream
         stream.synchronize()
         # Return the host output.
     return h_output
예제 #14
0
    def _compute(self, data: np.ndarray,
                 context: trt.tensorrt.IExecutionContext) -> np.ndarray:
        assert (data.dtype == self.array_in_dtype)
        assert (data.shape == self.array_in_shape)

        self.h_input = data

        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(self.d_input, self.h_input, self.stream)
        # Run inference.
        context.execute_async(bindings=[int(self.d_input),
                                        int(self.d_output)],
                              stream_handle=self.stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(self.h_output, self.d_output, self.stream)
        # Synchronize the stream
        self.stream.synchronize()
        # Return the host output.
        return self.h_output.reshape(self.array_out_shape)
예제 #15
0
def inference(engine_file_path):
    engine, context = load(engine_file_path)

    # get sizes of input and output and allocate memory required for input data and for output data
    device_input = None
    device_output, host_output = None, None
    for binding in engine:
        if engine.binding_is_input(binding):  # we expect only one input
            input_shape = engine.get_binding_shape(binding)
            input_size = trt.volume(
                input_shape) * engine.max_batch_size * np.dtype(
                    np.float32).itemsize  # in bytes
            device_input = cuda.mem_alloc(input_size)
        else:  # and one output
            output_shape = engine.get_binding_shape(binding)
            # create page-locked memory buffers (i.e. won't be swapped to disk)
            host_output = cuda.pagelocked_empty(trt.volume(output_shape) *
                                                engine.max_batch_size,
                                                dtype=np.float32)
            device_output = cuda.mem_alloc(host_output.nbytes)
    assert device_input
    assert device_output

    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()

    # preprocess input data
    host_input = np.array(np.random.rand(1, 3, 50, 200),
                          dtype=np.float32,
                          order='C')
    cuda.memcpy_htod_async(device_input, host_input, stream)

    # run inference
    context.execute_async(bindings=[int(device_input),
                                    int(device_output)],
                          stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(host_output, device_output, stream)
    stream.synchronize()

    # postprocess results
    output_data = torch.Tensor(host_output).reshape(engine.max_batch_size,
                                                    output_shape[0])
    print(output_data)
예제 #16
0
def predict():

    time1 = time.time()
    cuda.init()
    device = cuda.Device(0)
    ctx = device.make_context()
    time2 = time.time()
    print("time to get context : ", time2 - time1)

    with builder.build_cuda_engine(network) as engine:
        output = np.empty(10 * BATCH_SIZE, dtype=np.float32)

        d_input = cuda.mem_alloc(1 * img.nbytes)
        d_output = cuda.mem_alloc(1 * output.nbytes)
        bindings = [int(d_input), int(d_output)]

        stream = cuda.Stream()

        with engine.create_execution_context() as context:
            cuda.memcpy_htod_async(d_input, img, stream)

            context.execute_async(bindings=bindings,
                                  stream_handle=stream.handle,
                                  batch_size=BATCH_SIZE)

            cuda.memcpy_dtoh_async(output, d_output, stream)

            stream.synchronize()

            # print("true label : ", label)

            result = []
            accuracy = np.zeros((1, BATCH_SIZE), np.uint8)
            for ii in range(BATCH_SIZE):
                result.append(np.argmax(output[ii * 10:(ii + 1) * 10]))
                if result[ii] == label[ii]:
                    accuracy[0, ii] = 1
                # print(output[ii*10:(ii+1)*10])
            # print(result)
            print("accuracy : ", np.sum(accuracy) / BATCH_SIZE)

    ctx.pop()
    return "Done\n"  #str(output)
예제 #17
0
 def inference(self, img):
     # copy img to input memory
     # self.inputs[0]['host'] = np.ascontiguousarray(img)
     self.inputs[0]['host'] = np.ravel(img)
     # transfer data to the gpu
     for inp in self.inputs:
         cuda.memcpy_htod_async(inp['device'], inp['host'], self.stream)
     # run inference
     start = time.time()
     self.context.execute_async_v2(bindings=self.bindings,
                                   stream_handle=self.stream.handle)
     end = time.time()
     print('execution time:', end - start)
     # fetch outputs from gpu
     for out in self.outputs:
         cuda.memcpy_dtoh_async(out['host'], out['device'], self.stream)
     # synchronize stream
     self.stream.synchronize()
     return [out['host'] for out in self.outputs]
예제 #18
0
    def inference(self, img):

        self.watch.start()
        ih, iw = img.shape[:-1]
        if (iw, ih) != self.input_shape:
            img = cv2.resize(img, self.input_shape)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = img.transpose((2, 0, 1)).astype(np.float32)
        img *= (2.0 / 255.0)
        img -= 1.0
        self.watch.stop(Stopwatch.MODE_PREPROCESS)

        self.watch.start()
        np.copyto(self.host_inputs[0], img.ravel())
        cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0],
                               self.stream)
        self.context.execute_async(batch_size=1,
                                   bindings=self.bindings,
                                   stream_handle=self.stream.handle)
        cuda.memcpy_dtoh_async(self.host_outputs[1], self.cuda_outputs[1],
                               self.stream)
        cuda.memcpy_dtoh_async(self.host_outputs[0], self.cuda_outputs[0],
                               self.stream)
        self.stream.synchronize()
        self.watch.stop(Stopwatch.MODE_INFER)

        self.watch.start()
        output = self.host_outputs[0]
        results = []
        for prefix in range(0, len(output), 7):

            conf = float(output[prefix + 2])
            if conf < 0.5:
                continue
            x1 = output[prefix + 3] * iw
            y1 = output[prefix + 4] * ih
            x2 = (output[prefix + 5] - output[prefix + 3]) * iw
            y2 = (output[prefix + 6] - output[prefix + 4]) * ih
            cls = int(output[prefix + 1])
            results.append(((x1, y1, x2, y2), cls, conf))
        self.watch.stop(Stopwatch.MODE_POSTPROCESS)

        return results
예제 #19
0
 def inference(self, inputs):
     np.copyto(self.host_inputs[0], inputs[0].ravel())
     np.copyto(self.host_inputs[1], inputs[1].ravel())
     if self.cuda_ctx:
         self.cuda_ctx.push()
     cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0],
                            self.stream)
     cuda.memcpy_htod_async(self.cuda_inputs[1], self.host_inputs[1],
                            self.stream)
     self.context.execute_async(batch_size=1,
                                bindings=self.bindings,
                                stream_handle=self.stream.handle)
     cuda.memcpy_dtoh_async(self.host_outputs[0], self.cuda_outputs[0],
                            self.stream)
     self.stream.synchronize()
     if self.cuda_ctx:
         self.cuda_ctx.pop()
     output = self.host_outputs[0]
     return output
예제 #20
0
 def may_download(self):
     ctx = self.cuda_context
     if self.pixels is not None or not ctx or self.freed:
         return
     assert self.cuda_device_buffer, "bug: no device buffer"
     start = monotonic_time()
     ctx.push()
     host_buffer = driver.pagelocked_empty(self.buffer_size,
                                           dtype=numpy.byte)
     driver.memcpy_dtoh_async(host_buffer, self.cuda_device_buffer,
                              self.stream)
     self.wait_for_stream()
     self.pixels = host_buffer.tobytes()
     elapsed = monotonic_time() - start
     log("may_download() from %#x to %s, size=%s, elapsed=%ims - %iMB/s",
         int(self.cuda_device_buffer), host_buffer, self.buffer_size,
         int(1000 * elapsed), self.buffer_size / elapsed / 1024 / 1024)
     self.free_cuda()
     ctx.pop()
예제 #21
0
    def infer(self, input_img, output_size, num_binding):
        #self.runtime=self.create_runtime()
        #self.context=self.create_context()

        assert (self.__engine.get_nb_bindings() == num_binding)
        output = np.empty(output_size, dtype=np.float32)

        d_input = cuda.mem_alloc(self.batchsize * input_img.size *
                                 input_img.dtype.itemsize)
        d_output = cuda.mem_alloc(self.batchsize * output.size *
                                  output.dtype.itemsize)

        # pointers to gpu memory
        bindings = [int(d_input), int(d_output)]

        stream = cuda.Stream()

        #transfer input data to device
        cuda.memcpy_htod_async(d_input, input_img, stream)

        #execute model
        self.context.enqueue(self.batchsize, bindings, stream.handle, None)

        #transfer predictions back
        cuda.memcpy_dtoh_async(output, d_output, stream)

        #syncronize threads
        stream.synchronize()

        print 'all of activities in stream is done: {}'.format(
            stream.is_done())
        #destroy cuda context
        d_input.free()
        d_output.free()

        print 1999 - cuda.mem_get_info()[0] / 1048576, cuda.mem_get_info(
        )[1] / 1048576

        #self.context.destroy

        #self.runtime.destroy()

        return output
예제 #22
0
    def detect(self, img, conf_th=0.3, conf_class=[]):
        """Detect objects in the input image."""
        img_resized = _preprocess_trt(img, self.input_shape)
        np.copyto(self.host_inputs[0], img_resized.ravel())

        cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0],
                               self.stream)
        self.context.execute_async(batch_size=1,
                                   bindings=self.bindings,
                                   stream_handle=self.stream.handle)
        cuda.memcpy_dtoh_async(self.host_outputs[1], self.cuda_outputs[1],
                               self.stream)
        cuda.memcpy_dtoh_async(self.host_outputs[0], self.cuda_outputs[0],
                               self.stream)
        self.stream.synchronize()

        output = self.host_outputs[0]
        return _postprocess_trt(img, output, conf_th, self.output_layout,
                                conf_class)
예제 #23
0
    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        import numpy
        shape = (32, 8)
        a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        a[:] = numpy.random.randn(*shape)
        b[:] = numpy.random.randn(*shape)

        a_gpu = drv.mem_alloc(a.nbytes)
        b_gpu = drv.mem_alloc(b.nbytes)

        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)
        drv.memcpy_htod_async(b_gpu, b, strm)
        strm.synchronize()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(drv.Out(dest),
                      a_gpu,
                      b_gpu,
                      block=shape + (1, ),
                      stream=strm)
        strm.synchronize()

        drv.memcpy_dtoh_async(a, a_gpu, strm)
        drv.memcpy_dtoh_async(b, b_gpu, strm)
        strm.synchronize()

        assert la.norm(dest - a * b) == 0
예제 #24
0
def do_inference(engine, tensor, h_input_1, d_input_1, h_output, d_output,
                 stream, batch_size1):
    """
   This is the function to run the inference
   Args:
      engine : Path to the TensorRT engine. 
      tensor : Input VTK file to the model.  
      h_input_1: Input in the host. 
      d_input_1: Input in the device. 
      h_output_1: Output in the host. 
      d_output_1: Output in the device. 
      stream: CUDA stream.
      batch_size : Batch size for execution time.
      height: Height of the output image.
      width: Width of the output image.
   
   Output:
      The list of output images.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
   """
    print("[INFO] load file to buffer...")
    load_file_to_buffer(tensor, h_input_1)

    with engine.create_execution_context() as context:
        # Transfer input data to the GPU.
        start = time.time()
        cuda.memcpy_htod_async(d_input_1, h_input_1, stream)
        # Run inference.
        #context.profiler = trt.Profiler()
        context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)])
        #context.execute_async(batch_size=1, bindings=[int(d_input_1), int(d_output)], stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream.
        stream.synchronize()
        # Return the host output.
        end = time.time()
        out = h_output.reshape((batch_size1, 18, 18, 8, 3))
        print("inference time by TensorRT for one batch with size of {}: {}".
              format(batch_size1, end - start))

        return out
예제 #25
0
    def _test_model(self,
                    model_name,
                    input_shape=(3, 224, 224),
                    normalization_hint=0):

        model = getattr(models, model_name)(pretrained=True)

        shape = (1, ) + input_shape
        dummy_input = (torch.randn(shape), )
        onnx_name = model_name + ".onnx"

        torch.onnx.export(model,
                          dummy_input,
                          onnx_name,
                          input_names=[],
                          output_names=[],
                          verbose=False,
                          export_params=True,
                          opset_version=9)

        with self.build_engine_onnx(onnx_name) as engine:
            h_input, d_input, h_output, d_output, stream = allocate_buffers(
                engine)
            with engine.create_execution_context() as context:
                err_count = 0
                for index, f in enumerate(self.image_files):
                    test_case = load_normalized_test_case(input_shape, f,\
                        h_input, normalization_hint)
                    cuda.memcpy_htod_async(d_input, h_input, stream)

                    context.execute_async_v2(bindings=[d_input, d_output],
                                             stream_handle=stream.handle)
                    cuda.memcpy_dtoh_async(h_output, d_output, stream)
                    stream.synchronize()

                    amax = np.argmax(h_output)
                    pred = self.labels[amax]
                    if "_".join(pred.split()) not in\
                            os.path.splitext(os.path.basename(test_case))[0]:
                        err_count = err_count + 1
                self.assertLessEqual(err_count, 1,
                                     "Too many recognition errors")
예제 #26
0
    def inference(self, img):
        """
        Detect objects in the input image.

        Args:
            img: uint8 numpy array with shape (img_height, img_width, channels)

        Returns:
            result: a dictionary contains of [{"id": 0, "bbox": [x1, y1, x2, y2], "score": s% }, {...}, {...}, ...]
        """
        img_resized = self._preprocess_trt(img)
        # transfer the data to the GPU, run inference and the copy the results back
        np.copyto(self.host_inputs[0], img_resized.ravel())

        # Start inference time
        t_begin = time.perf_counter()
        cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0],
                               self.stream)
        self.context.execute_async(batch_size=1,
                                   bindings=self.bindings,
                                   stream_handle=self.stream.handle)
        cuda.memcpy_dtoh_async(self.host_outputs[1], self.cuda_outputs[1],
                               self.stream)
        cuda.memcpy_dtoh_async(self.host_outputs[0], self.cuda_outputs[0],
                               self.stream)
        self.stream.synchronize()
        inference_time = time.perf_counter() - t_begin  # Seconds

        # Calculate Frames rate (fps)
        self.fps = convert_infr_time_to_fps(inference_time)
        output = self.host_outputs[0]
        boxes, scores, classes = self._postprocess_trt(img, output)
        result = []
        for i in range(len(boxes)):  # number of boxes
            if classes[i] == self.class_id + 1:
                result.append({
                    "id": str(classes[i] - 1) + '-' + str(i),
                    "bbox": boxes[i],
                    "score": scores[i]
                })

        return result
예제 #27
0
def run(batchSize, nRow, nCol):
    print("test", batchSize, nRow, nCol)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFilePath)

    engine = buildEngine(logger, nRow, nCol)
    if engine == None:
        print("Failed building engine!")
        return None
    print("Succeeded building engine!")

    context = engine.create_execution_context()
    stream = cuda.Stream()

    condition = np.array(np.random.randint(0, 2, [batchSize, nRow, nCol]),
                         dtype=np.int32)
    inputX = np.full([batchSize, nRow, nCol], 1, dtype=np.float32)
    inputY = np.full([batchSize, nRow, nCol], -1, dtype=np.float32)
    inputH0 = np.ascontiguousarray(condition.reshape(-1))
    inputH1 = np.ascontiguousarray(inputX.reshape(-1))
    inputH2 = np.ascontiguousarray(inputY.reshape(-1))
    inputD0 = cuda.mem_alloc(inputH0.nbytes)
    inputD1 = cuda.mem_alloc(inputH1.nbytes)
    inputD2 = cuda.mem_alloc(inputH2.nbytes)
    outputH0 = np.empty((batchSize, ) + tuple(engine.get_binding_shape(3)),
                        dtype=trt.nptype(engine.get_binding_dtype(3)))
    outputD0 = cuda.mem_alloc(outputH0.nbytes)

    cuda.memcpy_htod_async(inputD0, inputH0, stream)
    cuda.memcpy_htod_async(inputD1, inputH1, stream)
    cuda.memcpy_htod_async(inputD2, inputH2, stream)
    context.execute_async(
        batchSize,
        [int(inputD0), int(inputD1),
         int(inputD2), int(outputD0)], stream.handle)
    cuda.memcpy_dtoh_async(outputH0, outputD0, stream)
    stream.synchronize()

    outputH0CPU = whereCPU(condition, inputX, inputY)
    print("Check result:",
          ["True" if np.all(outputH0 == outputH0CPU) else "False"][0])
예제 #28
0
    def detect(self, img, conf_thr=0.4, nms_thr=0.4):
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        input_image = _preprocess_yolo(img, self.input_shape)
        # Copy input image to host buffer
        np.copyto(host_inputs[0], input_image.ravel())
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()

        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        raw_h, raw_w = img.shape[:2]
        print("raw shape:", img.shape)
        w_scale = raw_w / self.input_shape[1]
        h_scale = raw_h / self.input_shape[0]
        boxes, scores, classes = _postprocess_yolo(output, w_scale, h_scale,
                                                   conf_thr, nms_thr)

        boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0, img.shape[1] - 1)
        boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0, img.shape[0] - 1)

        for i in range(len(boxes)):
            box = boxes[i]
            plot_one_box(box,
                         img,
                         label="{}:{:.2f}".format(classes[i], scores[i]))
        cv2.imwrite('test.jpg', img)
        return boxes, scores, classes
예제 #29
0
def predict_image_dn_trt(noised, clean, out_shape, context):
    x = torch.from_numpy(np.float32(noised / 255)).permute(2, 0, 1).unsqueeze(0)
    image = np.ascontiguousarray(x)
    d_input = cuda.mem_alloc(1 * image.size * image.dtype.itemsize)
    y = np.empty(out_shape, dtype=np.float32)
    d_output = cuda.mem_alloc(1 * y.size * y.dtype.itemsize)
    bindings = [int(d_input), int(d_output)]
    stream = cuda.Stream()
    cuda.memcpy_htod_async(d_input, image, stream)
    context.execute_async(int(1), bindings, stream.handle, None)
    cuda.memcpy_dtoh_async(y, d_output, stream)
    stream.synchronize()
    y = torch.from_numpy(y)
    if clean is not None:
        xx = torch.from_numpy(np.float32(clean / 255)).permute(2, 0, 1).unsqueeze(0)
        psnr_ = np.round(batch_PSNR(y, xx, data_range=1.0), 3)
    else:
        psnr_ = 0
    img = np.array(y.cpu().squeeze(0).permute(1, 2, 0) * 255).astype('uint8')
    return psnr_, img
예제 #30
0
def infer(context, h_input, data_type=np.float32):
    # dynamic shape, batch size
    ## context.active_optimization_profile = 0
    ## context.set_binding_shape(0, (h_input.shape))
    # print(context.get_binding_shape(0), context.get_binding_shape(1))
    size = trt.volume(h_input.shape) * np.dtype(data_type).itemsize
    d_input = cuda.mem_alloc(size)
    h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)),
                                     dtype=data_type)
    d_output = cuda.mem_alloc(h_output.nbytes)

    stream = cuda.Stream()
    cuda.memcpy_htod_async(d_input, h_input, stream)
    context.execute_async_v2(bindings=[int(d_input),
                                       int(d_output)],
                             stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(h_outpdataut, d_output, stream)
    stream.synchronize()

    return h_output
예제 #31
0
 def _do_inference(self):
     # Transfer input data to the GPU.(optionally serialized via stream)
     [cuda.memcpy_htod_async(inp.device, inp.host, self.stream) for inp in self.inputs]
     # Run inference.
     self.context.execute_async(bindings=self.bindings, stream_handle=self.stream.handle)
     # Transfer predictions back from the GPU.(optionally serialized via stream)
     [cuda.memcpy_dtoh_async(out.host, out.device, self.stream) for out in self.outputs]
     # Synchronize the stream
     self.stream.synchronize()
     # Return only the host outputs.
     return [out.host for out in self.outputs]
예제 #32
0
	def infer(cls, context, bindings, inputs, outputs, stream, batch_size=1):
		# Transfer input data to the GPU.
		[cuda.memcpy_htod_async(inp.device_memory, inp.host_memory, stream) for inp in inputs]
		# Run inference.
		context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
		# Transfer predictions back from the GPU.
		[cuda.memcpy_dtoh_async(out.host_memory, out.device_memory, stream) for out in outputs]
		# Synchronize the stream
		stream.synchronize()
		# Return only the host outputs.
		return [out.host_memory for out in outputs]
예제 #33
0
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    
    # Run inference.
    success_flag = context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # Bug

    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream

    return [out.host for out in outputs]
예제 #34
0
def trt_inference(stream, trt_ctx, d_input, d_output, input_signal,
                  input_signal_length):
    print("infer with shape: {}".format(input_signal.shape))
    trt_ctx.set_binding_shape(0, input_signal.shape)
    assert trt_ctx.all_binding_shapes_specified

    h_output = cuda.pagelocked_empty(tuple(trt_ctx.get_binding_shape(1)),
                                     dtype=np.float32)

    h_input_signal = cuda.register_host_memory(
        np.ascontiguousarray(input_signal.cpu().numpy().ravel()))
    cuda.memcpy_htod_async(d_input, h_input_signal, stream)
    trt_ctx.execute_async_v2(bindings=[int(d_input),
                                       int(d_output)],
                             stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    stream.synchronize()

    greedy_predictions = torch.tensor(h_output).argmax(dim=-1, keepdim=False)
    return greedy_predictions
예제 #35
0
def do_inference(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]
예제 #36
0
def predict(context, batch, d_input, stream, bindings, p_output,
            d_output):  # result gets copied into output
    # transfer input data to device
    cuda.memcpy_htod_async(d_input, batch, stream)
    # execute model
    context.execute_async_v2(bindings, stream.handle, None)
    # transfer predictions back
    cuda.memcpy_dtoh_async(p_output, d_output, stream)
    # syncronize threads
    stream.synchronize()

    # =============================================================================
    #     print(f"lr_shape: {batch.shape}"
    #           f"lr input : {batch}")
    #
    #     print(f"hr_shape: {p_output.shape}"
    #           f"hr output: {p_output}")
    # =============================================================================

    return p_output
예제 #37
0
파일: test_driver.py 프로젝트: minrk/PyCUDA
    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        import numpy
        shape = (32,8)
        a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        a[:] = numpy.random.randn(*shape)
        b[:] = numpy.random.randn(*shape)

        a_gpu = drv.mem_alloc(a.nbytes)
        b_gpu = drv.mem_alloc(b.nbytes)

        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)
        drv.memcpy_htod_async(b_gpu, b, strm)
        strm.synchronize()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(
                drv.Out(dest), a_gpu, b_gpu,
                block=shape+(1,), stream=strm)
        strm.synchronize()

        drv.memcpy_dtoh_async(a, a_gpu, strm)
        drv.memcpy_dtoh_async(b, b_gpu, strm)
        strm.synchronize()

        assert la.norm(dest-a*b) == 0
예제 #38
0
def main():

    # initialize TensorRT engine and parse ONNX model
    engine, context = build_engine(ONNX_FILE_PATH)

    # get sizes of input and output and allocate memory required for input data and for output data
    for binding in engine:
        if engine.binding_is_input(binding):  # we expect only one input
            input_shape = engine.get_binding_shape(binding)
            input_size = trt.volume(
                input_shape) * engine.max_batch_size * np.dtype(
                    np.float32).itemsize  # in bytes
            device_input = cuda.mem_alloc(input_size)
        else:  # and one output
            output_shape = engine.get_binding_shape(binding)
            # create page-locked memory buffers (i.e. won't be swapped to disk)
            host_output = cuda.pagelocked_empty(trt.volume(output_shape) *
                                                engine.max_batch_size,
                                                dtype=np.float32)
            device_output = cuda.mem_alloc(host_output.nbytes)

    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()

    # preprocess input data
    host_input = np.array(preprocess_image("input.jpeg").numpy(),
                          dtype=np.float32,
                          order='C')
    cuda.memcpy_htod_async(device_input, host_input, stream)

    # run inference
    context.execute_async(bindings=[int(device_input),
                                    int(device_output)],
                          stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(host_output, device_output, stream)
    stream.synchronize()

    # postprocess results
    output_data = torch.Tensor(host_output).reshape(engine.max_batch_size,
                                                    output_shape[1])
    postprocess(output_data)
    def loadONNX2TensorRT(self, filepath):
        '''
        通过onnx文件,构建TensorRT运行引擎
        :param filepath: onnx文件路径
        '''

        engine = self.ONNX_build_engine(filepath)

        # 读取测试集
        datas = DataLoaders()
        test_loader = datas.testDataLoader()
        img, target = next(iter(test_loader))
        img = img.numpy()
        target = target.numpy()

        img = img.ravel()

        context = engine.create_execution_context()
        output = np.empty((100, 10), dtype=np.float32)

        # 分配内存
        d_input = cuda.mem_alloc(1 * img.size * img.dtype.itemsize)
        d_output = cuda.mem_alloc(1 * output.size * output.dtype.itemsize)
        bindings = [int(d_input), int(d_output)]

        # pycuda操作缓冲区
        stream = cuda.Stream()
        # 将输入数据放入device
        cuda.memcpy_htod_async(d_input, img, stream)
        # 执行模型
        context.execute_async(100, bindings, stream.handle, None)
        # 将预测结果从从缓冲区取出
        cuda.memcpy_dtoh_async(output, d_output, stream)
        # 线程同步
        stream.synchronize()

        print("Test Case: " + str(target))
        print("Prediction 100: " + str(np.argmax(output, axis=1)))

        del context
        del engine
예제 #40
0
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width):
   """
   This is the function to run the inference
   Args:
      engine : Path to the TensorRT engine 
      pics_1 : Input images to the model.  
      h_input_1: Input in the host         
      d_input_1: Input in the device 
      h_output_1: Output in the host 
      d_output_1: Output in the device 
      stream: CUDA stream
      batch_size : Batch size for execution time
      height: Height of the output image
      width: Width of the output image
   
   Output:
      The list of output images

   """
   print('load images to buffer')
   load_images_to_buffer(pics_1, h_input_1)

   with engine.create_execution_context() as context:
       context.debug_sync = False
       # Transfer input data to the GPU.
       cuda.memcpy_htod_async(d_input_1, h_input_1, stream)

       # Run inference.
       print('load profiler')
       context.profiler = trt.Profiler()
       print('execute')
       context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)])
       print('Transfer predictions back from the GPU.')
       # Transfer predictions back from the GPU.
       cuda.memcpy_dtoh_async(h_output, d_output, stream)
       # Synchronize the stream
       stream.synchronize()
       # Return the host output.
       print(h_output.shape)
       out = h_output.reshape((1,-1))
       return out 
예제 #41
0
 def run_inference(self):
     """
     This function is generalized for multiple inputs/outputs.
     inputs and outputs are expected to be lists of HostDeviceMem objects.
     """
     # Transfer input data to the GPU.
     for inp in self.inputs:
         cuda.memcpy_htod_async(inp.device, inp.host, self.stream)
     # Run inference.
     self.context.execute_async(
         batch_size=self.batch_size,
         bindings=self.bindings,
         stream_handle=self.stream.handle,
     )
     # Transfer predictions back from the GPU.
     for out in self.outputs:
         cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
     # Synchronize the stream
     self.stream.synchronize()
     # Return only the host outputs.
     return [out.host for out in self.outputs]
예제 #42
0
def test_kernel(n, a, x_gpu, y_gpu):
    code = """
		#include <stdio.h>
		__global__ void saxpy(int n, float a, float *x, float *y)
		{
			int index = blockIdx.x * blockDim.x + threadIdx.x;
			int stride = blockDim.x * gridDim.x;

			for (int i = index; i < n; i += stride)
			{
				y[i] = a*x[i] + y[i];
			}
		}
		"""
    mod = SourceModule(code)
    saxpy = mod.get_function("saxpy")
    saxpy(n, a, x_gpu, y_gpu, block=(1024, 1, 1), grid=(1, 1))
    out = cuda.register_host_memory(np.empty(n, dtype=np.float32))

    cuda.memcpy_dtoh_async(out, y_gpu)
    return out
예제 #43
0
def run(nRow, nCol):
    print("test: nRow=%d,nCol=%d" % (nRow, nCol))
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFilePath)
    engine = buildEngine(logger, nRow, nCol)
    if engine == None:
        print("Failed building engine!")
        return None
    print("Succeeded building engine!")

    context = engine.create_execution_context()
    stream = cuda.Stream()

    data = np.full((nRow, nCol), 1, dtype=np.float32)  # uniform distribution
    #data = np.tile(np.arange(0,nCol,1,dtype=np.float32),[nRow,1])                                   # non-uniform distribution
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    inputD0 = cuda.mem_alloc(inputH0.nbytes)
    outputH0 = np.empty(context.get_binding_shape(1),
                        dtype=trt.nptype(engine.get_binding_dtype(1)))
    outputH1 = np.empty(context.get_binding_shape(2),
                        dtype=trt.nptype(engine.get_binding_dtype(2)))
    outputD0 = cuda.mem_alloc(outputH0.nbytes)
    outputD1 = cuda.mem_alloc(outputH1.nbytes)

    cuda.memcpy_htod_async(inputD0, inputH0, stream)
    context.execute_async(
        1, [int(inputD0), int(outputD0),
            int(outputD1)], stream.handle)
    cuda.memcpy_dtoh_async(outputH0, outputD0, stream)
    cuda.memcpy_dtoh_async(outputH1, outputD1, stream)
    stream.synchronize()

    print("outputH0")
    print(
        np.shape(outputH0), "mean=%.2f,var=%.2f,max=%d,min=%d" %
        (np.mean(outputH0), np.var(outputH0), np.max(outputH0),
         np.min(outputH0)))
    print("outputH1")
    print(np.shape(outputH1), "mean=%.2f" % (np.mean(outputH1)))
예제 #44
0
def threshold_integrated(series, value):
    global _dn, _n, _bn, _loc_tmp, _loc_out, _val_out, _loc, _val
        
    t = numpy.float32(value**2)
    nb = int(numpy.ceil(float(len(series))/nt/gs))
    
    if _bn is None or len(_bn) < nb:
        _bn = gpuarray.zeros(nb, dtype=numpy.uint32)
        
    if _n is None:
        _n = driver.pagelocked_empty((1), numpy.uint32, mem_flags=drv.host_alloc_flags.DEVICEMAP)
        ptr = numpy.intp(_n.base.get_device_pointer())
        class T():
            pass
        _dn = T()
        _dn.gpudata = ptr
        _dn.flags = _n.flags
        
    if _loc_tmp is None or len(series) > len(_loc_tmp):
        _loc_tmp = gpuarray.zeros(len(series), dtype=numpy.uint32)
        _loc_out = gpuarray.zeros(len(series), dtype=numpy.uint32)
        _val_out = gpuarray.zeros(len(series), dtype=series.dtype)
        _val = driver.pagelocked_empty((4096*256), numpy.complex64)
        _loc = driver.pagelocked_empty((4096*256), numpy.uint32)
    
    #Do the thresholding by block
    stuff(series.data, _loc_tmp, _bn, t, numpy.uint32(len(series)), block=(nt, 1, 1), grid=(nb, 1))
    
    # Recombine the blocks into a final output
    stuff2(series.data, _loc_tmp, _loc_out, _val_out, _bn, _dn, block=(nb, 1, 1), grid=(nb, 1))
    
    # We need to get the data back now
    pycbc.scheme.mgr.state.context.synchronize()
    if _n != 0: 
        driver.memcpy_dtoh_async(_val[0:_n], _val_out.gpudata)
        driver.memcpy_dtoh_async(_loc[0:_n], _loc_out.gpudata)
        pycbc.scheme.mgr.state.context.synchronize()
    return _loc[0:_n], _val[0:_n]
예제 #45
0
파일: packing.py 프로젝트: bartwozniak/PyFR
 def run(self, scomp, scopy):
     cuda.memcpy_dtoh_async(mpimat.hdata, mpimat.data, scomp)
	flop = 3*(nx*ny*nz*30)*tgap
	flops = np.zeros(tmax/tgap+1)
	start, stop = cuda.Event(), cuda.Event()
	start.record()

# main loop
ey_tmp = cuda.pagelocked_zeros((ny,nz),'f')
ez_tmp = cuda.pagelocked_zeros_like(ey_tmp)
hy_tmp = cuda.pagelocked_zeros_like(ey_tmp)
hz_tmp = cuda.pagelocked_zeros_like(ey_tmp)
stream1 = cuda.Stream()
for tn in xrange(1, tmax+1):
	update_h.prepared_async_call(bpg0, stream1, np.int32(By), *eh_args)
	for i, bpg in enumerate(bpg_list): update_h.prepared_call(bpg, np.int32(i*MBy), *eh_args)
	if rank == 0:
		cuda.memcpy_dtoh_async(hy_tmp, int(hy_gpu)+(nx-1)*ny*nz*np.nbytes['float32'], stream1) 
		cuda.memcpy_dtoh_async(hz_tmp, int(hz_gpu)+(nx-1)*ny*nz*np.nbytes['float32'], stream1) 
		stream1.synchronize()
		comm.Send(hy_tmp, 1, 20)
		comm.Send(hz_tmp, 1, 21)
	elif rank == 1:
		comm.Recv(hy_tmp, 0, 20)
		comm.Recv(hz_tmp, 0, 21)
		cuda.memcpy_htod_async(int(hy_gpu), hy_tmp, stream1) 
		cuda.memcpy_htod_async(int(hz_gpu), hz_tmp, stream1) 
		cuda.memcpy_dtoh_async(hy_tmp, int(hy_gpu)+(nx-1)*ny*nz*np.nbytes['float32'], stream1) 
		cuda.memcpy_dtoh_async(hz_tmp, int(hz_gpu)+(nx-1)*ny*nz*np.nbytes['float32'], stream1) 
		stream1.synchronize()
		comm.Send(hy_tmp, 2, 20)
		comm.Send(hz_tmp, 2, 21)
	elif rank == 2:
예제 #47
0
파일: gpuarray.py 프로젝트: Benli11/pycuda
    if len(shape) <= 1:
        if isinstance(src, GPUArray):
            if isinstance(dst, GPUArray):
                if async:
                    drv.memcpy_dtod_async(dst.gpudata, src.gpudata, src.nbytes, stream=stream)
                else:
                    drv.memcpy_dtod(dst.gpudata, src.gpudata, src.nbytes)
            else:
                # The arrays might be contiguous in the sense of
                # having no gaps, but the axes could be transposed
                # so that the order is neither Fortran or C.
                # So, we attempt to get a contiguous view of dst.
                dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,))
                if async:
                    drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream)
                else:
                    drv.memcpy_dtoh(dst, src.gpudata)
        else:
            src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,))
            if async:
                drv.memcpy_htod_async(dst.gpudata, src, stream=stream)
            else:
                drv.memcpy_htod(dst.gpudata, src)
        return

    if len(shape) == 2:
        copy = drv.Memcpy2D()
    elif len(shape) == 3:
        copy = drv.Memcpy3D()
    else:
예제 #48
0
 def from_buf_async(self, cl_buf, stream=None):
     cuda.memcpy_dtoh_async(self.buffers[cl_buf], cl_buf, stream)
예제 #49
0
    def convert_image_rgb(self, image):
        global program
        start = time.time()
        iplanes = image.get_planes()
        w = image.get_width()
        h = image.get_height()
        stride = image.get_rowstride()
        pixels = image.get_pixels()
        debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels))
        assert iplanes==ImageWrapper.PACKED, "must use packed format as input"
        assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format)
        divs = get_subsampling_divs(self.dst_format)

        #copy packed rgb pixels to GPU:
        upload_start = time.time()
        stream = driver.Stream()
        mem = numpy.frombuffer(pixels, dtype=numpy.byte)
        in_buf = driver.mem_alloc(len(pixels))
        hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP)
        pycuda.driver.memcpy_htod_async(in_buf, mem, stream)

        out_bufs = []
        out_strides = []
        out_sizes = []
        for i in range(3):
            x_div, y_div = divs[i]
            out_stride = roundup(self.dst_width/x_div, 4)
            out_height = roundup(self.dst_height/y_div, 2)
            out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4)
            out_bufs.append(out_buf)
            out_strides.append(out_stride)
            out_sizes.append((out_stride, out_height))
        #ensure uploading has finished:
        stream.synchronize()
        #we can now unpin the host memory:
        hmem.base.unregister()
        debug("allocation and upload took %.1fms", 1000.0*(time.time() - upload_start))

        kstart = time.time()
        kargs = [in_buf, numpy.int32(stride)]
        for i in range(3):
            kargs.append(out_bufs[i])
            kargs.append(numpy.int32(out_strides[i]))
        blockw, blockh = 16, 16
        #figure out how many pixels we process at a time in each dimension:
        xdiv = max([x[0] for x in divs])
        ydiv = max([x[1] for x in divs])
        gridw = max(1, w/blockw/xdiv)
        if gridw*2*blockw<w:
            gridw += 1
        gridh = max(1, h/blockh/ydiv)
        if gridh*2*blockh<h:
            gridh += 1
        debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh, 1))
        self.kernel_function(*kargs, block=(blockw,blockh,1), grid=(gridw, gridh))

        #we can now free the GPU source buffer:
        in_buf.free()
        kend = time.time()
        debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0)
        self.frames += 1

        #copy output YUV channel data to host memory:
        read_start = time.time()
        pixels = []
        strides = []
        for i in range(3):
            x_div, y_div = divs[i]
            out_size = out_sizes[i]
            #direct full plane async copy keeping current GPU padding:
            plane = driver.aligned_empty(out_size, dtype=numpy.byte)
            driver.memcpy_dtoh_async(plane, out_bufs[i], stream)
            pixels.append(plane.data)
            stride = out_strides[min(len(out_strides)-1, i)]
            strides.append(stride)
        stream.synchronize()
        #the copying has finished, we can now free the YUV GPU memory:
        #(the host memory will be freed by GC when 'pixels' goes out of scope)
        for out_buf in out_bufs:
            out_buf.free()
        self.cuda_context.synchronize()
        read_end = time.time()
        debug("strides=%s", strides)
        debug("read back took %.1fms, total time: %.1f", (read_end-read_start)*1000.0, 1000.0*(time.time()-start))
        return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)
예제 #50
0
 def copy(self, fb, dim, pool, stream=None):
     h_out = pool.allocate((dim.h, dim.w, 4), "u1")
     cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
     return h_out
예제 #51
0
Stream2 = drv.Stream()

MT_state_buf = drv.mem_alloc(SIZE * MT_N * 4)
MT_state_res_buf = drv.mem_alloc(MT_state_result.nbytes)

prg = SourceModule(
    transform_to_cuda(
        gen_kernel(MT_N, STATE_SIZE, M, SIZE, SIGNIFICANT_LENGTH)
    )
)
prog = prg.get_function('mt_brute')

zzz = time.time()

ev = prog(np.uint32(0), MT_state_buf, MT_state_res_buf, block=(STATE_SIZE, 1, 1), grid=(SIZE/STATE_SIZE, 1), stream=Stream)
drv.memcpy_dtoh_async(MT_state_result, MT_state_res_buf, stream=Stream2)

for i in xrange(TEST_ITERATIONS):
    prog(np.uint32(i*SIZE), MT_state_buf, MT_state_res_buf, block=(STATE_SIZE, 1, 1), grid=(SIZE/STATE_SIZE, 1), stream=Stream)
    drv.memcpy_dtoh(MT_state_result, MT_state_res_buf)#, stream=Stream2)

zzz = time.time() - zzz
print '>>>', zzz

for row in MT_state_result[0]:
    print row



MT_state_buf.free()
MT_state_res_buf.free()