示例#1
0
    def set(self, queue_adapter, buf, no_async=False):
        device_idx = queue_adapter._device_idx
        assert device_idx == self._device_idx
        self._context_adapter.activate_device(device_idx)

        # PyCUDA needs pointers to be passed as `numpy.number` to kernels,
        # but `memcpy` functions require Python `int`s.
        ptr = int(self._ptr) if isinstance(self._ptr,
                                           numpy.number) else self._ptr

        if isinstance(buf, numpy.ndarray):
            if no_async:
                pycuda_driver.memcpy_htod(ptr, buf)
            else:
                pycuda_driver.memcpy_htod_async(
                    ptr, buf, stream=queue_adapter._pycuda_stream)
        else:
            buf_ptr = int(buf._ptr) if isinstance(buf._ptr,
                                                  numpy.number) else buf._ptr
            if no_async:
                pycuda_driver.memcpy_dtod(ptr, buf_ptr, buf.size)
            else:
                pycuda_driver.memcpy_dtod_async(
                    ptr,
                    buf_ptr,
                    buf.size,
                    stream=queue_adapter._pycuda_stream)
示例#2
0
def gpuarray_memcpy(dest, src):
    '''Device memory copy with pycuda from
    src GPUArray to dest GPUArray.
    '''
    #     dest[:] = src
    #     memcpy_atoa(dest, 0, src, 0, len(src))
    memcpy_dtod_async(dest.gpudata, src.gpudata, src.nbytes)
 def copy(self):
     ret = GPULongint()
     ret.hide_digit = self.hide_digit
     ret.intsize_level = self.intsize_level
     ret.digitN = 1 << self.intsize_level
     dmy = drv.mem_alloc(4 * self.digitN)
     drv.memcpy_dtod_async(dmy, self.number, 4 * self.digitN)
     ret.number = dmy
     return ret
示例#4
0
文件: cuda.py 项目: jakirkham/reikna
 def _copy_array_buffer(self,
                        dest,
                        src,
                        nbytes,
                        src_offset=0,
                        dest_offset=0):
     cuda.memcpy_dtod_async(int(dest.gpudata) + dest_offset,
                            int(src.gpudata) + src_offset,
                            nbytes,
                            stream=self._queue)
示例#5
0
	def copyBuffer(self, buf, dest=None):
		if dest is None:
			buf_copy = self.allocate(buf.shape, buf.dtype)
		else:
			buf_copy = dest

		cuda.memcpy_dtod_async(buf_copy.gpudata, buf.gpudata, buf.nbytes, stream=self.stream)

		if dest is None:
			return buf_copy
示例#6
0
def copy_async(array, out=None, out_device=None, stream=None):
    """Copies a GPUArray object using the given stream.

    This function can copy the device array to the destination array on another
    device.

    Args:
        array (~pycuda.gpuarray.GPUArray): Array to be copied.
        out (~pycuda.gpuarray.GPUArray): Destination array.
            If it is not ``None``, then ``out_device`` argument is ignored.
        out_device: Destination device specifier. Actual device object is
            obtained by passing this value to :func:`get_device`.
        stream (~pycuda.driver.Stream): CUDA stream.

    Returns:
        ~pycuda.gpuarray.GPUArray: Copied array.

        If ``out`` is not specified, then the array is allocated on the device
        specified by ``out_device`` argument.

    .. warning::

       Currently, copy_async over different devices raises exception, since
       PyCUDA drops the definition of :func:`pycuda.driver.memcopy_peer_async`.

    """
    in_device = get_device(array)
    if out is None:
        if out_device is None:
            out_device = in_device
        else:
            out_device = get_device(out_device)

        with using_device(out_device):
            out = empty_like(array)
    else:
        out_device = get_device(out)

    with using_device(in_device):
        if in_device == out_device:
            drv.memcpy_dtod_async(out.ptr,
                                  array.ptr,
                                  out.nbytes,
                                  stream=stream)
        else:
            drv.memcpy_peer_async(out.ptr,
                                  array.ptr,
                                  out.nbytes,
                                  out_device,
                                  in_device,
                                  stream=stream)

    return out
示例#7
0
    def add_new_frame(self, x):
        x = x.flatten()
        assert len(x) == self._featuredim
        
        x_gpu = gpuarray.to_gpu_async(x)
        BLOCK_SIZE = (256,1,1)
        nblocks = int(np.ceil(float(self._featuredim) / BLOCK_SIZE[0]))
        GRID_SIZE = (nblocks, self._framecount, 1)

        cudabuffer.cyclebuffer(self._Y_gpu_scratch, x_gpu, self._Y_gpu,
                               np.int32(self._featuredim), np.int32(self._framecount),
                               block=BLOCK_SIZE, grid=GRID_SIZE)

        # Copy self._Y_gpu into self._Y_gpu_scratch
        cuda.memcpy_dtod_async(self._Y_gpu_scratch.gpudata, self._Y_gpu.gpudata, self._Y_gpu.nbytes)
示例#8
0
文件: cuda.py 项目: kuwa32/chainer
def copy_async(array, out=None, out_device=None, stream=None):
    """Copies a GPUArray object using the given stream.

    This function can copy the device array to the destination array on another
    device.

    Args:
        array (~pycuda.gpuarray.GPUArray): Array to be copied.
        out (~pycuda.gpuarray.GPUArray): Destination array.
            If it is not ``None``, then ``out_device`` argument is ignored.
        out_device: Destination device specifier. Actual device object is
            obtained by passing this value to :func:`get_device`.
        stream (~pycuda.driver.Stream): CUDA stream.

    Returns:
        ~pycuda.gpuarray.GPUArray: Copied array.

        If ``out`` is not specified, then the array is allocated on the device
        specified by ``out_device`` argument.

    .. warning::

       Currently, copy_async over different devices raises exception, since
       PyCUDA drops the definition of :func:`pycuda.driver.memcopy_peer_async`.

    """
    in_device = get_device(array)
    if out is None:
        if out_device is None:
            out_device = in_device
        else:
            out_device = get_device(out_device)

        with using_device(out_device):
            out = empty_like(array)
    else:
        out_device = get_device(out)

    with using_device(in_device):
        if in_device == out_device:
            drv.memcpy_dtod_async(
                out.ptr, array.ptr, out.nbytes, stream=stream)
        else:
            drv.memcpy_peer_async(out.ptr, array.ptr, out.nbytes, out_device,
                                  in_device, stream=stream)

    return out
示例#9
0
    def _assign(self, value):

        stream = self.backend.stream
        if isinstance(value, (int, float)):

            # if we have a contiguous array, then use the speedy driver kernel
            if self.is_contiguous:

                value = self.dtype.type(value)

                if self.dtype.itemsize == 1:
                    drv.memset_d8_async(self.gpudata,
                                        unpack_from('B', value)[0], self.size,
                                        stream)
                elif self.dtype.itemsize == 2:
                    drv.memset_d16_async(self.gpudata,
                                         unpack_from('H', value)[0], self.size,
                                         stream)
                else:
                    drv.memset_d32_async(self.gpudata,
                                         unpack_from('I', value)[0], self.size,
                                         stream)

            # otherwise use our copy kerel
            else:
                OpTreeNode.build("assign", self, value)

        elif isinstance(value, GPUTensor):
            # TODO: add an is_binary_compat like function
            if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype:
                drv.memcpy_dtod_async(self.gpudata, value.gpudata, self.nbytes,
                                      stream)
            else:
                OpTreeNode.build("assign", self, value)

        # collapse and execute an op tree as a kernel
        elif isinstance(value, OpTreeNode):
            OpTreeNode.build("assign", self, value)

        # assign to numpy array (same as set())
        elif isinstance(value, np.ndarray):
            self.set(value, device=None)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
示例#10
0
    def _assign(self, value):

        stream = self.backend.stream
        if isinstance(value, (int, float)):

            # if we have a contiguous array, then use the speedy driver kernel
            if self.is_contiguous:

                value = self.dtype.type(value)

                if self.dtype.itemsize == 1:
                    drv.memset_d8_async( self.gpudata,
                                   unpack_from('B', value)[0],
                                   self.size, stream)
                elif self.dtype.itemsize == 2:
                    drv.memset_d16_async(self.gpudata,
                                   unpack_from('H', value)[0],
                                   self.size, stream)
                else:
                    drv.memset_d32_async(self.gpudata,
                                   unpack_from('I', value)[0],
                                   self.size, stream)

            # otherwise use our copy kerel
            else:
                OpTreeNode.build("assign", self, value)

        elif isinstance(value, GPUTensor):
            # TODO: add an is_binary_compat like function
            if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype:
                drv.memcpy_dtod_async(self.gpudata, value.gpudata, self.nbytes, stream)
            else:
                OpTreeNode.build("assign", self, value)

        # collapse and execute an op tree as a kernel
        elif isinstance(value, OpTreeNode):
            OpTreeNode.build("assign", self, value)

        # assign to numpy array (same as set())
        elif isinstance(value, np.ndarray):
            self.set(value, device=None)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
示例#11
0
	def copyBuffer(self, buf, dest=None, src_offset=0, dest_offset=0, length=None):

		elem_size = buf.dtype.itemsize
		size = buf.nbytes if length is None else elem_size * length
		src_offset *= elem_size
		dest_offset *= elem_size

		if dest is None:
			ddest = self.allocate(buf.shape, buf.dtype)
		else:
			ddest = dest

		cuda.memcpy_dtod_async(int(ddest.gpudata) + dest_offset,
			int(buf.gpudata) + src_offset,
			size, stream=self.stream)

		if dest is None:
			return ddest
示例#12
0
 def copyBuffer(self, gpu_stream, buffer):
     """
     Copying the given device buffer into the already allocated memory
     """
     if not self.holds_data:
         raise RuntimeError('The buffer has been freed before copying buffer')
     
     if not buffer.holds_data:
         raise RuntimeError('The provided buffer is either not allocated, or has been freed before copying buffer')
     
     # Make sure that the input is of correct size:
     assert(buffer.nx_halo == self.nx_halo), str(buffer.nx_halo) + " vs " + str(self.nx_halo)
     assert(buffer.ny_halo == self.ny_halo), str(buffer.ny_halo) + " vs " + str(self.ny_halo)
     
     assert(buffer.bytes_per_float == self.bytes_per_float), "Provided buffer itemsize is " + str(buffer.bytes_per_float) + ", but should have been " + str(self.bytes_per_float)
     
     # Okay, everything is fine - issue device-to-device-copy:
     total_num_bytes = self.bytes_per_float*self.nx_halo*self.ny_halo
     cuda.memcpy_dtod_async(self.data.ptr, buffer.data.ptr, total_num_bytes, stream=gpu_stream)
示例#13
0
文件: solver.py 项目: zzz622848/lfd
    def solve(self, wt_n, y_nd, bend_coef, f_res):
        if y_nd.shape[0] != self.n or y_nd.shape[1] != self.d:
            raise RuntimeError(
                "The dimensions of y_nd doesn't match the dimensions of x_nd")
        if not y_nd.flags.c_contiguous:
            raise RuntimeError("Expected y_nd to be c-contiguous but it isn't")
        self.sqrtWQN_gpu.set_async(np.sqrt(wt_n)[:, None] * self.QN)
        geam(self.NKN_gpu, self.NRN_gpu, self.lhs_gpu, alpha=bend_coef, beta=1)
        gemm(self.sqrtWQN_gpu,
             self.sqrtWQN_gpu,
             self.lhs_gpu,
             transa='T',
             alpha=1,
             beta=1)

        drv.memcpy_dtod_async(self.rhs_gpu.gpudata, self.NR_gpu.gpudata,
                              self.rhs_gpu.nbytes)
        self.y_dnW_gpu.set_async(
            y_nd.T * wt_n)  # use transpose so that it is f_contiguous
        gemm(self.QN_gpu,
             self.y_dnW_gpu,
             self.rhs_gpu,
             transa='T',
             transb='T',
             alpha=1,
             beta=1)

        if lfd.registration._has_cula:
            culinalg.cho_solve(self.lhs_gpu, self.rhs_gpu)
            z = self.rhs_gpu.get()
            culinalg.dot(self.N_gpu, self.rhs_gpu, out=self.theta_gpu)
            theta = self.theta_gpu.get()
        else:  # if cula is not install perform the last two computations in the CPU
            z = np.linalg.solve(self.lhs_gpu.get(), self.rhs_gpu.get())
            theta = self.N.dot(z)
        f_res.update(self.x_nd,
                     y_nd,
                     bend_coef,
                     self.rot_coef,
                     wt_n,
                     theta,
                     N=self.N,
                     z=z)
示例#14
0
文件: Common.py 项目: kaihc/gpu-ocean
 def copyBuffer(self, gpu_stream, buffer):
     """
     Copying the given device buffer into the already allocated memory
     """
     if not self.holds_data:
         raise RuntimeError('The buffer has been freed before copying buffer')
     
     if not buffer.holds_data:
         raise RuntimeError('The provided buffer is either not allocated, or has been freed before copying buffer')
     
     # Make sure that the input is of correct size:
     assert(buffer.nx_halo == self.nx_halo), str(buffer.nx_halo) + " vs " + str(self.nx_halo)
     assert(buffer.ny_halo == self.ny_halo), str(buffer.ny_halo) + " vs " + str(self.ny_halo)
     
     assert(buffer.bytes_per_float == self.bytes_per_float), "Provided buffer itemsize is " + str(buffer.bytes_per_float) + ", but should have been " + str(self.bytes_per_float)
     
     # Okay, everything is fine - issue device-to-device-copy:
     total_num_bytes = self.bytes_per_float*self.nx_halo*self.ny_halo
     cuda.memcpy_dtod_async(self.data.ptr, buffer.data.ptr, total_num_bytes, stream=gpu_stream)
示例#15
0
    def replica_to_fragment(self, reptsr, fragtsr):
        '''
        Scatters the replica into the fragments (this just discards, so no p2p
        communication necessary
        '''
        numrep = self.num_dev
        fragsz = fragtsr.size
        dsz = fragtsr.dtype.itemsize
        assert reptsr.size == fragsz * numrep
        strms = self.strms
        starts = [i * fragsz for i in range(numrep)]

        for dbuf, sbuf, ctx, offset, strm in zip(fragtsr.tlist, reptsr.tlist,
                                                 self.ctxs, starts, strms):
            ctx.push()
            drv.memcpy_dtod_async(dbuf.ptr, sbuf.ptr + offset * dsz,
                                  fragsz * dsz, strm)
            ctx.pop()

        self.synchronize()
示例#16
0
文件: mgpu.py 项目: neuroidss/neon
    def replica_to_fragment(self, reptsr, fragtsr):
        '''
        Scatters the replica into the fragments (this just discards, so no p2p
        communication necessary
        '''
        numrep = self.num_dev
        fragsz = fragtsr.size
        dsz = fragtsr.dtype.itemsize
        assert reptsr.size == fragsz * numrep
        strms = self.strms
        starts = [i * fragsz for i in range(numrep)]

        for dbuf, sbuf, ctx, offset, strm in zip(fragtsr.tlist, reptsr.tlist,
                                                 self.ctxs, starts, strms):
            ctx.push()
            drv.memcpy_dtod_async(dbuf.ptr, sbuf.ptr + offset * dsz,
                                  fragsz * dsz, strm)
            ctx.pop()

        self.synchronize()
示例#17
0
文件: solver.py 项目: antingshen/lfd
    def solve(self, wt_n, y_nd, bend_coef, f_res):
        if y_nd.shape[0] != self.n or y_nd.shape[1] != self.d:
            raise RuntimeError("The dimensions of y_nd doesn't match the dimensions of x_nd")
        if not y_nd.flags.c_contiguous:
            raise RuntimeError("Expected y_nd to be c-contiguous but it isn't")
        self.sqrtWQN_gpu.set_async(np.sqrt(wt_n)[:,None] * self.QN)
        geam(self.NKN_gpu, self.NRN_gpu, self.lhs_gpu, alpha=bend_coef, beta=1)
        gemm(self.sqrtWQN_gpu, self.sqrtWQN_gpu, self.lhs_gpu, transa='T', alpha=1, beta=1)

        drv.memcpy_dtod_async(self.rhs_gpu.gpudata, self.NR_gpu.gpudata, self.rhs_gpu.nbytes)
        self.y_dnW_gpu.set_async(y_nd.T * wt_n) # use transpose so that it is f_contiguous
        gemm(self.QN_gpu, self.y_dnW_gpu, self.rhs_gpu, transa='T', transb='T', alpha=1, beta=1)
        
        if lfd.registration._has_cula:
            culinalg.cho_solve(self.lhs_gpu, self.rhs_gpu)
            culinalg.dot(self.N_gpu, self.rhs_gpu, out=self.theta_gpu)
            theta = self.theta_gpu.get()
        else: # if cula is not install perform the last two computations in the CPU
            z = np.linalg.solve(self.lhs_gpu.get(), self.rhs_gpu.get())
            theta = self.N.dot(z)
        f_res.set_ThinPlateSpline(self.x_nd, y_nd, bend_coef, self.rot_coef, wt_n, theta=theta)
示例#18
0
    def set_constant_buffer(self, queue_adapter: CuQueueAdapter, name: str,
                            arr: Union[CuBufferAdapter, numpy.ndarray]):
        """
        Uploads a constant array ``arr`` corresponding to the symbol ``name`` to the context.
        """
        self._context_adapter.activate_device(self._device_idx)
        symbol, size = self._pycuda_program.get_global(name)

        pycuda_stream = queue_adapter._pycuda_stream

        if isinstance(arr, CuBufferAdapter):
            transfer_size = arr.size
        elif isinstance(arr, numpy.ndarray):
            transfer_size = prod(arr.shape) * arr.dtype.itemsize
        else:  # pragma: no cover
            # Shouldn't reach this path because the type is already checked by the caller.
            # Nevertheless leaving it here as a sanity check.
            raise TypeError(f"Unsupported array type: {type(arr)}")

        if transfer_size != size:
            raise ValueError(f"Incorrect size of the constant buffer; "
                             f"expected {size} bytes, got {transfer_size}")

        if isinstance(arr, CuBufferAdapter):
            pycuda_driver.memcpy_dtod_async(symbol,
                                            arr.kernel_arg,
                                            arr.size,
                                            stream=pycuda_stream)
        else:
            # This serves two purposes:
            # 1. Gives us a pagelocked array, as PyCUDA requires
            # 2. Makes the array contiguous
            # Constant array are usually quite small, so it won't affect the performance.
            buf = pycuda_driver.pagelocked_empty(arr.shape, arr.dtype)
            numpy.copyto(buf, arr)
            pycuda_driver.memcpy_htod_async(symbol, buf, stream=pycuda_stream)
示例#19
0
文件: blasext.py 项目: zwghit/PyFR
 def run(self, queue):
     cuda.memcpy_dtod_async(dst.data, src.data, dst.nbytes,
                            stream=queue.cuda_stream_comp)
示例#20
0
    def run_step(self,
                 iter_parameters,
                 iter_limit=1000,
                 debug=False,
                 time=False):
        self.step_init(iter_parameters, debug)

        goal_reached = False
        iteration = 0
        while True:
            start_iter = timer()
            ########## create Wave front ###############
            start_wave_f = timer()  ############################# timer
            wavefront(self.dev_Gindicator,
                      self.dev_open,
                      self.dev_cost,
                      self.dev_threshold,
                      self.dev_n,
                      block=(self.threadsPerBlock, 1, 1),
                      grid=(self.nBlocksPerGrid, 1))

            self.dev_threshold += 2 * self.dev_radius
            goal_reached = self.dev_Gindicator[self.goal].get() == 1
            end_wave_f = timer()  ############################# timer

            start_wave_c = timer()  ############################# timer
            dev_Gscan = cuda.to_gpu(self.dev_Gindicator)
            exclusiveScan(dev_Gscan)

            dev_gSize = dev_Gscan[-1] + self.dev_Gindicator[-1]
            gSize = int(dev_gSize.get())

            if iteration >= iter_limit:
                print('### iteration limit ###', iteration)
                return self.route
            elif goal_reached:
                print('### goal reached ### ', iteration)
                self.parent = self.dev_parent.get()
                self.route = []
                self.get_path()
                return self.route
            elif gSize == 0:
                print('### threshold skip ', iteration)
                continue

            dev_G = cuda.GPUArray([
                gSize,
            ], np.int32)
            #dev_G = cuda.zeros(gSize, dtype=np.int32)

            compact(dev_G,
                    dev_Gscan,
                    self.dev_Gindicator,
                    self.dev_waypoints,
                    self.dev_n,
                    block=(self.threadsPerBlock, 1, 1),
                    grid=(self.nBlocksPerGrid, 1))
            end_wave_c = timer()  ############################# timer

            ######### scan and compact open set to connect neighbors ###############
            start_open = timer()  ############################# timer
            dev_yscan = cuda.to_gpu(self.dev_open)
            exclusiveScan(dev_yscan)
            dev_ySize = dev_yscan[-1] + self.dev_open[-1]
            ySize = int(dev_ySize.get())

            #dev_y = cuda.zeros(ySize, dtype=np.int32)
            dev_y = cuda.GPUArray([
                ySize,
            ], np.int32)
            compact(dev_y,
                    dev_yscan,
                    self.dev_open,
                    self.dev_waypoints,
                    self.dev_n,
                    block=(self.threadsPerBlock, 1, 1),
                    grid=(self.nBlocksPerGrid, 1))
            end_open = timer()  ############################# timer

            ########## creating neighbors of wave front to connect open ###############

            #dev_xindicator = cuda.zeros_like(self.dev_open, dtype= np.int32,stream= self.stream1)

            #self.dev_xindicator.fill(self.zero_val, stream = self.stream1)
            #print(self.dev_xindicator_zeros.nbytes)
            start_neighbor = timer()  ############################# timer
            drv.memcpy_dtod_async(self.dev_xindicator.gpudata,
                                  self.dev_xindicator_zeros.gpudata,
                                  self.dev_xindicator_zeros.nbytes)
            gBlocksPerGrid = int(
                ((gSize + self.threadsPerBlock - 1) / self.threadsPerBlock))
            neighborIndicator(self.dev_xindicator,
                              dev_G,
                              self.dev_unexplored,
                              self.dev_neighbors,
                              self.dev_num_neighbors,
                              self.neighbors_index,
                              dev_gSize,
                              block=(self.threadsPerBlock, 1, 1),
                              grid=(gBlocksPerGrid, 1))
            end_neighbor = timer()  ############################# timer

            start_neighbor_c = timer()  ############################# timer
            dev_xscan = cuda.to_gpu(self.dev_xindicator)
            exclusiveScan(dev_xscan)

            #start_create_n= timer()
            #dev_xscan = cuda.to_gpu_async(self.dev_xindicator, stream=self.stream1)
            #start_create_n= timer()
            #dev_xSize = cuda.sum(self.dev_xindicator, stream = self.stream1)
            #exclusiveScan(dev_xscan, stream=self.stream1)
            #start_create_n= timer()
            dev_xSize = dev_xscan[-1] + self.dev_xindicator[-1]
            #end_create_n= timer()

            xSize = int(dev_xSize.get())

            if xSize == 0:
                print('### x skip')
                continue

            dev_x = cuda.GPUArray([
                xSize,
            ], np.int32)
            #dev_x = cuda.zeros(xSize, dtype=np.int32)
            compact(dev_x,
                    dev_xscan,
                    self.dev_xindicator,
                    self.dev_waypoints,
                    self.dev_n,
                    block=(self.threadsPerBlock, 1, 1),
                    grid=(self.nBlocksPerGrid, 1))
            end_neighbor_c = timer()  ############################# timer

            ######### connect neighbors ####################
            # # launch planning
            start_connect = timer()  ############################# timer
            xBlocksPerGrid = int(
                ((xSize + self.threadsPerBlock - 1) / self.threadsPerBlock))
            dubinConnection(self.dev_cost,
                            self.dev_parent,
                            dev_x,
                            dev_y,
                            self.dev_states,
                            self.dev_open,
                            self.dev_unexplored,
                            dev_xSize,
                            dev_ySize,
                            self.dev_obstacles,
                            self.dev_num_obs,
                            self.dev_radius,
                            block=(self.threadsPerBlock, 1, 1),
                            grid=(xBlocksPerGrid, 1))
            end_connect = timer()  ############################# timer

            end_iter = timer()
            if debug:
                print('dev parents:', self.dev_parent)
                print('dev cost: ', self.dev_cost)
                print('dev unexplored: ', self.dev_unexplored)
                print('dev open: ', self.dev_open)
                print('dev threshold: ', self.dev_threshold)

                print('goal reached: ', goal_reached)
                print('y size: ', ySize, 'y: ', dev_y)
                print('G size: ', gSize, 'G: ', dev_G)

                print('x size: ', dev_xSize, 'x: ', dev_x)
                print('wave front timer: ', end_wave_f - start_wave_f)
                print('wave compact timer: ', end_wave_c - start_wave_c)
                print('open set timer: ', end_open - start_open)
                print('neighbor timer: ', end_neighbor - start_neighbor)
                print('neighbor compact timer: ',
                      end_neighbor_c - start_neighbor_c)
                print('connection timer: ', end_connect - start_connect)
                iteration_time = end_iter - start_iter
                print(
                    f'######### iteration: {iteration} iteration time: {iteration_time}'
                )

            if time and iteration > 0:
                self.time_data["wavefront"].append(end_wave_f - start_wave_f)
                self.time_data["wavefront_compact"].append(end_wave_c -
                                                           start_wave_c)
                self.time_data["open_compact"].append(end_open - start_open)
                self.time_data["neighbors"].append(end_neighbor -
                                                   start_neighbor)
                self.time_data["neighbors_compact"].append(end_neighbor_c -
                                                           start_neighbor_c)
                self.time_data["connection"].append(end_connect -
                                                    start_connect)
                self.time_data["elapsed"].append(end_iter - start_iter)
                self.time_data["iteration"].append(iteration)

            iteration += 1
    def infer_greedy(self):
        (infer_ndtype, dtype_esize) = get_dtype_info(self.args.input_dtype)
        batch_idx = 0

        # Static knobs
        ALWAYS_ADVANCE_TIME = True  # hack to always consume time pointer regardless of symbol outcome

        # Iterate over batches
        for image_idx in range(0, self.num_samples, self.batch_size):
            # Actual batch size might be smaller than max batch size
            actual_batch_size = self.batch_size if image_idx + self.batch_size <= self.num_samples else self.num_samples - image_idx

            start_time = time.time()

            # output and runtime data structures
            #
            enc_ptr = [0 for tdix in range(actual_batch_size)
                       ]  # holds encoder pointer per batch element (Xt)
            out_sym = [
                list() for tdix in range(actual_batch_size)
            ]  # holds output symbol translation per batch element (Yu)

            # data initialization for the batch ----------
            #

            # dec_inputs : host data for the decoder transfers
            dec_host_inputs = [
                np.ascontiguousarray(
                    np.zeros((actual_batch_size, 1), dtype=np.int32,
                             order='C')),  # input label
                np.ascontiguousarray(
                    np.zeros((actual_batch_size,
                              2 * self.hyperP.decoder_hidden_size),
                             dtype=infer_ndtype,
                             order='C')),  # hiden: layers * hidden
                np.ascontiguousarray(
                    np.zeros((actual_batch_size,
                              2 * self.hyperP.decoder_hidden_size),
                             dtype=infer_ndtype,
                             order='C')),  # cell:  layers * hidden
            ]

            # host_outputs : host data for outputs from decoder and joint/beam_search
            host_outputs = [
                np.ascontiguousarray(
                    np.zeros((actual_batch_size, 1 * self.hyperP.labels_size),
                             dtype=infer_ndtype,
                             order='C')),  # input: 1 * input
                np.ascontiguousarray(
                    np.zeros((actual_batch_size,
                              2 * self.hyperP.decoder_hidden_size),
                             dtype=infer_ndtype,
                             order='C')),  # hiden: layers * hidden
                np.ascontiguousarray(
                    np.zeros((actual_batch_size,
                              2 * self.hyperP.decoder_hidden_size),
                             dtype=infer_ndtype,
                             order='C')),  # cell:  layers * hidden
            ]

            # run the encoder ----------
            #

            #  outputs[0] - ( BS, max_seq_length // 2, enc_hidden_size=1024 )
            enc_outputs = self.encoder([self.batch_inputs[:actual_batch_size]],
                                       actual_batch_size)
            self.encoder.stream.synchronize()
            enc_outputs[0] = enc_outputs[0].reshape(
                (actual_batch_size, self.hyperP.max_seq_length // 2,
                 self.hyperP.encoder_hidden_size))
            # logging.info(" greedy::enc_output shape {:} type {:}".format(enc_outputs[0].shape, enc_outputs[0].dtype ))
            # logging.info(" greedy::enc_output data\n{:}".format(enc_outputs[0]))

            # run the decoder-joint greedy loop ----------
            #

            for seq_id in range(self.hyperP.max_seq_length // 2):
                # enc_input_seq = np.ascontiguousarray(enc_outputs[0][:,seq_id,:])
                # enc_input_seq = enc_input_seq.reshape (actual_batch_size, 1,  self.hyperP.encoder_hidden_size)
                enc_input_seq = np.ascontiguousarray(
                    np.zeros((actual_batch_size, 1,
                              self.hyperP.encoder_hidden_size),
                             dtype=infer_ndtype,
                             order='C'))
                for bs_index in range(actual_batch_size):
                    enc_input_seq[bs_index, :, :] = enc_outputs[0][
                        bs_index, enc_ptr[seq_id], :]
                # logging.info(" greedy::enc_output seq[{}] shape {:} data\n{:}".format(seq_id, enc_input_seq.shape, enc_input_seq))

                # run decoder/predictor (transfer data first)
                [
                    cuda.memcpy_htod_async(d_input, inp, self.decoder.stream)
                    for (d_input,
                         inp) in zip(self.decoder.d_inputs, dec_host_inputs)
                ]
                # self.debug_input_info(self.d_inputs, inputs, "_run_decoder")
                dec_dev_outputs = self.decoder.decoder_step(actual_batch_size)

                # trasfer decoding state to host
                cuda.memcpy_dtoh_async(host_outputs[1], dec_dev_outputs[1],
                                       self.decoder.stream)
                cuda.memcpy_dtoh_async(host_outputs[2], dec_dev_outputs[2],
                                       self.decoder.stream)

                # transfer data for joint
                # logging.info("tensor: shape = {:} -- {} :\n{}".format(enc_input_seq.shape, enc_input_seq.dtype, enc_input_seq))
                cuda.memcpy_htod_async(self.joint.d_inputs[0], enc_input_seq,
                                       self.joint.stream)  # encoder port
                self.decoder.stream.synchronize()
                cuda.memcpy_dtod_async(
                    self.joint.d_inputs[1], dec_dev_outputs[0],
                    actual_batch_size * self.hyperP.decoder_hidden_size *
                    dtype_esize, self.joint.stream)  # predictor

                # run joint
                self.joint.joint_step(actual_batch_size)

                # Transfer result to CPU for greedy decoder
                cuda.memcpy_dtoh_async(host_outputs[0],
                                       self.joint.outputs[0].device,
                                       self.joint.stream)
                self.joint.stream.synchronize()

                # greedy decoder
                winner_symbol = np.argmax(host_outputs[0], axis=1)
                # logging.info("Joint outputs:\n{} ".format(host_outputs[0]))
                # logging.info("Winner_symbol:\n{} ".format(winner_symbol))
                for bs_index in range(actual_batch_size):
                    new_symbol = winner_symbol[bs_index]
                    if new_symbol != self.hyperP.labels_size - 1:
                        # symbol is not blank
                        dec_host_inputs[0][bs_index, 0] = winner_symbol[
                            bs_index]  # update predicted symbol
                        dec_host_inputs[1][bs_index, :] = host_outputs[1][
                            bs_index, :]  # update hidden state
                        dec_host_inputs[2][bs_index, :] = host_outputs[2][
                            bs_index, :]  # update cell state
                        out_sym[bs_index].append(winner_symbol[bs_index])
                        if ALWAYS_ADVANCE_TIME:
                            enc_ptr[bs_index] += 1
                        # logging.info("Adding symbol {} in bs_id {} ".format(winner_symbol[bs_index], bs_index))
                    else:
                        # advance the audio time pointer if the symbol is blank
                        enc_ptr[bs_index] += 1

            # Loop epilogue
            logging.info(
                "Batch {:d} (Size {:}) >> Inference time: {:f}".format(
                    batch_idx, actual_batch_size,
                    time.time() - start_time))
            batch_idx += 1
            # logging.info("   output sequences:\n{:}".format(out_sym))

        # Function epilogue
        pass
示例#22
0
文件: batchtps.py 项目: rll/lfd
 def reset_tps_params(self):
     """
     sets the tps params to be identity
     """
     for p in self.tps_params:
         drv.memcpy_dtod_async(p.gpudata, self.default_tps_params.gpudata, p.nbytes)
示例#23
0
文件: cuda.py 项目: fjarri/reikna
 def _memcpy_dtod(self, dest, src, nbytes, src_offset=0, dest_offset=0):
     cuda.memcpy_dtod_async(
         int(dest) + dest_offset,
         int(src) + src_offset,
         nbytes, stream=self._queue)
示例#24
0
    def execute(self):
        ndevs = len(self.op.device_ids)
        size = self.input_tensor.tensor.size
        dtype = self.input_tensor.dtype
        segment_size = int(size / ndevs)
        if ((segment_size * ndevs) < size):
            segment_size += 1

        # Align segment size to 16 bytes
        if (segment_size & 0x03):
            segment_size = (segment_size & (~0x03)) + 4

        # Determine GPU active mask based on segment size
        num_active = int(size / segment_size)
        if ((segment_size * num_active) < size):
            num_active += 1

        # Copy tensor to output buffer
        drv.memcpy_dtod(self.output_buff.gpudata,
                        self.input_tensor.tensor.gpudata,
                        size * dtype.itemsize)

        # Send each GPU its assigned segment
        device_idx = self.op.device_ids.index(self.device_id)
        for peer_idx, peer_id in enumerate(self.op.device_ids):
            if (peer_id == self.device_id):
                continue

            # Only send if peer is active
            if (peer_idx >= num_active):
                continue

            # Compute size and offset of this peer's segment
            peer_segment_size = segment_size
            peer_segment_offset = peer_idx * segment_size

            if (device_idx > peer_idx):
                peer_scratch_offset = segment_size * (device_idx - 1)
            else:
                peer_scratch_offset = segment_size * device_idx

            if ((peer_idx + 1) == num_active):
                peer_segment_size = size - peer_segment_offset

            # Enqueue peer to peer memcpy
            src = int(self.output_buff_dict.get(self.device_id)) + \
                peer_segment_offset * dtype.itemsize
            scratch = int(self.scratch_buff_dict.get(peer_id)) + \
                peer_scratch_offset * dtype.itemsize

            drv.memcpy_dtod_async(scratch, src,
                                  peer_segment_size * dtype.itemsize,
                                  self.stream)

        # Record event in stream
        self.event.record(self.stream)

        # Sync with other devices
        self.process_sync()

        # Wait for other GPUs events
        for peer_id in self.op.device_ids:
            if (peer_id == self.device_id):
                continue
            self.stream.wait_for_event(self.event_buff_dict[peer_id])

        segment_offset = device_idx * segment_size
        this_segment_size = segment_size
        if ((device_idx + 1) == num_active):
            this_segment_size = size - segment_offset

        src = int(self.output_buff_dict.get(self.device_id)) + \
            segment_offset * dtype.itemsize

        # Sum received peer segments
        block_size = 1024
        grid_size = int(this_segment_size / (block_size * ITEMS_PER_THREAD))
        if ((grid_size * block_size * ITEMS_PER_THREAD) < this_segment_size):
            grid_size += 1

            # Perform reduction operation
            if (device_idx < num_active):
                num_arrays = ndevs - 1
                params = [
                    src, self.scratch_buff_dict[self.device_id],
                    this_segment_size, num_arrays, segment_size
                ]
                grid_dim = (grid_size, 1, 1)
                block_dim = (block_size, 1, 1)
                kernel = _reduction_kernel(self.op.reduce_func)
                kernel.prepared_async_call(grid_dim, block_dim, self.stream,
                                           *params)

                # Send other GPUs this GPU's assigned segment
                for peer_id in self.op.device_ids:
                    if (peer_id == self.device_id):
                        continue

                    # Enqueue peer to peer memcpy
                    dst = int(self.output_buff_dict.get(peer_id)) + \
                        segment_offset * dtype.itemsize
                    drv.memcpy_dtod_async(dst, src,
                                          this_segment_size * dtype.itemsize,
                                          self.stream)

            self.event.record(self.stream)

            self.process_sync()

            # Wait for other GPUs events
            for peer_id in self.op.device_ids:
                if (peer_id == self.device_id):
                    continue
                self.event_buff_dict[peer_id].synchronize()
            self.event.synchronize()

            drv.memcpy_dtod_async(self.tensor.tensor.gpudata,
                                  self.output_buff.gpudata,
                                  size * dtype.itemsize, self.stream)

            # This sync is only needed if we call this kernel 'synchronously'
            # if the assumption is that another kernel is called right after,
            # and uses the same streams as us, then we can remove this and
            # rely on the next kernel being put into our stream.

            # Record event in stream
            self.event.record(self.stream)

            # Sync with other devices
            self.process_sync()

            # Wait for other GPUs events
            for peer_id in self.op.device_ids:
                if (peer_id == self.device_id):
                    continue
                self.event_buff_dict[peer_id].synchronize()
            self.event.synchronize()
示例#25
0
            if dst_strides[i] < dst_strides[i-1]:
                raise ValueError("src and dst must have same order")
            if (src_strides[i-1] * shape[i-1] == src_strides[i] and
                dst_strides[i-1] * shape[i-1] == dst_strides[i]):
                shape[i-1:i+1] = [shape[i-1] * shape[i]]
                del src_strides[i]
                del dst_strides[i]
                del axes[i]
            else:
                i += 1

    if len(shape) <= 1:
        if isinstance(src, GPUArray):
            if isinstance(dst, GPUArray):
                if async:
                    drv.memcpy_dtod_async(dst.gpudata, src.gpudata, src.nbytes, stream=stream)
                else:
                    drv.memcpy_dtod(dst.gpudata, src.gpudata, src.nbytes)
            else:
                # The arrays might be contiguous in the sense of
                # having no gaps, but the axes could be transposed
                # so that the order is neither Fortran or C.
                # So, we attempt to get a contiguous view of dst.
                dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,))
                if async:
                    drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream)
                else:
                    drv.memcpy_dtoh(dst, src.gpudata)
        else:
            src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,))
            if async:
示例#26
0
            if dst_strides[i] < dst_strides[i-1]:
                raise ValueError("src and dst must have same order")
            if (src_strides[i-1] * shape[i-1] == src_strides[i] and
                dst_strides[i-1] * shape[i-1] == dst_strides[i]):
                shape[i-1:i+1] = [shape[i-1] * shape[i]]
                del src_strides[i]
                del dst_strides[i]
                del axes[i]
            else:
                i += 1

    if len(shape) <= 1:
        if isinstance(src, GPUArray):
            if isinstance(dst, GPUArray):
                if async:
                    drv.memcpy_dtod_async(dst.gpudata, src.gpudata, src.nbytes, stream=stream)
                else:
                    drv.memcpy_dtod(dst.gpudata, src.gpudata, src.nbytes)
            else:
                # The arrays might be contiguous in the sense of
                # having no gaps, but the axes could be transposed
                # so that the order is neither Fortran or C.
                # So, we attempt to get a contiguous view of dst.
                dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,))
                if async:
                    drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream)
                else:
                    drv.memcpy_dtoh(dst, src.gpudata)
        else:
            src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,))
            if async:
示例#27
0
 def _copy_array_buffer(self, dest, src, nbytes, src_offset=0, dest_offset=0):
     cuda.memcpy_dtod_async(
         int(dest.gpudata) + dest_offset,
         int(src.gpudata) + src_offset,
         nbytes, stream=self._queue)
示例#28
0
文件: batchtps.py 项目: rll/lfd
 def set_tps_params(self, vals):
     for d, s in zip(self.tps_params, vals):
         drv.memcpy_dtod_async(d.gpudata, s.gpudata, d.nbytes)
示例#29
0
 def _memcpy_dtod(self, dest, src, nbytes, src_offset=0, dest_offset=0):
     cuda.memcpy_dtod_async(int(dest) + dest_offset,
                            int(src) + src_offset,
                            nbytes,
                            stream=self._queue)
示例#30
0
 def initialize_solver(self, b, wt_n):
     drv.memcpy_dtod_async(self.NHN_gpu.gpudata, self.NON_gpu[b].gpudata,
                           self.NHN_gpu.nbytes)
     self.WQN_gpu.set_async(wt_n[:, None] * self.QN)
    def _run_decoder(self, inputs, seq_id, batch_size=1):
        (infer_ndtype, dtype_esize) = get_dtype_info(self.args.input_dtype)

        # self.debug_input_info(self.d_inputs, inputs, "_run_decoder")

        # Transfer input data to the GPU
        if seq_id == 0:
            # iter 0 needs the initial state
            hidden_tensor = np.ascontiguousarray(
                np.zeros((batch_size, 2 * self.hyperP.decoder_hidden_size),
                         dtype=infer_ndtype,
                         order='C'))  # layers * hidden
            cell_tensor = np.ascontiguousarray(
                np.zeros((batch_size, 2 * self.hyperP.decoder_hidden_size),
                         dtype=infer_ndtype,
                         order='C'))  # layers * hidden
            [
                cuda.memcpy_htod_async(d_input, inp, self.stream)
                for (d_input, inp) in zip(
                    self.d_inputs, [inputs[0], hidden_tensor, cell_tensor])
            ]
        else:
            # rest of iteration auto-reccur the state
            cuda.memcpy_htod_async(self.d_inputs[0], inputs[0], self.stream)

        # Run inference.
        if self.engine.has_implicit_batch_dimension:
            self.context.execute_async(batch_size=batch_size,
                                       bindings=self.bindings,
                                       stream_handle=self.stream.handle)
        else:
            for inp_idx in range(3):
                input_shape = self.context.get_binding_shape(inp_idx)
                input_shape[0] = batch_size
                self.context.set_binding_shape(inp_idx, input_shape)
            self.context.execute_async_v2(bindings=self.bindings,
                                          stream_handle=self.stream.handle)

        if self.args.debug_mode:
            # Transfer all outputs back from the GPU.
            [
                cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
                for out in self.outputs
            ]
            # Synchronize the stream
            self.stream.synchronize()
            logging.info("_run_decoder: out[0] = {}".format(
                self.outputs[0].host))
            self.debug_input_info(self.d_inputs,
                                  [out.host for out in self.outputs],
                                  "_run_decoder")

        # [cuda.memcpy_dtod_async(d_input, out.device, self.stream) for (d_input, out) in zip(self.d_inputs, self.outputs)]
        hidden_size = self.hyperP.decoder_hidden_size
        input_size = self.hyperP.decoder_input_size

        # Update state for next iteration
        cuda.memcpy_dtod_async(self.d_inputs[1], self.outputs[1].device,
                               batch_size * 2 * hidden_size * dtype_esize,
                               self.stream)
        cuda.memcpy_dtod_async(self.d_inputs[2], self.outputs[2].device,
                               batch_size * 2 * hidden_size * dtype_esize,
                               self.stream)

        # Transfer output to host
        cuda.memcpy_dtoh_async(self.outputs[0].host, self.outputs[0].device,
                               self.stream)
        # logging.info("_run_decoder: out[0] = {}".format(self.outputs[0].host))

        # Synchronize the stream
        self.stream.synchronize()

        # return the 'symbol' host output
        return self.outputs[0].host