def main(args): opt = Options() Options.getOptions(opt, args) try: if initXRT(opt): return 1 if opt.first_mem < 0: return 1 boHandle1 = xclAllocBO(opt.handle, opt.DATA_SIZE, xclBOKind.XCL_BO_DEVICE_RAM, opt.first_mem) ffi = FFI() bo1 = xclMapBO(opt.handle, boHandle1, True) testVector = "hello\nthis is Xilinx OpenCL memory read write test\n:-)\n" bo1_p = ffi.cast("FILE *", bo1) ffi.memmove(bo1_p, testVector, len(testVector)) bo1_buf = ffi.buffer(bo1_p, len(testVector)) print("buffer from device: ", bo1_buf[:]) if xclSyncBO(opt.handle, boHandle1, xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE, opt.DATA_SIZE, 0): return 1 p = xclBOProperties() bo1devAddr = p.paddr if not (xclGetBOProperties( opt.handle, boHandle1, p)) else -1 if bo1devAddr is -1: return 1 # Get the output if xclSyncBO(opt.handle, boHandle1, xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE, opt.DATA_SIZE, False): return 1 bo2 = xclMapBO(opt.handle, boHandle1, False) bo2_p = ffi.cast("FILE *", bo2) bo2_buf = ffi.buffer(bo2_p, len(testVector)) if bo1_buf[:] != bo2_buf[:]: print("FAILED TEST") print("Value read back does not match value written") return 1 except Exception as exp: print("Exception: ") print(exp) # prints the err sys.exit() print("PASSED TEST")
def test_memmove_readonly_readwrite(self): ffi = FFI() p = ffi.new("signed char[]", 5) ffi.memmove(p, b"abcde", 3) assert list(p) == [ord("a"), ord("b"), ord("c"), 0, 0] ffi.memmove(p, bytearray(b"ABCDE"), 2) assert list(p) == [ord("A"), ord("B"), ord("c"), 0, 0] py.test.raises((TypeError, BufferError), ffi.memmove, b"abcde", p, 3) ba = bytearray(b"xxxxx") ffi.memmove(dest=ba, src=p, n=3) assert ba == bytearray(b"ABcxx")
def test_memmove(self): ffi = FFI() p = ffi.new("short[]", [-1234, -2345, -3456, -4567, -5678]) ffi.memmove(p, p + 1, 4) assert list(p) == [-2345, -3456, -3456, -4567, -5678] p[2] = 999 ffi.memmove(p + 2, p, 6) assert list(p) == [-2345, -3456, -2345, -3456, 999] ffi.memmove(p + 4, ffi.new("char[]", b"\x71\x72"), 2) if sys.byteorder == 'little': assert list(p) == [-2345, -3456, -2345, -3456, 0x7271] else: assert list(p) == [-2345, -3456, -2345, -3456, 0x7172]
def test_memmove_buffer(self): import array ffi = FFI() a = array.array('H', [10000, 20000, 30000]) p = ffi.new("short[]", 5) ffi.memmove(p, a, 6) assert list(p) == [10000, 20000, 30000, 0, 0] ffi.memmove(p + 1, a, 6) assert list(p) == [10000, 10000, 20000, 30000, 0] b = array.array('h', [-1000, -2000, -3000]) ffi.memmove(b, a, 4) assert b.tolist() == [10000, 20000, -3000] assert a.tolist() == [10000, 20000, 30000] p[0] = 999 p[1] = 998 p[2] = 997 p[3] = 996 p[4] = 995 ffi.memmove(b, p, 2) assert b.tolist() == [999, 20000, -3000] ffi.memmove(b, p + 2, 4) assert b.tolist() == [997, 996, -3000] p[2] = -p[2] p[3] = -p[3] ffi.memmove(b, p + 2, 6) assert b.tolist() == [-997, -996, 995]
def runKernel(opt): ffi = FFI() # create the FFI obj boHandle = xclAllocBO(opt.handle, opt.DATA_SIZE, xclBOKind.XCL_BO_DEVICE_RAM, opt.first_mem) bo1 = xclMapBO(opt.handle, boHandle, True) read_fp = ffi.cast("FILE *", bo1) if xclSyncBO(opt.handle, boHandle, xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE, opt.DATA_SIZE, 0): return 1 p = xclBOProperties() bodevAddr = p.paddr if not (xclGetBOProperties(opt.handle, boHandle, p)) else -1 if bodevAddr is -1: return 1 # Allocate the exec_bo execHandle = xclAllocBO(opt.handle, opt.DATA_SIZE, xclBOKind.XCL_BO_SHARED_VIRTUAL, (1 << 31)) execData = xclMapBO(opt.handle, execHandle, True) # returns mmap() c_f = ffi.cast("FILE *", execData) if execData is ffi.NULL: print("execData is NULL") print("Construct the exe buf cmd to configure FPGA") ecmd = ert_configure_cmd() ecmd.m_uert.m_cmd_struct.state = 1 # ERT_CMD_STATE_NEW ecmd.m_uert.m_cmd_struct.opcode = 2 # ERT_CONFIGURE ecmd.slot_size = opt.DATA_SIZE ecmd.num_cus = 1 ecmd.cu_shift = 16 ecmd.cu_base_addr = opt.cu_base_addr ecmd.m_features.ert = opt.ert if opt.ert: ecmd.m_features.cu_dma = 1 ecmd.m_features.cu_isr = 1 # CU -> base address mapping ecmd.data[0] = opt.cu_base_addr ecmd.m_uert.m_cmd_struct.count = 5 + ecmd.num_cus sz = sizeof(ert_configure_cmd) ffi.memmove(c_f, ecmd, sz) print("Send the exec command and configure FPGA (ERT)") # Send the command. ret = xclExecBuf(opt.handle, execHandle) if ret: print("Unable to issue xclExecBuf") return 1 print("Wait until the command finish") while xclExecWait(opt.handle, 1000) != 0: print(".") print("Construct the exec command to run the kernel on FPGA") # construct the exec buffer cmd to start the kernel start_cmd = ert_start_kernel_cmd() rsz = (XHELLO_HELLO_CONTROL_ADDR_ACCESS1_DATA / 4 + 1) + 1 # regmap array size new_data = ((start_cmd.data._type_) * rsz)() start_cmd.m_uert.m_start_cmd_struct.state = 1 # ERT_CMD_STATE_NEW start_cmd.m_uert.m_start_cmd_struct.opcode = 0 # ERT_START_CU start_cmd.m_uert.m_start_cmd_struct.count = 1 + rsz start_cmd.cu_mask = 0x1 new_data[XHELLO_HELLO_CONTROL_ADDR_AP_CTRL] = 0x0 new_data[XHELLO_HELLO_CONTROL_ADDR_ACCESS1_DATA / 4] = bodevAddr new_data[XHELLO_HELLO_CONTROL_ADDR_ACCESS1_DATA / 4 + 1] = (bodevAddr >> 32) & 0xFFFFFFFF ffi.memmove(c_f, start_cmd, 2 * sizeof(c_uint32)) tmp_buf = ffi.buffer(c_f, 2 * sizeof(c_uint32) + (len(new_data) * sizeof(c_uint32))) data_ptr = ffi.from_buffer(tmp_buf) ffi.memmove(data_ptr + 2 * sizeof(c_uint32), new_data, len(new_data) * sizeof(c_uint32)) if xclExecBuf(opt.handle, execHandle): print("Unable to issue xclExecBuf") return 1 else: print("Kernel start command issued through xclExecBuf : start_kernel") print("Now wait until the kernel finish") print("Wait until the command finish") while xclExecWait(opt.handle, 1) != 0: print(".") # get the output xclSyncBO print("Get the output data from the device") if xclSyncBO(opt.handle, boHandle, xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE, opt.DATA_SIZE, 0): return 1 rd_buf = ffi.buffer(read_fp, len("Hello World")) print("RESULT: ") print(rd_buf[:] + "\n") return 0
class cv2pynq(): MAX_WIDTH = 1920 MAX_HEIGHT = 1080 def __init__(self, load_overlay=True): self.bitstream_name = None self.bitstream_name = "cv2pynq03.bit" self.bitstream_path = os.path.join(CV2PYNQ_BIT_DIR, self.bitstream_name) self.ol = Overlay(self.bitstream_path) self.ol.download() self.ol.reset() self.xlnk = Xlnk() self.partitions = 10 #split the cma into partitions for pipelined transfer self.cmaPartitionLen = self.MAX_HEIGHT * self.MAX_WIDTH / self.partitions self.listOfcma = [ self.xlnk.cma_array(shape=(int(self.MAX_HEIGHT / self.partitions), self.MAX_WIDTH), dtype=np.uint8) for i in range(self.partitions) ] self.img_filters = self.ol.image_filters self.dmaOut = self.img_filters.axi_dma_0.sendchannel self.dmaIn = self.img_filters.axi_dma_0.recvchannel self.dmaOut.stop() self.dmaIn.stop() self.dmaIn.start() self.dmaOut.start() self.filter2DType = -1 # filter types: SobelX=0, SobelY=1, ScharrX=2, ScharrY=3, Laplacian1=4, Laplacian3=5 self.filter2D_5Type = -1 # filter types: SobelX=0, SobelY=1, Laplacian5=4 self.filter2DfType = -1 # filter types: blur=0, GaussianBlur=1 self.ffi = FFI() self.f2D = self.img_filters.filter2D_hls_0 self.f2D.reset() self.f2D_5 = self.img_filters.filter2D_hls_5_0 self.f2D_5.reset() self.f2D_f = self.img_filters.filter2D_f_0 self.f2D_f.reset() self.erodeIP = self.img_filters.erode_hls_0 self.erodeIP.reset() self.dilateIP = self.img_filters.dilate_hls_0 self.dilateIP.reset() self.cmaBuffer_0 = self.xlnk.cma_array(shape=(self.MAX_HEIGHT, self.MAX_WIDTH), dtype=np.uint8) self.cmaBuffer0 = self.cmaBuffer_0.view(self.ContiguousArrayCv2pynq) self.cmaBuffer0.init(self.cmaBuffer_0) self.cmaBuffer_1 = self.xlnk.cma_array(shape=(self.MAX_HEIGHT, self.MAX_WIDTH), dtype=np.uint8) self.cmaBuffer1 = self.cmaBuffer_1.view(self.ContiguousArrayCv2pynq) self.cmaBuffer1.init(self.cmaBuffer_1) self.cmaBuffer_2 = self.xlnk.cma_array( shape=(self.MAX_HEIGHT * 4, self.MAX_WIDTH), dtype=np.uint8) # *4 for CornerHarris return self.cmaBuffer2 = self.cmaBuffer_2.view(self.ContiguousArrayCv2pynq) self.cmaBuffer2.init(self.cmaBuffer_2) self.CannyIP = self.img_filters.canny_edge_0 self.CannyIP.reset() #self.cornerHarrisIP = self.img_filters.CornerHarris_hls_0 #self.cornerHarrisIP.reset() def close(self): #self.dmaOut.stop() #self.dmaIn.stop() self.cmaBuffer_0.close() self.cmaBuffer_1.close() self.cmaBuffer_2.close() for cma in self.listOfcma: cma.close() def Sobel(self, src, ddepth, dx, dy, dst, ksize): if (ksize == 3): self.f2D.rows = src.shape[0] self.f2D.columns = src.shape[1] self.f2D.channels = 1 if (dx == 1) and (dy == 0): if self.filter2DType != 0: self.filter2DType = 0 self.f2D.r1 = 0x000100ff #[-1 0 1] self.f2D.r2 = 0x000200fe #[-2 0 2] self.f2D.r3 = 0x000100ff #[-1 0 1] elif (dx == 0) and (dy == 1): if self.filter2DType != 1: self.filter2DType = 1 self.f2D.r1 = 0x00fffeff #[-1 -2 -1] self.f2D.r2 = 0x00000000 #[ 0 0 0] self.f2D.r3 = 0x00010201 #[ 1 2 1] else: raise RuntimeError("Incorrect dx dy configuration") self.img_filters.select_filter(1) self.f2D.start() return self.filter2D(src, dst) else: #ksize == 5 self.f2D_5.rows = src.shape[0] self.f2D_5.columns = src.shape[1] if (dx == 1) and (dy == 0): if self.filter2D_5Type != 0: self.filter2D_5Type = 0 self.f2D_5.par_V = bytes([ \ #-1, -2, 0, 2, 1, 0xff, 0xfe, 0x00, 0x02, 0x01, \ #-4, -8, 0, 8, 4, 0xfc, 0xf8, 0x00, 0x08, 0x04, \ #-6, -12, 0, 12, 6, 0xfa, 0xf4, 0x00, 0x0c, 0x06, \ #-4, -8, 0, 8, 4, 0xfc, 0xf8, 0x00, 0x08, 0x04, \ #-1, -2, 0, 2, 1, 0xff, 0xfe, 0x00, 0x02, 0x01, \ 0,0,0]) #fill up to allign with 4 elif (dx == 0) and (dy == 1): if self.filter2D_5Type != 1: self.filter2D_5Type = 1 self.f2D_5.par_V = bytes([ \ #-1, -4, -6, -4, -1, 0xff, 0xfc, 0xfa, 0xfc, 0xff, \ #-2, -8, -12, -8, -2, 0xfe, 0xf8, 0xf4, 0xf8, 0xfe, \ # 0, 0, 0, 0, 0, 0x00, 0x00, 0x00, 0x00, 0x00, \ # 2, 8, 12, 8, 2, 0x02, 0x08, 0x0c, 0x08, 0x02, \ # 1, 4, 6, 4, 1, 0x01, 0x04, 0x06, 0x04, 0x01, \ 0,0,0]) #fill up to allign with 4 else: raise RuntimeError("Incorrect dx dy configuration") self.img_filters.select_filter(5) self.f2D_5.start() return self.filter2D(src, dst) def Scharr(self, src, ddepth, dx, dy, dst): self.f2D.rows = src.shape[0] self.f2D.columns = src.shape[1] self.f2D.channels = 1 if (dx == 1) and (dy == 0): if self.filter2DType != 2: self.filter2DType = 2 self.f2D.r1 = 0x000300fd #[-3 0 3] self.f2D.r2 = 0x000a00f6 #[-10 0 10] self.f2D.r3 = 0x000300fd #[-3 0 3] elif (dx == 0) and (dy == 1): if self.filter2DType != 3: self.filter2DType = 3 self.f2D.r1 = 0x00fdf6fd #[-3 -10 -3] self.f2D.r2 = 0x00000000 #[ 0 0 0] self.f2D.r3 = 0x00030a03 #[ 3 10 3] else: raise RuntimeError("Incorrect dx dy configuration") self.img_filters.select_filter(1) self.f2D.start() return self.filter2D(src, dst) def Laplacian(self, src, ddepth, dst, ksize): if ksize == 5: self.f2D_5.rows = src.shape[0] self.f2D_5.columns = src.shape[1] if self.filter2D_5Type != 4: self.filter2D_5Type = 4 # "Laplacian_5" self.f2D_5.par_V = bytes([ \ #2, 4, 4, 4, 2, 0x02, 0x04, 0x04, 0x04, 0x02, \ #4, 0, -8, 0, 4, 0x04, 0x00, 0xf8, 0x00, 0x04, \ #4, -8, -24, -8, 4, 0x04, 0xf8, 0xe8, 0xf8, 0x04, \ #4, 0, -8, 0, 4, 0x04, 0x00, 0xf8, 0x00, 0x04, \ #2, 4, 4, 4, 2, 0x02, 0x04, 0x04, 0x04, 0x02, \ 0,0,0]) #fill up to allign with 4 self.img_filters.select_filter(5) self.f2D_5.start() return self.filter2D(src, dst) else: #ksize 1 or 3 self.f2D.rows = src.shape[0] self.f2D.columns = src.shape[1] self.f2D.channels = 1 if ksize == 1: if (self.filter2DType != 4): self.filter2DType = 4 # "Laplacian_1" self.f2D.r1 = 0x00000100 #[ 0 1 0] self.f2D.r2 = 0x0001fc01 #[ 1 -4 1] self.f2D.r3 = 0x00000100 #[ 0 1 0] elif ksize == 3: if (self.filter2DType != 5): self.filter2DType = 5 # "Laplacian_3" self.f2D.r1 = 0x00020002 #[ 2 0 2] self.f2D.r2 = 0x0000f800 #[ 0 -8 0] self.f2D.r3 = 0x00020002 #[ 2 0 2] self.img_filters.select_filter(1) self.f2D.start() return self.filter2D(src, dst) def blur(self, src, ksize, dst): self.f2D_f.rows = src.shape[0] self.f2D_f.columns = src.shape[1] if (self.filter2DfType != 0): self.filter2DfType = 0 #blur mean = self.floatToFixed(1 / 9, cv2pynqDriverFilter2D_f.K_FP_W, cv2pynqDriverFilter2D_f.K_FP_F) self.f2D_f.r11 = mean self.f2D_f.r12 = mean self.f2D_f.r13 = mean self.f2D_f.r21 = mean self.f2D_f.r22 = mean self.f2D_f.r23 = mean self.f2D_f.r31 = mean self.f2D_f.r32 = mean self.f2D_f.r33 = mean self.img_filters.select_filter(2) self.f2D_f.start() return self.filter2D(src, dst) def GaussianBlur(self, src, ksize, sigmaX, sigmaY, dst): self.f2D_f.rows = src.shape[0] self.f2D_f.columns = src.shape[1] if (self.filter2DfType != 1): self.filter2DfType = 1 #GaussianBlur if (sigmaX <= 0): sigmaX = 0.3 * ((ksize[0] - 1) * 0.5 - 1) + 0.8 if (sigmaY <= 0): sigmaY = sigmaX kX = cv2.getGaussianKernel(3, sigmaX, ktype=cv2.CV_32F) #kernel X kY = cv2.getGaussianKernel(3, sigmaY, ktype=cv2.CV_32F) #kernel Y self.f2D_f.r11 = self.floatToFixed(kY[0] * kX[0], cv2pynqDriverFilter2D_f.K_FP_W, cv2pynqDriverFilter2D_f.K_FP_F) self.f2D_f.r12 = self.floatToFixed(kY[0] * kX[1], cv2pynqDriverFilter2D_f.K_FP_W, cv2pynqDriverFilter2D_f.K_FP_F) self.f2D_f.r13 = self.floatToFixed(kY[0] * kX[2], cv2pynqDriverFilter2D_f.K_FP_W, cv2pynqDriverFilter2D_f.K_FP_F) self.f2D_f.r21 = self.floatToFixed(kY[1] * kX[0], cv2pynqDriverFilter2D_f.K_FP_W, cv2pynqDriverFilter2D_f.K_FP_F) self.f2D_f.r22 = self.floatToFixed(kY[1] * kX[1], cv2pynqDriverFilter2D_f.K_FP_W, cv2pynqDriverFilter2D_f.K_FP_F) self.f2D_f.r23 = self.floatToFixed(kY[1] * kX[2], cv2pynqDriverFilter2D_f.K_FP_W, cv2pynqDriverFilter2D_f.K_FP_F) self.f2D_f.r31 = self.floatToFixed(kY[2] * kX[0], cv2pynqDriverFilter2D_f.K_FP_W, cv2pynqDriverFilter2D_f.K_FP_F) self.f2D_f.r32 = self.floatToFixed(kY[2] * kX[1], cv2pynqDriverFilter2D_f.K_FP_W, cv2pynqDriverFilter2D_f.K_FP_F) self.f2D_f.r33 = self.floatToFixed(kY[2] * kX[2], cv2pynqDriverFilter2D_f.K_FP_W, cv2pynqDriverFilter2D_f.K_FP_F) self.img_filters.select_filter(2) self.f2D_f.start() return self.filter2D(src, dst) def erode(self, src, kernel, dst, iterations, mode): self.img_filters.select_filter(3) return self.erodeDilateKernel(src, kernel, dst, iterations, mode, self.erodeIP) def dilate(self, src, kernel, dst, iterations, mode): self.img_filters.select_filter(4) return self.erodeDilateKernel(src, kernel, dst, iterations, mode, self.dilateIP) def Canny(self, src, threshold1, threshold2, dst): self.img_filters.select_filter(0) self.CannyIP.rows = src.shape[0] self.CannyIP.columns = src.shape[1] self.CannyIP.threshold1 = threshold1 self.CannyIP.threshold2 = threshold2 self.CannyIP.start() if hasattr(src, 'physical_address') and hasattr( dst, 'physical_address'): self.dmaIn.transfer(dst) self.dmaOut.transfer(src) self.dmaIn.wait() return dst self.cmaBuffer1.nbytes = src.nbytes self.dmaIn.transfer(self.cmaBuffer1) if hasattr(src, 'physical_address'): self.dmaOut.transfer(src) else: self.cmaBuffer0.nbytes = src.nbytes self.copyNto(self.cmaBuffer0, src, src.nbytes) self.dmaOut.transfer(self.cmaBuffer0) self.dmaIn.wait() ret = np.ndarray(src.shape, src.dtype) self.copyNto(ret, self.cmaBuffer1, ret.nbytes) return ret def filter2D(self, src, dst): if dst is None: self.cmaBuffer1.nbytes = src.nbytes elif hasattr(src, 'physical_address') and hasattr( dst, 'physical_address'): self.dmaIn.transfer(dst) self.dmaOut.transfer(src) self.dmaIn.wait() return dst if hasattr(src, 'physical_address'): self.dmaIn.transfer(self.cmaBuffer1) self.dmaOut.transfer(src) self.dmaIn.wait() else: #pipeline the copy to contiguous memory and filter calculation in hardware if src.nbytes < 184800: #440x420 self.partitions = 1 elif src.nbytes < 180000: #600x300 self.partitions = 2 elif src.nbytes < 231200: #680x340 self.partitions = 4 else: self.partitions = 8 self.cmaBuffer1.nbytes = src.nbytes self.dmaIn.transfer(self.cmaBuffer1) chunks_len = int(src.nbytes / (self.partitions)) self.cmaBuffer0.nbytes = chunks_len self.cmaBuffer2.nbytes = chunks_len #self.copyNto(src,self.cmaBuffer0,chunks_len) self.copyNto(self.cmaBuffer0, src, chunks_len) for i in range(1, self.partitions): if i % 2 == 1: while not self.dmaOut.idle and not self.dmaOut._first_transfer: pass self.dmaOut.transfer(self.cmaBuffer0) #self.copyNtoOff(src ,self.cmaBuffer2,chunks_len, i*chunks_len, 0) self.copyNtoOff(self.cmaBuffer2, src, chunks_len, 0, i * chunks_len) else: while not self.dmaOut.idle and not self.dmaOut._first_transfer: pass self.dmaOut.transfer(self.cmaBuffer2) #self.copyNtoOff(src ,self.cmaBuffer0,chunks_len, i*chunks_len, 0) self.copyNtoOff(self.cmaBuffer0, src, chunks_len, 0, i * chunks_len) while not self.dmaOut.idle and not self.dmaOut._first_transfer: pass self.dmaOut.transfer(self.cmaBuffer2) rest = src.nbytes % self.partitions if rest > 0: #cleanup any remaining data and send it to HW #self.copyNtoOff(src ,self.cmaBuffer0,chunks_len, self.partitions*chunks_len, 0) self.copyNtoOff(self.cmaBuffer0, src, chunks_len, 0, self.partitions * chunks_len) while not self.dmaOut.idle and not self.dmaOut._first_transfer: pass self.dmaOut.transfer(self.cmaBuffer0) rest -= chunks_len self.dmaIn.wait() ret = np.ndarray(src.shape, src.dtype) self.copyNto(ret, self.cmaBuffer1, ret.nbytes) return ret def floatToFixed(self, f, total_bits, fract_bits): """convert float f to a signed fixed point with #total_bits and #frac_bits after the point""" fix = int((abs(f) * (1 << fract_bits))) if (f < 0): fix += 1 << total_bits - 1 return fix def erodeDilateKernel(self, src, kernel, dst, iterations, mode, filter): filter.mode = mode filter.rows = src.shape[0] filter.columns = src.shape[1] if hasattr(src, 'physical_address') and hasattr( dst, 'physical_address'): filter.start() if iterations > 1: self.dmaIn.transfer(self.cmaBuffer1) else: self.dmaIn.transfer(dst) self.dmaOut.transfer(src) self.dmaIn.wait() self.cmaBuffer2.nbytes = src.nbytes #buffer = self.xlnk.cma_array(src.shape, dtype=np.uint8) for i in range(2, iterations + 1): filter.start() if i % 2 == 0: self.dmaIn.transfer(self.cmaBuffer2) if i != iterations: #avoid copy after last iteration self.dmaOut.transfer(self.cmaBuffer1) else: self.dmaOut.transfer(dst) else: self.dmaIn.transfer(self.cmaBuffer1) if i != iterations: self.dmaOut.transfer(self.cmaBuffer2) else: self.dmaOut.transfer(dst) self.dmaIn.wait() return dst self.cmaBuffer0.nbytes = src.nbytes self.cmaBuffer1.nbytes = src.nbytes filter.start() self.dmaIn.transfer(self.cmaBuffer1) if hasattr(src, 'physical_address'): self.dmaOut.transfer(src) else: self.copyNto(self.cmaBuffer0, src, src.nbytes) #np.copyto(srcBuffer,src) self.dmaOut.transfer(self.cmaBuffer0) self.dmaIn.wait() self.cmaBuffer2.nbytes = src.nbytes #buffer = self.xlnk.cma_array(src.shape, dtype=np.uint8) for i in range(2, iterations + 1): filter.start() if i % 2 == 0: self.dmaIn.transfer(self.cmaBuffer2) self.dmaOut.transfer(self.cmaBuffer1) else: self.dmaIn.transfer(self.cmaBuffer1) self.dmaOut.transfer(self.cmaBuffer2) self.dmaIn.wait() ret = np.ndarray(src.shape, src.dtype) if iterations % 2 == 1: self.copyNto(ret, self.cmaBuffer1, ret.nbytes) else: self.copyNto(ret, self.cmaBuffer2, ret.nbytes) return ret '''def cornerHarris(self, src, k, dst): self.img_filters.select_filter(5) self.cornerHarrisIP.rows = src.shape[0] self.cornerHarrisIP.columns = src.shape[1] self.cornerHarrisIP.start() if hasattr(src, 'physical_address') and hasattr(dst, 'physical_address') and (dst.nbytes == src.nbytes*4): self.dmaIn.transfer(dst) self.dmaOut.transfer(src) self.dmaIn.wait() return dst self.cmaBuffer2.nbytes = src.nbytes*4 self.dmaIn.transfer(self.cmaBuffer2) if hasattr(src, 'physical_address') : self.dmaOut.transfer(src) else: self.cmaBuffer0.nbytes = src.nbytes self.copyNto(self.cmaBuffer0,src,src.nbytes) self.dmaOut.transfer(self.cmaBuffer0) self.dmaIn.wait() ret = np.ndarray(src.shape,np.float32) self.copyNto(ret,self.cmaBuffer2,ret.nbytes) return ret''' def copyNto(self, dst, src, N): dstPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(dst)) srcPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(src)) self.ffi.memmove(dstPtr, srcPtr, N) def copyNtoOff(self, dst, src, N, dstOffset, srcOffset): dstPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(dst)) srcPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(src)) dstPtr += dstOffset srcPtr += srcOffset self.ffi.memmove(dstPtr, srcPtr, N) class ContiguousArrayCv2pynq(ContiguousArray): def init(self, cmaArray): self._nbytes = cmaArray.nbytes self.physical_address = cmaArray.physical_address self.cacheable = cmaArray.cacheable # overwrite access to nbytes with own function @property def nbytes(self): return self._nbytes @nbytes.setter def nbytes(self, value): self._nbytes = value
l = max_area_eye(eyes) if l[2] > 0: leftEyeLost = False left_pup_x = l[0] left_pup_y = l[1] ###____________________STARTING MAIN COMPUTATIONAL CHAIN FOR FIRST FRAME____________________ #roi copy of the rigth eye roiRightEye = frame1[int(right_pup_y):int(right_pup_y + W), int(right_pup_x):int(right_pup_x + W)].copy() #pointer to image (roi) pointerToRoiRightEye = ffi.cast("uint8_t *", ffi.from_buffer(roiRightEye)) #transfer image data to buffer ffi.memmove(pointIn + 1, pointerToRoiRightEye, W * W * CH) #DMA tranfer dmaOut.transfer(W * W, 1) dmaIn.transfer(W * W * CH, 0) #roi copy of left eye roiLeftEye = frame1[int(left_pup_y):int(left_pup_y + W), int(left_pup_x):int(left_pup_x + W)].copy() #pointer to image (roi) pointerToRoiLeftEye = ffi.cast("uint8_t *", ffi.from_buffer(roiLeftEye)) #transfer image data to buffer ffi.memmove(pointIn + 1, pointerToRoiLeftEye, W * W * CH) #get image analysed from buffer
class CTC(object): """ """ def __init__(self, on_device='cpu', blank_label=0): libpath = get_ctc_lib() self.ffi = FFI() self.ffi.cdef(ctc_header()) self.ctclib = self.ffi.dlopen(libpath) supported_devices = ['cpu', 'gpu'] if on_device not in supported_devices: print("the requested device {} is not supported".format(on_device), file=sys.stderr) sys.exit(1) assign_device = 0 if on_device is 'cpu' else 1 self.options = self.ffi.new('ctcOptions*', { "loc": assign_device, "blank_label": blank_label })[0] self.size_in_bytes = self.ffi.new("size_t*") self.nout = None self.bsz = None def get_buf_size(self, ptr_to_buf): return self.ffi.sizeof( self.ffi.getctype(self.ffi.typeof(ptr_to_buf).item)) def buf_ref_from_array(self, arr): return self.ffi.from_buffer( self.ffi.buffer(self.ffi.cast('void*', arr.ptr), arr.nbytes)) def buf_ref_from_ptr(self, ptr, size): return self.ffi.from_buffer(self.ffi.buffer(ptr, size)) def get_gpu_workspace_size(self, lbl_lens, utt_lens, nout, bsz): self.nout = nout self.bsz = bsz _lbl_lens = self.ffi.cast("int*", lbl_lens.ravel().ctypes.data) _utt_lens = self.ffi.cast("int*", utt_lens.ravel().ctypes.data) status = self.ctclib.get_workspace_size(_lbl_lens, _utt_lens, self.nout, self.bsz, self.options, self.size_in_bytes) assert status is 0, "get_workspace_size() in warp-ctc failed" return self.size_in_bytes[0] def bind_to_gpu(self, acts, grads, lbls, lbl_lens, utt_lens, costs, workspace, scratch_size, stream): if stream is None: stream_ptr = self.ffi.cast('void*', 0) stream_buf_size = self.ffi.sizeof(self.ffi.new_handle(stream)) stream_buf = self.buf_ref_from_ptr(stream_ptr, stream_buf_size) else: stream_buf = self.ffi.cast("void*", stream.handle) self.options.stream = stream_buf flat_dims = np.prod(acts.shape) assert np.prod(grads.shape) == flat_dims acts_buf = self.ffi.cast("float*", self.buf_ref_from_array(acts)) grads_buf = self.ffi.cast("float*", self.buf_ref_from_array(grads)) costs_buf = self.ffi.cast("float*", self.buf_ref_from_array(costs)) warp_grads_buf_size = flat_dims * self.get_buf_size(grads_buf) warp_costs_buf_size = self.bsz * self.get_buf_size(costs_buf) warp_labels = self.ffi.cast("int*", lbls.ravel().ctypes.data) warp_label_lens = self.ffi.cast("int*", lbl_lens.ravel().ctypes.data) warp_input_lens = self.ffi.cast("int*", utt_lens.ravel().ctypes.data) workspace_buf = self.buf_ref_from_ptr( self.ffi.cast('void*', workspace), int(scratch_size)) ctc_status = self.ctclib.compute_ctc_loss(acts_buf, grads_buf, warp_labels, warp_label_lens, warp_input_lens, self.nout, self.bsz, costs_buf, workspace_buf, self.options) assert ctc_status is 0, "warp-ctc run failed" def bind_to_cpu(self, acts, lbls, utt_lens, lbl_lens, grads, costs, n_threads=1): self.options.num_threads = n_threads _, self.bsz, self.nout = acts.shape flat_dims = np.prod(acts.shape) assert np.prod(grads.shape) == flat_dims acts_buf = self.ffi.cast("float*", acts.ctypes.data) grads_buf = self.ffi.cast("float*", grads.ctypes.data) costs_buf = self.ffi.cast("float*", costs.ctypes.data) warp_grads_buf_size = flat_dims * self.get_buf_size(grads_buf) warp_costs_buf_size = self.bsz * self.get_buf_size(costs_buf) warp_labels = self.ffi.cast("int*", lbls.ravel().ctypes.data) warp_label_lens = self.ffi.cast("int*", lbl_lens.ravel().ctypes.data) warp_input_lens = self.ffi.cast("int*", utt_lens.ravel().ctypes.data) status = self.ctclib.get_workspace_size(warp_label_lens, warp_input_lens, self.nout, self.bsz, self.options, self.size_in_bytes) assert status is 0, "get_workspace_size() in warp-ctc failed" # TODO: workspace is a variable size buffer whose size is # determined during each call, so we can't initialize ahead # of time. Can we avoid this? workspace = self.ffi.new("char[]", self.size_in_bytes[0]) ctc_status = self.ctclib.compute_ctc_loss(acts_buf, grads_buf, warp_labels, warp_label_lens, warp_input_lens, self.nout, self.bsz, costs_buf, workspace, self.options) # transfer grads and costs back without copying self.ffi.memmove(grads, grads_buf, warp_grads_buf_size) grads = grads.reshape((acts.shape)) self.ffi.memmove(costs, costs_buf, warp_costs_buf_size) assert ctc_status is 0, "warp-ctc run failed"
def runKernel(opt): count = 1024 DATA_SIZE = sizeof(c_int64) * count ffi = FFI() # create the FFI obj boHandle1 = xclAllocBO(opt.handle, DATA_SIZE, xclBOKind.XCL_BO_DEVICE_RAM, opt.first_mem) boHandle2 = xclAllocBO(opt.handle, DATA_SIZE, xclBOKind.XCL_BO_DEVICE_RAM, opt.first_mem) bo1 = xclMapBO(opt.handle, boHandle1, True) bo2 = xclMapBO(opt.handle, boHandle2, True) bo1_fp = ffi.cast("FILE *", bo1) bo2_fp = ffi.cast("FILE *", bo2) # bo1 bo1_arr = np.array([0x586C0C6C for _ in range(count)]) ffi.memmove(bo1_fp, ffi.from_buffer(bo1_arr), count * 5) # bo2 int_arr = np.array([i * i for i in range(count)]) bo2_arr = np.array(map(str, int_arr.astype(int))) ffi.memmove(bo2_fp, ffi.from_buffer(bo2_arr), count * 7) # bufReference int_arr_2 = np.array([i * i + i * 16 for i in range(count)]) str_arr = np.array(map(str, int_arr_2)) buf = ffi.from_buffer(str_arr) bufReference = ''.join(buf) if xclSyncBO(opt.handle, boHandle1, xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE, DATA_SIZE, 0): return 1 if xclSyncBO(opt.handle, boHandle2, xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE, DATA_SIZE, 0): return 1 p = xclBOProperties() bo1devAddr = p.paddr if not (xclGetBOProperties(opt.handle, boHandle1, p)) else -1 bo2devAddr = p.paddr if not (xclGetBOProperties(opt.handle, boHandle2, p)) else -1 if bo1devAddr is -1 or bo2devAddr is -1: return 1 # Allocate the exec_bo execHandle = xclAllocBO(opt.handle, DATA_SIZE, xclBOKind.XCL_BO_SHARED_VIRTUAL, (1 << 31)) execData = xclMapBO(opt.handle, execHandle, True) # returns mmap() c_f = ffi.cast("FILE *", execData) if execData is ffi.NULL: print("execData is NULL") print("Construct the exe buf cmd to configure FPGA") ecmd = ert_configure_cmd() ecmd.m_uert.m_cmd_struct.state = 1 # ERT_CMD_STATE_NEW ecmd.m_uert.m_cmd_struct.opcode = 2 # ERT_CONFIGURE ecmd.slot_size = 1024 ecmd.num_cus = 1 ecmd.cu_shift = 16 ecmd.cu_base_addr = opt.cu_base_addr ecmd.m_features.ert = opt.ert if opt.ert: ecmd.m_features.cu_dma = 1 ecmd.m_features.cu_isr = 1 # CU -> base address mapping ecmd.data[0] = opt.cu_base_addr ecmd.m_uert.m_cmd_struct.count = 5 + ecmd.num_cus sz = sizeof(ert_configure_cmd) ffi.memmove(c_f, ecmd, sz) print("Send the exec command and configure FPGA (ERT)") # Send the command. if xclExecBuf(opt.handle, execHandle): print("Unable to issue xclExecBuf") return 1 print("Wait until the command finish") while xclExecWait(opt.handle, 1000) != 0: print(".") print("Construct the exec command to run the kernel on FPGA") print( "Due to the 1D OpenCL group size, the kernel must be launched %d times" ) % count # construct the exec buffer cmd to start the kernel for id in range(count): start_cmd = ert_start_kernel_cmd() rsz = XSIMPLE_CONTROL_ADDR_FOO_DATA / 4 + 2 # regmap array size new_data = ((start_cmd.data._type_) * rsz)() start_cmd.m_uert.m_start_cmd_struct.state = 1 # ERT_CMD_STATE_NEW start_cmd.m_uert.m_start_cmd_struct.opcode = 0 # ERT_START_CU start_cmd.m_uert.m_start_cmd_struct.count = 1 + rsz start_cmd.cu_mask = 0x1 new_data[XSIMPLE_CONTROL_ADDR_AP_CTRL] = 0x0 new_data[XSIMPLE_CONTROL_ADDR_GROUP_ID_X_DATA / 4] = id new_data[XSIMPLE_CONTROL_ADDR_S1_DATA / 4] = bo1devAddr & 0xFFFFFFFF # output new_data[XSIMPLE_CONTROL_ADDR_S2_DATA / 4] = bo2devAddr & 0xFFFFFFFF # input new_data[XSIMPLE_CONTROL_ADDR_FOO_DATA / 4] = 0x10 # foo ffi.memmove(c_f, start_cmd, 2 * sizeof(c_uint32)) tmp_buf = ffi.buffer( c_f, 2 * sizeof(c_uint32) + (len(new_data) * sizeof(c_uint32))) data_ptr = ffi.from_buffer(tmp_buf) ffi.memmove(data_ptr + 2 * sizeof(c_uint32), new_data, len(new_data) * sizeof(c_uint32)) if xclExecBuf(opt.handle, execHandle): print("Unable to issue xclExecBuf") return 1 print("Wait until the command finish") while xclExecWait(opt.handle, 100) != 0: print("reentering wait... \n") # get the output xclSyncBO print("Get the output data from the device") if xclSyncBO(opt.handle, boHandle1, xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE, DATA_SIZE, 0): return 1 rd_buf = ffi.buffer(bo1_fp, count * 7) print("RESULT: ") # print(rd_buf[:] + "\n") if bufReference != rd_buf[:]: print("FAILED TEST") print("Value read back does not match value written") sys.exit()
class cv2pynq(): MAX_WIDTH = 800 MAX_HEIGHT = 600 def __init__(self, load_overlay=True): self.bitstream_name = None self.bitstream_name = "imgfilter.bit" self.bitstream_path = os.path.join(self.bitstream_name) self.ol = Overlay(self.bitstream_path) self.ol.download() self.ol.reset() self.xlnk = Xlnk() self.partitions = 10 #split the cma into partitions for pipelined transfer self.cmaPartitionLen = self.MAX_HEIGHT * self.MAX_WIDTH / self.partitions self.listOfcma = [ self.xlnk.cma_array(shape=(int(self.MAX_HEIGHT / self.partitions), self.MAX_WIDTH), dtype=np.uint8) for i in range(self.partitions) ] self.filter2d = self.ol self.dmaOut = self.filter2d.axi_dma_0.sendchannel self.dmaIn = self.filter2d.axi_dma_0.recvchannel self.dmaOut.stop() self.dmaIn.stop() self.dmaIn.start() self.dmaOut.start() self.filter2DType = -1 # filter types: SobelX=0 self.ffi = FFI() self.f2D = self.filter2d.filter2D_hls_0 self.f2D.reset() self.cmaBuffer_0 = self.xlnk.cma_array(shape=(self.MAX_HEIGHT, self.MAX_WIDTH), dtype=np.uint8) self.cmaBuffer0 = self.cmaBuffer_0.view(self.ContiguousArrayCv2pynq) self.cmaBuffer0.init(self.cmaBuffer_0) self.cmaBuffer_1 = self.xlnk.cma_array(shape=(self.MAX_HEIGHT, self.MAX_WIDTH), dtype=np.uint8) self.cmaBuffer1 = self.cmaBuffer_1.view(self.ContiguousArrayCv2pynq) self.cmaBuffer1.init(self.cmaBuffer_1) self.cmaBuffer_2 = self.xlnk.cma_array( shape=(self.MAX_HEIGHT * 4, self.MAX_WIDTH), dtype=np.uint8) # *4 for CornerHarris return self.cmaBuffer2 = self.cmaBuffer_2.view(self.ContiguousArrayCv2pynq) self.cmaBuffer2.init(self.cmaBuffer_2) def close(self): self.cmaBuffer_0.close() self.cmaBuffer_1.close() self.cmaBuffer_2.close() for cma in self.listOfcma: cma.close() def copyNto(self, dst, src, N): dstPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(dst)) srcPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(src)) self.ffi.memmove(dstPtr, srcPtr, N) def copyNtoOff(self, dst, src, N, dstOffset, srcOffset): dstPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(dst)) srcPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(src)) dstPtr += dstOffset srcPtr += srcOffset self.ffi.memmove(dstPtr, srcPtr, N) def filter2D(self, src, dst): if dst is None: self.cmaBuffer1.nbytes = src.nbytes elif hasattr(src, 'physical_address') and hasattr( dst, 'physical_address'): self.dmaIn.transfer(dst) self.dmaOut.transfer(src) self.dmaIn.wait() return dst if hasattr(src, 'physical_address'): self.dmaIn.transfer(self.cmaBuffer1) self.dmaOut.transfer(src) self.dmaIn.wait() else: #pipeline the copy to contiguous memory and filter calculation in hardware if src.nbytes < 184800: #440x420 self.partitions = 1 elif src.nbytes < 180000: #600x300 self.partitions = 2 elif src.nbytes < 231200: #680x340 self.partitions = 4 else: self.partitions = 8 self.cmaBuffer1.nbytes = src.nbytes self.dmaIn.transfer(self.cmaBuffer1) chunks_len = int(src.nbytes / (self.partitions)) self.cmaBuffer0.nbytes = chunks_len self.cmaBuffer2.nbytes = chunks_len self.copyNto(src, self.cmaBuffer0, chunks_len) for i in range(1, self.partitions): if i % 2 == 1: while not self.dmaOut.idle and not self.dmaOut._first_transfer: pass self.dmaOut.transfer(self.cmaBuffer0) self.copyNtoOff(src, self.cmaBuffer2, chunks_len, i * chunks_len, 0) else: while not self.dmaOut.idle and not self.dmaOut._first_transfer: pass self.dmaOut.transfer(self.cmaBuffer2) self.copyNtoOff(src, self.cmaBuffer0, chunks_len, i * chunks_len, 0) while not self.dmaOut.idle and not self.dmaOut._first_transfer: pass self.dmaOut.transfer(self.cmaBuffer2) rest = src.nbytes % self.partitions if rest != 0: #cleanup any remaining data and send it to HW self.copyNtoOff(src, self.cmaBuffer0, chunks_len, self.partitions * chunks_len, 0) while not self.dmaOut.idle and not self.dmaOut._first_transfer: pass self.dmaOut.transfer(self.cmaBuffer0) self.dmaIn.wait() ret = np.ndarray(src.shape, src.dtype) self.copyNto(ret, self.cmaBuffer1, ret.nbytes) return ret class ContiguousArrayCv2pynq(ContiguousArray): def init(self, cmaArray): self._nbytes = cmaArray.nbytes self.physical_address = cmaArray.physical_address self.cacheable = cmaArray.cacheable # overwrite access to nbytes with own function @property def nbytes(self): return self._nbytes @nbytes.setter def nbytes(self, value): self._nbytes = value def Sobel(self, src, ddepth, dx, dy, dst, ksize): if (ksize == 3): self.filter2DType = 0 self.f2D.rows = src.shape[0] self.f2D.columns = src.shape[1] self.f2D.channels = 1 self.f2D.r1 = 0x000100ff #[-1 0 1] self.f2D.r2 = 0x000200fe #[-2 0 2] self.f2D.r3 = 0x000100ff #[-1 0 1] self.f2D.start() return self.filter2D(src, dst) def Sobel1(self, src, ddepth, dx, dy, dst, ksize): if (ksize == 3): if self.filter2DType != 1: self.filter2DType = 1 self.f2D.rows = src.shape[0] self.f2D.columns = src.shape[1] self.f2D.channels = 1 self.f2D.r1 = 0x00fffeff #[-1 -2 -1] self.f2D.r2 = 0x00000000 #[ 0 0 0] self.f2D.r3 = 0x00010201 #[ 1 2 1] self.f2D.start() return self.filter2D(src, dst)