示例#1
0
文件: main.py 项目: mikexilinx/XRT
def main(args):
    opt = Options()
    Options.getOptions(opt, args)
    try:
        if initXRT(opt):
            return 1
        if opt.first_mem < 0:
            return 1

        boHandle1 = xclAllocBO(opt.handle, opt.DATA_SIZE,
                               xclBOKind.XCL_BO_DEVICE_RAM, opt.first_mem)

        ffi = FFI()
        bo1 = xclMapBO(opt.handle, boHandle1, True)
        testVector = "hello\nthis is Xilinx OpenCL memory read write test\n:-)\n"
        bo1_p = ffi.cast("FILE *", bo1)

        ffi.memmove(bo1_p, testVector, len(testVector))

        bo1_buf = ffi.buffer(bo1_p, len(testVector))
        print("buffer from device: ", bo1_buf[:])

        if xclSyncBO(opt.handle, boHandle1,
                     xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE,
                     opt.DATA_SIZE, 0):
            return 1

        p = xclBOProperties()
        bo1devAddr = p.paddr if not (xclGetBOProperties(
            opt.handle, boHandle1, p)) else -1

        if bo1devAddr is -1:
            return 1

        # Get the output
        if xclSyncBO(opt.handle, boHandle1,
                     xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE,
                     opt.DATA_SIZE, False):
            return 1

        bo2 = xclMapBO(opt.handle, boHandle1, False)
        bo2_p = ffi.cast("FILE *", bo2)
        bo2_buf = ffi.buffer(bo2_p, len(testVector))

        if bo1_buf[:] != bo2_buf[:]:
            print("FAILED TEST")
            print("Value read back does not match value written")
            return 1

    except Exception as exp:
        print("Exception: ")
        print(exp)  # prints the err
        sys.exit()

    print("PASSED TEST")
示例#2
0
 def test_memmove_readonly_readwrite(self):
     ffi = FFI()
     p = ffi.new("signed char[]", 5)
     ffi.memmove(p, b"abcde", 3)
     assert list(p) == [ord("a"), ord("b"), ord("c"), 0, 0]
     ffi.memmove(p, bytearray(b"ABCDE"), 2)
     assert list(p) == [ord("A"), ord("B"), ord("c"), 0, 0]
     py.test.raises((TypeError, BufferError), ffi.memmove, b"abcde", p, 3)
     ba = bytearray(b"xxxxx")
     ffi.memmove(dest=ba, src=p, n=3)
     assert ba == bytearray(b"ABcxx")
示例#3
0
 def test_memmove_readonly_readwrite(self):
     ffi = FFI()
     p = ffi.new("signed char[]", 5)
     ffi.memmove(p, b"abcde", 3)
     assert list(p) == [ord("a"), ord("b"), ord("c"), 0, 0]
     ffi.memmove(p, bytearray(b"ABCDE"), 2)
     assert list(p) == [ord("A"), ord("B"), ord("c"), 0, 0]
     py.test.raises((TypeError, BufferError), ffi.memmove, b"abcde", p, 3)
     ba = bytearray(b"xxxxx")
     ffi.memmove(dest=ba, src=p, n=3)
     assert ba == bytearray(b"ABcxx")
示例#4
0
 def test_memmove(self):
     ffi = FFI()
     p = ffi.new("short[]", [-1234, -2345, -3456, -4567, -5678])
     ffi.memmove(p, p + 1, 4)
     assert list(p) == [-2345, -3456, -3456, -4567, -5678]
     p[2] = 999
     ffi.memmove(p + 2, p, 6)
     assert list(p) == [-2345, -3456, -2345, -3456, 999]
     ffi.memmove(p + 4, ffi.new("char[]", b"\x71\x72"), 2)
     if sys.byteorder == 'little':
         assert list(p) == [-2345, -3456, -2345, -3456, 0x7271]
     else:
         assert list(p) == [-2345, -3456, -2345, -3456, 0x7172]
示例#5
0
 def test_memmove(self):
     ffi = FFI()
     p = ffi.new("short[]", [-1234, -2345, -3456, -4567, -5678])
     ffi.memmove(p, p + 1, 4)
     assert list(p) == [-2345, -3456, -3456, -4567, -5678]
     p[2] = 999
     ffi.memmove(p + 2, p, 6)
     assert list(p) == [-2345, -3456, -2345, -3456, 999]
     ffi.memmove(p + 4, ffi.new("char[]", b"\x71\x72"), 2)
     if sys.byteorder == 'little':
         assert list(p) == [-2345, -3456, -2345, -3456, 0x7271]
     else:
         assert list(p) == [-2345, -3456, -2345, -3456, 0x7172]
示例#6
0
 def test_memmove_buffer(self):
     import array
     ffi = FFI()
     a = array.array('H', [10000, 20000, 30000])
     p = ffi.new("short[]", 5)
     ffi.memmove(p, a, 6)
     assert list(p) == [10000, 20000, 30000, 0, 0]
     ffi.memmove(p + 1, a, 6)
     assert list(p) == [10000, 10000, 20000, 30000, 0]
     b = array.array('h', [-1000, -2000, -3000])
     ffi.memmove(b, a, 4)
     assert b.tolist() == [10000, 20000, -3000]
     assert a.tolist() == [10000, 20000, 30000]
     p[0] = 999
     p[1] = 998
     p[2] = 997
     p[3] = 996
     p[4] = 995
     ffi.memmove(b, p, 2)
     assert b.tolist() == [999, 20000, -3000]
     ffi.memmove(b, p + 2, 4)
     assert b.tolist() == [997, 996, -3000]
     p[2] = -p[2]
     p[3] = -p[3]
     ffi.memmove(b, p + 2, 6)
     assert b.tolist() == [-997, -996, 995]
示例#7
0
def runKernel(opt):
    ffi = FFI()  # create the FFI obj
    boHandle = xclAllocBO(opt.handle, opt.DATA_SIZE, xclBOKind.XCL_BO_DEVICE_RAM, opt.first_mem)
    bo1 = xclMapBO(opt.handle, boHandle, True)
    read_fp = ffi.cast("FILE *", bo1)

    if xclSyncBO(opt.handle, boHandle, xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE, opt.DATA_SIZE, 0):
        return 1

    p = xclBOProperties()
    bodevAddr = p.paddr if not (xclGetBOProperties(opt.handle, boHandle, p)) else -1

    if bodevAddr is -1:
        return 1

    # Allocate the exec_bo
    execHandle = xclAllocBO(opt.handle, opt.DATA_SIZE, xclBOKind.XCL_BO_SHARED_VIRTUAL, (1 << 31))
    execData = xclMapBO(opt.handle, execHandle, True)  # returns mmap()
    c_f = ffi.cast("FILE *", execData)

    if execData is ffi.NULL:
        print("execData is NULL")
    print("Construct the exe buf cmd to configure FPGA")

    ecmd = ert_configure_cmd()
    ecmd.m_uert.m_cmd_struct.state = 1  # ERT_CMD_STATE_NEW
    ecmd.m_uert.m_cmd_struct.opcode = 2  # ERT_CONFIGURE

    ecmd.slot_size = opt.DATA_SIZE
    ecmd.num_cus = 1
    ecmd.cu_shift = 16
    ecmd.cu_base_addr = opt.cu_base_addr

    ecmd.m_features.ert = opt.ert
    if opt.ert:
        ecmd.m_features.cu_dma = 1
        ecmd.m_features.cu_isr = 1

    # CU -> base address mapping
    ecmd.data[0] = opt.cu_base_addr
    ecmd.m_uert.m_cmd_struct.count = 5 + ecmd.num_cus

    sz = sizeof(ert_configure_cmd)
    ffi.memmove(c_f, ecmd, sz)
    print("Send the exec command and configure FPGA (ERT)")

    # Send the command.
    ret = xclExecBuf(opt.handle, execHandle)

    if ret:
        print("Unable to issue xclExecBuf")
        return 1

    print("Wait until the command finish")

    while xclExecWait(opt.handle, 1000) != 0:
        print(".")

    print("Construct the exec command to run the kernel on FPGA")

    # construct the exec buffer cmd to start the kernel
    start_cmd = ert_start_kernel_cmd()
    rsz = (XHELLO_HELLO_CONTROL_ADDR_ACCESS1_DATA / 4 + 1) + 1  # regmap array size
    new_data = ((start_cmd.data._type_) * rsz)()
    start_cmd.m_uert.m_start_cmd_struct.state = 1  # ERT_CMD_STATE_NEW
    start_cmd.m_uert.m_start_cmd_struct.opcode = 0  # ERT_START_CU
    start_cmd.m_uert.m_start_cmd_struct.count = 1 + rsz
    start_cmd.cu_mask = 0x1

    new_data[XHELLO_HELLO_CONTROL_ADDR_AP_CTRL] = 0x0
    new_data[XHELLO_HELLO_CONTROL_ADDR_ACCESS1_DATA / 4] = bodevAddr
    new_data[XHELLO_HELLO_CONTROL_ADDR_ACCESS1_DATA / 4 + 1] = (bodevAddr >> 32) & 0xFFFFFFFF

    ffi.memmove(c_f, start_cmd, 2 * sizeof(c_uint32))

    tmp_buf = ffi.buffer(c_f, 2 * sizeof(c_uint32) + (len(new_data) * sizeof(c_uint32)))
    data_ptr = ffi.from_buffer(tmp_buf)
    ffi.memmove(data_ptr + 2 * sizeof(c_uint32), new_data, len(new_data) * sizeof(c_uint32))

    if xclExecBuf(opt.handle, execHandle):
        print("Unable to issue xclExecBuf")
        return 1
    else:
        print("Kernel start command issued through xclExecBuf : start_kernel")
        print("Now wait until the kernel finish")

    print("Wait until the command finish")

    while xclExecWait(opt.handle, 1) != 0:
        print(".")

    # get the output xclSyncBO
    print("Get the output data from the device")
    if xclSyncBO(opt.handle, boHandle, xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE, opt.DATA_SIZE, 0):
        return 1

    rd_buf = ffi.buffer(read_fp, len("Hello World"))
    print("RESULT: ")
    print(rd_buf[:] + "\n")

    return 0
示例#8
0
 def test_memmove_buffer(self):
     import array
     ffi = FFI()
     a = array.array('H', [10000, 20000, 30000])
     p = ffi.new("short[]", 5)
     ffi.memmove(p, a, 6)
     assert list(p) == [10000, 20000, 30000, 0, 0]
     ffi.memmove(p + 1, a, 6)
     assert list(p) == [10000, 10000, 20000, 30000, 0]
     b = array.array('h', [-1000, -2000, -3000])
     ffi.memmove(b, a, 4)
     assert b.tolist() == [10000, 20000, -3000]
     assert a.tolist() == [10000, 20000, 30000]
     p[0] = 999
     p[1] = 998
     p[2] = 997
     p[3] = 996
     p[4] = 995
     ffi.memmove(b, p, 2)
     assert b.tolist() == [999, 20000, -3000]
     ffi.memmove(b, p + 2, 4)
     assert b.tolist() == [997, 996, -3000]
     p[2] = -p[2]
     p[3] = -p[3]
     ffi.memmove(b, p + 2, 6)
     assert b.tolist() == [-997, -996, 995]
示例#9
0
class cv2pynq():
    MAX_WIDTH = 1920
    MAX_HEIGHT = 1080

    def __init__(self, load_overlay=True):
        self.bitstream_name = None
        self.bitstream_name = "cv2pynq03.bit"
        self.bitstream_path = os.path.join(CV2PYNQ_BIT_DIR,
                                           self.bitstream_name)
        self.ol = Overlay(self.bitstream_path)
        self.ol.download()
        self.ol.reset()
        self.xlnk = Xlnk()
        self.partitions = 10  #split the cma into partitions for pipelined transfer
        self.cmaPartitionLen = self.MAX_HEIGHT * self.MAX_WIDTH / self.partitions
        self.listOfcma = [
            self.xlnk.cma_array(shape=(int(self.MAX_HEIGHT / self.partitions),
                                       self.MAX_WIDTH),
                                dtype=np.uint8) for i in range(self.partitions)
        ]
        self.img_filters = self.ol.image_filters
        self.dmaOut = self.img_filters.axi_dma_0.sendchannel
        self.dmaIn = self.img_filters.axi_dma_0.recvchannel
        self.dmaOut.stop()
        self.dmaIn.stop()
        self.dmaIn.start()
        self.dmaOut.start()
        self.filter2DType = -1  # filter types: SobelX=0, SobelY=1, ScharrX=2, ScharrY=3, Laplacian1=4, Laplacian3=5
        self.filter2D_5Type = -1  # filter types: SobelX=0, SobelY=1,                     Laplacian5=4
        self.filter2DfType = -1  # filter types: blur=0, GaussianBlur=1
        self.ffi = FFI()
        self.f2D = self.img_filters.filter2D_hls_0
        self.f2D.reset()
        self.f2D_5 = self.img_filters.filter2D_hls_5_0
        self.f2D_5.reset()
        self.f2D_f = self.img_filters.filter2D_f_0
        self.f2D_f.reset()
        self.erodeIP = self.img_filters.erode_hls_0
        self.erodeIP.reset()
        self.dilateIP = self.img_filters.dilate_hls_0
        self.dilateIP.reset()
        self.cmaBuffer_0 = self.xlnk.cma_array(shape=(self.MAX_HEIGHT,
                                                      self.MAX_WIDTH),
                                               dtype=np.uint8)
        self.cmaBuffer0 = self.cmaBuffer_0.view(self.ContiguousArrayCv2pynq)
        self.cmaBuffer0.init(self.cmaBuffer_0)
        self.cmaBuffer_1 = self.xlnk.cma_array(shape=(self.MAX_HEIGHT,
                                                      self.MAX_WIDTH),
                                               dtype=np.uint8)
        self.cmaBuffer1 = self.cmaBuffer_1.view(self.ContiguousArrayCv2pynq)
        self.cmaBuffer1.init(self.cmaBuffer_1)
        self.cmaBuffer_2 = self.xlnk.cma_array(
            shape=(self.MAX_HEIGHT * 4, self.MAX_WIDTH),
            dtype=np.uint8)  # *4 for CornerHarris return
        self.cmaBuffer2 = self.cmaBuffer_2.view(self.ContiguousArrayCv2pynq)
        self.cmaBuffer2.init(self.cmaBuffer_2)
        self.CannyIP = self.img_filters.canny_edge_0
        self.CannyIP.reset()
        #self.cornerHarrisIP = self.img_filters.CornerHarris_hls_0
        #self.cornerHarrisIP.reset()

    def close(self):
        #self.dmaOut.stop()
        #self.dmaIn.stop()
        self.cmaBuffer_0.close()
        self.cmaBuffer_1.close()
        self.cmaBuffer_2.close()
        for cma in self.listOfcma:
            cma.close()

    def Sobel(self, src, ddepth, dx, dy, dst, ksize):
        if (ksize == 3):
            self.f2D.rows = src.shape[0]
            self.f2D.columns = src.shape[1]
            self.f2D.channels = 1
            if (dx == 1) and (dy == 0):
                if self.filter2DType != 0:
                    self.filter2DType = 0
                    self.f2D.r1 = 0x000100ff  #[-1  0  1]
                    self.f2D.r2 = 0x000200fe  #[-2  0  2]
                    self.f2D.r3 = 0x000100ff  #[-1  0  1]
            elif (dx == 0) and (dy == 1):
                if self.filter2DType != 1:
                    self.filter2DType = 1
                    self.f2D.r1 = 0x00fffeff  #[-1 -2 -1]
                    self.f2D.r2 = 0x00000000  #[ 0  0  0]
                    self.f2D.r3 = 0x00010201  #[ 1  2  1]
            else:
                raise RuntimeError("Incorrect dx dy configuration")
            self.img_filters.select_filter(1)
            self.f2D.start()
            return self.filter2D(src, dst)
        else:  #ksize == 5
            self.f2D_5.rows = src.shape[0]
            self.f2D_5.columns = src.shape[1]
            if (dx == 1) and (dy == 0):
                if self.filter2D_5Type != 0:
                    self.filter2D_5Type = 0
                    self.f2D_5.par_V = bytes([ \
                    #-1,  -2,   0,    2,    1,

                    0xff, 0xfe, 0x00, 0x02, 0x01, \
                    #-4,  -8,   0,    8,    4,

                    0xfc, 0xf8, 0x00, 0x08, 0x04, \
                    #-6,  -12,  0,    12,   6,

                    0xfa, 0xf4, 0x00, 0x0c, 0x06, \
                    #-4,  -8,   0,    8,    4,

                    0xfc, 0xf8, 0x00, 0x08, 0x04, \
                    #-1,  -2,   0,    2,    1,

                    0xff, 0xfe, 0x00, 0x02, 0x01, \
                    0,0,0]) #fill up to allign with 4
            elif (dx == 0) and (dy == 1):
                if self.filter2D_5Type != 1:
                    self.filter2D_5Type = 1
                    self.f2D_5.par_V = bytes([ \
                    #-1,  -4,   -6,   -4,   -1,

                    0xff, 0xfc, 0xfa, 0xfc, 0xff, \
                    #-2,  -8,   -12,  -8,   -2,

                    0xfe, 0xf8, 0xf4, 0xf8, 0xfe, \
                    # 0,  0,    0,    0,    0,

                    0x00, 0x00, 0x00, 0x00, 0x00, \
                    # 2,  8,    12,   8,    2,

                    0x02, 0x08, 0x0c, 0x08, 0x02, \
                    # 1,  4,    6,    4,    1,

                    0x01, 0x04, 0x06, 0x04, 0x01, \
                    0,0,0]) #fill up to allign with 4
            else:
                raise RuntimeError("Incorrect dx dy configuration")
            self.img_filters.select_filter(5)
            self.f2D_5.start()
            return self.filter2D(src, dst)

    def Scharr(self, src, ddepth, dx, dy, dst):
        self.f2D.rows = src.shape[0]
        self.f2D.columns = src.shape[1]
        self.f2D.channels = 1
        if (dx == 1) and (dy == 0):
            if self.filter2DType != 2:
                self.filter2DType = 2
                self.f2D.r1 = 0x000300fd  #[-3  0  3]
                self.f2D.r2 = 0x000a00f6  #[-10 0 10]
                self.f2D.r3 = 0x000300fd  #[-3  0  3]
        elif (dx == 0) and (dy == 1):
            if self.filter2DType != 3:
                self.filter2DType = 3
                self.f2D.r1 = 0x00fdf6fd  #[-3 -10 -3]
                self.f2D.r2 = 0x00000000  #[ 0   0  0]
                self.f2D.r3 = 0x00030a03  #[ 3  10  3]
        else:
            raise RuntimeError("Incorrect dx dy configuration")
        self.img_filters.select_filter(1)
        self.f2D.start()
        return self.filter2D(src, dst)

    def Laplacian(self, src, ddepth, dst, ksize):
        if ksize == 5:
            self.f2D_5.rows = src.shape[0]
            self.f2D_5.columns = src.shape[1]
            if self.filter2D_5Type != 4:
                self.filter2D_5Type = 4  # "Laplacian_5"
                self.f2D_5.par_V = bytes([ \
                #2,   4,    4,    4,    2,

                0x02, 0x04, 0x04, 0x04, 0x02, \
                #4,   0,    -8,   0,    4,

                0x04, 0x00, 0xf8, 0x00, 0x04, \
                #4,   -8,   -24,  -8,   4,

                0x04, 0xf8, 0xe8, 0xf8, 0x04, \
                #4,   0,    -8,   0,    4,

                0x04, 0x00, 0xf8, 0x00, 0x04, \
                #2,   4,    4,    4,    2,

                0x02, 0x04, 0x04, 0x04, 0x02, \
                0,0,0]) #fill up to allign with 4
            self.img_filters.select_filter(5)
            self.f2D_5.start()
            return self.filter2D(src, dst)
        else:  #ksize 1 or 3
            self.f2D.rows = src.shape[0]
            self.f2D.columns = src.shape[1]
            self.f2D.channels = 1
            if ksize == 1:
                if (self.filter2DType != 4):
                    self.filter2DType = 4  # "Laplacian_1"
                    self.f2D.r1 = 0x00000100  #[ 0  1  0]
                    self.f2D.r2 = 0x0001fc01  #[ 1 -4  1]
                    self.f2D.r3 = 0x00000100  #[ 0  1  0]
            elif ksize == 3:
                if (self.filter2DType != 5):
                    self.filter2DType = 5  # "Laplacian_3"
                    self.f2D.r1 = 0x00020002  #[ 2  0  2]
                    self.f2D.r2 = 0x0000f800  #[ 0 -8  0]
                    self.f2D.r3 = 0x00020002  #[ 2  0  2]
            self.img_filters.select_filter(1)
            self.f2D.start()
            return self.filter2D(src, dst)

    def blur(self, src, ksize, dst):
        self.f2D_f.rows = src.shape[0]
        self.f2D_f.columns = src.shape[1]
        if (self.filter2DfType != 0):
            self.filter2DfType = 0  #blur
            mean = self.floatToFixed(1 / 9, cv2pynqDriverFilter2D_f.K_FP_W,
                                     cv2pynqDriverFilter2D_f.K_FP_F)
            self.f2D_f.r11 = mean
            self.f2D_f.r12 = mean
            self.f2D_f.r13 = mean
            self.f2D_f.r21 = mean
            self.f2D_f.r22 = mean
            self.f2D_f.r23 = mean
            self.f2D_f.r31 = mean
            self.f2D_f.r32 = mean
            self.f2D_f.r33 = mean
        self.img_filters.select_filter(2)
        self.f2D_f.start()
        return self.filter2D(src, dst)

    def GaussianBlur(self, src, ksize, sigmaX, sigmaY, dst):
        self.f2D_f.rows = src.shape[0]
        self.f2D_f.columns = src.shape[1]
        if (self.filter2DfType != 1):
            self.filter2DfType = 1  #GaussianBlur
            if (sigmaX <= 0):
                sigmaX = 0.3 * ((ksize[0] - 1) * 0.5 - 1) + 0.8
            if (sigmaY <= 0):
                sigmaY = sigmaX
            kX = cv2.getGaussianKernel(3, sigmaX, ktype=cv2.CV_32F)  #kernel X
            kY = cv2.getGaussianKernel(3, sigmaY, ktype=cv2.CV_32F)  #kernel Y
            self.f2D_f.r11 = self.floatToFixed(kY[0] * kX[0],
                                               cv2pynqDriverFilter2D_f.K_FP_W,
                                               cv2pynqDriverFilter2D_f.K_FP_F)
            self.f2D_f.r12 = self.floatToFixed(kY[0] * kX[1],
                                               cv2pynqDriverFilter2D_f.K_FP_W,
                                               cv2pynqDriverFilter2D_f.K_FP_F)
            self.f2D_f.r13 = self.floatToFixed(kY[0] * kX[2],
                                               cv2pynqDriverFilter2D_f.K_FP_W,
                                               cv2pynqDriverFilter2D_f.K_FP_F)
            self.f2D_f.r21 = self.floatToFixed(kY[1] * kX[0],
                                               cv2pynqDriverFilter2D_f.K_FP_W,
                                               cv2pynqDriverFilter2D_f.K_FP_F)
            self.f2D_f.r22 = self.floatToFixed(kY[1] * kX[1],
                                               cv2pynqDriverFilter2D_f.K_FP_W,
                                               cv2pynqDriverFilter2D_f.K_FP_F)
            self.f2D_f.r23 = self.floatToFixed(kY[1] * kX[2],
                                               cv2pynqDriverFilter2D_f.K_FP_W,
                                               cv2pynqDriverFilter2D_f.K_FP_F)
            self.f2D_f.r31 = self.floatToFixed(kY[2] * kX[0],
                                               cv2pynqDriverFilter2D_f.K_FP_W,
                                               cv2pynqDriverFilter2D_f.K_FP_F)
            self.f2D_f.r32 = self.floatToFixed(kY[2] * kX[1],
                                               cv2pynqDriverFilter2D_f.K_FP_W,
                                               cv2pynqDriverFilter2D_f.K_FP_F)
            self.f2D_f.r33 = self.floatToFixed(kY[2] * kX[2],
                                               cv2pynqDriverFilter2D_f.K_FP_W,
                                               cv2pynqDriverFilter2D_f.K_FP_F)
        self.img_filters.select_filter(2)
        self.f2D_f.start()
        return self.filter2D(src, dst)

    def erode(self, src, kernel, dst, iterations, mode):
        self.img_filters.select_filter(3)
        return self.erodeDilateKernel(src, kernel, dst, iterations, mode,
                                      self.erodeIP)

    def dilate(self, src, kernel, dst, iterations, mode):
        self.img_filters.select_filter(4)
        return self.erodeDilateKernel(src, kernel, dst, iterations, mode,
                                      self.dilateIP)

    def Canny(self, src, threshold1, threshold2, dst):
        self.img_filters.select_filter(0)
        self.CannyIP.rows = src.shape[0]
        self.CannyIP.columns = src.shape[1]
        self.CannyIP.threshold1 = threshold1
        self.CannyIP.threshold2 = threshold2
        self.CannyIP.start()
        if hasattr(src, 'physical_address') and hasattr(
                dst, 'physical_address'):
            self.dmaIn.transfer(dst)
            self.dmaOut.transfer(src)
            self.dmaIn.wait()
            return dst

        self.cmaBuffer1.nbytes = src.nbytes
        self.dmaIn.transfer(self.cmaBuffer1)
        if hasattr(src, 'physical_address'):
            self.dmaOut.transfer(src)
        else:
            self.cmaBuffer0.nbytes = src.nbytes
            self.copyNto(self.cmaBuffer0, src, src.nbytes)
            self.dmaOut.transfer(self.cmaBuffer0)
        self.dmaIn.wait()
        ret = np.ndarray(src.shape, src.dtype)
        self.copyNto(ret, self.cmaBuffer1, ret.nbytes)
        return ret

    def filter2D(self, src, dst):
        if dst is None:
            self.cmaBuffer1.nbytes = src.nbytes
        elif hasattr(src, 'physical_address') and hasattr(
                dst, 'physical_address'):
            self.dmaIn.transfer(dst)
            self.dmaOut.transfer(src)
            self.dmaIn.wait()
            return dst
        if hasattr(src, 'physical_address'):
            self.dmaIn.transfer(self.cmaBuffer1)
            self.dmaOut.transfer(src)
            self.dmaIn.wait()
        else:  #pipeline the copy to contiguous memory and filter calculation in hardware
            if src.nbytes < 184800:  #440x420
                self.partitions = 1
            elif src.nbytes < 180000:  #600x300
                self.partitions = 2
            elif src.nbytes < 231200:  #680x340
                self.partitions = 4
            else:
                self.partitions = 8
            self.cmaBuffer1.nbytes = src.nbytes
            self.dmaIn.transfer(self.cmaBuffer1)
            chunks_len = int(src.nbytes / (self.partitions))
            self.cmaBuffer0.nbytes = chunks_len
            self.cmaBuffer2.nbytes = chunks_len
            #self.copyNto(src,self.cmaBuffer0,chunks_len)
            self.copyNto(self.cmaBuffer0, src, chunks_len)
            for i in range(1, self.partitions):
                if i % 2 == 1:
                    while not self.dmaOut.idle and not self.dmaOut._first_transfer:
                        pass
                    self.dmaOut.transfer(self.cmaBuffer0)
                    #self.copyNtoOff(src ,self.cmaBuffer2,chunks_len, i*chunks_len, 0)
                    self.copyNtoOff(self.cmaBuffer2, src, chunks_len, 0,
                                    i * chunks_len)
                else:
                    while not self.dmaOut.idle and not self.dmaOut._first_transfer:
                        pass
                    self.dmaOut.transfer(self.cmaBuffer2)
                    #self.copyNtoOff(src ,self.cmaBuffer0,chunks_len,  i*chunks_len, 0)
                    self.copyNtoOff(self.cmaBuffer0, src, chunks_len, 0,
                                    i * chunks_len)
            while not self.dmaOut.idle and not self.dmaOut._first_transfer:
                pass
            self.dmaOut.transfer(self.cmaBuffer2)
            rest = src.nbytes % self.partitions
            if rest > 0:  #cleanup any remaining data and send it to HW
                #self.copyNtoOff(src ,self.cmaBuffer0,chunks_len, self.partitions*chunks_len, 0)
                self.copyNtoOff(self.cmaBuffer0, src, chunks_len, 0,
                                self.partitions * chunks_len)
                while not self.dmaOut.idle and not self.dmaOut._first_transfer:
                    pass
                self.dmaOut.transfer(self.cmaBuffer0)
                rest -= chunks_len
            self.dmaIn.wait()
        ret = np.ndarray(src.shape, src.dtype)
        self.copyNto(ret, self.cmaBuffer1, ret.nbytes)
        return ret

    def floatToFixed(self, f, total_bits, fract_bits):
        """convert float f to a signed fixed point with #total_bits and #frac_bits after the point"""
        fix = int((abs(f) * (1 << fract_bits)))
        if (f < 0):
            fix += 1 << total_bits - 1
        return fix

    def erodeDilateKernel(self, src, kernel, dst, iterations, mode, filter):
        filter.mode = mode
        filter.rows = src.shape[0]
        filter.columns = src.shape[1]
        if hasattr(src, 'physical_address') and hasattr(
                dst, 'physical_address'):
            filter.start()
            if iterations > 1:
                self.dmaIn.transfer(self.cmaBuffer1)
            else:
                self.dmaIn.transfer(dst)
            self.dmaOut.transfer(src)
            self.dmaIn.wait()
            self.cmaBuffer2.nbytes = src.nbytes  #buffer = self.xlnk.cma_array(src.shape, dtype=np.uint8)
            for i in range(2, iterations + 1):
                filter.start()
                if i % 2 == 0:
                    self.dmaIn.transfer(self.cmaBuffer2)
                    if i != iterations:  #avoid copy after last iteration
                        self.dmaOut.transfer(self.cmaBuffer1)
                    else:
                        self.dmaOut.transfer(dst)
                else:
                    self.dmaIn.transfer(self.cmaBuffer1)
                    if i != iterations:
                        self.dmaOut.transfer(self.cmaBuffer2)
                    else:
                        self.dmaOut.transfer(dst)
                self.dmaIn.wait()
            return dst
        self.cmaBuffer0.nbytes = src.nbytes
        self.cmaBuffer1.nbytes = src.nbytes
        filter.start()
        self.dmaIn.transfer(self.cmaBuffer1)
        if hasattr(src, 'physical_address'):
            self.dmaOut.transfer(src)
        else:
            self.copyNto(self.cmaBuffer0, src,
                         src.nbytes)  #np.copyto(srcBuffer,src)
            self.dmaOut.transfer(self.cmaBuffer0)
        self.dmaIn.wait()
        self.cmaBuffer2.nbytes = src.nbytes  #buffer = self.xlnk.cma_array(src.shape, dtype=np.uint8)
        for i in range(2, iterations + 1):
            filter.start()
            if i % 2 == 0:
                self.dmaIn.transfer(self.cmaBuffer2)
                self.dmaOut.transfer(self.cmaBuffer1)
            else:
                self.dmaIn.transfer(self.cmaBuffer1)
                self.dmaOut.transfer(self.cmaBuffer2)
            self.dmaIn.wait()
        ret = np.ndarray(src.shape, src.dtype)
        if iterations % 2 == 1:
            self.copyNto(ret, self.cmaBuffer1, ret.nbytes)
        else:
            self.copyNto(ret, self.cmaBuffer2, ret.nbytes)
        return ret

    '''def cornerHarris(self, src, k, dst):
        self.img_filters.select_filter(5)
        self.cornerHarrisIP.rows = src.shape[0]
        self.cornerHarrisIP.columns = src.shape[1]
        self.cornerHarrisIP.start()
        if hasattr(src, 'physical_address') and hasattr(dst, 'physical_address') and (dst.nbytes == src.nbytes*4):    
            self.dmaIn.transfer(dst)
            self.dmaOut.transfer(src)
            self.dmaIn.wait()
            return dst
        
        self.cmaBuffer2.nbytes = src.nbytes*4
        self.dmaIn.transfer(self.cmaBuffer2)
        if hasattr(src, 'physical_address') :
            self.dmaOut.transfer(src)        
        else:
            self.cmaBuffer0.nbytes = src.nbytes
            self.copyNto(self.cmaBuffer0,src,src.nbytes)
            self.dmaOut.transfer(self.cmaBuffer0)        
        self.dmaIn.wait()
        ret = np.ndarray(src.shape,np.float32)
        self.copyNto(ret,self.cmaBuffer2,ret.nbytes)
        return ret'''

    def copyNto(self, dst, src, N):
        dstPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(dst))
        srcPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(src))
        self.ffi.memmove(dstPtr, srcPtr, N)

    def copyNtoOff(self, dst, src, N, dstOffset, srcOffset):
        dstPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(dst))
        srcPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(src))
        dstPtr += dstOffset
        srcPtr += srcOffset
        self.ffi.memmove(dstPtr, srcPtr, N)

    class ContiguousArrayCv2pynq(ContiguousArray):
        def init(self, cmaArray):
            self._nbytes = cmaArray.nbytes
            self.physical_address = cmaArray.physical_address
            self.cacheable = cmaArray.cacheable

        # overwrite access to nbytes with own function
        @property
        def nbytes(self):
            return self._nbytes

        @nbytes.setter
        def nbytes(self, value):
            self._nbytes = value
示例#10
0
                l = max_area_eye(eyes)
                if l[2] > 0:
                    leftEyeLost = False
                    left_pup_x = l[0]
                    left_pup_y = l[1]

###____________________STARTING MAIN COMPUTATIONAL CHAIN FOR FIRST FRAME____________________

#roi copy of the rigth eye
        roiRightEye = frame1[int(right_pup_y):int(right_pup_y + W),
                             int(right_pup_x):int(right_pup_x + W)].copy()
        #pointer to image (roi)
        pointerToRoiRightEye = ffi.cast("uint8_t *",
                                        ffi.from_buffer(roiRightEye))
        #transfer image data to buffer
        ffi.memmove(pointIn + 1, pointerToRoiRightEye, W * W * CH)

        #DMA tranfer
        dmaOut.transfer(W * W, 1)
        dmaIn.transfer(W * W * CH, 0)

        #roi copy of left eye
        roiLeftEye = frame1[int(left_pup_y):int(left_pup_y + W),
                            int(left_pup_x):int(left_pup_x + W)].copy()
        #pointer to image (roi)
        pointerToRoiLeftEye = ffi.cast("uint8_t *",
                                       ffi.from_buffer(roiLeftEye))
        #transfer image data to buffer
        ffi.memmove(pointIn + 1, pointerToRoiLeftEye, W * W * CH)

        #get image analysed from buffer
示例#11
0
class CTC(object):
    """
    """
    def __init__(self, on_device='cpu', blank_label=0):
        libpath = get_ctc_lib()
        self.ffi = FFI()
        self.ffi.cdef(ctc_header())
        self.ctclib = self.ffi.dlopen(libpath)

        supported_devices = ['cpu', 'gpu']
        if on_device not in supported_devices:
            print("the requested device {} is not supported".format(on_device),
                  file=sys.stderr)
            sys.exit(1)
        assign_device = 0 if on_device is 'cpu' else 1

        self.options = self.ffi.new('ctcOptions*', {
            "loc": assign_device,
            "blank_label": blank_label
        })[0]
        self.size_in_bytes = self.ffi.new("size_t*")
        self.nout = None
        self.bsz = None

    def get_buf_size(self, ptr_to_buf):
        return self.ffi.sizeof(
            self.ffi.getctype(self.ffi.typeof(ptr_to_buf).item))

    def buf_ref_from_array(self, arr):
        return self.ffi.from_buffer(
            self.ffi.buffer(self.ffi.cast('void*', arr.ptr), arr.nbytes))

    def buf_ref_from_ptr(self, ptr, size):
        return self.ffi.from_buffer(self.ffi.buffer(ptr, size))

    def get_gpu_workspace_size(self, lbl_lens, utt_lens, nout, bsz):
        self.nout = nout
        self.bsz = bsz
        _lbl_lens = self.ffi.cast("int*", lbl_lens.ravel().ctypes.data)
        _utt_lens = self.ffi.cast("int*", utt_lens.ravel().ctypes.data)

        status = self.ctclib.get_workspace_size(_lbl_lens, _utt_lens,
                                                self.nout, self.bsz,
                                                self.options,
                                                self.size_in_bytes)
        assert status is 0, "get_workspace_size() in warp-ctc failed"

        return self.size_in_bytes[0]

    def bind_to_gpu(self, acts, grads, lbls, lbl_lens, utt_lens, costs,
                    workspace, scratch_size, stream):

        if stream is None:
            stream_ptr = self.ffi.cast('void*', 0)
            stream_buf_size = self.ffi.sizeof(self.ffi.new_handle(stream))
            stream_buf = self.buf_ref_from_ptr(stream_ptr, stream_buf_size)
        else:
            stream_buf = self.ffi.cast("void*", stream.handle)

        self.options.stream = stream_buf

        flat_dims = np.prod(acts.shape)
        assert np.prod(grads.shape) == flat_dims

        acts_buf = self.ffi.cast("float*", self.buf_ref_from_array(acts))
        grads_buf = self.ffi.cast("float*", self.buf_ref_from_array(grads))
        costs_buf = self.ffi.cast("float*", self.buf_ref_from_array(costs))

        warp_grads_buf_size = flat_dims * self.get_buf_size(grads_buf)
        warp_costs_buf_size = self.bsz * self.get_buf_size(costs_buf)

        warp_labels = self.ffi.cast("int*", lbls.ravel().ctypes.data)
        warp_label_lens = self.ffi.cast("int*", lbl_lens.ravel().ctypes.data)
        warp_input_lens = self.ffi.cast("int*", utt_lens.ravel().ctypes.data)

        workspace_buf = self.buf_ref_from_ptr(
            self.ffi.cast('void*', workspace), int(scratch_size))

        ctc_status = self.ctclib.compute_ctc_loss(acts_buf, grads_buf,
                                                  warp_labels, warp_label_lens,
                                                  warp_input_lens, self.nout,
                                                  self.bsz, costs_buf,
                                                  workspace_buf, self.options)

        assert ctc_status is 0, "warp-ctc run failed"

    def bind_to_cpu(self,
                    acts,
                    lbls,
                    utt_lens,
                    lbl_lens,
                    grads,
                    costs,
                    n_threads=1):

        self.options.num_threads = n_threads
        _, self.bsz, self.nout = acts.shape
        flat_dims = np.prod(acts.shape)
        assert np.prod(grads.shape) == flat_dims

        acts_buf = self.ffi.cast("float*", acts.ctypes.data)
        grads_buf = self.ffi.cast("float*", grads.ctypes.data)
        costs_buf = self.ffi.cast("float*", costs.ctypes.data)

        warp_grads_buf_size = flat_dims * self.get_buf_size(grads_buf)
        warp_costs_buf_size = self.bsz * self.get_buf_size(costs_buf)

        warp_labels = self.ffi.cast("int*", lbls.ravel().ctypes.data)
        warp_label_lens = self.ffi.cast("int*", lbl_lens.ravel().ctypes.data)
        warp_input_lens = self.ffi.cast("int*", utt_lens.ravel().ctypes.data)
        status = self.ctclib.get_workspace_size(warp_label_lens,
                                                warp_input_lens, self.nout,
                                                self.bsz, self.options,
                                                self.size_in_bytes)

        assert status is 0, "get_workspace_size() in warp-ctc failed"

        # TODO: workspace is a variable size buffer whose size is
        # determined during each call, so we can't initialize ahead
        # of time. Can we avoid this?
        workspace = self.ffi.new("char[]", self.size_in_bytes[0])

        ctc_status = self.ctclib.compute_ctc_loss(acts_buf, grads_buf,
                                                  warp_labels, warp_label_lens,
                                                  warp_input_lens, self.nout,
                                                  self.bsz, costs_buf,
                                                  workspace, self.options)

        # transfer grads and costs back without copying
        self.ffi.memmove(grads, grads_buf, warp_grads_buf_size)
        grads = grads.reshape((acts.shape))
        self.ffi.memmove(costs, costs_buf, warp_costs_buf_size)

        assert ctc_status is 0, "warp-ctc run failed"
示例#12
0
文件: main.py 项目: mikexilinx/XRT
def runKernel(opt):
    count = 1024
    DATA_SIZE = sizeof(c_int64) * count

    ffi = FFI()  # create the FFI obj
    boHandle1 = xclAllocBO(opt.handle, DATA_SIZE, xclBOKind.XCL_BO_DEVICE_RAM,
                           opt.first_mem)
    boHandle2 = xclAllocBO(opt.handle, DATA_SIZE, xclBOKind.XCL_BO_DEVICE_RAM,
                           opt.first_mem)

    bo1 = xclMapBO(opt.handle, boHandle1, True)
    bo2 = xclMapBO(opt.handle, boHandle2, True)

    bo1_fp = ffi.cast("FILE *", bo1)
    bo2_fp = ffi.cast("FILE *", bo2)

    # bo1
    bo1_arr = np.array([0x586C0C6C for _ in range(count)])
    ffi.memmove(bo1_fp, ffi.from_buffer(bo1_arr), count * 5)

    # bo2
    int_arr = np.array([i * i for i in range(count)])
    bo2_arr = np.array(map(str, int_arr.astype(int)))

    ffi.memmove(bo2_fp, ffi.from_buffer(bo2_arr), count * 7)

    # bufReference
    int_arr_2 = np.array([i * i + i * 16 for i in range(count)])
    str_arr = np.array(map(str, int_arr_2))
    buf = ffi.from_buffer(str_arr)
    bufReference = ''.join(buf)

    if xclSyncBO(opt.handle, boHandle1,
                 xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE, DATA_SIZE, 0):
        return 1

    if xclSyncBO(opt.handle, boHandle2,
                 xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE, DATA_SIZE, 0):
        return 1

    p = xclBOProperties()
    bo1devAddr = p.paddr if not (xclGetBOProperties(opt.handle, boHandle1,
                                                    p)) else -1
    bo2devAddr = p.paddr if not (xclGetBOProperties(opt.handle, boHandle2,
                                                    p)) else -1

    if bo1devAddr is -1 or bo2devAddr is -1:
        return 1

    # Allocate the exec_bo
    execHandle = xclAllocBO(opt.handle, DATA_SIZE,
                            xclBOKind.XCL_BO_SHARED_VIRTUAL, (1 << 31))
    execData = xclMapBO(opt.handle, execHandle, True)  # returns mmap()
    c_f = ffi.cast("FILE *", execData)

    if execData is ffi.NULL:
        print("execData is NULL")

    print("Construct the exe buf cmd to configure FPGA")

    ecmd = ert_configure_cmd()
    ecmd.m_uert.m_cmd_struct.state = 1  # ERT_CMD_STATE_NEW
    ecmd.m_uert.m_cmd_struct.opcode = 2  # ERT_CONFIGURE

    ecmd.slot_size = 1024
    ecmd.num_cus = 1
    ecmd.cu_shift = 16
    ecmd.cu_base_addr = opt.cu_base_addr

    ecmd.m_features.ert = opt.ert
    if opt.ert:
        ecmd.m_features.cu_dma = 1
        ecmd.m_features.cu_isr = 1

    # CU -> base address mapping
    ecmd.data[0] = opt.cu_base_addr
    ecmd.m_uert.m_cmd_struct.count = 5 + ecmd.num_cus

    sz = sizeof(ert_configure_cmd)
    ffi.memmove(c_f, ecmd, sz)
    print("Send the exec command and configure FPGA (ERT)")

    # Send the command.
    if xclExecBuf(opt.handle, execHandle):
        print("Unable to issue xclExecBuf")
        return 1

    print("Wait until the command finish")

    while xclExecWait(opt.handle, 1000) != 0:
        print(".")

    print("Construct the exec command to run the kernel on FPGA")
    print(
        "Due to the 1D OpenCL group size, the kernel must be launched %d times"
    ) % count

    # construct the exec buffer cmd to start the kernel
    for id in range(count):
        start_cmd = ert_start_kernel_cmd()
        rsz = XSIMPLE_CONTROL_ADDR_FOO_DATA / 4 + 2  # regmap array size
        new_data = ((start_cmd.data._type_) * rsz)()
        start_cmd.m_uert.m_start_cmd_struct.state = 1  # ERT_CMD_STATE_NEW
        start_cmd.m_uert.m_start_cmd_struct.opcode = 0  # ERT_START_CU
        start_cmd.m_uert.m_start_cmd_struct.count = 1 + rsz
        start_cmd.cu_mask = 0x1

        new_data[XSIMPLE_CONTROL_ADDR_AP_CTRL] = 0x0
        new_data[XSIMPLE_CONTROL_ADDR_GROUP_ID_X_DATA / 4] = id
        new_data[XSIMPLE_CONTROL_ADDR_S1_DATA /
                 4] = bo1devAddr & 0xFFFFFFFF  # output
        new_data[XSIMPLE_CONTROL_ADDR_S2_DATA /
                 4] = bo2devAddr & 0xFFFFFFFF  # input
        new_data[XSIMPLE_CONTROL_ADDR_FOO_DATA / 4] = 0x10  # foo
        ffi.memmove(c_f, start_cmd, 2 * sizeof(c_uint32))

        tmp_buf = ffi.buffer(
            c_f, 2 * sizeof(c_uint32) + (len(new_data) * sizeof(c_uint32)))
        data_ptr = ffi.from_buffer(tmp_buf)
        ffi.memmove(data_ptr + 2 * sizeof(c_uint32), new_data,
                    len(new_data) * sizeof(c_uint32))

        if xclExecBuf(opt.handle, execHandle):
            print("Unable to issue xclExecBuf")
            return 1

        print("Wait until the command finish")

        while xclExecWait(opt.handle, 100) != 0:
            print("reentering wait... \n")

    # get the output xclSyncBO
    print("Get the output data from the device")
    if xclSyncBO(opt.handle, boHandle1,
                 xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE, DATA_SIZE, 0):
        return 1
    rd_buf = ffi.buffer(bo1_fp, count * 7)
    print("RESULT: ")
    # print(rd_buf[:] + "\n")

    if bufReference != rd_buf[:]:
        print("FAILED TEST")
        print("Value read back does not match value written")
        sys.exit()
class cv2pynq():
    MAX_WIDTH = 800
    MAX_HEIGHT = 600

    def __init__(self, load_overlay=True):
        self.bitstream_name = None
        self.bitstream_name = "imgfilter.bit"
        self.bitstream_path = os.path.join(self.bitstream_name)
        self.ol = Overlay(self.bitstream_path)
        self.ol.download()
        self.ol.reset()
        self.xlnk = Xlnk()
        self.partitions = 10  #split the cma into partitions for pipelined transfer
        self.cmaPartitionLen = self.MAX_HEIGHT * self.MAX_WIDTH / self.partitions
        self.listOfcma = [
            self.xlnk.cma_array(shape=(int(self.MAX_HEIGHT / self.partitions),
                                       self.MAX_WIDTH),
                                dtype=np.uint8) for i in range(self.partitions)
        ]
        self.filter2d = self.ol
        self.dmaOut = self.filter2d.axi_dma_0.sendchannel
        self.dmaIn = self.filter2d.axi_dma_0.recvchannel
        self.dmaOut.stop()
        self.dmaIn.stop()
        self.dmaIn.start()
        self.dmaOut.start()
        self.filter2DType = -1  # filter types: SobelX=0
        self.ffi = FFI()
        self.f2D = self.filter2d.filter2D_hls_0
        self.f2D.reset()
        self.cmaBuffer_0 = self.xlnk.cma_array(shape=(self.MAX_HEIGHT,
                                                      self.MAX_WIDTH),
                                               dtype=np.uint8)
        self.cmaBuffer0 = self.cmaBuffer_0.view(self.ContiguousArrayCv2pynq)
        self.cmaBuffer0.init(self.cmaBuffer_0)
        self.cmaBuffer_1 = self.xlnk.cma_array(shape=(self.MAX_HEIGHT,
                                                      self.MAX_WIDTH),
                                               dtype=np.uint8)
        self.cmaBuffer1 = self.cmaBuffer_1.view(self.ContiguousArrayCv2pynq)
        self.cmaBuffer1.init(self.cmaBuffer_1)
        self.cmaBuffer_2 = self.xlnk.cma_array(
            shape=(self.MAX_HEIGHT * 4, self.MAX_WIDTH),
            dtype=np.uint8)  # *4 for CornerHarris return
        self.cmaBuffer2 = self.cmaBuffer_2.view(self.ContiguousArrayCv2pynq)
        self.cmaBuffer2.init(self.cmaBuffer_2)

    def close(self):
        self.cmaBuffer_0.close()
        self.cmaBuffer_1.close()
        self.cmaBuffer_2.close()
        for cma in self.listOfcma:
            cma.close()

    def copyNto(self, dst, src, N):
        dstPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(dst))
        srcPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(src))
        self.ffi.memmove(dstPtr, srcPtr, N)

    def copyNtoOff(self, dst, src, N, dstOffset, srcOffset):
        dstPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(dst))
        srcPtr = self.ffi.cast("uint8_t *", self.ffi.from_buffer(src))
        dstPtr += dstOffset
        srcPtr += srcOffset
        self.ffi.memmove(dstPtr, srcPtr, N)

    def filter2D(self, src, dst):
        if dst is None:
            self.cmaBuffer1.nbytes = src.nbytes
        elif hasattr(src, 'physical_address') and hasattr(
                dst, 'physical_address'):
            self.dmaIn.transfer(dst)
            self.dmaOut.transfer(src)
            self.dmaIn.wait()
            return dst
        if hasattr(src, 'physical_address'):
            self.dmaIn.transfer(self.cmaBuffer1)
            self.dmaOut.transfer(src)
            self.dmaIn.wait()
        else:  #pipeline the copy to contiguous memory and filter calculation in hardware
            if src.nbytes < 184800:  #440x420
                self.partitions = 1
            elif src.nbytes < 180000:  #600x300
                self.partitions = 2
            elif src.nbytes < 231200:  #680x340
                self.partitions = 4
            else:
                self.partitions = 8
            self.cmaBuffer1.nbytes = src.nbytes
            self.dmaIn.transfer(self.cmaBuffer1)
            chunks_len = int(src.nbytes / (self.partitions))
            self.cmaBuffer0.nbytes = chunks_len
            self.cmaBuffer2.nbytes = chunks_len
            self.copyNto(src, self.cmaBuffer0, chunks_len)
            for i in range(1, self.partitions):
                if i % 2 == 1:
                    while not self.dmaOut.idle and not self.dmaOut._first_transfer:
                        pass
                    self.dmaOut.transfer(self.cmaBuffer0)
                    self.copyNtoOff(src, self.cmaBuffer2, chunks_len,
                                    i * chunks_len, 0)
                else:
                    while not self.dmaOut.idle and not self.dmaOut._first_transfer:
                        pass
                    self.dmaOut.transfer(self.cmaBuffer2)
                    self.copyNtoOff(src, self.cmaBuffer0, chunks_len,
                                    i * chunks_len, 0)
            while not self.dmaOut.idle and not self.dmaOut._first_transfer:
                pass
            self.dmaOut.transfer(self.cmaBuffer2)
            rest = src.nbytes % self.partitions
            if rest != 0:  #cleanup any remaining data and send it to HW
                self.copyNtoOff(src, self.cmaBuffer0, chunks_len,
                                self.partitions * chunks_len, 0)
                while not self.dmaOut.idle and not self.dmaOut._first_transfer:
                    pass
                self.dmaOut.transfer(self.cmaBuffer0)
            self.dmaIn.wait()
        ret = np.ndarray(src.shape, src.dtype)
        self.copyNto(ret, self.cmaBuffer1, ret.nbytes)
        return ret

    class ContiguousArrayCv2pynq(ContiguousArray):
        def init(self, cmaArray):
            self._nbytes = cmaArray.nbytes
            self.physical_address = cmaArray.physical_address
            self.cacheable = cmaArray.cacheable

        # overwrite access to nbytes with own function
        @property
        def nbytes(self):
            return self._nbytes

        @nbytes.setter
        def nbytes(self, value):
            self._nbytes = value

    def Sobel(self, src, ddepth, dx, dy, dst, ksize):
        if (ksize == 3):
            self.filter2DType = 0
            self.f2D.rows = src.shape[0]
            self.f2D.columns = src.shape[1]
            self.f2D.channels = 1
            self.f2D.r1 = 0x000100ff  #[-1  0  1]
            self.f2D.r2 = 0x000200fe  #[-2  0  2]
            self.f2D.r3 = 0x000100ff  #[-1  0  1]
            self.f2D.start()
            return self.filter2D(src, dst)

    def Sobel1(self, src, ddepth, dx, dy, dst, ksize):
        if (ksize == 3):
            if self.filter2DType != 1:
                self.filter2DType = 1
                self.f2D.rows = src.shape[0]
                self.f2D.columns = src.shape[1]
                self.f2D.channels = 1
                self.f2D.r1 = 0x00fffeff  #[-1 -2 -1]
                self.f2D.r2 = 0x00000000  #[ 0  0  0]
                self.f2D.r3 = 0x00010201  #[ 1  2  1]
                self.f2D.start()
                return self.filter2D(src, dst)