Пример #1
0
def RunDPU(kernel, img, count):
    """
    DPU run function
    kernel: dpu kernel
    img: image to be run
    count : test rounds count
    """
    """Create DPU Tasks from DPU Kernel"""
    task = n2cube.dpuCreateTask(kernel, 0)
    while count < 1000:
        """Load image to DPU"""
        dputils.dpuSetInputImage2(task, KERNEL_CONV_INPUT, img)
        """Get input Tesor"""
        tensor = n2cube.dpuGetInputTensor(task, KERNEL_CONV_INPUT)
        """Model run on DPU"""
        n2cube.dpuRunTask(task)
        """Get the output tensor size from FC output"""
        size = n2cube.dpuGetOutputTensorSize(task, KERNEL_FC_OUTPUT)
        """Get the output tensor channel from FC output"""
        channel = n2cube.dpuGetOutputTensorChannel(task, KERNEL_FC_OUTPUT)

        softmax = np.zeros(size, dtype=float32)
        """Get FC result"""
        conf = n2cube.dpuGetOutputTensorAddress(task, KERNEL_FC_OUTPUT)
        """Get output scale of FC"""
        outputScale = n2cube.dpuGetOutputTensorScale(task, KERNEL_FC_OUTPUT)
        """Run softmax"""
        softmax = n2cube.dpuRunSoftmax(conf, channel, size // channel,
                                       outputScale)

        l.acquire()
        count = count + threadnum
        l.release()
    """Destroy DPU Tasks & free resources"""
    n2cube.dpuDestroyTask(task)
    def prediction(self, image):
        task = n2cube.dpuCreateTask(self.kernel, 0)

        image_data = Detector.pre_process(image, (416, 416))
        image_data = np.array(image_data,dtype=np.float32)
        input_len = n2cube.dpuGetInputTensorSize(task, CONV_INPUT_NODE)

        """Get input Tesor"""
        n2cube.dpuSetInputTensorInHWCFP32(task,CONV_INPUT_NODE,image_data,input_len)

        """Model run on DPU"""
        n2cube.dpuRunTask(task)

        """Get the output tensor"""
        conv_sbbox_size = n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE1)
        conv_out1 = n2cube.dpuGetOutputTensorInHWCFP32(task, CONV_OUTPUT_NODE1, conv_sbbox_size)
        conv_out1 = np.reshape(conv_out1, (1, 13, 13, 75))

        conv_mbbox_size = n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE2)
        conv_out2 = n2cube.dpuGetOutputTensorInHWCFP32(task, CONV_OUTPUT_NODE2, conv_mbbox_size)
        conv_out2 = np.reshape(conv_out2, (1, 26, 26, 75))

        conv_lbbox_size = n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE3)
        conv_out3 = n2cube.dpuGetOutputTensorInHWCFP32(task, CONV_OUTPUT_NODE3, conv_lbbox_size)
        conv_out3 = np.reshape(conv_out3, (1, 52, 52, 75))

        return [conv_out1, conv_out2, conv_out3]
Пример #3
0
 def dpuFlowerPredictSoftmax(self, img_input):
     img_scale = cv2.resize(img_input, (self.dpuImgSize, self.dpuImgSize),
                            interpolation=cv2.INTER_CUBIC)
     img1_scale = np.array(img_scale, dtype='float32')
     if np.max(img1_scale) > 1:
         img1_scale = img1_scale / 255.
     input_len = img1_scale.shape[0] * img1_scale.shape[
         1] * img1_scale.shape[2]  #输入数据的长度
     #input_len = n2cube.dpuGetInputTensorSize(task, KERNEL_CONV_INPUT)
     # Set DPU Task input Tensor with data from a CPU memory block.
     # 设定 DPU 任务输入张量以及所需的 CPU 内存
     n2cube.dpuSetInputTensorInHWCFP32(self.dpuTaks, self.dpuInputNode,
                                       img1_scale, input_len)
     # Launch the running of DPU Task.
     n2cube.dpuRunTask(self.dpuTaks)
     # 以下代码有一些问题
     # softmax需要4个参数
     # 后期可以移到dpuFlowerSetSoftmax
     conf = n2cube.dpuGetOutputTensorAddress(self.dpuTaks,
                                             self.dpuOutputNode)
     channel = n2cube.dpuGetOutputTensorChannel(self.dpuTaks,
                                                self.dpuOutputNode)
     outScale = n2cube.dpuGetOutputTensorScale(self.dpuTaks,
                                               self.dpuOutputNode)
     size = n2cube.dpuGetOutputTensorSize(self.dpuTaks, self.dpuOutputNode)
     ################
     softmax = n2cube.dpuRunSoftmax(conf, channel, size // channel,
                                    outScale)
     pdt = np.argmax(softmax, axis=0)
     return pdt
Пример #4
0
    def run_dpu_task(self, kernel, i, iter_cnt, listimage, result):
        start = 0
        count = 0
        task = n2cube.dpuCreateTask(kernel, 0)
        pre_time = 0
        calc_time = 0
        while count < iter_cnt:
            img_name = listimage[i].pop(0)
            path = os.path.join(image_folder, img_name)
            im = cv2.imread(path)
            im = cv2.cvtColor(im,cv2.COLOR_BGR2RGB)
            h, w = im.shape[:2]

            padded_center_crop_size = int((IMAGE_SIZE / (IMAGE_SIZE + CROP_PADDING)) * min(h, w))
            offset_height = ((h - padded_center_crop_size) + 1) // 2
            offset_width = ((w - padded_center_crop_size) + 1) // 2
            image_crop = im[offset_height: padded_center_crop_size + offset_height, offset_width: padded_center_crop_size + offset_width,:]
            image = cv2.resize(image_crop, dsize=(IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_CUBIC)
            im = cv2.normalize(image, None, alpha=-1, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
            count = count + 1
            input_len = n2cube.dpuGetInputTensorSize(task, KERNEL_CONV_INPUT)
            n2cube.dpuSetInputTensorInHWCFP32(task, KERNEL_CONV_INPUT, im, input_len)
            
            n2cube.dpuRunTask(task)
            size = n2cube.dpuGetOutputTensorSize(task, KERNEL_FC_OUTPUT)
            channel = n2cube.dpuGetOutputTensorChannel(task, KERNEL_FC_OUTPUT)
            conf = n2cube.dpuGetOutputTensorAddress(task, KERNEL_FC_OUTPUT)
            outputScale = n2cube.dpuGetOutputTensorScale(task, KERNEL_FC_OUTPUT)
            softmax = n2cube.dpuRunSoftmax(conf, channel, size//channel, outputScale)
            
            idx   = np.argpartition(softmax, -5)[-5:]
            top_5 = idx[np.argsort(-softmax[idx])]
            for val in top_5:
                result[i].append("{} {}".format(img_name, val))
        n2cube.dpuDestroyTask(task)
Пример #5
0
def main():

    """ Attach to DPU driver and prepare for running """
    n2cube.dpuOpen()

    """ Create DPU Kernels for CONV NODE in imniResNet """
    kernel = n2cube.dpuLoadKernel(KERNEL_CONV)

    """ Create DPU Tasks for CONV NODE in miniResNet """
    task = n2cube.dpuCreateTask(kernel, 0)

    listimage = os.listdir(calib_image_dir)

    for i in range(len(listimage)):
        path = os.path.join(calib_image_dir, listimage[i])
        if os.path.splitext(path)[1] != ".png":
            continue
        print("Loading %s" %listimage[i])

        """ Load image and Set image into CONV Task """
        imageRun=graph_input_fn.calib_input(path)
        imageRun=imageRun.reshape((imageRun.shape[0]*imageRun.shape[1]*imageRun.shape[2]))
        input_len=len(imageRun)
        n2cube.dpuSetInputTensorInHWCFP32(task,CONV_INPUT_NODE,imageRun,input_len)

        """  Launch miniRetNet task """
        n2cube.dpuRunTask(task)

        """ Get output tensor address of CONV """
        conf = n2cube.dpuGetOutputTensorAddress(task, CONV_OUTPUT_NODE)
        
        """ Get output channel of CONV  """
        channel = n2cube.dpuGetOutputTensorChannel(task, CONV_OUTPUT_NODE)
        
        """ Get output size of CONV  """
        size = n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE)
        
        softmax = [0 for i in range(size)]
       
        """ Get output scale of CONV  """
        scale = n2cube.dpuGetOutputTensorScale(task, CONV_OUTPUT_NODE)
        
        batchSize=size//channel
        """ Calculate softmax and show TOP5 classification result """
        n2cube.dpuRunSoftmax(conf, softmax, channel, batchSize, scale)
        TopK(softmax, calib_image_list)

    """ Destroy DPU Tasks & free resources """
    n2cube.dpuDestroyTask(task)
    """ Destroy DPU Kernels & free resources """
    rtn = n2cube.dpuDestroyKernel(kernel)
    """ Dettach from DPU driver & free resources """
    n2cube.dpuClose()
Пример #6
0
def run_dpu_task(outsize, task, outputchannel, conf, outputscale, listimage,
                 imageRun, KERNEL_CONV_INPUT, KERNEL_FC_OUTPUT):
    input_len = len(imageRun)
    #    print(f"input_len = {input_len}")
    n2cube.dpuSetInputTensorInHWCFP32(task, KERNEL_CONV_INPUT, imageRun,
                                      input_len)
    n2cube.dpuRunTask(task)
    #    outputtensor = n2cube.dpuGetOutputTensorInHWCFP32(task, KERNEL_FC_OUTPUT, outsize)
    #    print(outputtensor)
    #    print(outputchannel)
    #    print(outputscale)
    softmax = n2cube.dpuRunSoftmax(conf, outputchannel,
                                   outsize // outputchannel, outputscale)
    #    print(f"softmax = {softmax}")
    return softmax, listimage
Пример #7
0
def predict_label(imfile):
    task = n2cube.dpuCreateTask(kernel, 0)

    # Set client to get file from S3 
    s3_client.download_file(BUCKET, imfile, image_folder + imfile)
    img_obj = os.path.join(image_folder, imfile)
    
    #To get it from local path
    #img_file = os.path.join(image_folder, imfile)
    
    img = cv2.imread(img_obj) 
    img = cv2.resize(img, (IMG_DIMS, IMG_DIMS))
    img = img.astype(np.float32)
    img = (img/255.0) 
        
    """Get input Tensor"""
    tensor = n2cube.dpuGetInputTensor(task, KERNEL_CONV_INPUT)
    input_len = n2cube.dpuGetInputTensorSize(task, KERNEL_CONV_INPUT)   
        
    """Set input Tesor"""
    n2cube.dpuSetInputTensorInHWCFP32(task, KERNEL_CONV_INPUT, img, input_len)

    """Model run on DPU"""
    n2cube.dpuRunTask(task)
        
    """Get the output tensor size from FC output"""
    size = n2cube.dpuGetOutputTensorSize(task, KERNEL_FC_OUTPUT)

    """Get the output tensor channel from FC output"""
    channel = n2cube.dpuGetOutputTensorChannel(task, KERNEL_FC_OUTPUT)

    softmax = np.zeros(size,dtype=np.float32)

    """Get FC result"""
    conf = n2cube.dpuGetOutputTensorAddress(task, KERNEL_FC_OUTPUT)

    """Get output scale of FC"""
    outputScale = n2cube.dpuGetOutputTensorScale(task, KERNEL_FC_OUTPUT)

    """Run softmax"""
    softmax = n2cube.dpuRunSoftmax(conf, channel, size // channel, outputScale)
     
    #print("softmax =", softmax)

    n2cube.dpuDestroyTask(task)
    
    return slabels[np.argmax(softmax)].strip('\n')
Пример #8
0
 def dpuPredictSoftmax(self, img_input):
     img_scale = cv2.resize(img_input, (self.dpuImgSize, self.dpuImgSize),
                            interpolation=cv2.INTER_CUBIC)
     img1_scale = np.array(img_scale, dtype='float32')
     if np.max(img1_scale) > 1:
         img1_scale = img1_scale / 255.
     input_len = img1_scale.shape[0] * img1_scale.shape[
         1] * img1_scale.shape[2]
     n2cube.dpuSetInputTensorInHWCFP32(self.dpuTask, self.dpuInputNode,
                                       img1_scale, input_len)
     n2cube.dpuRunTask(self.dpuTask)
     conf = n2cube.dpuGetOutputTensorAddress(self.dpuTask,
                                             self.dpuOutputNode)
     channel = n2cube.dpuGetOutputTensorChannel(self.dpuTask,
                                                self.dpuOutputNode)
     outScale = n2cube.dpuGetOutputTensorScale(self.dpuTask,
                                               self.dpuOutputNode)
     size = n2cube.dpuGetOutputTensorSize(self.dpuTask, self.dpuOutputNode)
     softmax = n2cube.dpuRunSoftmax(conf, channel, size // channel,
                                    outScale)
     pdt = np.argmax(softmax, axis=0)
     return pdt
Пример #9
0
def accel_fused(kernel_name, input_name, output_name, layout, out, *ins):

    # Attach to DPU driver and prepare for running
    n2cube.dpuOpen()

    # Create DPU Kernels
    kernel = n2cube.dpuLoadKernel(kernel_name)

    # Create DPU Tasks for kernel
    task = n2cube.dpuCreateTask(kernel, 0)

    # Load image to DPU
    X = ins[0].asnumpy().reshape((-1))
    n2cube.dpuSetInputTensorInHWCFP32(task, input_name, X, len(X))

    # Model run on DPU """
    n2cube.dpuRunTask(task)

    # Get the output tensor size
    size = n2cube.dpuGetOutputTensorSize(task, output_name)
    address = n2cube.dpuGetOutputTensorAddress(task, output_name)

    value = [0 for i in range(size)]

    # Get the output tensor data
    n2cube.dpuGetTensorData(address, value, size)
    scale = n2cube.dpuGetOutputTensorScale(task, output_name, idx=0)

    value = np.array(value).astype(np.float32) * float(scale)

    value_shape = tuple(out.shape) if layout == 'NHWC' else  \
        (out.shape[0], out.shape[2], out.shape[3], out.shape[1])
    value = np.reshape(value, value_shape)

    # DPU output is in NHWC
    if layout == 'NCHW':
        value = np.transpose(value, (0, 3, 1, 2))

    tvm.nd.array(value).copyto(out)
Пример #10
0
def main():

    print("STARTING UNETv2 on DPU...")

    if USE_DPU:
        # Attach to DPU driver
        n2cube.dpuOpen()

        # Load DPU Kernel and create a task
        kernel = n2cube.dpuLoadKernel(KERNEL_CONV)
        task = n2cube.dpuCreateTask(kernel, 0)

    # load and preprocess images and load segmentation labels
    assert os.path.isdir(IMG_TEST_DIR)
    #print(IMG_TEST_DIR)
    x_test, y_test, img_file, seg_file = dpu_get_data(IMG_TEST_DIR,
                                                      SEG_TEST_DIR,
                                                      cfg.NUM_CLASSES,
                                                      cfg.WIDTH, cfg.HEIGHT)

    y_pred = []
    # process all images
    for i in range(len(x_test)):

        # opened image as BGR, convert it to RGB
        #B,G,R  = cv2.split(x_test[i])
        #imageRun = cv2.merge((R,G,B))
        imageRun = x_test[i]
        imageRun = imageRun.reshape(
            (imageRun.shape[0] * imageRun.shape[1] * imageRun.shape[2]))
        input_len = len(imageRun)

        if USE_DPU:
            # load pre-processed image as DPU input
            n2cube.dpuSetInputTensorInHWCFP32(task, CONV_INPUT_NODE, imageRun,
                                              input_len)
            dpu_in = n2cube.dpuGetInputTensor(task, CONV_INPUT_NODE)
            ti_scale = n2cube.dpuGetTensorScale(dpu_in)
            ti_h = n2cube.dpuGetTensorHeight(dpu_in)
            ti_w = n2cube.dpuGetTensorWidth(dpu_in)
            ti_sz = n2cube.dpuGetTensorSize(dpu_in)
            ti_ch = n2cube.dpuGetTensorChannel(dpu_in)
            if (i == 0):
                print(
                    "Input  tensor=%3d ch=%3d H=%3d W=%3d Size=%6d scale=%4d" %
                    (i, ti_ch, ti_h, ti_w, ti_sz, ti_scale))
            # run DPU task
            n2cube.dpuRunTask(task)

            # get output tensor address
            dpu_out = n2cube.dpuGetOutputTensorAddress(task, CONV_OUTPUT_NODE)

            # get number of channels in output tensor
            to_ch = n2cube.dpuGetOutputTensorChannel(task, CONV_OUTPUT_NODE)
            # get size in bytes of output tensor
            to_sz = n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE)
            # get width output tensor
            to_w = n2cube.dpuGetOutputTensorWidth(task, CONV_OUTPUT_NODE)
            # get height output tensor
            to_h = n2cube.dpuGetOutputTensorHeight(task, CONV_OUTPUT_NODE)
            # get output tensor scale
            to_scale = n2cube.dpuGetOutputTensorScale(task, CONV_OUTPUT_NODE)

            softmax = np.zeros(to_sz, dtype=np.float32)

            if (i == 0):
                print("Output tensor=%3d ch=%3d H=%3d W=%3d Size=%6d" %
                      (i, to_ch, to_h, to_w, to_sz))
                print("Output tensor scaling factor", to_scale)

            softmax = n2cube.dpuRunSoftmax(dpu_out, to_ch, to_sz // to_ch,
                                           to_scale)

            prediction = softmax.reshape((to_h, to_w, to_ch))

            y_pred.append(prediction)
            if (i == 0):
                print("prediction shape: ", prediction.shape)

    # Calculate intersection over union for each segmentation class
    y_pred = np.asarray(y_pred)
    y_test = np.asarray(y_test)
    print("y_pred shape: ", y_pred.shape)
    print("y_test shape: ", y_test.shape)

    y_predi = np.argmax(y_pred, axis=3)
    y_testi = np.argmax(y_test, axis=3)
    print("shape of y_testi and y_predi ", y_testi.shape, y_predi.shape)

    dpu_IoU(y_testi, y_predi)

    # print results
    print("Processed", len(x_test), "images")
    print("FINISHED")

    if USE_DPU:
        # Destroy DPU Kernel & detach
        n2cube.dpuDestroyKernel(kernel)
        n2cube.dpuClose()
Пример #11
0
def runDPU(preedQueue, dpuresQueue, threadnum):

    KERNEL_CONV = "testnet"
    CONV_INPUT_NODE = "ssd_mobilenet_v2_conv2d_conv2d_conv2d_Conv2D"

    CONV_OUTPUT_NODE = []
    CONV_OUTPUT_NODE.append(
        "ssd_mobilenet_v2_block8_box_conv_cls_2_conv_cls_2_Conv2D")
    CONV_OUTPUT_NODE.append(
        "ssd_mobilenet_v2_block7_box_conv_cls_2_conv_cls_2_Conv2D")
    CONV_OUTPUT_NODE.append(
        "ssd_mobilenet_v2_block6_box_conv_cls_2_conv_cls_2_Conv2D")
    CONV_OUTPUT_NODE.append(
        "ssd_mobilenet_v2_block5_box_conv_cls_2_conv_cls_2_Conv2D")
    CONV_OUTPUT_NODE.append(
        "ssd_mobilenet_v2_block8_box_conv_loc_2_conv_loc_2_Conv2D")
    CONV_OUTPUT_NODE.append(
        "ssd_mobilenet_v2_block7_box_conv_loc_2_conv_loc_2_Conv2D")
    CONV_OUTPUT_NODE.append(
        "ssd_mobilenet_v2_block6_box_conv_loc_2_conv_loc_2_Conv2D")
    CONV_OUTPUT_NODE.append(
        "ssd_mobilenet_v2_block5_box_conv_loc_2_conv_loc_2_Conv2D")

    reshapsize = []
    reshapsize.append((1, 56, 96, 6, 10))
    reshapsize.append((1, 28, 48, 6, 10))
    reshapsize.append((1, 14, 24, 6, 10))
    reshapsize.append((1, 7, 12, 6, 10))
    reshapsize.append((1, 56, 96, 6, 4))
    reshapsize.append((1, 28, 48, 6, 4))
    reshapsize.append((1, 14, 24, 6, 4))
    reshapsize.append((1, 7, 12, 6, 4))
    """ Attach to DPU driver and prepare for running """
    n2cube.dpuOpen()
    """ Create DPU Kernels for CONV NODE in imniResNet """
    kernel = n2cube.dpuLoadKernel(KERNEL_CONV)
    """ Create DPU Tasks for CONV NODE in miniResNet """
    task = n2cube.dpuCreateTask(kernel, 0)

    conv_sbbox_size = []
    for i in range(8):
        conv_sbbox_size.append(
            n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE[i]))
        print("outputdata %d is %d" % (i, conv_sbbox_size[i]))

    print("finish set dpu")

    #tstart=time.time()
    tdpu = 0
    tall1 = time.time()
    for j in range(runtimes):

        listin = preedQueue.get()
        orgimg = listin[0]
        imgo = listin[1]

        tpost1 = time.time()
        # print("preedimg.shape is",preedimg.shape)

        n2cube.dpuSetInputTensorInHWCInt8(task, CONV_INPUT_NODE, imgo,
                                          1032192)  #448*768*3
        """  Launch miniRetNet task """
        # print("ready to start dpu work")
        n2cube.dpuRunTask(task)

        outputData = []
        outputData.append(orgimg)
        for i in range(8):
            conv_out = n2cube.dpuGetOutputTensorInHWCInt8(
                task, CONV_OUTPUT_NODE[i], conv_sbbox_size[i])
            conv_out = np.reshape(conv_out, reshapsize[i])
            outputData.append(conv_out)

        tpost2 = time.time()
        tdpu = tdpu + (tpost2 - tpost1)
        #print("one dpu cost time is", (tpost2 - tpost1))
        dpuresQueue.put(outputData)
    tall2 = time.time()
    print("all dpu time out is cost", (tall2 - tall1))
    print("all dpu cost time is", tdpu)

    n2cube.dpuDestroyTask(task)

    return
Пример #12
0
#        path = image_folder + listimage[i]
#        img = cv2.imread(path)
#        image = cv2.imread(path)	

        image_ho, image_wo, _ = image.shape
        image_size = image.shape[:2]
        image_data = pre_process(image, (416, 416))
        
        image_data = np.array(image_data,dtype=np.float32)
        input_len = n2cube.dpuGetInputTensorSize(task, CONV_INPUT_NODE)
     
        """Get input Tesor"""
        n2cube.dpuSetInputTensorInHWCFP32(task,CONV_INPUT_NODE,image_data,input_len)
    
        """Model run on DPU"""
        n2cube.dpuRunTask(task)
       
        """Get the output tensor"""   
        conv_sbbox_size = n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE1)
        conv_out1 = n2cube.dpuGetOutputTensorInHWCFP32(task, CONV_OUTPUT_NODE1, conv_sbbox_size)
        conv_out1 = np.reshape(conv_out1, (1, 13, 13, 75))
        
        conv_mbbox_size = n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE2)
        conv_out2 = n2cube.dpuGetOutputTensorInHWCFP32(task, CONV_OUTPUT_NODE2, conv_mbbox_size)
        conv_out2 = np.reshape(conv_out2, (1, 26, 26, 75))
            
        conv_lbbox_size = n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE3)
        conv_out3 = n2cube.dpuGetOutputTensorInHWCFP32(task, CONV_OUTPUT_NODE3, conv_lbbox_size)
        conv_out3 = np.reshape(conv_out3, (1, 52, 52, 75))
        
        yolo_outputs = [conv_out1, conv_out2, conv_out3]    
Пример #13
0
def main():

  # UI: DPU
  ui = UI()
  ui.update_boot_window('Initializing DPU...')

  from dnndk import n2cube
  from pynq_dpu import DpuOverlay

  # Set up the DPU IP
  overlay = DpuOverlay(str(fh.dir_dpu / fh.dpu_bit_file))
  overlay.load_model(str(fh.dir_dpu / fh.dpu_assembly_file))

  # Set up the Neural Network Runtime (N2Cube)
  kernel_name = fh.kernel_name

  kernel_conv_input = fh.kernel_conv_input
  kernel_fc_output = fh.kernel_fc_output

  n2cube.dpuOpen()
  kernel = n2cube.dpuLoadKernel(kernel_name)
  task = n2cube.dpuCreateTask(kernel, 0)

  input_tensor_size = n2cube.dpuGetInputTensorSize(task, kernel_conv_input)

  output_tensor_size = n2cube.dpuGetOutputTensorSize(task, kernel_fc_output)
  output_tensor_channel = n2cube.dpuGetOutputTensorChannel(task, kernel_fc_output)
  output_tensor_address = n2cube.dpuGetOutputTensorAddress(task, kernel_fc_output)
  output_tensor_scale = n2cube.dpuGetOutputTensorScale(task, kernel_fc_output)

  # UI: Camera
  ui.update_boot_window('Initializing Camera...')

  # libcamera
  libcamera = ctypes.CDLL(fh.dir_cam / fh.libcamera_file)

  # Getter
  libcamera.get_frame_ptr.restype = ctypes.POINTER(ctypes.c_ubyte)
  libcamera.get_frame_ptr.argtypes = [ctypes.c_uint]
  libcamera.get_throw_bgn_idx.restype = ctypes.c_uint
  libcamera.get_throw_bgn_idx.argtypes = None
  libcamera.get_throw_end_idx.restype = ctypes.c_uint
  libcamera.get_throw_end_idx.argtypes = None
  libcamera.get_throw_bgn.restype = ctypes.c_bool
  libcamera.get_throw_bgn.argtypes = None
  libcamera.get_throw_end.restype = ctypes.c_bool
  libcamera.get_throw_end.argtypes = None

  # Setter
  libcamera.set_frame_rate.restype = None
  libcamera.set_frame_rate.argtypes = [ctypes.c_double]
  libcamera.set_buff_size.restype = None
  libcamera.set_buff_size.argtypes = [ctypes.c_uint]
  libcamera.set_exposure_time.restype = None
  libcamera.set_exposure_time.argtypes = [ctypes.c_double]
  libcamera.set_camera_gain.restype = None
  libcamera.set_camera_gain.argtypes = [ctypes.c_double]
  libcamera.set_avg_diffs.restype = None
  libcamera.set_avg_diffs.argtypes = [ctypes.c_uint]
  libcamera.set_threshold_mult.restype = None
  libcamera.set_threshold_mult.argtypes = [ctypes.c_double]
  libcamera.set_frames_to_acquire.restype = None
  libcamera.set_frames_to_acquire.argtypes = [ctypes.c_uint]

  # Camera
  libcamera.initialize.restype = ctypes.c_int
  libcamera.initialize.argtypes = None
  libcamera.reset_global_variables.restype = None
  libcamera.reset_global_variables.argtypes = None
  libcamera.start_acquisition.restype = ctypes.c_int
  libcamera.start_acquisition.argtypes = None
  libcamera.terminate.restype = ctypes.c_int
  libcamera.terminate.argtypes = None

  # Set the global variables according to the module `fhnwtoys.settings`
  libcamera.set_frame_rate(fh.frame_rate)
  libcamera.set_buff_size(fh.buff_size)
  libcamera.set_exposure_time(fh.exposure_time)
  libcamera.set_camera_gain(fh.camera_gain)
  libcamera.set_avg_diffs(fh.avg_diffs)
  libcamera.set_threshold_mult(fh.threshold_mult)
  libcamera.set_frames_to_acquire(fh.frames_to_acquire)

  # Initialize Camera
  initialize = fh.ReturnCodes.NOT_INITIALIZED(*\label{lst:ln:camera_init1}*)
  initialization_tries = 0

  while initialize != fh.ReturnCodes.SUCCESS:
    if initialization_tries > 0:
      try:
        return_code = fh.ReturnCodes(initialize).name
      except ValueError:
        return_code = initialize
      ui.update_boot_window(f'Camera Error ({return_code}), try to replug the camera.')
    initialize = libcamera.initialize()
    initialization_tries += 1(*\label{lst:ln:camera_init2}*)

  # UI: Ready
  ui.update_boot_window('READY')

  # Set up the `frames` array
  frames = np.empty((fh.frames_to_consider,) + fh.bgr_shape, dtype=np.uint8)

  while True:
    # Reset the predictions
    predictions = np.zeros((fh.frames_to_consider, fh.num_objects), dtype=np.float32)(*\label{lst:ln:predictions_matrix}*)

    # Start acquisition (threaded)
    # todo: error handling ('Unexpected Error, system reboot required.')
    # start_acquisition = libcamera.start_acquisition() # non threaded approach
    t = Thread(target=libcamera.start_acquisition)(*\label{lst:ln:threading}*) # threaded approach (process due to ctypes)
    t.start()

    # Wait until the throw has ended (the Ultra96-V2 is not powerful enough to process the data during the acquisition)
    while not libcamera.get_throw_end():
      pass(*\label{lst:ln:polling}*)

    throw_bgn_idx = libcamera.get_throw_bgn_idx()
    throw_end_idx = libcamera.get_throw_end_idx()

    num_frames = throw_end_idx - throw_bgn_idx - 1 # Ignore the last two captured frames

    # Image processing (including inference)
    for idx, frame_id in enumerate(range(throw_bgn_idx, throw_end_idx - 1)):
      frame_ptr = libcamera.get_frame_ptr(frame_id)(*\label{lst:ln:image_preprocessing1}*)
      raw_frame = np.ctypeslib.as_array(frame_ptr, shape=fh.raw_shape) # Raw Baumer BayerRG8 frame
      # Transform Baumer BayerRG8 to BGR8 (Baumer BayerRG = OpenCV BayerBG)
      frames[idx] = cv2.cvtColor(raw_frame, cv2.COLOR_BayerBG2BGR) # Color space conversion
      # Image scaling using nearest-neighbor interpolation
      frame_resized = cv2.resize(frames[idx], fh.inf_dsize, interpolation=fh.Interpolation.NEAREST)
      frame_inference = frame_resized.astype(np.float32) / 255.0(*\label{lst:ln:image_preprocessing2}*) # Normalization (float32 precision)

      # Inference
      n2cube.dpuSetInputTensorInHWCFP32(task, kernel_conv_input, frame_inference, input_tensor_size)
      n2cube.dpuRunTask(task)(*\label{lst:ln:image_classification}*)

      # Softmax function (normalized exponential function)
      # Confident predictions lead to all zeros and a NaN, when run through `n2cube.dpuRunSoftmax(.)`
      # This section replaces the first occurrence of NaN in the `prediction` array with 1.0 and sets everything else to 0.0
      prediction = n2cube.dpuRunSoftmax(output_tensor_address, output_tensor_channel, output_tensor_size//output_tensor_channel, output_tensor_scale)(*\label{lst:ln:softmax1}*)
      nan = np.isnan(prediction)
      if nan.any():
        nan_idx = nan.argmax() # returns the index of the first occurrence of NaN
        prediction = np.zeros((fh.num_objects,), dtype=np.float32)
        prediction[nan_idx] = 1.0(*\label{lst:ln:softmax2}*)
      predictions[idx] = prediction

      # Only consider `fh.frames_to_consider` frames
      if idx == fh.frames_to_consider - 1: # (-1: idx starts with 0)
        break

    num_frames_considered = min(fh.frames_to_consider, num_frames)

    window = sine_squared_window(num_frames, num_frames_considered) # weighting function
    weighted_prediction = np.matmul(window, predictions) / np.sum(window)(*\label{lst:ln:matrix_multiplication}*) # computation of the weighted prediction

    # UI: Prepare data for the UI
    weighted_prediction_percent = weighted_prediction * 100
    weighted_prediction_sorted = np.sort(weighted_prediction_percent)[::-1]
    weighted_prediction_argsorted = np.argsort(weighted_prediction_percent)[::-1]

    # this is the index of the best guess (computed by weighting the `fh.frames_to_consider` frames)
    guess_idx = weighted_prediction_argsorted[0]

    relevant_pct_ui = np.asarray(weighted_prediction_percent >= 1.0).nonzero()[0] # value of prediction must be at least 1.0%
    relevant_pct_ui_len = len(relevant_pct_ui)
    predictions_ui_len = min(4, relevant_pct_ui_len) # show at most Top 4

    predictions_ui = [] # the object names
    percentages_ui = np.empty((predictions_ui_len + 1,), dtype=np.float32) # the percentages (+1: 'Others')
    for i, w in enumerate(weighted_prediction_argsorted[0:predictions_ui_len]):
      predictions_ui.append(fh.objects_ui[w])
      percentages_ui[i] = weighted_prediction_percent[w]

    # the object names
    predictions_ui.append('Others')

    # the percentages
    percentages_ui[-1] = np.sum(weighted_prediction_sorted[predictions_ui_len:])
    percentages_ui = lrm_round(percentages_ui)

    # the frame
    wighted_guesses = np.multiply(window, predictions[:, guess_idx])(*\label{lst:ln:frame_selection1}*)
    frame_ui_idx = wighted_guesses.argmax()

    frame_ui_resized = cv2.resize(frames[frame_ui_idx], fh.ui_dsize, interpolation=fh.Interpolation.NEAREST)
    _, frame_ui_png = cv2.imencode('.png', frame_ui_resized)
    frame_ui = frame_ui_png.tobytes()(*\label{lst:ln:frame_selection2}*) # the frame

    # UI: Show results
    if percentages_ui[-1] == 0.0:
      predictions_ui = predictions_ui[:-1]
      percentages_ui = percentages_ui[:-1]

    # UI: Inference
    ui.update_inference_window(predictions_ui, percentages_ui, frame_ui)

    # Wait until the camera thread (process due to ctypes) is terminated
    t.join()

    # Reset the global variables (has to be done manually to avoid race conditions)
    libcamera.reset_global_variables()

  # Under regular circumstances, this section should never be reached

  # Terminate Camera
  terminate = libcamera.terminate()

  # Clean up the DPU IP
  n2cube.dpuDestroyKernel(kernel)
  n2cube.dpuDestroyTask(task)
def main():

    # Set up the DPU IP
    overlay = DpuOverlay(str(fh.dir_dpu / fh.dpu_bit_file))
    overlay.load_model(str(fh.dir_dpu / fh.dpu_assembly_file))

    # Set up the Neural Network Runtime (N2Cube)
    kernel_name = fh.kernel_name

    kernel_conv_input = fh.kernel_conv_input
    kernel_fc_output = fh.kernel_fc_output

    n2cube.dpuOpen()
    kernel = n2cube.dpuLoadKernel(kernel_name)
    task = n2cube.dpuCreateTask(kernel, 0)

    input_tensor_size = n2cube.dpuGetInputTensorSize(task, kernel_conv_input)

    output_tensor_size = n2cube.dpuGetOutputTensorSize(task, kernel_fc_output)
    output_tensor_channel = n2cube.dpuGetOutputTensorChannel(
        task, kernel_fc_output)
    output_tensor_address = n2cube.dpuGetOutputTensorAddress(
        task, kernel_fc_output)
    output_tensor_scale = n2cube.dpuGetOutputTensorScale(
        task, kernel_fc_output)

    # libcamera
    libcamera = ctypes.CDLL(fh.dir_cam / fh.libcamera_file)

    libcamera.get_frame_ptr.restype = ctypes.POINTER(ctypes.c_ubyte)
    libcamera.get_throw_bgn_idx.restype = ctypes.c_uint
    libcamera.get_throw_end_idx.restype = ctypes.c_uint
    libcamera.get_throw_bgn.restype = ctypes.c_bool
    libcamera.get_throw_end.restype = ctypes.c_bool

    libcamera.set_frame_rate.restype = None
    libcamera.set_buff_size.restype = None
    libcamera.set_exposure_time.restype = None
    libcamera.set_camera_gain.restype = None
    libcamera.set_avg_diffs.restype = None
    libcamera.set_threshold_mult.restype = None
    libcamera.set_frames_to_acquire.restype = None

    libcamera.initialize.restype = ctypes.c_int
    libcamera.start_acquisition.restype = ctypes.c_int
    libcamera.terminate.restype = ctypes.c_int

    # Set up of variables
    frames = np.empty((fh.frames_to_consider, ) + fh.bgr_shape, dtype=np.uint8)

    # Initialize Camera
    initialize = libcamera.initialize()

    if initialize != fh.ReturnCodes.SUCCESS:
        try:
            return_code = fh.ReturnCodes(initialize).name
        except ValueError:
            return_code = initialize
        print(f'Initialization failed: {return_code}')
        sys.exit()
    else:
        print(
            '================================= READY ================================='
        )

    # Reset predictions
    predictions = np.zeros((fh.frames_to_consider, fh.num_objects),
                           dtype=np.float32)

    # Start acquisition (Threaded)
    t = Thread(target=libcamera.start_acquisition)
    t.start()

    # Wait until the throw has ended
    while not libcamera.get_throw_end():
        pass

    stages = [
        'Get raw bayer', 'Transform color', 'Resize', 'Normalize',
        'Run inference', 'Softmax', 'Weighting'
    ]
    meas_time = {s: get_dict() for s in stages}

    throw_bgn_idx = libcamera.get_throw_bgn_idx()
    throw_end_idx = libcamera.get_throw_end_idx()

    num_frames = throw_end_idx - throw_bgn_idx - 1  # Ignore the last two captured frames

    for idx, frame_id in enumerate(range(throw_bgn_idx, throw_end_idx - 1)):

        meas_time['Get raw bayer']['start'].append(datetime.now())
        frame_ptr = libcamera.get_frame_ptr(frame_id)
        raw_frame = np.ctypeslib.as_array(frame_ptr, shape=fh.raw_shape)
        meas_time['Get raw bayer']['end'].append(datetime.now())

        # Transform Baumer BayerRG8 to BGR8 (Baumer BayerRG ≙ OpenCV BayerBG)
        meas_time['Transform color']['start'].append(datetime.now())
        frames[idx] = cv2.cvtColor(raw_frame, cv2.COLOR_BayerBG2BGR)
        meas_time['Transform color']['end'].append(datetime.now())

        meas_time['Resize']['start'].append(datetime.now())
        frame_resized = cv2.resize(frames[idx],
                                   fh.inf_dsize,
                                   interpolation=fh.Interpolation.NEAREST)
        meas_time['Resize']['end'].append(datetime.now())

        meas_time['Normalize']['start'].append(datetime.now())
        frame_inference = frame_resized.astype(np.float32) / 255.0
        meas_time['Normalize']['end'].append(datetime.now())

        meas_time['Run inference']['start'].append(datetime.now())
        n2cube.dpuSetInputTensorInHWCFP32(task, kernel_conv_input,
                                          frame_inference, input_tensor_size)
        n2cube.dpuRunTask(task)
        meas_time['Run inference']['end'].append(datetime.now())

        # n2cube.dpuRunSoftmax(.) sometimes returns all zeros except one NaN
        # This section replaces the first occurrence of NaN in the prediction array with 1.0 and sets everything else to 0.0
        meas_time['Softmax']['start'].append(datetime.now())
        prediction = n2cube.dpuRunSoftmax(
            output_tensor_address, output_tensor_channel,
            output_tensor_size // output_tensor_channel, output_tensor_scale)
        nan = np.isnan(prediction)
        if nan.any():
            nan_idx = nan.argmax(
            )  # return the index of the first occurrence of NaN
            prediction = np.zeros((fh.num_objects, ), dtype=np.float32)
            prediction[nan_idx] = 1.0
        predictions[idx] = prediction
        meas_time['Softmax']['end'].append(datetime.now())

        if idx == fh.frames_to_consider - 1:
            break

    meas_time['Weighting']['start'].append(datetime.now())
    num_frames_considered = min(fh.frames_to_consider, num_frames)
    window = sine_window(num_frames, num_frames_considered)  # weighting
    weighted_prediction = np.matmul(window, predictions) / np.sum(window)
    meas_time['Weighting']['end'].append(datetime.now())

    for k in meas_time:
        meas_time[k] = [
            (e - s).total_seconds() * 1000
            for s, e in zip(meas_time[k]['start'], meas_time[k]['end'])
        ]
        meas_time[k] = sum(meas_time[k]) / len(meas_time[k])

    # create output file
    mmax = 0
    for s in stages:
        if len(s) > mmax:
            mmax = len(s)
    output = f'Number of captured frames: {num_frames_considered}\n\n'
    for idx, s in enumerate(stages):
        output += f'{s}:{" "*(mmax - len(stages[idx]))} {meas_time[s]:.3f} ms\n'

    output += f'\nSum:{" "*(mmax - len("Sum"))} {sum(meas_time.values()):.3f} ms\n'

    output += f'Frame rate:{" "*(mmax - len("Frame rate"))} {1000 / sum(meas_time.values()):.3f} fps\n'

    print(output)

    with open(fh.dir_verification / 'throughput.log', 'w') as f:
        f.write(output)

    # Wait until the camera thread (process due to ctypes) is terminated
    t.join()

    # Terminate Camera
    terminate = libcamera.terminate()

    # Clean up the DPU IP
    n2cube.dpuDestroyKernel(kernel)
    n2cube.dpuDestroyTask(task)