def dpuFlowerPredictSoftmax(self, img_input): img_scale = cv2.resize(img_input, (self.dpuImgSize, self.dpuImgSize), interpolation=cv2.INTER_CUBIC) img1_scale = np.array(img_scale, dtype='float32') if np.max(img1_scale) > 1: img1_scale = img1_scale / 255. input_len = img1_scale.shape[0] * img1_scale.shape[ 1] * img1_scale.shape[2] #输入数据的长度 #input_len = n2cube.dpuGetInputTensorSize(task, KERNEL_CONV_INPUT) # Set DPU Task input Tensor with data from a CPU memory block. # 设定 DPU 任务输入张量以及所需的 CPU 内存 n2cube.dpuSetInputTensorInHWCFP32(self.dpuTaks, self.dpuInputNode, img1_scale, input_len) # Launch the running of DPU Task. n2cube.dpuRunTask(self.dpuTaks) # 以下代码有一些问题 # softmax需要4个参数 # 后期可以移到dpuFlowerSetSoftmax conf = n2cube.dpuGetOutputTensorAddress(self.dpuTaks, self.dpuOutputNode) channel = n2cube.dpuGetOutputTensorChannel(self.dpuTaks, self.dpuOutputNode) outScale = n2cube.dpuGetOutputTensorScale(self.dpuTaks, self.dpuOutputNode) size = n2cube.dpuGetOutputTensorSize(self.dpuTaks, self.dpuOutputNode) ################ softmax = n2cube.dpuRunSoftmax(conf, channel, size // channel, outScale) pdt = np.argmax(softmax, axis=0) return pdt
def RunDPU(kernel, img, count): """ DPU run function kernel: dpu kernel img: image to be run count : test rounds count """ """Create DPU Tasks from DPU Kernel""" task = n2cube.dpuCreateTask(kernel, 0) while count < 1000: """Load image to DPU""" dputils.dpuSetInputImage2(task, KERNEL_CONV_INPUT, img) """Get input Tesor""" tensor = n2cube.dpuGetInputTensor(task, KERNEL_CONV_INPUT) """Model run on DPU""" n2cube.dpuRunTask(task) """Get the output tensor size from FC output""" size = n2cube.dpuGetOutputTensorSize(task, KERNEL_FC_OUTPUT) """Get the output tensor channel from FC output""" channel = n2cube.dpuGetOutputTensorChannel(task, KERNEL_FC_OUTPUT) softmax = np.zeros(size, dtype=float32) """Get FC result""" conf = n2cube.dpuGetOutputTensorAddress(task, KERNEL_FC_OUTPUT) """Get output scale of FC""" outputScale = n2cube.dpuGetOutputTensorScale(task, KERNEL_FC_OUTPUT) """Run softmax""" softmax = n2cube.dpuRunSoftmax(conf, channel, size // channel, outputScale) l.acquire() count = count + threadnum l.release() """Destroy DPU Tasks & free resources""" n2cube.dpuDestroyTask(task)
def run_dpu_task(self, kernel, i, iter_cnt, listimage, result): start = 0 count = 0 task = n2cube.dpuCreateTask(kernel, 0) pre_time = 0 calc_time = 0 while count < iter_cnt: img_name = listimage[i].pop(0) path = os.path.join(image_folder, img_name) im = cv2.imread(path) im = cv2.cvtColor(im,cv2.COLOR_BGR2RGB) h, w = im.shape[:2] padded_center_crop_size = int((IMAGE_SIZE / (IMAGE_SIZE + CROP_PADDING)) * min(h, w)) offset_height = ((h - padded_center_crop_size) + 1) // 2 offset_width = ((w - padded_center_crop_size) + 1) // 2 image_crop = im[offset_height: padded_center_crop_size + offset_height, offset_width: padded_center_crop_size + offset_width,:] image = cv2.resize(image_crop, dsize=(IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_CUBIC) im = cv2.normalize(image, None, alpha=-1, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F) count = count + 1 input_len = n2cube.dpuGetInputTensorSize(task, KERNEL_CONV_INPUT) n2cube.dpuSetInputTensorInHWCFP32(task, KERNEL_CONV_INPUT, im, input_len) n2cube.dpuRunTask(task) size = n2cube.dpuGetOutputTensorSize(task, KERNEL_FC_OUTPUT) channel = n2cube.dpuGetOutputTensorChannel(task, KERNEL_FC_OUTPUT) conf = n2cube.dpuGetOutputTensorAddress(task, KERNEL_FC_OUTPUT) outputScale = n2cube.dpuGetOutputTensorScale(task, KERNEL_FC_OUTPUT) softmax = n2cube.dpuRunSoftmax(conf, channel, size//channel, outputScale) idx = np.argpartition(softmax, -5)[-5:] top_5 = idx[np.argsort(-softmax[idx])] for val in top_5: result[i].append("{} {}".format(img_name, val)) n2cube.dpuDestroyTask(task)
def main(): """ Attach to DPU driver and prepare for running """ n2cube.dpuOpen() """ Create DPU Kernels for CONV NODE in imniResNet """ kernel = n2cube.dpuLoadKernel(KERNEL_CONV) """ Create DPU Tasks for CONV NODE in miniResNet """ task = n2cube.dpuCreateTask(kernel, 0) listimage = os.listdir(calib_image_dir) for i in range(len(listimage)): path = os.path.join(calib_image_dir, listimage[i]) if os.path.splitext(path)[1] != ".png": continue print("Loading %s" %listimage[i]) """ Load image and Set image into CONV Task """ imageRun=graph_input_fn.calib_input(path) imageRun=imageRun.reshape((imageRun.shape[0]*imageRun.shape[1]*imageRun.shape[2])) input_len=len(imageRun) n2cube.dpuSetInputTensorInHWCFP32(task,CONV_INPUT_NODE,imageRun,input_len) """ Launch miniRetNet task """ n2cube.dpuRunTask(task) """ Get output tensor address of CONV """ conf = n2cube.dpuGetOutputTensorAddress(task, CONV_OUTPUT_NODE) """ Get output channel of CONV """ channel = n2cube.dpuGetOutputTensorChannel(task, CONV_OUTPUT_NODE) """ Get output size of CONV """ size = n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE) softmax = [0 for i in range(size)] """ Get output scale of CONV """ scale = n2cube.dpuGetOutputTensorScale(task, CONV_OUTPUT_NODE) batchSize=size//channel """ Calculate softmax and show TOP5 classification result """ n2cube.dpuRunSoftmax(conf, softmax, channel, batchSize, scale) TopK(softmax, calib_image_list) """ Destroy DPU Tasks & free resources """ n2cube.dpuDestroyTask(task) """ Destroy DPU Kernels & free resources """ rtn = n2cube.dpuDestroyKernel(kernel) """ Dettach from DPU driver & free resources """ n2cube.dpuClose()
def predict_label(imfile): task = n2cube.dpuCreateTask(kernel, 0) # Set client to get file from S3 s3_client.download_file(BUCKET, imfile, image_folder + imfile) img_obj = os.path.join(image_folder, imfile) #To get it from local path #img_file = os.path.join(image_folder, imfile) img = cv2.imread(img_obj) img = cv2.resize(img, (IMG_DIMS, IMG_DIMS)) img = img.astype(np.float32) img = (img/255.0) """Get input Tensor""" tensor = n2cube.dpuGetInputTensor(task, KERNEL_CONV_INPUT) input_len = n2cube.dpuGetInputTensorSize(task, KERNEL_CONV_INPUT) """Set input Tesor""" n2cube.dpuSetInputTensorInHWCFP32(task, KERNEL_CONV_INPUT, img, input_len) """Model run on DPU""" n2cube.dpuRunTask(task) """Get the output tensor size from FC output""" size = n2cube.dpuGetOutputTensorSize(task, KERNEL_FC_OUTPUT) """Get the output tensor channel from FC output""" channel = n2cube.dpuGetOutputTensorChannel(task, KERNEL_FC_OUTPUT) softmax = np.zeros(size,dtype=np.float32) """Get FC result""" conf = n2cube.dpuGetOutputTensorAddress(task, KERNEL_FC_OUTPUT) """Get output scale of FC""" outputScale = n2cube.dpuGetOutputTensorScale(task, KERNEL_FC_OUTPUT) """Run softmax""" softmax = n2cube.dpuRunSoftmax(conf, channel, size // channel, outputScale) #print("softmax =", softmax) n2cube.dpuDestroyTask(task) return slabels[np.argmax(softmax)].strip('\n')
def dpuPredictSoftmax(self, img_input): img_scale = cv2.resize(img_input, (self.dpuImgSize, self.dpuImgSize), interpolation=cv2.INTER_CUBIC) img1_scale = np.array(img_scale, dtype='float32') if np.max(img1_scale) > 1: img1_scale = img1_scale / 255. input_len = img1_scale.shape[0] * img1_scale.shape[ 1] * img1_scale.shape[2] n2cube.dpuSetInputTensorInHWCFP32(self.dpuTask, self.dpuInputNode, img1_scale, input_len) n2cube.dpuRunTask(self.dpuTask) conf = n2cube.dpuGetOutputTensorAddress(self.dpuTask, self.dpuOutputNode) channel = n2cube.dpuGetOutputTensorChannel(self.dpuTask, self.dpuOutputNode) outScale = n2cube.dpuGetOutputTensorScale(self.dpuTask, self.dpuOutputNode) size = n2cube.dpuGetOutputTensorSize(self.dpuTask, self.dpuOutputNode) softmax = n2cube.dpuRunSoftmax(conf, channel, size // channel, outScale) pdt = np.argmax(softmax, axis=0) return pdt
def accel_fused(kernel_name, input_name, output_name, layout, out, *ins): # Attach to DPU driver and prepare for running n2cube.dpuOpen() # Create DPU Kernels kernel = n2cube.dpuLoadKernel(kernel_name) # Create DPU Tasks for kernel task = n2cube.dpuCreateTask(kernel, 0) # Load image to DPU X = ins[0].asnumpy().reshape((-1)) n2cube.dpuSetInputTensorInHWCFP32(task, input_name, X, len(X)) # Model run on DPU """ n2cube.dpuRunTask(task) # Get the output tensor size size = n2cube.dpuGetOutputTensorSize(task, output_name) address = n2cube.dpuGetOutputTensorAddress(task, output_name) value = [0 for i in range(size)] # Get the output tensor data n2cube.dpuGetTensorData(address, value, size) scale = n2cube.dpuGetOutputTensorScale(task, output_name, idx=0) value = np.array(value).astype(np.float32) * float(scale) value_shape = tuple(out.shape) if layout == 'NHWC' else \ (out.shape[0], out.shape[2], out.shape[3], out.shape[1]) value = np.reshape(value, value_shape) # DPU output is in NHWC if layout == 'NCHW': value = np.transpose(value, (0, 3, 1, 2)) tvm.nd.array(value).copyto(out)
def main(): print("STARTING UNETv2 on DPU...") if USE_DPU: # Attach to DPU driver n2cube.dpuOpen() # Load DPU Kernel and create a task kernel = n2cube.dpuLoadKernel(KERNEL_CONV) task = n2cube.dpuCreateTask(kernel, 0) # load and preprocess images and load segmentation labels assert os.path.isdir(IMG_TEST_DIR) #print(IMG_TEST_DIR) x_test, y_test, img_file, seg_file = dpu_get_data(IMG_TEST_DIR, SEG_TEST_DIR, cfg.NUM_CLASSES, cfg.WIDTH, cfg.HEIGHT) y_pred = [] # process all images for i in range(len(x_test)): # opened image as BGR, convert it to RGB #B,G,R = cv2.split(x_test[i]) #imageRun = cv2.merge((R,G,B)) imageRun = x_test[i] imageRun = imageRun.reshape( (imageRun.shape[0] * imageRun.shape[1] * imageRun.shape[2])) input_len = len(imageRun) if USE_DPU: # load pre-processed image as DPU input n2cube.dpuSetInputTensorInHWCFP32(task, CONV_INPUT_NODE, imageRun, input_len) dpu_in = n2cube.dpuGetInputTensor(task, CONV_INPUT_NODE) ti_scale = n2cube.dpuGetTensorScale(dpu_in) ti_h = n2cube.dpuGetTensorHeight(dpu_in) ti_w = n2cube.dpuGetTensorWidth(dpu_in) ti_sz = n2cube.dpuGetTensorSize(dpu_in) ti_ch = n2cube.dpuGetTensorChannel(dpu_in) if (i == 0): print( "Input tensor=%3d ch=%3d H=%3d W=%3d Size=%6d scale=%4d" % (i, ti_ch, ti_h, ti_w, ti_sz, ti_scale)) # run DPU task n2cube.dpuRunTask(task) # get output tensor address dpu_out = n2cube.dpuGetOutputTensorAddress(task, CONV_OUTPUT_NODE) # get number of channels in output tensor to_ch = n2cube.dpuGetOutputTensorChannel(task, CONV_OUTPUT_NODE) # get size in bytes of output tensor to_sz = n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE) # get width output tensor to_w = n2cube.dpuGetOutputTensorWidth(task, CONV_OUTPUT_NODE) # get height output tensor to_h = n2cube.dpuGetOutputTensorHeight(task, CONV_OUTPUT_NODE) # get output tensor scale to_scale = n2cube.dpuGetOutputTensorScale(task, CONV_OUTPUT_NODE) softmax = np.zeros(to_sz, dtype=np.float32) if (i == 0): print("Output tensor=%3d ch=%3d H=%3d W=%3d Size=%6d" % (i, to_ch, to_h, to_w, to_sz)) print("Output tensor scaling factor", to_scale) softmax = n2cube.dpuRunSoftmax(dpu_out, to_ch, to_sz // to_ch, to_scale) prediction = softmax.reshape((to_h, to_w, to_ch)) y_pred.append(prediction) if (i == 0): print("prediction shape: ", prediction.shape) # Calculate intersection over union for each segmentation class y_pred = np.asarray(y_pred) y_test = np.asarray(y_test) print("y_pred shape: ", y_pred.shape) print("y_test shape: ", y_test.shape) y_predi = np.argmax(y_pred, axis=3) y_testi = np.argmax(y_test, axis=3) print("shape of y_testi and y_predi ", y_testi.shape, y_predi.shape) dpu_IoU(y_testi, y_predi) # print results print("Processed", len(x_test), "images") print("FINISHED") if USE_DPU: # Destroy DPU Kernel & detach n2cube.dpuDestroyKernel(kernel) n2cube.dpuClose()
def main(): # UI: DPU ui = UI() ui.update_boot_window('Initializing DPU...') from dnndk import n2cube from pynq_dpu import DpuOverlay # Set up the DPU IP overlay = DpuOverlay(str(fh.dir_dpu / fh.dpu_bit_file)) overlay.load_model(str(fh.dir_dpu / fh.dpu_assembly_file)) # Set up the Neural Network Runtime (N2Cube) kernel_name = fh.kernel_name kernel_conv_input = fh.kernel_conv_input kernel_fc_output = fh.kernel_fc_output n2cube.dpuOpen() kernel = n2cube.dpuLoadKernel(kernel_name) task = n2cube.dpuCreateTask(kernel, 0) input_tensor_size = n2cube.dpuGetInputTensorSize(task, kernel_conv_input) output_tensor_size = n2cube.dpuGetOutputTensorSize(task, kernel_fc_output) output_tensor_channel = n2cube.dpuGetOutputTensorChannel(task, kernel_fc_output) output_tensor_address = n2cube.dpuGetOutputTensorAddress(task, kernel_fc_output) output_tensor_scale = n2cube.dpuGetOutputTensorScale(task, kernel_fc_output) # UI: Camera ui.update_boot_window('Initializing Camera...') # libcamera libcamera = ctypes.CDLL(fh.dir_cam / fh.libcamera_file) # Getter libcamera.get_frame_ptr.restype = ctypes.POINTER(ctypes.c_ubyte) libcamera.get_frame_ptr.argtypes = [ctypes.c_uint] libcamera.get_throw_bgn_idx.restype = ctypes.c_uint libcamera.get_throw_bgn_idx.argtypes = None libcamera.get_throw_end_idx.restype = ctypes.c_uint libcamera.get_throw_end_idx.argtypes = None libcamera.get_throw_bgn.restype = ctypes.c_bool libcamera.get_throw_bgn.argtypes = None libcamera.get_throw_end.restype = ctypes.c_bool libcamera.get_throw_end.argtypes = None # Setter libcamera.set_frame_rate.restype = None libcamera.set_frame_rate.argtypes = [ctypes.c_double] libcamera.set_buff_size.restype = None libcamera.set_buff_size.argtypes = [ctypes.c_uint] libcamera.set_exposure_time.restype = None libcamera.set_exposure_time.argtypes = [ctypes.c_double] libcamera.set_camera_gain.restype = None libcamera.set_camera_gain.argtypes = [ctypes.c_double] libcamera.set_avg_diffs.restype = None libcamera.set_avg_diffs.argtypes = [ctypes.c_uint] libcamera.set_threshold_mult.restype = None libcamera.set_threshold_mult.argtypes = [ctypes.c_double] libcamera.set_frames_to_acquire.restype = None libcamera.set_frames_to_acquire.argtypes = [ctypes.c_uint] # Camera libcamera.initialize.restype = ctypes.c_int libcamera.initialize.argtypes = None libcamera.reset_global_variables.restype = None libcamera.reset_global_variables.argtypes = None libcamera.start_acquisition.restype = ctypes.c_int libcamera.start_acquisition.argtypes = None libcamera.terminate.restype = ctypes.c_int libcamera.terminate.argtypes = None # Set the global variables according to the module `fhnwtoys.settings` libcamera.set_frame_rate(fh.frame_rate) libcamera.set_buff_size(fh.buff_size) libcamera.set_exposure_time(fh.exposure_time) libcamera.set_camera_gain(fh.camera_gain) libcamera.set_avg_diffs(fh.avg_diffs) libcamera.set_threshold_mult(fh.threshold_mult) libcamera.set_frames_to_acquire(fh.frames_to_acquire) # Initialize Camera initialize = fh.ReturnCodes.NOT_INITIALIZED(*\label{lst:ln:camera_init1}*) initialization_tries = 0 while initialize != fh.ReturnCodes.SUCCESS: if initialization_tries > 0: try: return_code = fh.ReturnCodes(initialize).name except ValueError: return_code = initialize ui.update_boot_window(f'Camera Error ({return_code}), try to replug the camera.') initialize = libcamera.initialize() initialization_tries += 1(*\label{lst:ln:camera_init2}*) # UI: Ready ui.update_boot_window('READY') # Set up the `frames` array frames = np.empty((fh.frames_to_consider,) + fh.bgr_shape, dtype=np.uint8) while True: # Reset the predictions predictions = np.zeros((fh.frames_to_consider, fh.num_objects), dtype=np.float32)(*\label{lst:ln:predictions_matrix}*) # Start acquisition (threaded) # todo: error handling ('Unexpected Error, system reboot required.') # start_acquisition = libcamera.start_acquisition() # non threaded approach t = Thread(target=libcamera.start_acquisition)(*\label{lst:ln:threading}*) # threaded approach (process due to ctypes) t.start() # Wait until the throw has ended (the Ultra96-V2 is not powerful enough to process the data during the acquisition) while not libcamera.get_throw_end(): pass(*\label{lst:ln:polling}*) throw_bgn_idx = libcamera.get_throw_bgn_idx() throw_end_idx = libcamera.get_throw_end_idx() num_frames = throw_end_idx - throw_bgn_idx - 1 # Ignore the last two captured frames # Image processing (including inference) for idx, frame_id in enumerate(range(throw_bgn_idx, throw_end_idx - 1)): frame_ptr = libcamera.get_frame_ptr(frame_id)(*\label{lst:ln:image_preprocessing1}*) raw_frame = np.ctypeslib.as_array(frame_ptr, shape=fh.raw_shape) # Raw Baumer BayerRG8 frame # Transform Baumer BayerRG8 to BGR8 (Baumer BayerRG = OpenCV BayerBG) frames[idx] = cv2.cvtColor(raw_frame, cv2.COLOR_BayerBG2BGR) # Color space conversion # Image scaling using nearest-neighbor interpolation frame_resized = cv2.resize(frames[idx], fh.inf_dsize, interpolation=fh.Interpolation.NEAREST) frame_inference = frame_resized.astype(np.float32) / 255.0(*\label{lst:ln:image_preprocessing2}*) # Normalization (float32 precision) # Inference n2cube.dpuSetInputTensorInHWCFP32(task, kernel_conv_input, frame_inference, input_tensor_size) n2cube.dpuRunTask(task)(*\label{lst:ln:image_classification}*) # Softmax function (normalized exponential function) # Confident predictions lead to all zeros and a NaN, when run through `n2cube.dpuRunSoftmax(.)` # This section replaces the first occurrence of NaN in the `prediction` array with 1.0 and sets everything else to 0.0 prediction = n2cube.dpuRunSoftmax(output_tensor_address, output_tensor_channel, output_tensor_size//output_tensor_channel, output_tensor_scale)(*\label{lst:ln:softmax1}*) nan = np.isnan(prediction) if nan.any(): nan_idx = nan.argmax() # returns the index of the first occurrence of NaN prediction = np.zeros((fh.num_objects,), dtype=np.float32) prediction[nan_idx] = 1.0(*\label{lst:ln:softmax2}*) predictions[idx] = prediction # Only consider `fh.frames_to_consider` frames if idx == fh.frames_to_consider - 1: # (-1: idx starts with 0) break num_frames_considered = min(fh.frames_to_consider, num_frames) window = sine_squared_window(num_frames, num_frames_considered) # weighting function weighted_prediction = np.matmul(window, predictions) / np.sum(window)(*\label{lst:ln:matrix_multiplication}*) # computation of the weighted prediction # UI: Prepare data for the UI weighted_prediction_percent = weighted_prediction * 100 weighted_prediction_sorted = np.sort(weighted_prediction_percent)[::-1] weighted_prediction_argsorted = np.argsort(weighted_prediction_percent)[::-1] # this is the index of the best guess (computed by weighting the `fh.frames_to_consider` frames) guess_idx = weighted_prediction_argsorted[0] relevant_pct_ui = np.asarray(weighted_prediction_percent >= 1.0).nonzero()[0] # value of prediction must be at least 1.0% relevant_pct_ui_len = len(relevant_pct_ui) predictions_ui_len = min(4, relevant_pct_ui_len) # show at most Top 4 predictions_ui = [] # the object names percentages_ui = np.empty((predictions_ui_len + 1,), dtype=np.float32) # the percentages (+1: 'Others') for i, w in enumerate(weighted_prediction_argsorted[0:predictions_ui_len]): predictions_ui.append(fh.objects_ui[w]) percentages_ui[i] = weighted_prediction_percent[w] # the object names predictions_ui.append('Others') # the percentages percentages_ui[-1] = np.sum(weighted_prediction_sorted[predictions_ui_len:]) percentages_ui = lrm_round(percentages_ui) # the frame wighted_guesses = np.multiply(window, predictions[:, guess_idx])(*\label{lst:ln:frame_selection1}*) frame_ui_idx = wighted_guesses.argmax() frame_ui_resized = cv2.resize(frames[frame_ui_idx], fh.ui_dsize, interpolation=fh.Interpolation.NEAREST) _, frame_ui_png = cv2.imencode('.png', frame_ui_resized) frame_ui = frame_ui_png.tobytes()(*\label{lst:ln:frame_selection2}*) # the frame # UI: Show results if percentages_ui[-1] == 0.0: predictions_ui = predictions_ui[:-1] percentages_ui = percentages_ui[:-1] # UI: Inference ui.update_inference_window(predictions_ui, percentages_ui, frame_ui) # Wait until the camera thread (process due to ctypes) is terminated t.join() # Reset the global variables (has to be done manually to avoid race conditions) libcamera.reset_global_variables() # Under regular circumstances, this section should never be reached # Terminate Camera terminate = libcamera.terminate() # Clean up the DPU IP n2cube.dpuDestroyKernel(kernel) n2cube.dpuDestroyTask(task)
def run(image_folder, shortsize, KERNEL_CONV, KERNEL_CONV_INPUT, KERNEL_FC_OUTPUT, inputscale): start = time.time() # listimage = [i for i in os.listdir(image_folder) if i.endswith("JPEG")] listimage = [i for i in os.listdir(image_folder) if i.endswith("jpg")] listimage.sort() # wordstxt = os.path.join(image_folder, "words.txt") # with open(wordstxt, "r") as f: # lines = f.readlines() fo = open(resultname, "w") n2cube.dpuOpen() kernel = n2cube.dpuLoadKernel(KERNEL_CONV) task = n2cube.dpuCreateTask(kernel, 0) height, width, inputchannel, mean = parameter(task, KERNEL_CONV_INPUT) # print("mean = %f"%mean[0]) outsize = n2cube.dpuGetOutputTensorSize(task, KERNEL_FC_OUTPUT) # print("size = %d"%size) outputchannel = n2cube.dpuGetOutputTensorChannel(task, KERNEL_FC_OUTPUT) # print("outputchannel = %d"%outputchannel) conf = n2cube.dpuGetOutputTensorAddress(task, KERNEL_FC_OUTPUT) # print("conf = {}".format(conf)) # print("inputscale = %f"%inputscale) inputscale = n2cube.dpuGetInputTensorScale(task, KERNEL_CONV_INPUT) # print("inputscalenow = %f"%inputscale) outputscale = n2cube.dpuGetOutputTensorScale(task, KERNEL_FC_OUTPUT) # print("outputscale = %f"%outputscale) imagenumber = len(listimage) print("\nimagenumber = %d\n" % imagenumber) softlist = [] # imagenumber = 1000 correct = 0 wrong = 0 for i in range(imagenumber): print(f"i = {i+1}") print(listimage[i]) # path = os.path.join(image_folder, listimage[i]) # if i % 50 == 0: # print("\r", listimage[i], end = "") path = image_folder + listimage[i] img = cv2.imread(path) imageRun = predict_label(img, task, inputscale, mean, height, width, inputchannel, shortsize, KERNEL_CONV_INPUT) input_len = len(imageRun) # print(f"input_len = {input_len}") # soft = threadPool.submit(run_dpu_task, outsize, task, outputchannel, conf, outputscale, listimage[i], imageRun, KERNEL_CONV_INPUT, KERNEL_FC_OUTPUT) # softlist.append(soft) # for future in as_completed(softlist): # softmax, listimage = future.result() softmax, listimage[i] = run_dpu_task(outsize, task, outputchannel, conf, outputscale, listimage[i], imageRun, KERNEL_CONV_INPUT, KERNEL_FC_OUTPUT) correct, wrong = TopK(softmax, listimage[i], fo, correct, wrong) print("") fo.close() accuracy = correct / imagenumber print('Correct:', correct, ' Wrong:', wrong, ' Accuracy:', accuracy) n2cube.dpuDestroyTask(task) n2cube.dpuDestroyKernel(kernel) n2cube.dpuClose() print("") end = time.time() total_time = end - start print('\nAll processing time: {} seconds.'.format(total_time)) print('\n{} ms per frame\n'.format(10000 * total_time / imagenumber))
def main(): # Set up the DPU IP overlay = DpuOverlay(str(fh.dir_dpu / fh.dpu_bit_file)) overlay.load_model(str(fh.dir_dpu / fh.dpu_assembly_file)) # Set up the Neural Network Runtime (N2Cube) kernel_name = fh.kernel_name kernel_conv_input = fh.kernel_conv_input kernel_fc_output = fh.kernel_fc_output n2cube.dpuOpen() kernel = n2cube.dpuLoadKernel(kernel_name) task = n2cube.dpuCreateTask(kernel, 0) input_tensor_size = n2cube.dpuGetInputTensorSize(task, kernel_conv_input) output_tensor_size = n2cube.dpuGetOutputTensorSize(task, kernel_fc_output) output_tensor_channel = n2cube.dpuGetOutputTensorChannel( task, kernel_fc_output) output_tensor_address = n2cube.dpuGetOutputTensorAddress( task, kernel_fc_output) output_tensor_scale = n2cube.dpuGetOutputTensorScale( task, kernel_fc_output) # libcamera libcamera = ctypes.CDLL(fh.dir_cam / fh.libcamera_file) libcamera.get_frame_ptr.restype = ctypes.POINTER(ctypes.c_ubyte) libcamera.get_throw_bgn_idx.restype = ctypes.c_uint libcamera.get_throw_end_idx.restype = ctypes.c_uint libcamera.get_throw_bgn.restype = ctypes.c_bool libcamera.get_throw_end.restype = ctypes.c_bool libcamera.set_frame_rate.restype = None libcamera.set_buff_size.restype = None libcamera.set_exposure_time.restype = None libcamera.set_camera_gain.restype = None libcamera.set_avg_diffs.restype = None libcamera.set_threshold_mult.restype = None libcamera.set_frames_to_acquire.restype = None libcamera.initialize.restype = ctypes.c_int libcamera.start_acquisition.restype = ctypes.c_int libcamera.terminate.restype = ctypes.c_int # Set up of variables frames = np.empty((fh.frames_to_consider, ) + fh.bgr_shape, dtype=np.uint8) # Initialize Camera initialize = libcamera.initialize() if initialize != fh.ReturnCodes.SUCCESS: try: return_code = fh.ReturnCodes(initialize).name except ValueError: return_code = initialize print(f'Initialization failed: {return_code}') sys.exit() else: print( '================================= READY =================================' ) # Reset predictions predictions = np.zeros((fh.frames_to_consider, fh.num_objects), dtype=np.float32) # Start acquisition (Threaded) t = Thread(target=libcamera.start_acquisition) t.start() # Wait until the throw has ended while not libcamera.get_throw_end(): pass stages = [ 'Get raw bayer', 'Transform color', 'Resize', 'Normalize', 'Run inference', 'Softmax', 'Weighting' ] meas_time = {s: get_dict() for s in stages} throw_bgn_idx = libcamera.get_throw_bgn_idx() throw_end_idx = libcamera.get_throw_end_idx() num_frames = throw_end_idx - throw_bgn_idx - 1 # Ignore the last two captured frames for idx, frame_id in enumerate(range(throw_bgn_idx, throw_end_idx - 1)): meas_time['Get raw bayer']['start'].append(datetime.now()) frame_ptr = libcamera.get_frame_ptr(frame_id) raw_frame = np.ctypeslib.as_array(frame_ptr, shape=fh.raw_shape) meas_time['Get raw bayer']['end'].append(datetime.now()) # Transform Baumer BayerRG8 to BGR8 (Baumer BayerRG ≙ OpenCV BayerBG) meas_time['Transform color']['start'].append(datetime.now()) frames[idx] = cv2.cvtColor(raw_frame, cv2.COLOR_BayerBG2BGR) meas_time['Transform color']['end'].append(datetime.now()) meas_time['Resize']['start'].append(datetime.now()) frame_resized = cv2.resize(frames[idx], fh.inf_dsize, interpolation=fh.Interpolation.NEAREST) meas_time['Resize']['end'].append(datetime.now()) meas_time['Normalize']['start'].append(datetime.now()) frame_inference = frame_resized.astype(np.float32) / 255.0 meas_time['Normalize']['end'].append(datetime.now()) meas_time['Run inference']['start'].append(datetime.now()) n2cube.dpuSetInputTensorInHWCFP32(task, kernel_conv_input, frame_inference, input_tensor_size) n2cube.dpuRunTask(task) meas_time['Run inference']['end'].append(datetime.now()) # n2cube.dpuRunSoftmax(.) sometimes returns all zeros except one NaN # This section replaces the first occurrence of NaN in the prediction array with 1.0 and sets everything else to 0.0 meas_time['Softmax']['start'].append(datetime.now()) prediction = n2cube.dpuRunSoftmax( output_tensor_address, output_tensor_channel, output_tensor_size // output_tensor_channel, output_tensor_scale) nan = np.isnan(prediction) if nan.any(): nan_idx = nan.argmax( ) # return the index of the first occurrence of NaN prediction = np.zeros((fh.num_objects, ), dtype=np.float32) prediction[nan_idx] = 1.0 predictions[idx] = prediction meas_time['Softmax']['end'].append(datetime.now()) if idx == fh.frames_to_consider - 1: break meas_time['Weighting']['start'].append(datetime.now()) num_frames_considered = min(fh.frames_to_consider, num_frames) window = sine_window(num_frames, num_frames_considered) # weighting weighted_prediction = np.matmul(window, predictions) / np.sum(window) meas_time['Weighting']['end'].append(datetime.now()) for k in meas_time: meas_time[k] = [ (e - s).total_seconds() * 1000 for s, e in zip(meas_time[k]['start'], meas_time[k]['end']) ] meas_time[k] = sum(meas_time[k]) / len(meas_time[k]) # create output file mmax = 0 for s in stages: if len(s) > mmax: mmax = len(s) output = f'Number of captured frames: {num_frames_considered}\n\n' for idx, s in enumerate(stages): output += f'{s}:{" "*(mmax - len(stages[idx]))} {meas_time[s]:.3f} ms\n' output += f'\nSum:{" "*(mmax - len("Sum"))} {sum(meas_time.values()):.3f} ms\n' output += f'Frame rate:{" "*(mmax - len("Frame rate"))} {1000 / sum(meas_time.values()):.3f} fps\n' print(output) with open(fh.dir_verification / 'throughput.log', 'w') as f: f.write(output) # Wait until the camera thread (process due to ctypes) is terminated t.join() # Terminate Camera terminate = libcamera.terminate() # Clean up the DPU IP n2cube.dpuDestroyKernel(kernel) n2cube.dpuDestroyTask(task)