def main(): """ Attach to DPU driver and prepare for running """ n2cube.dpuOpen() """ Create DPU Kernels for CONV NODE in imniResNet """ kernel = n2cube.dpuLoadKernel(KERNEL_CONV) """ Create DPU Tasks for CONV NODE in miniResNet """ task = n2cube.dpuCreateTask(kernel, 0) listimage = os.listdir(calib_image_dir) for i in range(len(listimage)): path = os.path.join(calib_image_dir, listimage[i]) if os.path.splitext(path)[1] != ".png": continue print("Loading %s" %listimage[i]) """ Load image and Set image into CONV Task """ imageRun=graph_input_fn.calib_input(path) imageRun=imageRun.reshape((imageRun.shape[0]*imageRun.shape[1]*imageRun.shape[2])) input_len=len(imageRun) n2cube.dpuSetInputTensorInHWCFP32(task,CONV_INPUT_NODE,imageRun,input_len) """ Launch miniRetNet task """ n2cube.dpuRunTask(task) """ Get output tensor address of CONV """ conf = n2cube.dpuGetOutputTensorAddress(task, CONV_OUTPUT_NODE) """ Get output channel of CONV """ channel = n2cube.dpuGetOutputTensorChannel(task, CONV_OUTPUT_NODE) """ Get output size of CONV """ size = n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE) softmax = [0 for i in range(size)] """ Get output scale of CONV """ scale = n2cube.dpuGetOutputTensorScale(task, CONV_OUTPUT_NODE) batchSize=size//channel """ Calculate softmax and show TOP5 classification result """ n2cube.dpuRunSoftmax(conf, softmax, channel, batchSize, scale) TopK(softmax, calib_image_list) """ Destroy DPU Tasks & free resources """ n2cube.dpuDestroyTask(task) """ Destroy DPU Kernels & free resources """ rtn = n2cube.dpuDestroyKernel(kernel) """ Dettach from DPU driver & free resources """ n2cube.dpuClose()
def run(self): overlay = DpuOverlay("./bitstream/dpu.bit") overlay.load_model("./model/dpu_tf_efficientnet.elf") cv2.setUseOptimized(True) cv2.setNumThreads(4) threadnum = 4 num_iterations = 0 listimage = [[] * i for i in range(threadnum)] result = [[] * i for i in range(threadnum)] img_processed = [[] * i for i in range(threadnum)] cnt = 0 thread = 0 list_image = sorted([i for i in os.listdir(image_folder) if i.endswith("JPEG")]) picture_num = 0 picture_num = len(list_image) for i in list_image: listimage[thread].append(i) if cnt % math.ceil(picture_num/threadnum) == 0 and cnt != 0: thread = thread + 1 cnt = cnt + 1 n2cube.dpuOpen() kernel = n2cube.dpuLoadKernel(KERNEL_CONV) threadAll = [] for i in range(threadnum): t1 = threading.Thread(target=self.run_dpu_task, args=(kernel, i, len(listimage[i]), listimage, result)) threadAll.append(t1) for x in threadAll: x.start() for x in threadAll: x.join() with open(RESULT_FILE, 'w') as result_file: for item in result: for i in item: result_file.write("%s\n" % i) rtn = n2cube.dpuDestroyKernel(kernel) n2cube.dpuClose() # Run all date set and write your outputs to result file. # Please see README and "classification_result.sample" to know the result file format. #time.sleep(10) return
def main(argv): """Attach to DPU driver and prepare for runing""" n2cube.dpuOpen() """Create DPU Kernels for GoogLeNet""" kernel = n2cube.dpuLoadKernel(KERNEL_CONV) image_path = "./../common/image_224_224/" listimage = os.listdir(image_path) path = os.path.join(image_path, listimage[0]) print("Loading %s" %listimage[0]) img = cv2.imread(path) threadAll = [] global threadnum threadnum = int(argv[1]) print("Input thread number is: %d" %threadnum) time1 = time.time() for i in range(int(threadnum)): t1 = threading.Thread(target=RunDPU, args=(kernel, img, i)) threadAll.append(t1) for x in threadAll: x.start() for x in threadAll: x.join() time2 = time.time() timetotal = time2 - time1 fps = float(1000 / timetotal) print("%.2f FPS" %fps) """Destroy DPU Tasks & free resources""" rtn = n2cube.dpuDestroyKernel(kernel) """Dettach from DPU driver & release resources""" n2cube.dpuClose()
def main(): print("STARTING UNETv2 on DPU...") if USE_DPU: # Attach to DPU driver n2cube.dpuOpen() # Load DPU Kernel and create a task kernel = n2cube.dpuLoadKernel(KERNEL_CONV) task = n2cube.dpuCreateTask(kernel, 0) # load and preprocess images and load segmentation labels assert os.path.isdir(IMG_TEST_DIR) #print(IMG_TEST_DIR) x_test, y_test, img_file, seg_file = dpu_get_data(IMG_TEST_DIR, SEG_TEST_DIR, cfg.NUM_CLASSES, cfg.WIDTH, cfg.HEIGHT) y_pred = [] # process all images for i in range(len(x_test)): # opened image as BGR, convert it to RGB #B,G,R = cv2.split(x_test[i]) #imageRun = cv2.merge((R,G,B)) imageRun = x_test[i] imageRun = imageRun.reshape( (imageRun.shape[0] * imageRun.shape[1] * imageRun.shape[2])) input_len = len(imageRun) if USE_DPU: # load pre-processed image as DPU input n2cube.dpuSetInputTensorInHWCFP32(task, CONV_INPUT_NODE, imageRun, input_len) dpu_in = n2cube.dpuGetInputTensor(task, CONV_INPUT_NODE) ti_scale = n2cube.dpuGetTensorScale(dpu_in) ti_h = n2cube.dpuGetTensorHeight(dpu_in) ti_w = n2cube.dpuGetTensorWidth(dpu_in) ti_sz = n2cube.dpuGetTensorSize(dpu_in) ti_ch = n2cube.dpuGetTensorChannel(dpu_in) if (i == 0): print( "Input tensor=%3d ch=%3d H=%3d W=%3d Size=%6d scale=%4d" % (i, ti_ch, ti_h, ti_w, ti_sz, ti_scale)) # run DPU task n2cube.dpuRunTask(task) # get output tensor address dpu_out = n2cube.dpuGetOutputTensorAddress(task, CONV_OUTPUT_NODE) # get number of channels in output tensor to_ch = n2cube.dpuGetOutputTensorChannel(task, CONV_OUTPUT_NODE) # get size in bytes of output tensor to_sz = n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE) # get width output tensor to_w = n2cube.dpuGetOutputTensorWidth(task, CONV_OUTPUT_NODE) # get height output tensor to_h = n2cube.dpuGetOutputTensorHeight(task, CONV_OUTPUT_NODE) # get output tensor scale to_scale = n2cube.dpuGetOutputTensorScale(task, CONV_OUTPUT_NODE) softmax = np.zeros(to_sz, dtype=np.float32) if (i == 0): print("Output tensor=%3d ch=%3d H=%3d W=%3d Size=%6d" % (i, to_ch, to_h, to_w, to_sz)) print("Output tensor scaling factor", to_scale) softmax = n2cube.dpuRunSoftmax(dpu_out, to_ch, to_sz // to_ch, to_scale) prediction = softmax.reshape((to_h, to_w, to_ch)) y_pred.append(prediction) if (i == 0): print("prediction shape: ", prediction.shape) # Calculate intersection over union for each segmentation class y_pred = np.asarray(y_pred) y_test = np.asarray(y_test) print("y_pred shape: ", y_pred.shape) print("y_test shape: ", y_test.shape) y_predi = np.argmax(y_pred, axis=3) y_testi = np.argmax(y_test, axis=3) print("shape of y_testi and y_predi ", y_testi.shape, y_predi.shape) dpu_IoU(y_testi, y_predi) # print results print("Processed", len(x_test), "images") print("FINISHED") if USE_DPU: # Destroy DPU Kernel & detach n2cube.dpuDestroyKernel(kernel) n2cube.dpuClose()
def run(image_folder, shortsize, KERNEL_CONV, KERNEL_CONV_INPUT, KERNEL_FC_OUTPUT, inputscale): start = time.time() # listimage = [i for i in os.listdir(image_folder) if i.endswith("JPEG")] listimage = [i for i in os.listdir(image_folder) if i.endswith("jpg")] listimage.sort() # wordstxt = os.path.join(image_folder, "words.txt") # with open(wordstxt, "r") as f: # lines = f.readlines() fo = open(resultname, "w") n2cube.dpuOpen() kernel = n2cube.dpuLoadKernel(KERNEL_CONV) task = n2cube.dpuCreateTask(kernel, 0) height, width, inputchannel, mean = parameter(task, KERNEL_CONV_INPUT) # print("mean = %f"%mean[0]) outsize = n2cube.dpuGetOutputTensorSize(task, KERNEL_FC_OUTPUT) # print("size = %d"%size) outputchannel = n2cube.dpuGetOutputTensorChannel(task, KERNEL_FC_OUTPUT) # print("outputchannel = %d"%outputchannel) conf = n2cube.dpuGetOutputTensorAddress(task, KERNEL_FC_OUTPUT) # print("conf = {}".format(conf)) # print("inputscale = %f"%inputscale) inputscale = n2cube.dpuGetInputTensorScale(task, KERNEL_CONV_INPUT) # print("inputscalenow = %f"%inputscale) outputscale = n2cube.dpuGetOutputTensorScale(task, KERNEL_FC_OUTPUT) # print("outputscale = %f"%outputscale) imagenumber = len(listimage) print("\nimagenumber = %d\n" % imagenumber) softlist = [] # imagenumber = 1000 correct = 0 wrong = 0 for i in range(imagenumber): print(f"i = {i+1}") print(listimage[i]) # path = os.path.join(image_folder, listimage[i]) # if i % 50 == 0: # print("\r", listimage[i], end = "") path = image_folder + listimage[i] img = cv2.imread(path) imageRun = predict_label(img, task, inputscale, mean, height, width, inputchannel, shortsize, KERNEL_CONV_INPUT) input_len = len(imageRun) # print(f"input_len = {input_len}") # soft = threadPool.submit(run_dpu_task, outsize, task, outputchannel, conf, outputscale, listimage[i], imageRun, KERNEL_CONV_INPUT, KERNEL_FC_OUTPUT) # softlist.append(soft) # for future in as_completed(softlist): # softmax, listimage = future.result() softmax, listimage[i] = run_dpu_task(outsize, task, outputchannel, conf, outputscale, listimage[i], imageRun, KERNEL_CONV_INPUT, KERNEL_FC_OUTPUT) correct, wrong = TopK(softmax, listimage[i], fo, correct, wrong) print("") fo.close() accuracy = correct / imagenumber print('Correct:', correct, ' Wrong:', wrong, ' Accuracy:', accuracy) n2cube.dpuDestroyTask(task) n2cube.dpuDestroyKernel(kernel) n2cube.dpuClose() print("") end = time.time() total_time = end - start print('\nAll processing time: {} seconds.'.format(total_time)) print('\n{} ms per frame\n'.format(10000 * total_time / imagenumber))