def main(args=None): os.environ['LS_BIND_NOW'] = "1" args = xdnn_io.processCommandLine() images = xdnn_io.getFilePaths(args['images']) # spawn dispatcher dispatcher = Dispatcher(args['vitis_rundir'], g_nFPGA, g_nDispatchers, args['batch_sz']) inshape = dispatcher.inshape # send work to system work = [] for qIdx in range(g_nQueries): idx = qIdx * inshape[0] workBatch = [ images[(idx + i) % len(images)] for i in range(inshape[0]) ] work.append((qIdx, workBatch, (args['img_raw_scale'], args['img_mean'], args['img_input_scale']))) startTime = timeit.default_timer() dispatcher.run(work) del dispatcher t = timeit.default_timer() - startTime print("Queries: %d, Elapsed: %.2fs, QPS: %.2f, FPS: %.2f" \ % (g_nQueries, t, g_nQueries / t, g_nQueries * inshape[0] / t)) sys.stdout.flush()
def main(): args = xdnn_io.processCommandLine() runner = Runner(args['vitis_rundir']) inTensors = runner.get_input_tensors() outTensors = runner.get_output_tensors() batch_sz = args['batch_sz'] if batch_sz == -1: # use Runner's suggested batch size batch_sz = inTensors[0].dims[0] if args['golden']: goldenMap = xdnn_io.getGoldenMap(args['golden']) top5Count = 0 top1Count = 0 fpgaBlobs = [] for io in [inTensors, outTensors]: blobs = [] for t in io: shape = (batch_sz,) + tuple([t.dims[i] for i in range(t.ndims)][1:]) blobs.append(np.empty((shape), dtype=np.float32, order='C')) fpgaBlobs.append(blobs) img_paths = xdnn_io.getFilePaths(args['images']) labels = xdnn_io.get_labels(args['labels']) xdnnCPUOp = xdnn.XDNNCPUOp("%s/weights.h5" % args['vitis_rundir']) fcOutput = np.empty((batch_sz, args['outsz'],), dtype=np.float32, order='C') fpgaInput = fpgaBlobs[0][0] for i in range(0, len(img_paths), batch_sz): pl = [] # fill tensor input data from image file for j, p in enumerate(img_paths[i:i + batch_sz]): img, _ = xdnn_io.loadImageBlobFromFile(p, args['img_raw_scale'], args['img_mean'], args['img_input_scale'], fpgaInput.shape[2], fpgaInput.shape[3]) pl.append(p) np.copyto(fpgaInput[j], img) jid = runner.execute_async(fpgaBlobs[0], fpgaBlobs[1]) runner.wait(jid) xdnnCPUOp.computeFC(fpgaBlobs[1][0], fcOutput) softmaxOut = xdnnCPUOp.computeSoftmax(fcOutput) if args['golden']: for j,p in enumerate(img_paths[i:i + batch_sz]): top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 1) top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 5) else: xdnn_io.printClassification(softmaxOut, pl, labels) if args['golden']: print ( ("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % (len(img_paths), float(top1Count)/float(len(img_paths))*100., float(top5Count)/float(len(img_paths))*100.) )
def pre_process(q, args): xclbin_p = str(args['xclbin'] + "/xdnn_v3_96x16_2pe_8b_9mb_bank03.xclbin") kernelName_p = "pp_pipeline_accel" deviceIdx_p = args['deviceid'] fpga_pp = waa_rt.PreProcess(xclbin_p, kernelName_p, deviceIdx_p, 1) batch_sz = args['batch_sz'] img_paths = xdnn_io.getFilePaths(args['images']) for i in range(0, len(img_paths), batch_sz): for j, p in enumerate(img_paths[i:i + batch_sz]): arr, ht = fpga_pp.preprocess_input(p) q.put(arr)
def pre_process(q,args): xclbin_p=str(args['xclbin']+"/xdnn_v3_96x16_2pe_8b_9mb_bank03.xclbin") kernelName_p="pp_pipeline_accel" deviceIdx_p=args['deviceid'] fpga_pp = xplusml.PreProcess(xclbin_p,kernelName_p,deviceIdx_p) batch_sz = args['batch_sz'] img_paths = xdnn_io.getFilePaths(args['images']) print("Pre-processing handle created. Populating Queue") for i in range(0, len(img_paths), batch_sz): for j, p in enumerate(img_paths[i:i + batch_sz]): arr, ht = fpga_pp.preprocess_input(p) q.put(arr) print("Queue populated")
def pre_process(q_img, q_shape,args): xclbin_p=str(args['xclbin']+"/xdnn_v3_96x16_2pe_8b_9mb_bank03.xclbin") kernelName_p="pp_pipeline_accel" deviceIdx_p=args['deviceid'] handle_p = waa_rt.PreProcess(xclbin_p,kernelName_p,deviceIdx_p, 1) if handle_p == -1: print("Unable to Create handle for pre-processing kernel. Only U200 device is supported") sys.exit() batch_sz = args['batch_sz'] img_paths = xdnn_io.getFilePaths(args['images']) print("Pre-processing handle created. Populating Queue") for i in range(0, len(img_paths), batch_sz): for j, p in enumerate(img_paths[i:i + batch_sz]): arr, shape = handle_p.preprocess_input(p) q_img.put(arr) q_shape.put(shape)
def __init__(self): global pool global pFpgaRT self.args = xdnn_io.processCommandLine() # Get command line args pool = Pool( self.args['numproc'] ) # Depends on new switch, new switch is not added to xdnn_io.py as of yet # Split Images into batches - list of lists self.batches = [] self.all_image_paths = xdnn_io.getFilePaths( self.args['images']) #[0:10000] for i in range(0, len(self.all_image_paths), self.args['batch_sz']): self.batches.append(self.all_image_paths[i:i + self.args['batch_sz']]) pFpgaRT = xdnn.XDNNFPGAOp(self.args) # Parent process gets handle self.args['inShape'] = (self.args['batch_sz'], ) + tuple( tuple(pFpgaRT.getInputDescriptors().values())[0] [1:]) # Save input shape for children self.mpid = pFpgaRT.getMPID() # Save handle to use in child processes
def main(): args = xdnn_io.processCommandLine() images = xdnn_io.getFilePaths(args['images']) # start comms xserver = xstream.Server() # acquire resources fmaster = FpgaMaster(args['vitis_rundir']) inshape = list(fmaster.inshape) if args['batch_sz'] != -1: inshape[0] = args['batch_sz'] # update batch size # spawn dispatchers dispatcher = Dispatcher(g_nDispatchers, g_nWorkers, inshape) # spawn workers workers = WorkerPool(args['vitis_rundir'] + "_worker", g_nWorkers, args) # send work to system work = [] for qIdx in range(g_nQueries): idx = qIdx * inshape[0] workBatch = [ images[(idx + i) % len(images)] for i in range(inshape[0]) ] work.append((qIdx, workBatch, (args['img_raw_scale'], args['img_mean'], args['img_input_scale']))) startTime = timeit.default_timer() dispatcher.run(work) del dispatcher t = timeit.default_timer() - startTime print("Queries: %d, Elapsed: %.2fs, QPS: %.2f, FPS: %.2f" \ % (g_nQueries, t, g_nQueries / t, g_nQueries * inshape[0] / t)) # cleanup del workers del fmaster del xserver
def main(): parser = xdnn_io.default_parser_args() parser = yolo_parser_args(parser) args = parser.parse_args() args = xdnn_io.make_dict_args(args) g_nDispatchers = args['numprepproc'] g_nWorkers = args['numworkers'] # Setup the environment images = xdnn_io.getFilePaths(args['images']) if (args['golden'] or args['visualize']): assert args['labels'], "Provide --labels to compute mAP." assert args[ 'results_dir'], "For accuracy measurements, provide --results_dir to save the detections." # start comms xserver = xstream.Server() # acquire resources fmaster = FpgaMaster(args['vitis_rundir']) # update batch size inshape = list(fmaster.inshape) if args['batch_sz'] != -1: inshape[0] = args['batch_sz'] args['net_h'] = inshape[2] args['net_w'] = inshape[3] # spawn dispatchers dispatcher = yoloDispatcher(g_nDispatchers, g_nWorkers, inshape) # spawn workers workers = yoloWorkerPool(args['vitis_rundir'] + "_worker", g_nWorkers, args) # send work to system g_nQueries = int(np.ceil(len(images) / inshape[0])) work = [] for qIdx in range(g_nQueries): idx = qIdx * inshape[0] workBatch = [ images[(idx + i) % len(images)] for i in range(inshape[0]) ] work.append((qIdx, workBatch, (args['img_raw_scale'], args['img_mean'], args['img_input_scale']))) startTime = timeit.default_timer() dispatcher.run(work) del dispatcher t = timeit.default_timer() - startTime print("Queries: %d, Elapsed: %.2fs, QPS: %.2f, FPS: %.2f" \ % (g_nQueries, t, g_nQueries / t, g_nQueries * inshape[0] / t)) sys.stdout.flush() # cleanup del workers del fmaster del xserver # mAP calculation if (args['golden']): print() print("Computing mAP score : ") labels = xdnn_io.get_labels(args['labels']) print("Class names are : {} ".format(labels)) mAP = calc_detector_mAP(args['results_dir'], args['golden'], len(labels), labels, args['prob_threshold'], args['mapiouthresh'], args['points']) sys.stdout.flush()
def run(args=None): if not args: parser = xdnn_io.default_parser_args() parser.add_argument('--numprepproc', type=int, default=1, help='number of parallel processes used to decode and quantize images') parser.add_argument('--numstream', type=int, default=16, help='number of FPGA streams') parser.add_argument('--deviceID', type=int, default=0, help='FPGA no. -> FPGA ID to run in case multiple FPGAs') parser.add_argument('--benchmarkmode', type=int, default=0, help='bypass pre/post processing for benchmarking') parser.add_argument('--profile', action='store_true', help='Print average latencies for preproc/exec/postproc') args = parser.parse_args() args = xdnn_io.make_dict_args(args) sharedInputArrs = [] fpgaOutputs = [] compilerJSONObj = xdnn.CompilerJsonParser(args['netcfg']) input_shapes = [v for k,v in compilerJSONObj.getInputs().items()] output_shapes = [v for k,v in compilerJSONObj.getOutputs().items()] for in_idx in range(len(input_shapes)): input_shapes[in_idx][0] = args['batch_sz'] for out_idx in range(len(output_shapes)): output_shapes[out_idx][0] = args['batch_sz'] input_sizes = map(lambda x: np.prod(x), input_shapes) output_sizes = map(lambda x: np.prod(x), output_shapes) num_shared_slots = args['numstream'] # shared memory from preprocessing to fpga forward shared_trans_arrs = SharedMemoryQueue("trans",num_shared_slots*(args['numprepproc']*args['batch_sz']), input_shapes +[(args['batch_sz'], 4)]) # shared memory from fpga forward to postprocessing shared_output_arrs = SharedMemoryQueue("output",num_shared_slots, output_shapes + [(args['batch_sz'], 4)]) # Form list of images to chunks of batch_sz img_paths = xdnn_io.getFilePaths(args['images']) imgids = list(range(len(img_paths))) imgid_chunks = [ imgids[i:i+args['batch_sz']] for i in range(0, len(img_paths), args['batch_sz']) ] # Start all processes p = mp.Pool(initializer = init_pre_process, initargs = (args, img_paths, input_shapes, shared_trans_arrs, ), processes = args['numprepproc']) xdnnProc = mp.Process(target=fpga_process, args=(args, len(imgid_chunks), compilerJSONObj,shared_trans_arrs,shared_output_arrs,)) postProc = mp.Process(target=post_process, args=(args, img_paths, fpgaOutputs,output_shapes,shared_output_arrs,)) xdnnProc.start() postProc.start() t1 = timeit.default_timer() if args['perpetual']: while True: res = [p.map_async(run_pre_process, imgid_chunks)] for j in res: j.wait() del j else: p.map_async(run_pre_process, imgid_chunks) xdnnProc.join() postProc.join() p.close() p.join() t2 = timeit.default_timer() total_t = t2 - t1 if(args['profile']): print("Total time taken: {} s\n Total images: {}\nAverage FPS: {}".format(total_t, \ len(img_paths), len(img_paths)/total_t))
def run(args=None): if not args: parser = xdnn_io.default_parser_args() parser = yolo_parser_args(parser) parser.add_argument('--startxstream', default=True, action='store_true', help='automatically start obj store server') parser.add_argument('--servermode', default=False, action='store_true', help='accept images from another process') parser.add_argument("--deploymodel", type=str, default='', help='Original prototxt') parser.add_argument("--caffemodel", type=str, default='', help='Original caffemodel') args = parser.parse_args() args = xdnn_io.make_dict_args(args) args['preprocseq'] = [('resize', (224, 224)), ('meansub', [104.007, 116.669, 122.679]), ('chtranspose', (2, 0, 1))] if (args['golden'] or args['visualize']): assert args['labels'], "Provide --labels to compute mAP." assert args[ 'results_dir'], "For accuracy measurements, provide --results_dir to save the detections." labels = xdnn_io.get_labels(args['labels']) colors = generate_colors(len(labels)) args['startxstream'] = True args['servermode'] = False timerQ = Queue() args['timerQ'] = timerQ compJson = xdnn.CompilerJsonParser(args['netcfg']) firstInputShape = next(itervalues(compJson.getInputs())) args['net_h'] = firstInputShape[2] args['net_w'] = firstInputShape[3] # start object store # (make sure to 'pip install pyarrow') xserver = None if args['startxstream']: xserver = xstream.Server() graph = grapher.Graph("yolo_v2") graph.node("prep", yolov2_pre.Node, args) graph.node("fpga", yolov2_fpga.Node, args) graph.node("post", yolov2_post.Node, args) graph.edge("START", None, "prep") graph.edge("prep", "prep", "fpga") graph.edge("fpga", "fpga", "post") graph.edge("DONE", "post", "fpga") graph.edge("DONE", "post", None) if not args['servermode']: graph.serve(background=True) img_paths = xdnn_io.getFilePaths(args['images']) reqProc = mp.Process(target=request_process, args=( args, img_paths, graph._in[0], graph._out[0], )) t = timeit.default_timer() reqProc.start() reqProc.join() graph.stop(kill=False) t2 = args['timerQ'].get() full_time = t2 - t args['timerQ'].close() print("Total time : {}s for {} images".format(full_time, len(img_paths))) print("Average FPS : {} imgs/sec".format(len(img_paths) / full_time)) else: print("Serving %s -> %s" % (graph._in[0], graph._out[0])) graph.serve() # mAP calculation if (args['golden']): print(flush=True) print("Computing mAP score : ", flush=True) print("Class names are : {} ".format(labels), flush=True) mAP = calc_detector_mAP(args['results_dir'], args['golden'], len(labels), labels,\ args['prob_threshold'], args['mapiouthresh'], args['points']) sys.stdout.flush()
def main(): parser = xdnn_io.default_parser_args() parser = yolo_parser_args(parser) args = parser.parse_args() args = xdnn_io.make_dict_args(args) # Setup the environment img_paths = xdnn_io.getFilePaths(args['images']) if (args['golden'] or args['visualize']): assert args['labels'], "Provide --labels to compute mAP." assert args[ 'results_dir'], "For accuracy measurements, provide --results_dir to save the detections." labels = xdnn_io.get_labels(args['labels']) colors = generate_colors(len(labels)) if args['yolo_version'] == 'v2': yolo_postproc = yolo.yolov2_postproc elif args['yolo_version'] == 'v3': yolo_postproc = yolo.yolov3_postproc runner = Runner(args['vitis_rundir']) # Setup the blobs inTensors = runner.get_input_tensors() outTensors = runner.get_output_tensors() batch_sz = args['batch_sz'] if batch_sz == -1: batch_sz = inTensors[0].dims[0] fpgaBlobs = [] for io in [inTensors, outTensors]: blobs = [] for t in io: shape = (batch_sz, ) + tuple([t.dims[i] for i in range(t.ndims)][1:]) blobs.append(np.empty((shape), dtype=np.float32, order='C')) fpgaBlobs.append(blobs) fpgaInput = fpgaBlobs[0][0] # Setup the YOLO config net_h, net_w = fpgaInput.shape[-2:] args['net_h'] = net_h args['net_w'] = net_w biases = bias_selector(args) # Setup profiling env prep_time = 0 exec_time = 0 post_time = 0 # Start the execution for i in range(0, len(img_paths), batch_sz): pl = [] img_shapes = [] # Prep images t1 = timeit.default_timer() for j, p in enumerate(img_paths[i:i + batch_sz]): fpgaInput[j, ...], img_shape = xdnn_io.loadYoloImageBlobFromFile( p, net_h, net_w) pl.append(p) img_shapes.append(img_shape) t2 = timeit.default_timer() # Execute jid = runner.execute_async(fpgaBlobs[0], fpgaBlobs[1]) runner.wait(jid) # Post Proc t3 = timeit.default_timer() boxes = yolo_postproc(fpgaBlobs[1], args, img_shapes, biases=biases) t4 = timeit.default_timer() prep_time += (t2 - t1) exec_time += (t3 - t2) post_time += (t4 - t3) for i in range(min(batch_sz, len(img_shapes))): print("Detected {} boxes in {}".format(len(boxes[i]), pl[i])) # Save the result if (args['results_dir']): for i in range(min(batch_sz, len(img_shapes))): filename = os.path.splitext(os.path.basename(pl[i]))[0] out_file_txt = os.path.join(args['results_dir'], filename + '.txt') print("Saving {} boxes to {}".format(len(boxes[i]), out_file_txt)) sys.stdout.flush() saveDetectionDarknetStyle(out_file_txt, boxes[i], img_shapes[i]) if (args['visualize']): out_file_png = os.path.join(args['results_dir'], filename + '.png') print("Saving result to {}".format(out_file_png)) sys.stdout.flush() draw_boxes(pl[i], boxes[i], labels, colors, out_file_png) # Profiling results if (args['profile']): print("\nAverage Latency in ms:") print(" Image Prep: {0:3f}".format(prep_time * 1000.0 / len(img_paths))) print(" Exec: {0:3f}".format(exec_time * 1000.0 / len(img_paths))) print(" Post Proc: {0:3f}".format(post_time * 1000.0 / len(img_paths))) sys.stdout.flush() # mAP calculation if (args['golden']): print() print("Computing mAP score : ") print("Class names are : {} ".format(labels)) mAP = calc_detector_mAP(args['results_dir'], args['golden'], len(labels), labels, args['prob_threshold'], args['mapiouthresh'], args['points']) sys.stdout.flush()
def yolo_gpu_inference(backend_path, image_dir, deploy_model, weights, out_labels, IOU_threshold, scorethresh, mean_value, pxscale, transpose, channel_swap, yolo_model, num_classes, args): # Setup the environment images = xdnn_io.getFilePaths(args['images']) if (args['golden'] or args['visualize']): assert args['labels'], "Provide --labels to compute mAP." assert args[ 'results_dir'], "For accuracy measurements, provide --results_dir to save the detections." labels = xdnn_io.get_labels(args['labels']) colors = generate_colors(len(labels)) # Select postproc and biases if args['yolo_version'] == 'v2': yolo_postproc = yolo.yolov2_postproc elif args['yolo_version'] == 'v3': yolo_postproc = yolo.yolov3_postproc biases = bias_selector(args) import caffe caffe.set_mode_cpu() print(args) if (args['gpu'] is not None): caffe.set_mode_gpu() caffe.set_device(args['gpu']) net = caffe.Net(deploy_model, weights, caffe.TEST) net_h, net_w = net.blobs['data'].data.shape[-2:] args['net_h'] = net_h args['net_w'] = net_w for i, img in enumerate(images): if ((i + 1) % 100 == 0): print(i + 1, "images processed") raw_img, img_shape = xdnn_io.loadYoloImageBlobFromFile( img, net_h, net_w) net.blobs['data'].data[...] = raw_img out = net.forward() caffeOutput = sorted(out.values(), key=lambda item: item.shape[-1]) boxes = yolo_postproc(caffeOutput, args, [img_shape], biases=biases) print("{}. Detected {} boxes in {}".format(i, len(boxes[0]), img)) # Save the result boxes = boxes[0] if (args['results_dir']): filename = os.path.splitext(os.path.basename(img))[0] out_file_txt = os.path.join(args['results_dir'], filename + '.txt') print("Saving {} boxes to {}".format(len(boxes), out_file_txt)) sys.stdout.flush() saveDetectionDarknetStyle(out_file_txt, boxes, img_shape) if (args['visualize']): out_file_png = os.path.join(args['results_dir'], filename + '.png') print("Saving result to {}".format(out_file_png)) sys.stdout.flush() draw_boxes(img, boxes, labels, colors, out_file_png) # draw_boxes(images[i],bboxes,class_names,colors=[(0,0,0)]*num_classes) return len(images)
def run(args=None): if not args: parser = xdnn_io.default_parser_args() parser.add_argument('--numprepproc', type=int, default=1, help='# parallel procs to decode/quantize images') parser.add_argument('--numstream', type=int, default=6, help='number of FPGA streams') parser.add_argument('--deviceID', type=int, default=0, help='FPGA no. -> FPGA ID to use multiple FPGAs') parser.add_argument('--benchmarkmode', type=int, default=0, help='bypass pre/post processing for benchmarking') parser.add_argument('--startxstream', default=False, action='store_true', help='automatically start obj store server') parser.add_argument('--servermode', default=False, action='store_true', help='accept images from another process') args = parser.parse_args() args = xdnn_io.make_dict_args(args) args['preprocseq'] = [('resize', (224, 224)), ('meansub', [104.007, 116.669, 122.679]), ('chtranspose', (2, 0, 1))] # start object store # (make sure to 'pip install pyarrow') xserver = None if args['startxstream']: xserver = xstream.Server() graph = grapher.Graph("imagenet") graph.node("prep", pre.Node, args) graph.node("fpga", fpga.Node, args) graph.node("post", post.Node, args) graph.edge("START", None, "prep") graph.edge("prep", "prep", "fpga") graph.edge("fpga", "fpga", "post") graph.edge("DONE", "post", "fpga") graph.edge("DONE", "post", None) if not args['servermode']: graph.serve(background=True) img_paths = xdnn_io.getFilePaths(args['images']) reqProc = mp.Process(target=request_process, args=( args, img_paths, graph._in[0], graph._out[0], )) reqProc.start() reqProc.join() graph.stop(kill=False) else: print("Serving %s -> %s" % (graph._in[0], graph._out[0])) graph.serve()