def fpga_stage(config, q_fpga, q_bbox, maxNumIters=-1): config['xdnn_handle'], handles = xdnn.createHandle(config['xclbin'], "kernelSxdnn_0") if config['xdnn_handle'] != 0: log.error("Failed to start FPGA process ", " - could not open xclbin %s %s!" \ % (config['xclbin'], config['xlnxlib'])) sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, config) # Allocate FPGA Outputs fpgaOutSize = config['out_w']*config['out_h']*config['bboxplanes']*(config['classes']+config['coords']+1) fpgaOutput = np.empty((config['batch_sz'], fpgaOutSize,), dtype=np.float32, order='C') raw_img = np.empty(((config['batch_sz'],) + config['in_shape']), dtype=np.float32, order='C') numIters = 0 while True: numIters += 1 if maxNumIters > 0 and numIters > maxNumIters: break job = q_fpga.get() if job == None: q_bbox.put(None) # propagate 'stop' signal downstream sys.exit(0) images = job['images'] display = job['display'] coco = job['coco'] if images is not None: log.info("Running Image(s):") log.info(images) config['images'] = images else: log.error("Detect requires images as a parameter") continue log.info("Preparing Input...") shapes = [] for i,img in enumerate(images): raw_img[i,...], s = xdnn_io.loadYoloImageBlobFromFile(img, config['in_shape'][1], config['in_shape'][2]) shapes.append(s) job['shapes'] = shapes # pass shapes to next stage # EXECUTE XDNN log.info("Running %s image(s)"%(config['batch_sz'])) startTime = timeit.default_timer() fpgaRT.execute(raw_img, fpgaOutput, config['PE']) elapsedTime = timeit.default_timer() - startTime # Only showing time for second run because first is loading script log.info("\nTotal FPGA: %f ms" % (elapsedTime*1000)) log.info("Image Time: (%f ms/img):" % (elapsedTime*1000/config['batch_sz'])) q_bbox.put((job, fpgaOutput))
def main(): args = xdnn_io.processCommandLine() ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) fpgaOutput = fpgaRT.getOutputs() fpgaInput = fpgaRT.getInputs() fcWeight, fcBias = xdnn_io.loadFCWeightsBias(args) img_paths = xdnn_io.getFilePaths(args['images']) fcOutput = np.empty(( args['batch_sz'], args['outsz'], ), dtype=np.float32, order='C') inShape = (args['batch_sz'], ) + tuple( tuple(fpgaRT.getInputDescriptors().values())[0][1:]) labels = xdnn_io.get_labels(args['labels']) if args['golden']: goldenMap = xdnn_io.getGoldenMap(args['golden']) top5Count = 0 top1Count = 0 firstInput = list(fpgaInput.values())[0] firstOutput = list(fpgaOutput.values())[0] for i in range(0, len(img_paths), args['batch_sz']): pl = [] for j, p in enumerate(img_paths[i:i + args['batch_sz']]): firstInput[j, ...], _ = xdnn_io.loadImageBlobFromFile( p, args['img_raw_scale'], args['img_mean'], args['img_input_scale'], inShape[2], inShape[3]) pl.append(p) fpgaRT.execute(fpgaInput, fpgaOutput) xdnn.computeFC(fcWeight, fcBias, firstOutput, fcOutput) softmaxOut = xdnn.computeSoftmax(fcOutput) xdnn_io.printClassification(softmaxOut, pl, labels) if args['golden']: for j, p in enumerate(img_paths[i:i + args['batch_sz']]): top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 1) top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 5) xdnn.closeHandle() if args['golden']: print("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % ( len(img_paths), float(top1Count) / float(len(img_paths)) * 100., float(top5Count) / float(len(img_paths)) * 100.)
def fpga_init(): # Parse arguments parser = xdnn_io.default_parser_args() parser.add_argument('--deviceID', type=int, default=0, help='FPGA no. -> FPGA ID to run in case multiple FPGAs') args = parser.parse_args() args = xdnn_io.make_dict_args(args) # Create manager if not xdnn.createManager(): raise Exception("Failed to create manager") compilerJSONObj = xdnn.CompilerJsonParser(args['netcfg']) # Get input and output shape input_shapes = list(map(lambda x: (x), compilerJSONObj.getInputs().itervalues())) output_shapes = list(map(lambda x: (x), compilerJSONObj.getOutputs().itervalues())) for in_idx in range(len(input_shapes)): input_shapes[in_idx][0] = args['batch_sz'] for out_idx in range(len(output_shapes)): output_shapes[out_idx][0] = args['batch_sz'] input_node_names = list(map(lambda x: str(x), compilerJSONObj.getInputs().iterkeys())) output_node_names = list(map(lambda x: str(x), compilerJSONObj.getOutputs().iterkeys())) num_inputs = len(input_shapes) num_outputs = len(output_shapes) # Create runtime ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", [args["deviceID"]]) if ret != 0: raise Exception("Failed to create handle, return value: {error}".format(error=ret)) fpgaRT = xdnn.XDNNFPGAOp(handles, args) print("Batch size:", args['batch_sz']) print("Input shapes:", input_shapes) print("Input nodes:", input_node_names) print("Ouput shapes:", output_shapes) print("Ouput nodes:", output_node_names) output_buffers = [] for _ in range(N_STREAMS): buffer = {name: np.empty(shape=shape, dtype=np.float32) for name, shape in zip(output_node_names, output_shapes)} output_buffers.append(buffer) # fpgaRT.exec_async({input_node_names[0]: np.zeros(input_shapes[0])}, # output_buffers[0], 0) # fpgaRT.get_result(0) (fcWeight, fcBias) = xdnn_io.loadFCWeightsBias(args) return fpgaRT, output_buffers,\ {name: shape for name, shape in zip(input_node_names, input_shapes)},\ fcWeight, fcBias
def setup(self, bottom, top): self.param_dict = eval(self.param_str) # Get args from prototxt self._args = xdnn_io.make_dict_args(self.param_dict) self._numPE = self._args[ "batch_sz"] # Bryan hack to detremine number of PEs in FPGA # Establish FPGA Communication, Load bitstream ret, handles = xdnn.createHandle(self._args["xclbin"], "kernelSxdnn_0") if ret != 0: raise Exception("Failed to open FPGA handle.") self._args["scaleB"] = 1 self._args["PE"] = -1 # Instantiate runtime interface object self.fpgaRT = xdnn.XDNNFPGAOp(handles, self._args) self._indictnames = self._args["input_names"] self._outdictnames = self._args["output_names"] self._parser = xdnn.CompilerJsonParser(self._args["netcfg"])
def __init__(self, params): self._args = xdnn_io.make_dict_args(params) self._numPE = self._args[ "batch_sz" ] # Bryan hack to detremine number of PEs in FPGA # Establish FPGA Communication, Load bitstream ret, handles = xdnn.createHandle(self._args["xclbin"], "kernelSxdnn_0") if ret != 0: raise Exception("Failed to open FPGA handle.") self._args["scaleB"] = 1 self._args["PE"] = -1 self._streamIds = [0, 1, 2, 3, 4, 5, 6, 7] # Allow 8 streams # Instantiate runtime interface object self.fpgaRT = xdnn.XDNNFPGAOp(handles, self._args) self._indictnames = self._args["input_names"] self._outdictnames = self._args["output_names"] self._parser = xdnn.CompilerJsonParser(self._args["netcfg"])
def fpga_init(): global PORT global N_STREAMS # Parse arguments parser = xdnn_io.default_parser_args() parser.add_argument('--device-ids', type=int, default=[0], nargs="+", help='a list of device IDs for FPGA') parser.add_argument('--port', type=int, default=5000, help='port to listen on') args = parser.parse_args() device_ids = args.device_ids PORT = args.port N_STREAMS *= len(device_ids) args = xdnn_io.make_dict_args(args) # Create manager if not xdnn.createManager(): raise Exception("Failed to create manager") compilerJSONObj = xdnn.CompilerJsonParser(args['netcfg']) # Get input and output shape input_shapes = list( map(lambda x: (x), compilerJSONObj.getInputs().itervalues())) output_shapes = list( map(lambda x: (x), compilerJSONObj.getOutputs().itervalues())) for in_idx in range(len(input_shapes)): input_shapes[in_idx][0] = args['batch_sz'] for out_idx in range(len(output_shapes)): output_shapes[out_idx][0] = args['batch_sz'] input_node_names = list( map(lambda x: str(x), compilerJSONObj.getInputs().iterkeys())) output_node_names = list( map(lambda x: str(x), compilerJSONObj.getOutputs().iterkeys())) num_inputs = len(input_shapes) num_outputs = len(output_shapes) # Create runtime ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", device_ids) if ret != 0: raise Exception( "Failed to create handle, return value: {error}".format(error=ret)) fpgaRT = xdnn.XDNNFPGAOp(handles, args) print("Batch size:", args['batch_sz']) print("Input shapes:", input_shapes) print("Input nodes:", input_node_names) print("Ouput shapes:", output_shapes) print("Ouput nodes:", output_node_names) print("Using model {path}".format(path=args["netcfg"])) print("Using FPGA device:", device_ids) output_buffers = [] for _ in range(N_STREAMS): buffer = { name: np.empty(shape=shape, dtype=np.float32) for name, shape in zip(output_node_names, output_shapes) } output_buffers.append(buffer) # fpgaRT.exec_async({input_node_names[0]: np.zeros(input_shapes[0])}, # output_buffers[0], 0) # fpgaRT.get_result(0) (fcWeight, fcBias) = xdnn_io.loadFCWeightsBias(args) return fpgaRT, output_buffers, output_node_names[0],\ {name: shape for name, shape in zip(input_node_names, input_shapes)},\ fcWeight, fcBias, args['batch_sz']
def main(argv): args = xdnn_io.processCommandLine(argv) ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib) if ret != 0: sys.exit(1) labels = xdnn_io.get_labels(args['labels']) # TODO dict of tuples instead? fpgaRT = {} fpgaOutputs = {} fcWeights = {} fcBiases = {} netFiles = {} confNames = [] args = args['jsoncfg'] # we do not use other args' keys for netconf_args in args: confName = str(netconf_args['name']) confNames += [confName] # netconf_args['netcfg'] = './data/{}_{}.json'.format(netconf_args['net'], netconf_args['dsp']) fpgaRT[confName] = xdnn.XDNNFPGAOp(handles, netconf_args) netconf_args['in_shape'] = tuple((netconf_args['batch_sz'], ) + tuple( fpgaRT[confName].getInputDescriptors().itervalues().next()[1:])) (fcWeights[confName], fcBiases[confName]) = xdnn_io.loadFCWeightsBias(netconf_args) fpgaOutputs[confName] = np.empty(( netconf_args['batch_sz'], int(netconf_args['fpgaoutsz']), ), dtype=np.float32, order='C') netFiles[confName] = str(netconf_args['netcfg']) batchArrays = [] for streamId, netconf_args in enumerate(args): batchArrays.append( np.empty(netconf_args['in_shape'], dtype=np.float32, order='C')) pl = [] img_paths = xdnn_io.getFilePaths(netconf_args['images']) for j, p in enumerate(img_paths[:netconf_args['batch_sz']]): batchArrays[-1][j, ...], _ = xdnn_io.loadImageBlobFromFile( p, netconf_args['img_raw_scale'], netconf_args['img_mean'], netconf_args['img_input_scale'], netconf_args['in_shape'][2], netconf_args['in_shape'][3]) pl.append(p) confName = str(netconf_args['name']) firstInputName = fpgaRT[confName].getInputs().iterkeys().next() firstOutputName = fpgaRT[confName].getOutputs().iterkeys().next() fpgaRT[confName].exec_async({firstInputName: batchArrays[-1]}, {firstOutputName: fpgaOutputs[confName]}, streamId) for streamId, confName in enumerate(confNames): fpgaRT[confName].get_result(streamId) for netconf_args in args: confName = str(netconf_args['name']) fcOut = np.empty((netconf_args['batch_sz'], netconf_args['outsz']), dtype=np.float32, order='C') xdnn.computeFC(fcWeights[confName], fcBiases[confName], fpgaOutputs[confName], fcOut) softmaxOut = xdnn.computeSoftmax(fcOut) xdnn_io.printClassification(softmaxOut, netconf_args['images'], labels) xdnn.closeHandle()
def fpga_stage(config, q_fpga, q_bbox, maxNumIters=-1): config['xdnn_handle'], handles = xdnn.createHandle( config['xclbin'], "kernelSxdnn_0") if config['xdnn_handle'] != 0: log.error("Failed to start FPGA process ", " - could not open xclbin %s %s!" \ % (config['xclbin'], config['xlnxlib'])) sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, config) fpgaInput = fpgaRT.getInputs() fpgaOutput = fpgaRT.getOutputs() numIters = 0 while True: numIters += 1 if maxNumIters > 0 and numIters > maxNumIters: break job = q_fpga.get() if job == None: q_bbox.put(None) # propagate 'stop' signal downstream sys.exit(0) images = job['images'] display = job['display'] coco = job['coco'] if images is not None: log.info("Running Image(s):") log.info(images) config['images'] = images else: log.error("Detect requires images as a parameter") continue if ((config['yolo_model'] == 'xilinx_yolo_v2') or (config['yolo_model'] == 'xilinx_prelu_yolo_v2') or (config['yolo_model'] == 'tiny_yolo_v2_voc')): pass else: out_data_shape = [] net = caffe.Net(config['caffe_prototxt'], config['caffe_model'], caffe.TEST) if (config['yolo_model'] == 'standard_yolo_v2'): out_data_shape.append( (config['batch_sz'], ) + tuple(net.blobs['layer31-conv'].data.shape[1:4])) elif (config['yolo_model'] == 'tiny_yolo_v2'): out_data_shape.append( (config['batch_sz'], ) + tuple(net.blobs['layer15-conv'].data.shape[1:4])) elif (config['yolo_model'] == 'tiny_yolo_v3'): out_data_shape.append( (config['batch_sz'], ) + tuple(net.blobs['layer14-conv'].data.shape[1:4])) out_data_shape.append( (config['batch_sz'], ) + tuple(net.blobs['layer21-conv'].data.shape[1:4])) elif (config['yolo_model'] == 'standard_yolo_v3'): out_data_shape.append( (config['batch_sz'], ) + tuple(net.blobs['layer81-conv'].data.shape[1:4])) out_data_shape.append( (config['batch_sz'], ) + tuple(net.blobs['layer93-conv'].data.shape[1:4])) out_data_shape.append( (config['batch_sz'], ) + tuple(net.blobs['layer105-conv'].data.shape[1:4])) elif (config['yolo_model'] == 'spp_yolo_v3'): out_data_shape.append( (config['batch_sz'], ) + tuple(net.blobs['layer88-conv'].data.shape[1:4])) out_data_shape.append( (config['batch_sz'], ) + tuple(net.blobs['layer100-conv'].data.shape[1:4])) out_data_shape.append( (config['batch_sz'], ) + tuple(net.blobs['layer112-conv'].data.shape[1:4])) #print "out_data_shape : ", out_data_shape softmaxOut = [] for list_idx in range(len(out_data_shape)): softmaxOut.append(np.empty(out_data_shape[list_idx])) firstInput = fpgaInput.itervalues().next() firstOutput = fpgaOutput.itervalues().next() maxpool_out = np.empty_like(firstOutput) log.info("Preparing Input...") shapes = [] inputs = [] for i, img in enumerate(images): firstInput[i, ...], s = xdnn_io.loadYoloImageBlobFromFile( img, config['in_shape'][1], config['in_shape'][2]) shapes.append(s) job['shapes'] = shapes # pass shapes to next stage # EXECUTE XDNN log.info("Running %s image(s)" % (config['batch_sz'])) if ((config['yolo_model'] == 'xilinx_yolo_v2') or (config['yolo_model'] == 'xilinx_prelu_yolo_v2') or (config['yolo_model'] == 'tiny_yolo_v2_voc')): startTime = timeit.default_timer() fpgaRT.execute(fpgaInput, fpgaOutput, config['PE']) elapsedTime = timeit.default_timer() - startTime # Only showing time for second run because first is loading script log.info("\nTotal FPGA: %f ms" % (elapsedTime * 1000)) log.info("Image Time: (%f ms/img):" % (elapsedTime * 1000 / config['batch_sz'])) q_bbox.put((job, firstOutput)) elif (config['yolo_model'] == 'standard_yolo_v2'): startTime = timeit.default_timer() fpgaRT.execute(fpgaInput, fpgaOutput, config['PE']) elapsedTime = timeit.default_timer() - startTime #out_data_shape = (config['batch_sz'] ,) + tuple(net.blobs['layer31-conv'].data.shape[1:4]) #softmaxOut = np.empty(out_data_shape) startTime = timeit.default_timer() for bt_idx in range(config['batch_sz']): net.blobs['layer25-conv'].data[ ...] = fpgaOutput['layer25-conv'][bt_idx, ...] net.blobs['layer27-conv'].data[ ...] = fpgaOutput['layer27-conv'][bt_idx, ...] net.forward(start='layer28-reorg', end='layer31-conv') final_out = net.blobs['layer31-conv'].data[...] softmaxOut[0][bt_idx, ...] = final_out[...] elapsedTime_cpu = timeit.default_timer() - startTime # Only showing time for second run because first is loading script print(elapsedTime * 1000, (elapsedTime_cpu * 1000), ((elapsedTime + elapsedTime_cpu) * 1000 / config['batch_sz'])) log.info("\nTotal FPGA: %f ms" % (elapsedTime * 1000)) log.info("\nTotal FPGA: %f ms" % (elapsedTime_cpu * 1000)) log.info("Image Time: (%f ms/img):" % ((elapsedTime + elapsedTime_cpu) * 1000 / config['batch_sz'])) q_bbox.put((job, softmaxOut[0])) elif (config['yolo_model'] == 'tiny_yolo_v3'): startTime = timeit.default_timer() fpgaRT.execute(fpgaInput, fpgaOutput, config['PE']) elapsedTime = timeit.default_timer() - startTime for bt_idx in range(config['batch_sz']): softmaxOut[0][bt_idx, ...] = fpgaOutput['layer14-conv'][bt_idx, ...] softmaxOut[1][bt_idx, ...] = fpgaOutput['layer21-conv'][bt_idx, ...] q_bbox.put((job, softmaxOut)) elif (config['yolo_model'] == 'standard_yolo_v3'): use_fpga = 1 if (use_fpga == 1): startTime = timeit.default_timer() fpgaRT.execute(fpgaInput, fpgaOutput, config['PE']) elapsedTime = timeit.default_timer() - startTime startTime = timeit.default_timer() for bt_idx in range(config['batch_sz']): softmaxOut[0][bt_idx, ...] = fpgaOutput['layer81-conv'][bt_idx, ...] softmaxOut[1][bt_idx, ...] = fpgaOutput['layer93-conv'][bt_idx, ...] softmaxOut[2][ bt_idx, ...] = fpgaOutput['layer105-conv'][bt_idx, ...] elapsedTime_cpu = timeit.default_timer() - startTime print(elapsedTime * 1000, (elapsedTime_cpu * 1000), ((elapsedTime + elapsedTime_cpu) * 1000 / config['batch_sz'])) else: for bt_idx in range(config['batch_sz']): net.blobs['data'].data[...] = firstInput[bt_idx, ...] net.forward() softmaxOut[0][ bt_idx, ...] = net.blobs['layer81-conv'].data[...] softmaxOut[1][ bt_idx, ...] = net.blobs['layer93-conv'].data[...] softmaxOut[2][ bt_idx, ...] = net.blobs['layer105-conv'].data[...] # Only showing time for second run because first is loading script #log.info("\nTotal FPGA: %f ms" % (elapsedTime*1000)) #log.info("\nTotal FPGA: %f ms" % (elapsedTime_cpu*1000)) #log.info("Image Time: (%f ms/img):" % ((elapsedTime+elapsedTime_cpu)*1000/config['batch_sz'])) q_bbox.put((job, softmaxOut)) elif (config['yolo_model'] == 'spp_yolo_v3'): startTime = timeit.default_timer() fpgaRT.execute(fpgaInput, fpgaOutput, config['PE']) elapsedTime = timeit.default_timer() - startTime startTime = timeit.default_timer() for bt_idx in range(config['batch_sz']): softmaxOut[0][bt_idx, ...] = fpgaOutput['layer88-conv'][bt_idx, ...] softmaxOut[1][bt_idx, ...] = fpgaOutput['layer100-conv'][bt_idx, ...] softmaxOut[2][bt_idx, ...] = fpgaOutput['layer112-conv'][bt_idx, ...] elapsedTime_cpu = timeit.default_timer() - startTime # Only showing time for second run because first is loading script print(elapsedTime * 1000, (elapsedTime_cpu * 1000), ((elapsedTime + elapsedTime_cpu) * 1000 / config['batch_sz'])) log.info("\nTotal FPGA: %f ms" % (elapsedTime * 1000)) log.info("\nTotal FPGA: %f ms" % (elapsedTime_cpu * 1000)) log.info("Image Time: (%f ms/img):" % ((elapsedTime + elapsedTime_cpu) * 1000 / config['batch_sz'])) q_bbox.put((job, softmaxOut)) elif (config['yolo_model'] == 'tiny_yolo_v2'): startTime = timeit.default_timer() fpgaRT.execute(fpgaInput, fpgaOutput, config['PE']) elapsedTime = timeit.default_timer() - startTime darknet_maxpool_k2x2_s1(firstOutput, maxpool_out) for bt_idx in range(config['batch_sz']): net.blobs['data'].data[...] = maxpool_out[bt_idx, ...] net.forward() final_out = net.blobs['layer15-conv'].data[...] softmaxOut[0][bt_idx, ...] = final_out[...] elapsedTime_cpu = timeit.default_timer() - startTime print(elapsedTime * 1000, (elapsedTime_cpu * 1000), ((elapsedTime + elapsedTime_cpu) * 1000 / config['batch_sz'])) log.info("\nTotal FPGA: %f ms" % (elapsedTime * 1000)) log.info("\nTotal FPGA: %f ms" % (elapsedTime_cpu * 1000)) log.info("Image Time: (%f ms/img):" % ((elapsedTime + elapsedTime_cpu) * 1000 / config['batch_sz'])) q_bbox.put((job, softmaxOut[0])) else: print("model not supported")
def fpga_process(fpgaRT, args, num_img, compJson, shared_trans_arrs, shared_output_arrs): if fpgaRT is None: ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", [args["deviceID"]]) if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) else: print "fpga process handle was ready:" qWait = mp.Queue(maxsize=100) numStreams = args['numstream'] bsz = args['batch_sz'] input_ptrs = [[] for i in range(numStreams)] numProcessed = 0 t = threading.Thread(target=fpga_wait, args=(fpgaRT, qWait, shared_output_arrs, shared_trans_arrs)) t.start() input_shapes = map(lambda x: (x), compJson.getInputs().itervalues()) output_shapes = map(lambda x: (x), compJson.getOutputs().itervalues()) InputName_list = map(lambda x: str(x), compJson.getInputs().iterkeys()) OutputName_list = map(lambda x: str(x), compJson.getOutputs().iterkeys()) num_inputs = len(input_shapes) num_outputs = len(output_shapes) startTime = time.time() while numProcessed < num_img or args['perpetual']: write_slot = shared_output_arrs.openWriteId() write_slot_arrs = shared_output_arrs.accessNumpyBuffer(write_slot) in_dict = {} out_dict = {} for out_idx in range(num_outputs): out_dict[OutputName_list[out_idx]] = write_slot_arrs[out_idx] read_slot_arrs_list = [] read_slot_list = [] for img_num in range(args['batch_sz']): read_slot = shared_trans_arrs.openReadId() if read_slot is None: break read_slot_arrs = shared_trans_arrs.accessNumpyBuffer(read_slot) read_slot_arrs_list.append(read_slot_arrs) read_slot_list.append(read_slot) write_slot_arrs[-1][img_num][:] = read_slot_arrs[-1][:] numProcessed += 1 if (args['perpetual'] == False): if numProcessed == num_img: break images_added = len(read_slot_arrs_list) # when number of images avaiable are less than the batch size, fill the rest of the out buffer image-id slots with -1 for img_num in range(images_added, args['batch_sz']): write_slot_arrs[-1][img_num][:] = -1 for in_idx in range(num_inputs): in_dict[InputName_list[in_idx]] = [] for img_idx in range(len(read_slot_arrs_list)): in_dict[InputName_list[in_idx]].append( read_slot_arrs_list[img_idx][in_idx]) fpgaRT.exec_async(in_dict, out_dict, write_slot) qWait.put((write_slot, read_slot_list, img_num)) #shared_trans_arrs.closeReadId(read_slot) qWait.put((None, None, None)) t.join() elapsedTime = (time.time() - startTime) print("FPGA_process: ", float(numProcessed) / elapsedTime, "img/s") xdnn.closeHandle()